In [13]:
import pandas as pd
from pathlib import Path

In [14]:
# Defining Paths
population_path = Path("../Group3Project1/Resources/sub-est2022.csv")
snp500_path = Path("../Group3Project1/Resources/constituents.csv")

# Reading CSV
population_data = pd.read_csv(population_path, header=0) 
snp500_data = pd.read_csv(snp500_path, index_col="Symbol")

In [15]:
# Preparing and Cleaning SUMLEV 162(Cities) Population Data
cities_population_data = population_data[population_data["SUMLEV"] == 162]
cities_population_data = cities_population_data[['NAME', 'STNAME', 'POPESTIMATE2022']]
cities_population_data.columns = ['city', 'state', 'population']
cities_population_data['city'] = cities_population_data['city'].str.replace(
  r' (City|city|town|village|borough)', '', regex=True)
cities_population_data['city'] = cities_population_data['city'].str.replace(r'\bSt\. \b', 'Saint ', regex=True)

In [16]:
# Preparing and Cleaning SUMLEV 050(County) Population Data
counties_population_data = population_data[population_data["SUMLEV"] == 50]
counties_population_data = counties_population_data[['NAME', 'STNAME', 'POPESTIMATE2022']]
counties_population_data.columns = ['city', 'state', 'population']

counties_population_data['city'] = counties_population_data['city'].str.replace(
  r' (city|town|village|County|Parish|Planning Region|Census Area|City and Borough|Borough|Municipality)$',
  '', regex=True)
counties_population_data['city'] = counties_population_data['city'].str.replace(r'\bSt\. \b', 'Saint ', regex=True)
counties_population_data['city'] = counties_population_data['city'].str.strip()

In [17]:
# Preparing and Cleaning SUMLEV 061(Towns) Population Data
towns_population_data = population_data[population_data["SUMLEV"] == 61]
towns_population_data = towns_population_data[['NAME', 'STNAME', 'POPESTIMATE2022']]
towns_population_data.columns = ['city', 'state', 'population']
towns_population_data['city'] = towns_population_data['city'].str.replace(
  r' (City|city|township|town|village|-Troy Hills)', '', regex=True)
towns_population_data['city'] = towns_population_data['city'].str.replace(r'\bSt\. \b', 'Saint ', regex=True)

In [18]:
# Cleaning and preparing Louisville(SUMLEV 157) and Nashville(SUMLEV 172) data
targeted_population_data = population_data[population_data["SUMLEV"].isin([157, 172])]
targeted_population_data = targeted_population_data[['NAME', 'STNAME', 'POPESTIMATE2022']]
targeted_population_data.columns = ['city', 'state', 'population']

targeted_population_data['city'] = targeted_population_data['city'].str.replace(
  r'(-Davidson metropolitan government \(balance\)|/Jefferson County metro government \(balance\))', '', regex=True)

In [19]:
#Preparing and cleaning S&P500 Data
split_locations = snp500_data['Headquarters Location'].str.split(',', expand=True)
snp500_data['city'] = split_locations[0]
snp500_data['state'] = split_locations[1] if split_locations.shape[1] > 1 else None

snp500_data['city'] = snp500_data['city'].str.strip()
snp500_data['state'] = snp500_data['state'].str.strip() if snp500_data['state'] is not None else None
snp500_data['city'] = snp500_data['city'].str.replace(r' (City|County|Village|Ranch)$', '', regex=True)
snp500_data['city'] = snp500_data['city'].str.replace(r'\bSt\. \b', 'Saint ', regex=True)
if 'state' in snp500_data.columns:
  snp500_data['state'] = snp500_data['state'].str.replace(r'\bD.C\.', 'District of Columbia', regex=True)

In [20]:
#Intial Merge with cities data
snp500_with_population = pd.merge(snp500_data, cities_population_data, on=['city', 'state'], how='left')


Unnamed: 0,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded,city,state,population
0,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902,Saint Paul,Minnesota,303176.0
1,A. O. Smith,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,91142,1916,Milwaukee,Wisconsin,563305.0
2,Abbott,Health Care,Health Care Equipment,"North Chicago, Illinois",1957-03-04,1800,1888,North Chicago,Illinois,30490.0
3,AbbVie,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888),North Chicago,Illinois,30490.0
4,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989,Dublin,Ireland,


In [21]:
# Function to check for NANs and fill them with population_data
def fill_population(row):
	if pd.isna(row['population']):
		# Try to get the population from the county data
		county_population = counties_population_data[
			(counties_population_data['city'] == row['city']) &
			(counties_population_data['state'] == row['state'])
			]['population'].values
		if len(county_population) > 0:
			return county_population[0]
		# If County data not found try Towns data
		town_population = towns_population_data[
			(towns_population_data['city'] == row['city']) &
			(towns_population_data['state'] == row['state'])
		]['population'].values
		if len(town_population) > 0:
			return town_population[0]
		targeted_population = targeted_population_data[
			(targeted_population_data['city'] == row['city']) &
			(targeted_population_data['state'] == row['state'])
			]['population'].values
		return targeted_population[0] if len(targeted_population) > 0 else None
	return row['population']

snp500_with_population['population'] = snp500_with_population.apply(fill_population, axis=1)

path_to_save_csv = '../Group3Project1/Resources/snp500_with_population.csv'
snp500_with_population.to_csv(path_to_save_csv, index=False)

In [22]:
#print(snp500_with_population.info())
print(snp500_with_population['population'].isna().sum())

51


In [23]:
#Creating CSV with NAN population values rows removed
snp500_no_nan_population = snp500_with_population.dropna(subset=['population'])

snp500_no_nan_population_path = '../Group3Project1/Resources/snp500_no_nan_population.csv'
snp500_no_nan_population.to_csv(snp500_no_nan_population_path, index=False)