This notebook contains the entire ETL pipeline executed to generate the county data required for subsequent model training. In short, data is retrieved/imported from xlsx files via the U.S. Census Bureau, cleaned/transformed, enriched with segmentation labels via the unsupervised learning algorithm 'KMeans' and finally joined with the target/labeled outcome binary classifier column. Model composition, training and optimization occur within another repository notebook.

In [1]:
import pandas as pd
import numpy as np
import hvplot.pandas
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer

**Demographic Data Extraction, Transformation & Loading (2012, 2016 & 2019)**
-
-----------

**2012**
-

In [2]:
# Reading in 2012 U.S. Census Bureau demographic dataset
initial_pa_demographic_2012 = pd.read_excel("Resources/PA_Demographics_2012.xlsx")
initial_pa_demographic_2012


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
0,Label,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent,...,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent
1,SEX AND AGE,,,,,,,,,,...,,,,,,,,,,
2,Total population,101482,1229338,68409,170245,413491,127121,627053,184970,141584,...,168798,808460,299267,94428,1547607,147063,76957,208716,363395,437846
3,Male,49.2%,48.0%,49.6%,48.3%,49.1%,48.8%,49.0%,49.6%,49.6%,...,49.0%,48.5%,49.2%,51.0%,47.3%,50.5%,51.7%,48.8%,48.8%,49.3%
4,Female,50.8%,52.0%,50.4%,51.7%,50.9%,51.2%,51.0%,50.4%,50.4%,...,51.0%,51.5%,50.8%,49.0%,52.7%,49.5%,48.3%,51.2%,51.2%,50.7%
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81,Some other race alone,0.0%,0.1%,0.0%,0.1%,0.0%,0.0%,0.1%,0.0%,0.1%,...,0.3%,0.3%,0.2%,0.1%,0.2%,0.0%,0.0%,0.1%,0.1%,0.0%
82,Two or more races,1.7%,2.1%,0.9%,1.6%,1.2%,1.6%,1.5%,1.2%,1.7%,...,2.5%,1.9%,1.7%,0.5%,2.2%,1.2%,0.7%,2.2%,0.9%,1.8%
83,Two races including Some other race,0.1%,0.1%,0.0%,0.0%,0.0%,0.0%,0.1%,0.0%,0.0%,...,0.1%,0.0%,0.1%,0.0%,0.1%,0.0%,0.0%,0.0%,0.0%,0.1%
84,"Two races excluding Some other race, and Three...",1.6%,2.0%,0.9%,1.6%,1.2%,1.6%,1.4%,1.2%,1.7%,...,2.4%,1.9%,1.6%,0.5%,2.1%,1.2%,0.7%,2.2%,0.8%,1.7%


In [3]:
# Retrieving only rows that contain the desired county summary demographic statistics
# % Male, % Female, % Male > 18 Years Old, % Female > 18 Years Old, % White, % Black or African American, % American Indian and Alaska Native, % Asian
initial_pa_demographic_2012 = initial_pa_demographic_2012.iloc[[3, 4, 24, 25, 62, 63, 64, 65], :]
initial_pa_demographic_2012.head(8)


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
3,Male,49.2%,48.0%,49.6%,48.3%,49.1%,48.8%,49.0%,49.6%,49.6%,...,49.0%,48.5%,49.2%,51.0%,47.3%,50.5%,51.7%,48.8%,48.8%,49.3%
4,Female,50.8%,52.0%,50.4%,51.7%,50.9%,51.2%,51.0%,50.4%,50.4%,...,51.0%,51.5%,50.8%,49.0%,52.7%,49.5%,48.3%,51.2%,51.2%,50.7%
24,Male,48.6%,47.3%,49.3%,47.7%,48.4%,48.0%,48.4%,48.8%,49.2%,...,48.9%,47.8%,48.4%,50.7%,46.2%,50.5%,51.8%,48.3%,48.4%,48.8%
25,Female,51.4%,52.7%,50.7%,52.3%,51.6%,52.0%,51.6%,51.2%,50.8%,...,51.1%,52.2%,51.6%,49.3%,53.8%,49.5%,48.2%,51.7%,51.6%,51.2%
62,White,94.4%,83.1%,98.6%,92.7%,87.5%,97.7%,90.9%,97.6%,95.7%,...,82.1%,83.4%,89.3%,95.2%,43.4%,96.0%,96.6%,96.1%,95.9%,91.1%
63,Black or African American,2.0%,14.5%,1.4%,7.6%,6.1%,2.4%,4.5%,1.7%,3.8%,...,15.0%,9.9%,7.0%,3.7%,44.9%,3.5%,2.9%,4.3%,3.5%,7.1%
64,American Indian and Alaska Native,0.5%,0.7%,0.6%,0.5%,1.0%,0.7%,0.5%,0.4%,1.2%,...,0.6%,0.4%,0.7%,0.9%,0.9%,0.5%,0.6%,0.7%,0.2%,0.5%
65,Asian,1.0%,3.6%,0.3%,0.7%,1.8%,1.2%,4.8%,1.3%,0.8%,...,2.7%,7.4%,3.0%,0.5%,7.1%,0.8%,0.4%,1.0%,1.1%,1.7%


In [4]:
# Transposing the DataFrame's format for process integrity and future merging simplicity
initial_pa_demographic_2012 = initial_pa_demographic_2012.T
initial_pa_demographic_2012.head()


Unnamed: 0,3,4,24,25,62,63,64,65
Unnamed: 0,Male,Female,Male,Female,White,Black or African American,American Indian and Alaska Native,Asian
"Adams County, Pennsylvania",49.2%,50.8%,48.6%,51.4%,94.4%,2.0%,0.5%,1.0%
"Allegheny County, Pennsylvania",48.0%,52.0%,47.3%,52.7%,83.1%,14.5%,0.7%,3.6%
"Armstrong County, Pennsylvania",49.6%,50.4%,49.3%,50.7%,98.6%,1.4%,0.6%,0.3%
"Beaver County, Pennsylvania",48.3%,51.7%,47.7%,52.3%,92.7%,7.6%,0.5%,0.7%


In [5]:
# Resetting index and renaming to create a 'County' column
initial_pa_demographic_2012 = initial_pa_demographic_2012.reset_index()
initial_pa_demographic_2012.rename(columns={'index': 'County'}, inplace=True)
initial_pa_demographic_2012.head()


Unnamed: 0,County,3,4,24,25,62,63,64,65
0,Unnamed: 0,Male,Female,Male,Female,White,Black or African American,American Indian and Alaska Native,Asian
1,"Adams County, Pennsylvania",49.2%,50.8%,48.6%,51.4%,94.4%,2.0%,0.5%,1.0%
2,"Allegheny County, Pennsylvania",48.0%,52.0%,47.3%,52.7%,83.1%,14.5%,0.7%,3.6%
3,"Armstrong County, Pennsylvania",49.6%,50.4%,49.3%,50.7%,98.6%,1.4%,0.6%,0.3%
4,"Beaver County, Pennsylvania",48.3%,51.7%,47.7%,52.3%,92.7%,7.6%,0.5%,0.7%


In [6]:
# Confirming column data types prior to renaming
print(initial_pa_demographic_2012.columns)


Index(['County', 3, 4, 24, 25, 62, 63, 64, 65], dtype='object')


In [7]:
# Converting column data types from integers to strings for renaming
initial_pa_demographic_2012.columns = initial_pa_demographic_2012.columns.astype(str)
print(initial_pa_demographic_2012.columns)


Index(['County', '3', '4', '24', '25', '62', '63', '64', '65'], dtype='object')


In [8]:
# Finding and replacing all 'N' values with 'NaN' across the entire DataFrame for future manipulation
initial_pa_demographic_2012.replace('N', np.nan, inplace=True)


In [9]:
# Further DataFrame cleaning including execution of column renaming, dropping obselete (0) index row and subsequent resetting,
# as well as converting 'County' values to uppercase county names only (removing ' County, Pennsylvania')
# A corresponding year column is also included for future merges and all % data types are coverted to floats
initial_pa_demographic_2012 = initial_pa_demographic_2012.rename(columns={'3': '% Male', '4': '% Female', '24': '% Male > 18 Years Old',
                                                                         '25': '% Female > 18 Years Old', '62': '% White', '63': '% Black or African American',
                                                                         '64': '% American Indian and Alaska Native', '65': '% Asian'})
initial_pa_demographic_2012 = initial_pa_demographic_2012.drop(0)
initial_pa_demographic_2012 = initial_pa_demographic_2012.reset_index(drop=True)
initial_pa_demographic_2012['County'] = initial_pa_demographic_2012['County'].str.replace(' County, Pennsylvania', '', regex=False)
initial_pa_demographic_2012['County'] = initial_pa_demographic_2012['County'].apply(lambda x: x.upper())
initial_pa_demographic_2012.insert(0, 'Year', 2012)
initial_pa_demographic_2012['% Male'] = initial_pa_demographic_2012['% Male'].str.rstrip('%').astype(float) / 100
initial_pa_demographic_2012['% Female'] = initial_pa_demographic_2012['% Female'].str.rstrip('%').astype(float) / 100
initial_pa_demographic_2012['% Male > 18 Years Old'] = initial_pa_demographic_2012['% Male > 18 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_demographic_2012['% Female > 18 Years Old'] = initial_pa_demographic_2012['% Female > 18 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_demographic_2012['% White'] = initial_pa_demographic_2012['% White'].str.rstrip('%').astype(float) / 100
initial_pa_demographic_2012['% Black or African American'] = initial_pa_demographic_2012['% Black or African American'].str.rstrip('%').astype(float) / 100
initial_pa_demographic_2012['% American Indian and Alaska Native'] = initial_pa_demographic_2012['% American Indian and Alaska Native'].str.rstrip('%').astype(float) / 100
initial_pa_demographic_2012['% Asian'] = initial_pa_demographic_2012['% Asian'].str.rstrip('%').astype(float) / 100
initial_pa_demographic_2012


Unnamed: 0,Year,County,% Male,% Female,% Male > 18 Years Old,% Female > 18 Years Old,% White,% Black or African American,% American Indian and Alaska Native,% Asian
0,2012,ADAMS,0.492,0.508,0.486,0.514,0.944,0.02,0.005,0.01
1,2012,ALLEGHENY,0.48,0.52,0.473,0.527,0.831,0.145,0.007,0.036
2,2012,ARMSTRONG,0.496,0.504,0.493,0.507,0.986,0.014,0.006,0.003
3,2012,BEAVER,0.483,0.517,0.477,0.523,0.927,0.076,0.005,0.007
4,2012,BERKS,0.491,0.509,0.484,0.516,0.875,0.061,0.01,0.018
5,2012,BLAIR,0.488,0.512,0.48,0.52,0.977,0.024,0.007,0.012
6,2012,BUCKS,0.49,0.51,0.484,0.516,0.909,0.045,0.005,0.048
7,2012,BUTLER,0.496,0.504,0.488,0.512,0.976,0.017,0.004,0.013
8,2012,CAMBRIA,0.496,0.504,0.492,0.508,0.957,0.038,0.012,0.008
9,2012,CARBON,0.489,0.511,0.481,0.519,0.968,0.03,0.013,


**2016**
-

In [10]:
# Reading in 2016 U.S. Census Bureau demographic dataset
initial_pa_demographic_2016 = pd.read_excel("Resources/PA_Demographics_2016.xlsx")
initial_pa_demographic_2016


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
0,Label,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent,...,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent
1,SEX AND AGE,,,,,,,,,,...,,,,,,,,,,
2,Total population,102180,1225365,66486,167429,414812,124650,626399,186847,134732,...,166098,821725,302294,92541,1567872,143573,75061,207981,355458,443744
3,Male,49.3%,48.3%,49.7%,48.7%,49.3%,49.0%,49.1%,49.7%,49.1%,...,49.6%,48.6%,49.4%,51.1%,47.4%,51.2%,51.6%,48.9%,48.8%,49.3%
4,Female,50.7%,51.7%,50.3%,51.3%,50.7%,51.0%,50.9%,50.3%,50.9%,...,50.4%,51.4%,50.6%,48.9%,52.6%,48.8%,48.4%,51.1%,51.2%,50.7%
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,Total housing units,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
86,"CITIZEN, VOTING AGE POPULATION",,,,,,,,,,...,,,,,,,,,,
87,"Citizen, 18 and over population",79061,959493,53519,133884,307577,98678,473185,146228,108117,...,127134,612189,231516,73819,1116915,114204,61071,165178,287378,336501
88,Male,48.6%,47.5%,49.3%,48.3%,49.0%,48.0%,48.5%,48.9%,48.4%,...,49.2%,47.7%,48.5%,50.5%,45.7%,51.4%,52.0%,48.2%,48.4%,48.7%


In [11]:
# Retrieving only rows that contain the desired county summary demographic statistics
# % Male, % Female, % Male > 18 Years Old, % Female > 18 Years Old, % White, % Black or African American, % American Indian and Alaska Native, % Asian
initial_pa_demographic_2016 = initial_pa_demographic_2016.iloc[[3, 4, 24, 25, 62, 63, 64, 65], :]
initial_pa_demographic_2016.head(8)


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
3,Male,49.3%,48.3%,49.7%,48.7%,49.3%,49.0%,49.1%,49.7%,49.1%,...,49.6%,48.6%,49.4%,51.1%,47.4%,51.2%,51.6%,48.9%,48.8%,49.3%
4,Female,50.7%,51.7%,50.3%,51.3%,50.7%,51.0%,50.9%,50.3%,50.9%,...,50.4%,51.4%,50.6%,48.9%,52.6%,48.8%,48.4%,51.1%,51.2%,50.7%
24,Male,49.0%,47.7%,49.3%,48.2%,48.6%,48.0%,48.5%,49.0%,48.6%,...,49.0%,47.9%,48.6%,50.7%,46.3%,51.3%,51.9%,48.3%,48.3%,48.8%
25,Female,51.0%,52.3%,50.7%,51.8%,51.4%,52.0%,51.5%,51.0%,51.4%,...,51.0%,52.1%,51.4%,49.3%,53.7%,48.7%,48.1%,51.7%,51.7%,51.2%
62,White,94.8%,82.8%,98.8%,93.5%,89.2%,96.8%,90.0%,96.9%,96.4%,...,77.8%,81.9%,89.9%,95.8%,42.8%,94.4%,96.3%,95.2%,96.0%,90.5%
63,Black or African American,1.9%,14.7%,0.5%,7.4%,8.6%,2.6%,4.9%,1.6%,4.0%,...,15.9%,10.4%,7.4%,3.3%,44.2%,3.8%,3.2%,4.3%,3.4%,7.4%
64,American Indian and Alaska Native,1.0%,0.8%,0.6%,0.5%,4.0%,0.3%,0.6%,0.3%,1.2%,...,0.8%,0.7%,0.6%,0.4%,1.2%,0.5%,0.4%,0.7%,0.3%,0.8%
65,Asian,1.1%,4.3%,1.0%,0.8%,1.8%,1.0%,5.4%,1.8%,0.8%,...,2.9%,8.3%,3.3%,0.6%,7.9%,0.9%,N,1.3%,1.2%,1.8%


In [12]:
# Transposing the DataFrame's format for process integrity and future merging simplicity
initial_pa_demographic_2016 = initial_pa_demographic_2016.T
initial_pa_demographic_2016.head()


Unnamed: 0,3,4,24,25,62,63,64,65
Unnamed: 0,Male,Female,Male,Female,White,Black or African American,American Indian and Alaska Native,Asian
"Adams County, Pennsylvania",49.3%,50.7%,49.0%,51.0%,94.8%,1.9%,1.0%,1.1%
"Allegheny County, Pennsylvania",48.3%,51.7%,47.7%,52.3%,82.8%,14.7%,0.8%,4.3%
"Armstrong County, Pennsylvania",49.7%,50.3%,49.3%,50.7%,98.8%,0.5%,0.6%,1.0%
"Beaver County, Pennsylvania",48.7%,51.3%,48.2%,51.8%,93.5%,7.4%,0.5%,0.8%


In [13]:
# Resetting index and renaming to create a 'County' column
initial_pa_demographic_2016 = initial_pa_demographic_2016.reset_index()
initial_pa_demographic_2016.rename(columns={'index': 'County'}, inplace=True)
initial_pa_demographic_2016.head()


Unnamed: 0,County,3,4,24,25,62,63,64,65
0,Unnamed: 0,Male,Female,Male,Female,White,Black or African American,American Indian and Alaska Native,Asian
1,"Adams County, Pennsylvania",49.3%,50.7%,49.0%,51.0%,94.8%,1.9%,1.0%,1.1%
2,"Allegheny County, Pennsylvania",48.3%,51.7%,47.7%,52.3%,82.8%,14.7%,0.8%,4.3%
3,"Armstrong County, Pennsylvania",49.7%,50.3%,49.3%,50.7%,98.8%,0.5%,0.6%,1.0%
4,"Beaver County, Pennsylvania",48.7%,51.3%,48.2%,51.8%,93.5%,7.4%,0.5%,0.8%


In [14]:
# Confirming column data types prior to renaming
print(initial_pa_demographic_2016.columns)


Index(['County', 3, 4, 24, 25, 62, 63, 64, 65], dtype='object')


In [15]:
# Converting column data types from integers to strings for renaming
initial_pa_demographic_2016.columns = initial_pa_demographic_2016.columns.astype(str)
print(initial_pa_demographic_2016.columns)


Index(['County', '3', '4', '24', '25', '62', '63', '64', '65'], dtype='object')


In [16]:
# Finding and replacing all 'N' values with 'NaN' across the entire DataFrame for future manipulation
initial_pa_demographic_2016.replace('N', np.nan, inplace=True)


In [17]:
# Further DataFrame cleaning including execution of column renaming, dropping obselete (0) index row and subsequent resetting,
# as well as converting 'County' values to uppercase county names only (removing ' County, Pennsylvania')
# A corresponding year column is also included for future merges and all % data types are coverted to floats
initial_pa_demographic_2016 = initial_pa_demographic_2016.rename(columns={'3': '% Male', '4': '% Female', '24': '% Male > 18 Years Old',
                                                                         '25': '% Female > 18 Years Old', '62': '% White', '63': '% Black or African American',
                                                                         '64': '% American Indian and Alaska Native', '65': '% Asian'})
initial_pa_demographic_2016 = initial_pa_demographic_2016.drop(0)
initial_pa_demographic_2016 = initial_pa_demographic_2016.reset_index(drop=True)
initial_pa_demographic_2016['County'] = initial_pa_demographic_2016['County'].str.replace(' County, Pennsylvania', '', regex=False)
initial_pa_demographic_2016['County'] = initial_pa_demographic_2016['County'].apply(lambda x: x.upper())
initial_pa_demographic_2016.insert(0, 'Year', 2016)
initial_pa_demographic_2016['% Male'] = initial_pa_demographic_2016['% Male'].str.rstrip('%').astype(float) / 100
initial_pa_demographic_2016['% Female'] = initial_pa_demographic_2016['% Female'].str.rstrip('%').astype(float) / 100
initial_pa_demographic_2016['% Male > 18 Years Old'] = initial_pa_demographic_2016['% Male > 18 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_demographic_2016['% Female > 18 Years Old'] = initial_pa_demographic_2016['% Female > 18 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_demographic_2016['% White'] = initial_pa_demographic_2016['% White'].str.rstrip('%').astype(float) / 100
initial_pa_demographic_2016['% Black or African American'] = initial_pa_demographic_2016['% Black or African American'].str.rstrip('%').astype(float) / 100
initial_pa_demographic_2016['% American Indian and Alaska Native'] = initial_pa_demographic_2016['% American Indian and Alaska Native'].str.rstrip('%').astype(float) / 100
initial_pa_demographic_2016['% Asian'] = initial_pa_demographic_2016['% Asian'].str.rstrip('%').astype(float) / 100
initial_pa_demographic_2016


Unnamed: 0,Year,County,% Male,% Female,% Male > 18 Years Old,% Female > 18 Years Old,% White,% Black or African American,% American Indian and Alaska Native,% Asian
0,2016,ADAMS,0.493,0.507,0.49,0.51,0.948,0.019,0.01,0.011
1,2016,ALLEGHENY,0.483,0.517,0.477,0.523,0.828,0.147,0.008,0.043
2,2016,ARMSTRONG,0.497,0.503,0.493,0.507,0.988,0.005,0.006,0.01
3,2016,BEAVER,0.487,0.513,0.482,0.518,0.935,0.074,0.005,0.008
4,2016,BERKS,0.493,0.507,0.486,0.514,0.892,0.086,0.04,0.018
5,2016,BLAIR,0.49,0.51,0.48,0.52,0.968,0.026,0.003,0.01
6,2016,BUCKS,0.491,0.509,0.485,0.515,0.9,0.049,0.006,0.054
7,2016,BUTLER,0.497,0.503,0.49,0.51,0.969,0.016,0.003,0.018
8,2016,CAMBRIA,0.491,0.509,0.486,0.514,0.964,0.04,0.012,0.008
9,2016,CARBON,0.505,0.495,0.5,0.5,0.963,0.031,,


**2019**
-
Please note: 2020 United States Census Bureau was unavailable and as such the closest year prior to the election (2019) was utilized

In [18]:
# Reading in 2019 U.S. Census Bureau demographic dataset
initial_pa_demographic_2019 = pd.read_excel("Resources/PA_Demographics_2019.xlsx")
initial_pa_demographic_2019


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
0,Label,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent,...,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent
1,SEX AND AGE,,,,,,,,,,...,,,,,,,,,,
2,Total population,103009,1216045,64735,163929,421164,121829,628270,187853,130192,...,170271,830915,305285,90843,1584064,141359,73447,206865,348899,449058
3,Male,49.4%,48.4%,50.1%,48.3%,49.2%,49.1%,49.1%,49.1%,49.0%,...,49.5%,48.6%,49.3%,50.6%,47.3%,51.5%,52.2%,49.1%,49.1%,49.6%
4,Female,50.6%,51.6%,49.9%,51.7%,50.8%,50.9%,50.9%,50.9%,51.0%,...,50.5%,51.4%,50.7%,49.4%,52.7%,48.5%,47.8%,50.9%,50.9%,50.4%
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,Total housing units,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
91,"CITIZEN, VOTING AGE POPULATION",,,,,,,,,,...,,,,,,,,,,
92,"Citizen, 18 and over population",81064,953365,52096,131153,316613,96704,479735,149363,105148,...,133897,623630,236683,72014,1147538,112395,59678,164801,283418,340800
93,Male,48.5%,47.5%,49.9%,47.8%,48.9%,48.3%,48.3%,48.6%,48.5%,...,48.8%,47.9%,48.6%,50.3%,46.0%,51.5%,52.2%,48.3%,48.5%,48.8%


In [19]:
# Retrieving only rows that contain the desired county summary demographic statistics
# % Male, % Female, % Male > 18 Years Old, % Female > 18 Years Old, % White, % Black or African American, % American Indian and Alaska Native, % Asian
initial_pa_demographic_2019 = initial_pa_demographic_2019.iloc[[3, 4, 27, 28, 67, 68, 69, 70], :]
initial_pa_demographic_2019.head(8)


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
3,Male,49.4%,48.4%,50.1%,48.3%,49.2%,49.1%,49.1%,49.1%,49.0%,...,49.5%,48.6%,49.3%,50.6%,47.3%,51.5%,52.2%,49.1%,49.1%,49.6%
4,Female,50.6%,51.6%,49.9%,51.7%,50.8%,50.9%,50.9%,50.9%,51.0%,...,50.5%,51.4%,50.7%,49.4%,52.7%,48.5%,47.8%,50.9%,50.9%,50.4%
27,Male,48.7%,47.8%,49.8%,47.9%,48.7%,48.2%,48.6%,48.6%,48.4%,...,49.2%,48.0%,48.6%,50.4%,46.3%,51.3%,52.4%,48.6%,48.5%,48.8%
28,Female,51.3%,52.2%,50.2%,52.1%,51.3%,51.8%,51.4%,51.4%,51.6%,...,50.8%,52.0%,51.4%,49.6%,53.7%,48.7%,47.6%,51.4%,51.5%,51.2%
67,White,94.4%,82.1%,98.1%,92.9%,82.4%,97.5%,88.6%,97.6%,94.9%,...,80.4%,81.3%,86.5%,96.0%,41.4%,95.4%,96.8%,95.4%,96.3%,90.6%
68,Black or African American,3.0%,14.7%,2.1%,8.4%,7.5%,3.3%,5.1%,1.7%,5.4%,...,15.8%,10.9%,7.5%,4.1%,43.8%,4.0%,3.3%,4.8%,3.6%,8.0%
69,American Indian and Alaska Native,0.7%,0.6%,N,0.5%,0.5%,N,0.4%,0.4%,0.2%,...,1.3%,0.5%,0.5%,N,1.6%,0.6%,0.9%,0.4%,0.4%,0.8%
70,Asian,1.2%,4.8%,0.4%,0.9%,1.7%,0.4%,5.9%,1.9%,0.7%,...,3.5%,8.8%,3.7%,0.6%,8.3%,0.8%,0.7%,1.5%,1.3%,1.9%


In [20]:
# Transposing the DataFrame's format for process integrity and future merging simplicity
initial_pa_demographic_2019 = initial_pa_demographic_2019.T
initial_pa_demographic_2019.head()


Unnamed: 0,3,4,27,28,67,68,69,70
Unnamed: 0,Male,Female,Male,Female,White,Black or African American,American Indian and Alaska Native,Asian
"Adams County, Pennsylvania",49.4%,50.6%,48.7%,51.3%,94.4%,3.0%,0.7%,1.2%
"Allegheny County, Pennsylvania",48.4%,51.6%,47.8%,52.2%,82.1%,14.7%,0.6%,4.8%
"Armstrong County, Pennsylvania",50.1%,49.9%,49.8%,50.2%,98.1%,2.1%,N,0.4%
"Beaver County, Pennsylvania",48.3%,51.7%,47.9%,52.1%,92.9%,8.4%,0.5%,0.9%


In [21]:
# Resetting index and renaming to create a 'County' column
initial_pa_demographic_2019 = initial_pa_demographic_2019.reset_index()
initial_pa_demographic_2019.rename(columns={'index': 'County'}, inplace=True)
initial_pa_demographic_2019.head()


Unnamed: 0,County,3,4,27,28,67,68,69,70
0,Unnamed: 0,Male,Female,Male,Female,White,Black or African American,American Indian and Alaska Native,Asian
1,"Adams County, Pennsylvania",49.4%,50.6%,48.7%,51.3%,94.4%,3.0%,0.7%,1.2%
2,"Allegheny County, Pennsylvania",48.4%,51.6%,47.8%,52.2%,82.1%,14.7%,0.6%,4.8%
3,"Armstrong County, Pennsylvania",50.1%,49.9%,49.8%,50.2%,98.1%,2.1%,N,0.4%
4,"Beaver County, Pennsylvania",48.3%,51.7%,47.9%,52.1%,92.9%,8.4%,0.5%,0.9%


In [22]:
# Confirming column data types prior to renaming
print(initial_pa_demographic_2019.columns)


Index(['County', 3, 4, 27, 28, 67, 68, 69, 70], dtype='object')


In [23]:
# Converting column data types from integers to strings for renaming
initial_pa_demographic_2019.columns = initial_pa_demographic_2019.columns.astype(str)
print(initial_pa_demographic_2019.columns)


Index(['County', '3', '4', '27', '28', '67', '68', '69', '70'], dtype='object')


In [24]:
# Finding and replacing all 'N' values with 'NaN' across the entire DataFrame for future manipulation
initial_pa_demographic_2019.replace('N', np.nan, inplace=True)


In [25]:
# Further DataFrame cleaning including execution of column renaming, dropping obselete (0) index row and subsequent resetting,
# as well as converting 'County' values to uppercase county names only (removing ' County, Pennsylvania')
# A corresponding year column is also included for future merges and all % data types are coverted to floats
initial_pa_demographic_2019 = initial_pa_demographic_2019.rename(columns={'3': '% Male', '4': '% Female', '27': '% Male > 18 Years Old',
                                                                         '28': '% Female > 18 Years Old', '67': '% White', '68': '% Black or African American',
                                                                         '69': '% American Indian and Alaska Native', '70': '% Asian'})
initial_pa_demographic_2019 = initial_pa_demographic_2019.drop(0)
initial_pa_demographic_2019 = initial_pa_demographic_2019.reset_index(drop=True)
initial_pa_demographic_2019['County'] = initial_pa_demographic_2019['County'].str.replace(' County, Pennsylvania', '', regex=False)
initial_pa_demographic_2019['County'] = initial_pa_demographic_2019['County'].apply(lambda x: x.upper())
initial_pa_demographic_2019.insert(0, 'Year', 2020)
initial_pa_demographic_2019['% Male'] = initial_pa_demographic_2019['% Male'].str.rstrip('%').astype(float) / 100
initial_pa_demographic_2019['% Female'] = initial_pa_demographic_2019['% Female'].str.rstrip('%').astype(float) / 100
initial_pa_demographic_2019['% Male > 18 Years Old'] = initial_pa_demographic_2019['% Male > 18 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_demographic_2019['% Female > 18 Years Old'] = initial_pa_demographic_2019['% Female > 18 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_demographic_2019['% White'] = initial_pa_demographic_2019['% White'].str.rstrip('%').astype(float) / 100
initial_pa_demographic_2019['% Black or African American'] = initial_pa_demographic_2019['% Black or African American'].str.rstrip('%').astype(float) / 100
initial_pa_demographic_2019['% American Indian and Alaska Native'] = initial_pa_demographic_2019['% American Indian and Alaska Native'].str.rstrip('%').astype(float) / 100
initial_pa_demographic_2019['% Asian'] = initial_pa_demographic_2019['% Asian'].str.rstrip('%').astype(float) / 100
initial_pa_demographic_2019


Unnamed: 0,Year,County,% Male,% Female,% Male > 18 Years Old,% Female > 18 Years Old,% White,% Black or African American,% American Indian and Alaska Native,% Asian
0,2020,ADAMS,0.494,0.506,0.487,0.513,0.944,0.03,0.007,0.012
1,2020,ALLEGHENY,0.484,0.516,0.478,0.522,0.821,0.147,0.006,0.048
2,2020,ARMSTRONG,0.501,0.499,0.498,0.502,0.981,0.021,,0.004
3,2020,BEAVER,0.483,0.517,0.479,0.521,0.929,0.084,0.005,0.009
4,2020,BERKS,0.492,0.508,0.487,0.513,0.824,0.075,0.005,0.017
5,2020,BLAIR,0.491,0.509,0.482,0.518,0.975,0.033,,0.004
6,2020,BUCKS,0.491,0.509,0.486,0.514,0.886,0.051,0.004,0.059
7,2020,BUTLER,0.491,0.509,0.486,0.514,0.976,0.017,0.004,0.019
8,2020,CAMBRIA,0.49,0.51,0.484,0.516,0.949,0.054,0.002,0.007
9,2020,CARBON,0.499,0.501,0.498,0.502,0.942,0.029,0.015,0.013


**Education Data Extraction, Transformation & Loading (2012, 2016 & 2019)**
-
-----------

**2012**
-

In [26]:
# Reading in 2012 U.S. Census Bureau education dataset
initial_pa_education_2012 = pd.read_excel("Resources/PA_Education_2012.xlsx")
initial_pa_education_2012


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
0,,Total,Total,Total,Total,Total,Total,Total,Total,Total,...,Total,Total,Total,Total,Total,Total,Total,Total,Total,Total
1,Label,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,...,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate
2,Population 18 to 24 years,10743,120782,4864,13587,42728,11697,48604,17793,13496,...,17948,63418,31757,7356,194497,10452,5891,18373,28817,37135
3,Less than high school graduate,11.4%,10.6%,13.8%,14.0%,15.3%,6.7%,11.5%,10.6%,7.7%,...,13.2%,11.2%,7.0%,30.3%,15.4%,14.4%,12.9%,9.4%,12.9%,14.9%
4,High school graduate (includes equivalency),36.1%,25.7%,47.9%,29.1%,36.7%,47.2%,34.2%,29.7%,31.8%,...,34.5%,33.6%,36.9%,31.2%,30.6%,45.0%,50.3%,32.3%,30.2%,41.5%
5,Some college or associate's degree,47.5%,44.1%,27.9%,44.3%,42.2%,39.1%,39.3%,49.7%,54.0%,...,47.1%,39.1%,45.0%,31.8%,40.8%,32.4%,29.5%,45.3%,45.0%,35.8%
6,Bachelor's degree or higher,4.9%,19.5%,10.4%,12.6%,5.8%,7.0%,14.9%,9.9%,6.5%,...,5.2%,16.1%,11.1%,6.8%,13.2%,8.2%,7.3%,13.0%,11.9%,7.8%
7,Population 25 years and over,69168,871384,50003,122708,274513,89086,439920,127213,100802,...,113419,564697,204478,68537,1004571,107830,57008,148353,264558,300874
8,Less than 9th grade,6.0%,2.0%,4.2%,3.4%,6.2%,1.8%,1.8%,2.4%,3.3%,...,2.6%,2.1%,3.0%,3.6%,6.3%,3.1%,4.7%,2.6%,2.3%,3.7%
9,"9th to 12th grade, no diploma",9.2%,4.5%,7.1%,5.6%,9.6%,7.7%,4.8%,5.7%,6.8%,...,7.7%,4.0%,7.3%,10.0%,12.3%,10.2%,9.5%,6.2%,6.2%,8.8%


In [27]:
# Retrieving only rows that contain the desired county summary demographic statistics
# % High School Graduate & % Bachelor's degree or higher for 18 to 24 years old and % High School Graduate or Higher and % Bachelor's degree or higher for the remaining age groups:
# 25 to 34, 35 to 44, 45 to 64 and 65 & older
initial_pa_education_2012 = initial_pa_education_2012.iloc[[4, 6, 18, 19, 21, 22, 24, 25, 27, 28], :]
initial_pa_education_2012


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
4,High school graduate (includes equivalency),36.1%,25.7%,47.9%,29.1%,36.7%,47.2%,34.2%,29.7%,31.8%,...,34.5%,33.6%,36.9%,31.2%,30.6%,45.0%,50.3%,32.3%,30.2%,41.5%
6,Bachelor's degree or higher,4.9%,19.5%,10.4%,12.6%,5.8%,7.0%,14.9%,9.9%,6.5%,...,5.2%,16.1%,11.1%,6.8%,13.2%,8.2%,7.3%,13.0%,11.9%,7.8%
18,High school graduate or higher,91.9%,96.7%,95.1%,93.5%,84.6%,92.8%,94.9%,96.9%,94.7%,...,93.5%,96.4%,96.2%,87.7%,88.6%,87.4%,89.9%,92.9%,93.8%,89.9%
19,Bachelor's degree or higher,21.9%,51.5%,25.0%,33.7%,27.5%,22.9%,38.8%,38.7%,27.1%,...,24.1%,54.8%,32.1%,23.6%,37.4%,19.2%,20.0%,36.6%,33.4%,28.9%
21,High school graduate or higher,86.9%,96.2%,93.3%,95.6%,87.3%,95.4%,96.3%,95.8%,93.2%,...,92.2%,96.3%,92.3%,95.0%,84.9%,91.0%,92.7%,93.5%,96.1%,91.9%
22,Bachelor's degree or higher,20.7%,45.3%,18.6%,24.5%,26.5%,25.8%,45.9%,39.3%,23.2%,...,20.0%,54.7%,31.0%,15.8%,25.4%,15.1%,19.0%,34.7%,32.0%,27.8%
24,High school graduate or higher,88.3%,95.4%,93.0%,95.1%,89.1%,93.0%,95.4%,94.0%,94.1%,...,92.1%,95.6%,92.8%,89.7%,80.7%,90.9%,88.1%,94.4%,95.6%,90.4%
25,Bachelor's degree or higher,22.3%,34.8%,14.7%,21.4%,22.9%,17.9%,36.7%,27.7%,19.0%,...,24.1%,46.4%,25.4%,15.5%,18.7%,12.3%,15.1%,26.3%,26.6%,20.7%
27,High school graduate or higher,73.4%,86.2%,75.4%,80.4%,72.5%,82.0%,85.6%,81.9%,79.0%,...,78.9%,86.5%,77.9%,74.9%,69.1%,76.5%,75.7%,83.4%,81.1%,76.1%
28,Bachelor's degree or higher,17.7%,22.4%,11.6%,12.3%,13.0%,9.2%,26.1%,20.7%,9.4%,...,19.8%,33.5%,16.9%,7.7%,15.8%,9.8%,7.5%,15.0%,16.9%,13.8%


In [28]:
# Transposing the DataFrame's format for process integrity and future merging simplicity
initial_pa_education_2012 = initial_pa_education_2012.T
initial_pa_education_2012.head()


Unnamed: 0,4,6,18,19,21,22,24,25,27,28
Unnamed: 0,High school graduate (includes equivalency),Bachelor's degree or higher,High school graduate or higher,Bachelor's degree or higher,High school graduate or higher,Bachelor's degree or higher,High school graduate or higher,Bachelor's degree or higher,High school graduate or higher,Bachelor's degree or higher
"Adams County, Pennsylvania",36.1%,4.9%,91.9%,21.9%,86.9%,20.7%,88.3%,22.3%,73.4%,17.7%
"Allegheny County, Pennsylvania",25.7%,19.5%,96.7%,51.5%,96.2%,45.3%,95.4%,34.8%,86.2%,22.4%
"Armstrong County, Pennsylvania",47.9%,10.4%,95.1%,25.0%,93.3%,18.6%,93.0%,14.7%,75.4%,11.6%
"Beaver County, Pennsylvania",29.1%,12.6%,93.5%,33.7%,95.6%,24.5%,95.1%,21.4%,80.4%,12.3%


In [29]:
# Resetting index and renaming to create a 'County' column
initial_pa_education_2012 = initial_pa_education_2012.reset_index()
initial_pa_education_2012.rename(columns={'index': 'County'}, inplace=True)
initial_pa_education_2012.head()


Unnamed: 0,County,4,6,18,19,21,22,24,25,27,28
0,Unnamed: 0,High school graduate (includes equivalency),Bachelor's degree or higher,High school graduate or higher,Bachelor's degree or higher,High school graduate or higher,Bachelor's degree or higher,High school graduate or higher,Bachelor's degree or higher,High school graduate or higher,Bachelor's degree or higher
1,"Adams County, Pennsylvania",36.1%,4.9%,91.9%,21.9%,86.9%,20.7%,88.3%,22.3%,73.4%,17.7%
2,"Allegheny County, Pennsylvania",25.7%,19.5%,96.7%,51.5%,96.2%,45.3%,95.4%,34.8%,86.2%,22.4%
3,"Armstrong County, Pennsylvania",47.9%,10.4%,95.1%,25.0%,93.3%,18.6%,93.0%,14.7%,75.4%,11.6%
4,"Beaver County, Pennsylvania",29.1%,12.6%,93.5%,33.7%,95.6%,24.5%,95.1%,21.4%,80.4%,12.3%


In [30]:
# Confirming column data types prior to renaming
print(initial_pa_education_2012.columns)


Index(['County', 4, 6, 18, 19, 21, 22, 24, 25, 27, 28], dtype='object')


In [31]:
# Converting column data types from integers to strings for renaming
initial_pa_education_2012.columns = initial_pa_education_2012.columns.astype(str)
print(initial_pa_education_2012.columns)


Index(['County', '4', '6', '18', '19', '21', '22', '24', '25', '27', '28'], dtype='object')


In [32]:
# Finding and replacing all 'N' values with 'NaN' across the entire DataFrame for future manipulation
initial_pa_education_2012.replace('N', np.nan, inplace=True)


In [33]:
# Further DataFrame cleaning including execution of column renaming, dropping obselete (0) index row and subsequent resetting,
# as well as converting 'County' values to uppercase county names only (removing ' County, Pennsylvania')
# A corresponding year column is also included for future merges and all % data types are coverted to floats
initial_pa_education_2012 = initial_pa_education_2012.rename(columns={'4': '% High School Graduate (18-24)', '6': '% Bachelors Degree or Higher (18-24)', '18': '% High School Graduate or Higher (25-34)',
                                                                         '19': '% Bachelors Degree or Higher (25-34)', '21': '% High School Graduate or Higher (35-44)', '22': '% Bachelors Degree or Higher (35-44)',
                                                                         '24': '% High School Graduate or Higher (45-64)', '25': '% Bachelors Degree or Higher (45-64)',
                                                                         '27': '% High School Graduate or Higher (65 & Older)', '28': '% Bachelors Degree or Higher (65 & Older)'})
initial_pa_education_2012 = initial_pa_education_2012.drop(0)
initial_pa_education_2012 = initial_pa_education_2012.reset_index(drop=True)
initial_pa_education_2012['County'] = initial_pa_education_2012['County'].str.replace(' County, Pennsylvania', '', regex=False)
initial_pa_education_2012['County'] = initial_pa_education_2012['County'].apply(lambda x: x.upper())
initial_pa_education_2012.insert(0, 'Year', 2012)
initial_pa_education_2012['% High School Graduate (18-24)'] = initial_pa_education_2012['% High School Graduate (18-24)'].str.rstrip('%').astype(float) / 100
initial_pa_education_2012['% Bachelors Degree or Higher (18-24)'] = initial_pa_education_2012['% Bachelors Degree or Higher (18-24)'].str.rstrip('%').astype(float) / 100
initial_pa_education_2012['% High School Graduate or Higher (25-34)'] = initial_pa_education_2012['% High School Graduate or Higher (25-34)'].str.rstrip('%').astype(float) / 100
initial_pa_education_2012['% Bachelors Degree or Higher (25-34)'] = initial_pa_education_2012['% Bachelors Degree or Higher (25-34)'].str.rstrip('%').astype(float) / 100
initial_pa_education_2012['% High School Graduate or Higher (35-44)'] = initial_pa_education_2012['% High School Graduate or Higher (35-44)'].str.rstrip('%').astype(float) / 100
initial_pa_education_2012['% Bachelors Degree or Higher (35-44)'] = initial_pa_education_2012['% Bachelors Degree or Higher (35-44)'].str.rstrip('%').astype(float) / 100
initial_pa_education_2012['% High School Graduate or Higher (45-64)'] = initial_pa_education_2012['% High School Graduate or Higher (45-64)'].str.rstrip('%').astype(float) / 100
initial_pa_education_2012['% Bachelors Degree or Higher (45-64)'] = initial_pa_education_2012['% Bachelors Degree or Higher (45-64)'].str.rstrip('%').astype(float) / 100
initial_pa_education_2012['% High School Graduate or Higher (65 & Older)'] = initial_pa_education_2012['% High School Graduate or Higher (65 & Older)'].str.rstrip('%').astype(float) / 100
initial_pa_education_2012['% Bachelors Degree or Higher (65 & Older)'] = initial_pa_education_2012['% Bachelors Degree or Higher (65 & Older)'].str.rstrip('%').astype(float) / 100
initial_pa_education_2012


Unnamed: 0,Year,County,% High School Graduate (18-24),% Bachelors Degree or Higher (18-24),% High School Graduate or Higher (25-34),% Bachelors Degree or Higher (25-34),% High School Graduate or Higher (35-44),% Bachelors Degree or Higher (35-44),% High School Graduate or Higher (45-64),% Bachelors Degree or Higher (45-64),% High School Graduate or Higher (65 & Older),% Bachelors Degree or Higher (65 & Older)
0,2012,ADAMS,0.361,0.049,0.919,0.219,0.869,0.207,0.883,0.223,0.734,0.177
1,2012,ALLEGHENY,0.257,0.195,0.967,0.515,0.962,0.453,0.954,0.348,0.862,0.224
2,2012,ARMSTRONG,0.479,0.104,0.951,0.25,0.933,0.186,0.93,0.147,0.754,0.116
3,2012,BEAVER,0.291,0.126,0.935,0.337,0.956,0.245,0.951,0.214,0.804,0.123
4,2012,BERKS,0.367,0.058,0.846,0.275,0.873,0.265,0.891,0.229,0.725,0.13
5,2012,BLAIR,0.472,0.07,0.928,0.229,0.954,0.258,0.93,0.179,0.82,0.092
6,2012,BUCKS,0.342,0.149,0.949,0.388,0.963,0.459,0.954,0.367,0.856,0.261
7,2012,BUTLER,0.297,0.099,0.969,0.387,0.958,0.393,0.94,0.277,0.819,0.207
8,2012,CAMBRIA,0.318,0.065,0.947,0.271,0.932,0.232,0.941,0.19,0.79,0.094
9,2012,CARBON,0.323,0.098,0.872,0.22,0.956,0.182,0.894,0.132,0.805,0.096


**2016**
-

In [34]:
# Reading in 2016 U.S. Census Bureau education dataset
initial_pa_education_2016 = pd.read_excel("Resources/PA_Education_2016.xlsx")
initial_pa_education_2016


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
0,,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent,...,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent
1,Label,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,...,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate
2,AGE BY EDUCATIONAL ATTAINMENT,,,,,,,,,,...,,,,,,,,,,
3,Population 18 to 24 years,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
4,Less than high school graduate,12.4%,9.1%,18.2%,13.2%,14.0%,11.4%,11.1%,7.7%,13.8%,...,10.3%,10.2%,9.2%,17.8%,13.0%,18.7%,17.8%,14.2%,9.1%,15.2%
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,Less than high school graduate,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
66,High school graduate (includes equivalency),(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
67,Some college or associate's degree,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
68,Bachelor's degree,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)


In [35]:
# Retrieving only rows that contain the desired county summary demographic statistics
# % High School Graduate & % Bachelor's degree or higher for 18 to 24 years old and % High School Graduate or Higher and % Bachelor's degree or higher for the remaining age groups:
# 25 to 34, 35 to 44, 45 to 64 and 65 & older
initial_pa_education_2016 = initial_pa_education_2016.iloc[[5, 7, 19, 20, 22, 23, 25, 26, 28, 29], :]
initial_pa_education_2016


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
5,High school graduate (includes equivalency),40.5%,28.3%,38.0%,29.3%,40.6%,38.2%,33.1%,30.8%,25.8%,...,36.7%,29.4%,32.9%,37.0%,30.9%,46.8%,46.6%,33.4%,29.6%,43.9%
7,Bachelor's degree or higher,9.8%,20.4%,14.5%,13.4%,9.3%,10.3%,18.5%,13.7%,8.9%,...,8.4%,18.9%,9.6%,8.6%,11.6%,6.6%,9.9%,13.7%,14.3%,6.3%
19,High school graduate or higher,89.5%,96.9%,88.7%,92.6%,88.5%,90.9%,95.6%,97.0%,95.9%,...,96.1%,95.7%,94.8%,80.8%,91.0%,87.8%,88.4%,94.1%,95.7%,92.2%
20,Bachelor's degree or higher,22.8%,56.0%,18.9%,38.3%,27.2%,23.3%,46.2%,45.9%,29.3%,...,33.5%,54.6%,33.5%,19.1%,44.2%,21.1%,19.1%,37.3%,38.4%,30.1%
22,High school graduate or higher,90.4%,96.8%,94.9%,92.1%,86.2%,94.4%,95.4%,97.6%,96.6%,...,90.5%,96.2%,94.4%,92.6%,86.4%,87.9%,91.6%,95.6%,97.0%,93.0%
23,Bachelor's degree or higher,31.7%,53.9%,24.2%,36.0%,30.1%,25.9%,45.3%,52.4%,30.2%,...,25.0%,58.9%,36.3%,28.6%,29.1%,21.1%,19.4%,42.6%,37.0%,31.7%
25,High school graduate or higher,88.9%,95.2%,90.6%,95.8%,88.9%,93.5%,94.8%,95.4%,93.1%,...,89.8%,95.6%,92.6%,92.1%,81.2%,89.9%,91.5%,93.1%,96.0%,91.8%
26,Bachelor's degree or higher,18.7%,37.7%,15.9%,21.8%,26.5%,19.3%,41.3%,35.5%,20.2%,...,23.5%,47.0%,29.0%,15.4%,20.9%,15.9%,14.9%,26.5%,26.7%,22.6%
28,High school graduate or higher,79.8%,89.5%,82.2%,87.7%,80.7%,88.0%,89.0%,91.3%,86.1%,...,86.9%,89.7%,83.0%,80.5%,72.6%,87.2%,79.4%,88.6%,89.4%,82.0%
29,Bachelor's degree or higher,20.1%,27.3%,10.9%,18.0%,17.4%,9.8%,30.9%,21.4%,17.3%,...,24.8%,37.2%,20.8%,13.5%,19.4%,8.6%,14.0%,21.7%,20.7%,17.3%


In [36]:
# Transposing the DataFrame's format for process integrity and future merging simplicity
initial_pa_education_2016 = initial_pa_education_2016.T
initial_pa_education_2016.head()


Unnamed: 0,5,7,19,20,22,23,25,26,28,29
Unnamed: 0,High school graduate (includes equivalency),Bachelor's degree or higher,High school graduate or higher,Bachelor's degree or higher,High school graduate or higher,Bachelor's degree or higher,High school graduate or higher,Bachelor's degree or higher,High school graduate or higher,Bachelor's degree or higher
"Adams County, Pennsylvania",40.5%,9.8%,89.5%,22.8%,90.4%,31.7%,88.9%,18.7%,79.8%,20.1%
"Allegheny County, Pennsylvania",28.3%,20.4%,96.9%,56.0%,96.8%,53.9%,95.2%,37.7%,89.5%,27.3%
"Armstrong County, Pennsylvania",38.0%,14.5%,88.7%,18.9%,94.9%,24.2%,90.6%,15.9%,82.2%,10.9%
"Beaver County, Pennsylvania",29.3%,13.4%,92.6%,38.3%,92.1%,36.0%,95.8%,21.8%,87.7%,18.0%


In [37]:
# Resetting index and renaming to create a 'County' column
initial_pa_education_2016 = initial_pa_education_2016.reset_index()
initial_pa_education_2016.rename(columns={'index': 'County'}, inplace=True)
initial_pa_education_2016.head()


Unnamed: 0,County,5,7,19,20,22,23,25,26,28,29
0,Unnamed: 0,High school graduate (includes equivalency),Bachelor's degree or higher,High school graduate or higher,Bachelor's degree or higher,High school graduate or higher,Bachelor's degree or higher,High school graduate or higher,Bachelor's degree or higher,High school graduate or higher,Bachelor's degree or higher
1,"Adams County, Pennsylvania",40.5%,9.8%,89.5%,22.8%,90.4%,31.7%,88.9%,18.7%,79.8%,20.1%
2,"Allegheny County, Pennsylvania",28.3%,20.4%,96.9%,56.0%,96.8%,53.9%,95.2%,37.7%,89.5%,27.3%
3,"Armstrong County, Pennsylvania",38.0%,14.5%,88.7%,18.9%,94.9%,24.2%,90.6%,15.9%,82.2%,10.9%
4,"Beaver County, Pennsylvania",29.3%,13.4%,92.6%,38.3%,92.1%,36.0%,95.8%,21.8%,87.7%,18.0%


In [38]:
# Confirming column data types prior to renaming
print(initial_pa_education_2016.columns)


Index(['County', 5, 7, 19, 20, 22, 23, 25, 26, 28, 29], dtype='object')


In [39]:
# Converting column data types from integers to strings for renaming
initial_pa_education_2016.columns = initial_pa_education_2016.columns.astype(str)
print(initial_pa_education_2016.columns)


Index(['County', '5', '7', '19', '20', '22', '23', '25', '26', '28', '29'], dtype='object')


In [40]:
# Finding and replacing all 'N' values with 'NaN' across the entire DataFrame for future manipulation
initial_pa_education_2016.replace('N', np.nan, inplace=True)


In [41]:
# Further DataFrame cleaning including execution of column renaming, dropping obselete (0) index row and subsequent resetting,
# as well as converting 'County' values to uppercase county names only (removing ' County, Pennsylvania')
# A corresponding year column is also included for future merges and all % data types are coverted to floats
initial_pa_education_2016 = initial_pa_education_2016.rename(columns={'5': '% High School Graduate (18-24)', '7': '% Bachelors Degree or Higher (18-24)', '19': '% High School Graduate or Higher (25-34)',
                                                                         '20': '% Bachelors Degree or Higher (25-34)', '22': '% High School Graduate or Higher (35-44)', '23': '% Bachelors Degree or Higher (35-44)',
                                                                         '25': '% High School Graduate or Higher (45-64)', '26': '% Bachelors Degree or Higher (45-64)',
                                                                         '28': '% High School Graduate or Higher (65 & Older)', '29': '% Bachelors Degree or Higher (65 & Older)'})
initial_pa_education_2016 = initial_pa_education_2016.drop(0)
initial_pa_education_2016 = initial_pa_education_2016.reset_index(drop=True)
initial_pa_education_2016['County'] = initial_pa_education_2016['County'].str.replace(' County, Pennsylvania', '', regex=False)
initial_pa_education_2016['County'] = initial_pa_education_2016['County'].apply(lambda x: x.upper())
initial_pa_education_2016.insert(0, 'Year', 2016)
initial_pa_education_2016['% High School Graduate (18-24)'] = initial_pa_education_2016['% High School Graduate (18-24)'].str.rstrip('%').astype(float) / 100
initial_pa_education_2016['% Bachelors Degree or Higher (18-24)'] = initial_pa_education_2016['% Bachelors Degree or Higher (18-24)'].str.rstrip('%').astype(float) / 100
initial_pa_education_2016['% High School Graduate or Higher (25-34)'] = initial_pa_education_2016['% High School Graduate or Higher (25-34)'].str.rstrip('%').astype(float) / 100
initial_pa_education_2016['% Bachelors Degree or Higher (25-34)'] = initial_pa_education_2016['% Bachelors Degree or Higher (25-34)'].str.rstrip('%').astype(float) / 100
initial_pa_education_2016['% High School Graduate or Higher (35-44)'] = initial_pa_education_2016['% High School Graduate or Higher (35-44)'].str.rstrip('%').astype(float) / 100
initial_pa_education_2016['% Bachelors Degree or Higher (35-44)'] = initial_pa_education_2016['% Bachelors Degree or Higher (35-44)'].str.rstrip('%').astype(float) / 100
initial_pa_education_2016['% High School Graduate or Higher (45-64)'] = initial_pa_education_2016['% High School Graduate or Higher (45-64)'].str.rstrip('%').astype(float) / 100
initial_pa_education_2016['% Bachelors Degree or Higher (45-64)'] = initial_pa_education_2016['% Bachelors Degree or Higher (45-64)'].str.rstrip('%').astype(float) / 100
initial_pa_education_2016['% High School Graduate or Higher (65 & Older)'] = initial_pa_education_2016['% High School Graduate or Higher (65 & Older)'].str.rstrip('%').astype(float) / 100
initial_pa_education_2016['% Bachelors Degree or Higher (65 & Older)'] = initial_pa_education_2016['% Bachelors Degree or Higher (65 & Older)'].str.rstrip('%').astype(float) / 100
initial_pa_education_2016


Unnamed: 0,Year,County,% High School Graduate (18-24),% Bachelors Degree or Higher (18-24),% High School Graduate or Higher (25-34),% Bachelors Degree or Higher (25-34),% High School Graduate or Higher (35-44),% Bachelors Degree or Higher (35-44),% High School Graduate or Higher (45-64),% Bachelors Degree or Higher (45-64),% High School Graduate or Higher (65 & Older),% Bachelors Degree or Higher (65 & Older)
0,2016,ADAMS,0.405,0.098,0.895,0.228,0.904,0.317,0.889,0.187,0.798,0.201
1,2016,ALLEGHENY,0.283,0.204,0.969,0.56,0.968,0.539,0.952,0.377,0.895,0.273
2,2016,ARMSTRONG,0.38,0.145,0.887,0.189,0.949,0.242,0.906,0.159,0.822,0.109
3,2016,BEAVER,0.293,0.134,0.926,0.383,0.921,0.36,0.958,0.218,0.877,0.18
4,2016,BERKS,0.406,0.093,0.885,0.272,0.862,0.301,0.889,0.265,0.807,0.174
5,2016,BLAIR,0.382,0.103,0.909,0.233,0.944,0.259,0.935,0.193,0.88,0.098
6,2016,BUCKS,0.331,0.185,0.956,0.462,0.954,0.453,0.948,0.413,0.89,0.309
7,2016,BUTLER,0.308,0.137,0.97,0.459,0.976,0.524,0.954,0.355,0.913,0.214
8,2016,CAMBRIA,0.258,0.089,0.959,0.293,0.966,0.302,0.931,0.202,0.861,0.173
9,2016,CARBON,0.607,0.096,0.869,0.207,0.963,0.214,0.908,0.169,0.834,0.136


**2019**
-
Please note: 2020 United States Census Bureau was unavailable and as such the closest year prior to the election (2019) was utilized

In [42]:
# Reading in 2019 U.S. Census Bureau education dataset
initial_pa_education_2019 = pd.read_excel("Resources/PA_Education_2019.xlsx")
initial_pa_education_2019.head(35)


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
0,,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent,...,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent
1,Label,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,...,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate
2,AGE BY EDUCATIONAL ATTAINMENT,,,,,,,,,,...,,,,,,,,,,
3,Population 18 to 24 years,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
4,Less than high school graduate,6.5%,8.9%,8.4%,10.2%,14.2%,14.7%,11.5%,7.2%,6.7%,...,15.2%,9.5%,12.1%,23.6%,10.7%,21.7%,17.0%,8.8%,8.3%,12.6%
5,High school graduate (includes equivalency),48.7%,29.8%,55.6%,36.9%,37.7%,44.2%,30.5%,37.7%,37.6%,...,37.5%,28.9%,30.2%,49.5%,33.2%,42.7%,52.0%,37.8%,32.6%,38.4%
6,Some college or associate's degree,37.1%,41.0%,24.9%,44.1%,38.1%,29.3%,35.3%,40.5%,49.5%,...,36.5%,42.3%,46.2%,22.0%,39.0%,31.9%,22.3%,41.9%,46.3%,39.1%
7,Bachelor's degree or higher,7.7%,20.3%,11.0%,8.7%,10.0%,11.9%,22.7%,14.7%,6.2%,...,10.9%,19.3%,11.5%,4.8%,17.1%,3.7%,8.8%,11.5%,12.8%,9.9%
8,Population 25 years and over,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
9,Less than 9th grade,3.6%,1.5%,2.7%,1.7%,5.2%,2.6%,2.0%,0.7%,1.4%,...,2.1%,1.8%,2.9%,3.4%,5.0%,2.1%,3.7%,2.2%,1.5%,3.0%


In [43]:
# Retrieving only rows that contain the desired county summary demographic statistics
# % High School Graduate & % Bachelor's degree or higher for 18 to 24 years old and % High School Graduate or Higher and % Bachelor's degree or higher for the remaining age groups:
# 25 to 34, 35 to 44, 45 to 64 and 65 & older
initial_pa_education_2019 = initial_pa_education_2019.iloc[[5, 7, 19, 20, 22, 23, 25, 26, 28, 29], :]
initial_pa_education_2019


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
5,High school graduate (includes equivalency),48.7%,29.8%,55.6%,36.9%,37.7%,44.2%,30.5%,37.7%,37.6%,...,37.5%,28.9%,30.2%,49.5%,33.2%,42.7%,52.0%,37.8%,32.6%,38.4%
7,Bachelor's degree or higher,7.7%,20.3%,11.0%,8.7%,10.0%,11.9%,22.7%,14.7%,6.2%,...,10.9%,19.3%,11.5%,4.8%,17.1%,3.7%,8.8%,11.5%,12.8%,9.9%
19,High school graduate or higher,90.8%,95.7%,91.9%,95.5%,91.1%,91.7%,95.5%,97.3%,95.3%,...,91.7%,97.9%,97.1%,88.7%,92.2%,93.2%,94.8%,96.3%,94.1%,92.0%
20,Bachelor's degree or higher,24.0%,56.8%,19.7%,33.0%,25.7%,26.0%,50.0%,42.1%,26.0%,...,28.5%,56.5%,39.8%,18.8%,44.8%,22.9%,22.2%,37.4%,35.9%,32.7%
22,High school graduate or higher,89.1%,96.4%,94.7%,95.4%,87.3%,92.4%,94.8%,97.9%,98.1%,...,93.1%,95.1%,94.6%,92.2%,88.6%,90.3%,90.7%,95.3%,95.4%,91.9%
23,Bachelor's degree or higher,27.2%,53.8%,26.7%,38.8%,28.5%,37.3%,47.2%,53.4%,30.0%,...,20.6%,57.4%,35.9%,25.4%,35.0%,19.3%,25.2%,45.0%,36.2%,32.7%
25,High school graduate or higher,89.4%,96.2%,93.9%,92.2%,87.0%,94.3%,95.5%,95.8%,95.0%,...,90.6%,95.3%,92.3%,90.8%,83.8%,92.3%,91.3%,92.6%,95.7%,91.4%
26,Bachelor's degree or higher,18.5%,38.8%,13.3%,24.0%,26.0%,20.4%,41.4%,36.8%,19.9%,...,28.7%,50.8%,29.8%,14.5%,22.6%,15.4%,18.1%,25.8%,28.5%,24.1%
28,High school graduate or higher,87.8%,92.9%,89.2%,89.8%,84.7%,87.7%,90.3%,92.3%,90.0%,...,90.8%,92.2%,84.3%,84.1%,78.2%,86.4%,81.3%,89.9%,92.6%,85.3%
29,Bachelor's degree or higher,21.6%,30.5%,11.3%,21.2%,22.1%,17.5%,35.7%,26.2%,14.9%,...,23.1%,39.4%,23.5%,11.7%,22.2%,12.9%,11.3%,26.7%,24.1%,21.2%


In [44]:
# Transposing the DataFrame's format for process integrity and future merging simplicity
initial_pa_education_2019 = initial_pa_education_2019.T
initial_pa_education_2019.head()


Unnamed: 0,5,7,19,20,22,23,25,26,28,29
Unnamed: 0,High school graduate (includes equivalency),Bachelor's degree or higher,High school graduate or higher,Bachelor's degree or higher,High school graduate or higher,Bachelor's degree or higher,High school graduate or higher,Bachelor's degree or higher,High school graduate or higher,Bachelor's degree or higher
"Adams County, Pennsylvania",48.7%,7.7%,90.8%,24.0%,89.1%,27.2%,89.4%,18.5%,87.8%,21.6%
"Allegheny County, Pennsylvania",29.8%,20.3%,95.7%,56.8%,96.4%,53.8%,96.2%,38.8%,92.9%,30.5%
"Armstrong County, Pennsylvania",55.6%,11.0%,91.9%,19.7%,94.7%,26.7%,93.9%,13.3%,89.2%,11.3%
"Beaver County, Pennsylvania",36.9%,8.7%,95.5%,33.0%,95.4%,38.8%,92.2%,24.0%,89.8%,21.2%


In [45]:
# Resetting index and renaming to create a 'County' column
initial_pa_education_2019 = initial_pa_education_2019.reset_index()
initial_pa_education_2019.rename(columns={'index': 'County'}, inplace=True)
initial_pa_education_2019.head()


Unnamed: 0,County,5,7,19,20,22,23,25,26,28,29
0,Unnamed: 0,High school graduate (includes equivalency),Bachelor's degree or higher,High school graduate or higher,Bachelor's degree or higher,High school graduate or higher,Bachelor's degree or higher,High school graduate or higher,Bachelor's degree or higher,High school graduate or higher,Bachelor's degree or higher
1,"Adams County, Pennsylvania",48.7%,7.7%,90.8%,24.0%,89.1%,27.2%,89.4%,18.5%,87.8%,21.6%
2,"Allegheny County, Pennsylvania",29.8%,20.3%,95.7%,56.8%,96.4%,53.8%,96.2%,38.8%,92.9%,30.5%
3,"Armstrong County, Pennsylvania",55.6%,11.0%,91.9%,19.7%,94.7%,26.7%,93.9%,13.3%,89.2%,11.3%
4,"Beaver County, Pennsylvania",36.9%,8.7%,95.5%,33.0%,95.4%,38.8%,92.2%,24.0%,89.8%,21.2%


In [46]:
# Confirming column data types prior to renaming
print(initial_pa_education_2019.columns)


Index(['County', 5, 7, 19, 20, 22, 23, 25, 26, 28, 29], dtype='object')


In [47]:
# Converting column data types from integers to strings for renaming
initial_pa_education_2019.columns = initial_pa_education_2019.columns.astype(str)
print(initial_pa_education_2019.columns)


Index(['County', '5', '7', '19', '20', '22', '23', '25', '26', '28', '29'], dtype='object')


In [48]:
# Finding and replacing all 'N' values with 'NaN' across the entire DataFrame for future manipulation
initial_pa_education_2019.replace('N', np.nan, inplace=True)


In [49]:
# Further DataFrame cleaning including execution of column renaming, dropping obselete (0) index row and subsequent resetting,
# as well as converting 'County' values to uppercase county names only (removing ' County, Pennsylvania')
# A corresponding year column is also included for future merges and all % data types are coverted to floats
initial_pa_education_2019 = initial_pa_education_2019.rename(columns={'5': '% High School Graduate (18-24)', '7': '% Bachelors Degree or Higher (18-24)', '19': '% High School Graduate or Higher (25-34)',
                                                                         '20': '% Bachelors Degree or Higher (25-34)', '22': '% High School Graduate or Higher (35-44)', '23': '% Bachelors Degree or Higher (35-44)',
                                                                         '25': '% High School Graduate or Higher (45-64)', '26': '% Bachelors Degree or Higher (45-64)',
                                                                         '28': '% High School Graduate or Higher (65 & Older)', '29': '% Bachelors Degree or Higher (65 & Older)'})
initial_pa_education_2019 = initial_pa_education_2019.drop(0)
initial_pa_education_2019 = initial_pa_education_2019.reset_index(drop=True)
initial_pa_education_2019['County'] = initial_pa_education_2019['County'].str.replace(' County, Pennsylvania', '', regex=False)
initial_pa_education_2019['County'] = initial_pa_education_2019['County'].apply(lambda x: x.upper())
initial_pa_education_2019.insert(0, 'Year', 2020)
initial_pa_education_2019['% High School Graduate (18-24)'] = initial_pa_education_2019['% High School Graduate (18-24)'].str.rstrip('%').astype(float) / 100
initial_pa_education_2019['% Bachelors Degree or Higher (18-24)'] = initial_pa_education_2019['% Bachelors Degree or Higher (18-24)'].str.rstrip('%').astype(float) / 100
initial_pa_education_2019['% High School Graduate or Higher (25-34)'] = initial_pa_education_2019['% High School Graduate or Higher (25-34)'].str.rstrip('%').astype(float) / 100
initial_pa_education_2019['% Bachelors Degree or Higher (25-34)'] = initial_pa_education_2019['% Bachelors Degree or Higher (25-34)'].str.rstrip('%').astype(float) / 100
initial_pa_education_2019['% High School Graduate or Higher (35-44)'] = initial_pa_education_2019['% High School Graduate or Higher (35-44)'].str.rstrip('%').astype(float) / 100
initial_pa_education_2019['% Bachelors Degree or Higher (35-44)'] = initial_pa_education_2019['% Bachelors Degree or Higher (35-44)'].str.rstrip('%').astype(float) / 100
initial_pa_education_2019['% High School Graduate or Higher (45-64)'] = initial_pa_education_2019['% High School Graduate or Higher (45-64)'].str.rstrip('%').astype(float) / 100
initial_pa_education_2019['% Bachelors Degree or Higher (45-64)'] = initial_pa_education_2019['% Bachelors Degree or Higher (45-64)'].str.rstrip('%').astype(float) / 100
initial_pa_education_2019['% High School Graduate or Higher (65 & Older)'] = initial_pa_education_2019['% High School Graduate or Higher (65 & Older)'].str.rstrip('%').astype(float) / 100
initial_pa_education_2019['% Bachelors Degree or Higher (65 & Older)'] = initial_pa_education_2019['% Bachelors Degree or Higher (65 & Older)'].str.rstrip('%').astype(float) / 100
initial_pa_education_2019


Unnamed: 0,Year,County,% High School Graduate (18-24),% Bachelors Degree or Higher (18-24),% High School Graduate or Higher (25-34),% Bachelors Degree or Higher (25-34),% High School Graduate or Higher (35-44),% Bachelors Degree or Higher (35-44),% High School Graduate or Higher (45-64),% Bachelors Degree or Higher (45-64),% High School Graduate or Higher (65 & Older),% Bachelors Degree or Higher (65 & Older)
0,2020,ADAMS,0.487,0.077,0.908,0.24,0.891,0.272,0.894,0.185,0.878,0.216
1,2020,ALLEGHENY,0.298,0.203,0.957,0.568,0.964,0.538,0.962,0.388,0.929,0.305
2,2020,ARMSTRONG,0.556,0.11,0.919,0.197,0.947,0.267,0.939,0.133,0.892,0.113
3,2020,BEAVER,0.369,0.087,0.955,0.33,0.954,0.388,0.922,0.24,0.898,0.212
4,2020,BERKS,0.377,0.1,0.911,0.257,0.873,0.285,0.87,0.26,0.847,0.221
5,2020,BLAIR,0.442,0.119,0.917,0.26,0.924,0.373,0.943,0.204,0.877,0.175
6,2020,BUCKS,0.305,0.227,0.955,0.5,0.948,0.472,0.955,0.414,0.903,0.357
7,2020,BUTLER,0.377,0.147,0.973,0.421,0.979,0.534,0.958,0.368,0.923,0.262
8,2020,CAMBRIA,0.376,0.062,0.953,0.26,0.981,0.3,0.95,0.199,0.9,0.149
9,2020,CARBON,0.446,0.093,0.957,0.187,0.951,0.231,0.9,0.211,0.829,0.126


**Income (Households) Data Extraction, Transformation & Loading (2012, 2016 & 2019)**
-
-----------

**2012**
-

In [50]:
# Reading in 2012 U.S. Census Bureau income (household) dataset
initial_pa_households_2012 = pd.read_excel("Resources/PA_Income_Households_2012.xlsx")
initial_pa_households_2012.head(19)


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
0,,Households,Households,Households,Households,Households,Households,Households,Households,Households,...,Households,Households,Households,Households,Households,Households,Households,Households,Households,Households
1,Label,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,...,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate
2,Total,38102,527219,29099,70767,151291,50098,231626,73318,58228,...,57561,308795,112574,39070,579874,59338,30048,84509,152420,168508
3,"Less than $10,000",3.5%,7.9%,8.7%,6.1%,6.4%,8.1%,3.7%,5.4%,8.4%,...,4.8%,3.6%,4.9%,7.7%,14.8%,6.9%,6.4%,6.1%,6.0%,5.5%
4,"$10,000 to $14,999",5.5%,5.2%,5.7%,6.1%,5.6%,7.1%,3.0%,5.4%,10.0%,...,5.8%,3.4%,5.3%,8.1%,8.5%,7.3%,6.5%,5.3%,5.7%,3.1%
5,"$15,000 to $24,999",10.4%,11.5%,13.1%,13.8%,11.0%,13.6%,7.2%,9.9%,13.1%,...,8.2%,7.4%,9.0%,14.5%,13.9%,13.6%,12.9%,10.9%,12.4%,10.5%
6,"$25,000 to $34,999",10.3%,10.7%,14.1%,12.8%,10.5%,12.6%,6.9%,8.7%,12.1%,...,11.3%,7.1%,11.0%,12.6%,12.4%,10.1%,12.3%,8.7%,12.2%,10.4%
7,"$35,000 to $49,999",11.5%,13.7%,17.2%,13.2%,14.8%,16.1%,11.4%,13.5%,16.4%,...,13.6%,10.0%,13.7%,17.1%,12.9%,17.5%,15.7%,14.3%,13.8%,15.8%
8,"$50,000 to $74,999",21.1%,18.0%,18.0%,19.6%,19.5%,21.3%,17.5%,18.8%,18.1%,...,19.6%,17.1%,19.7%,18.4%,16.0%,20.0%,21.9%,20.8%,19.2%,19.7%
9,"$75,000 to $99,999",16.9%,12.2%,11.4%,13.2%,13.2%,9.1%,14.4%,13.3%,10.0%,...,16.2%,14.3%,14.6%,11.0%,8.6%,12.7%,12.4%,12.0%,12.3%,14.8%


In [51]:
# Retrieving only rows that contain the desired county summary demographic statistics
# Four groupings = Households, Families, Married-couple families & non family households
# % of groups above that fall into the income categories: Less Than $10,000, $10,000-$14,999, $15,000-$24,999, $25,000-$34,999,
# $35,000-$49,999, $50,000-$74,999, $75,000-$99,999, $100,000-$149,999, $150,000-$199,999, $200,000 or More
# Total count for each grouping
# Median income for each grouping
# Mean income for each grouping
initial_pa_households_2012 = initial_pa_households_2012.iloc[[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], :]
initial_pa_households_2012


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
2,Total,38102,527219,29099,70767,151291,50098,231626,73318,58228,...,57561,308795,112574,39070,579874,59338,30048,84509,152420,168508
3,"Less than $10,000",3.5%,7.9%,8.7%,6.1%,6.4%,8.1%,3.7%,5.4%,8.4%,...,4.8%,3.6%,4.9%,7.7%,14.8%,6.9%,6.4%,6.1%,6.0%,5.5%
4,"$10,000 to $14,999",5.5%,5.2%,5.7%,6.1%,5.6%,7.1%,3.0%,5.4%,10.0%,...,5.8%,3.4%,5.3%,8.1%,8.5%,7.3%,6.5%,5.3%,5.7%,3.1%
5,"$15,000 to $24,999",10.4%,11.5%,13.1%,13.8%,11.0%,13.6%,7.2%,9.9%,13.1%,...,8.2%,7.4%,9.0%,14.5%,13.9%,13.6%,12.9%,10.9%,12.4%,10.5%
6,"$25,000 to $34,999",10.3%,10.7%,14.1%,12.8%,10.5%,12.6%,6.9%,8.7%,12.1%,...,11.3%,7.1%,11.0%,12.6%,12.4%,10.1%,12.3%,8.7%,12.2%,10.4%
7,"$35,000 to $49,999",11.5%,13.7%,17.2%,13.2%,14.8%,16.1%,11.4%,13.5%,16.4%,...,13.6%,10.0%,13.7%,17.1%,12.9%,17.5%,15.7%,14.3%,13.8%,15.8%
8,"$50,000 to $74,999",21.1%,18.0%,18.0%,19.6%,19.5%,21.3%,17.5%,18.8%,18.1%,...,19.6%,17.1%,19.7%,18.4%,16.0%,20.0%,21.9%,20.8%,19.2%,19.7%
9,"$75,000 to $99,999",16.9%,12.2%,11.4%,13.2%,13.2%,9.1%,14.4%,13.3%,10.0%,...,16.2%,14.3%,14.6%,11.0%,8.6%,12.7%,12.4%,12.0%,12.3%,14.8%
10,"$100,000 to $149,999",14.1%,12.1%,8.7%,11.9%,11.4%,8.1%,19.2%,15.3%,8.2%,...,14.6%,18.2%,13.1%,7.8%,7.8%,8.6%,8.7%,13.6%,12.3%,14.3%
11,"$150,000 to $199,999",4.6%,4.5%,1.1%,2.0%,4.1%,1.8%,8.4%,5.5%,1.9%,...,3.1%,8.4%,4.8%,1.9%,2.5%,2.3%,1.0%,4.2%,3.0%,3.4%


In [52]:
# Transposing the DataFrame's format for process integrity and future merging simplicity
initial_pa_households_2012 = initial_pa_households_2012.T
initial_pa_households_2012.head()


Unnamed: 0,2,3,4,5,6,7,8,9,10,11,12,13,14
Unnamed: 0,Total,"Less than $10,000","$10,000 to $14,999","$15,000 to $24,999","$25,000 to $34,999","$35,000 to $49,999","$50,000 to $74,999","$75,000 to $99,999","$100,000 to $149,999","$150,000 to $199,999","$200,000 or more",Median income (dollars),Mean income (dollars)
"Adams County, Pennsylvania",38102,3.5%,5.5%,10.4%,10.3%,11.5%,21.1%,16.9%,14.1%,4.6%,2.0%,60136,69324
"Allegheny County, Pennsylvania",527219,7.9%,5.2%,11.5%,10.7%,13.7%,18.0%,12.2%,12.1%,4.5%,4.2%,50884,72077
"Armstrong County, Pennsylvania",29099,8.7%,5.7%,13.1%,14.1%,17.2%,18.0%,11.4%,8.7%,1.1%,1.9%,41841,52635
"Beaver County, Pennsylvania",70767,6.1%,6.1%,13.8%,12.8%,13.2%,19.6%,13.2%,11.9%,2.0%,1.5%,47083,58980


In [53]:
# Resetting index and renaming to create a 'County' column
initial_pa_households_2012 = initial_pa_households_2012.reset_index()
initial_pa_households_2012.rename(columns={'index': 'County'}, inplace=True)
initial_pa_households_2012.head()


Unnamed: 0,County,2,3,4,5,6,7,8,9,10,11,12,13,14
0,Unnamed: 0,Total,"Less than $10,000","$10,000 to $14,999","$15,000 to $24,999","$25,000 to $34,999","$35,000 to $49,999","$50,000 to $74,999","$75,000 to $99,999","$100,000 to $149,999","$150,000 to $199,999","$200,000 or more",Median income (dollars),Mean income (dollars)
1,"Adams County, Pennsylvania",38102,3.5%,5.5%,10.4%,10.3%,11.5%,21.1%,16.9%,14.1%,4.6%,2.0%,60136,69324
2,"Allegheny County, Pennsylvania",527219,7.9%,5.2%,11.5%,10.7%,13.7%,18.0%,12.2%,12.1%,4.5%,4.2%,50884,72077
3,"Armstrong County, Pennsylvania",29099,8.7%,5.7%,13.1%,14.1%,17.2%,18.0%,11.4%,8.7%,1.1%,1.9%,41841,52635
4,"Beaver County, Pennsylvania",70767,6.1%,6.1%,13.8%,12.8%,13.2%,19.6%,13.2%,11.9%,2.0%,1.5%,47083,58980


In [54]:
# Confirming column data types prior to renaming
print(initial_pa_households_2012.columns)


Index(['County', 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], dtype='object')


In [55]:
# Converting column data types from integers to strings for renaming
initial_pa_households_2012.columns = initial_pa_households_2012.columns.astype(str)
print(initial_pa_households_2012.columns)


Index(['County', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14'],
      dtype='object')


In [56]:
# Finding and replacing all 'N' values with 'NaN' across the entire DataFrame for future manipulation
initial_pa_households_2012.replace('N', np.nan, inplace=True)


In [57]:
# Further DataFrame cleaning including execution of column renaming, dropping obselete (0) index row and subsequent resetting,
# as well as converting 'County' values to uppercase county names only (removing ' County, Pennsylvania')
# A corresponding year column is also included for future merges and all % data types are coverted to floats
initial_pa_households_2012 = initial_pa_households_2012.rename(columns={'2': '# of Households', '3': '% Households <$10,000', '4': '% Households $10,000-$14,999',
                                                                         '5': '% Households $15,000-$24,999', '6': '% Households $25,000-$34,999', '7': '% Households $35,000-$49,999',
                                                                         '8': '% Households $50,000-$74,999', '9': '% Households $75,000-$99,999',
                                                                         '10': '% Households $100,000-$149,999', '11': '% Households $150,000-$199,999',
                                                                        '12': '% Households $200,000 or More', '13': 'Median Household Income ($)',
                                                                       '14': 'Mean Household Income ($)'})
initial_pa_households_2012 = initial_pa_households_2012.drop(0)
initial_pa_households_2012 = initial_pa_households_2012.reset_index(drop=True)
initial_pa_households_2012['County'] = initial_pa_households_2012['County'].str.replace(' County, Pennsylvania', '', regex=False)
initial_pa_households_2012['County'] = initial_pa_households_2012['County'].apply(lambda x: x.upper())
initial_pa_households_2012.insert(0, 'Year', 2012)
initial_pa_households_2012['% Households <$10,000'] = initial_pa_households_2012['% Households <$10,000'].str.rstrip('%').astype(float) / 100
initial_pa_households_2012['% Households $10,000-$14,999'] = initial_pa_households_2012['% Households $10,000-$14,999'].str.rstrip('%').astype(float) / 100
initial_pa_households_2012['% Households $15,000-$24,999'] = initial_pa_households_2012['% Households $15,000-$24,999'].str.rstrip('%').astype(float) / 100
initial_pa_households_2012['% Households $25,000-$34,999'] = initial_pa_households_2012['% Households $25,000-$34,999'].str.rstrip('%').astype(float) / 100
initial_pa_households_2012['% Households $35,000-$49,999'] = initial_pa_households_2012['% Households $35,000-$49,999'].str.rstrip('%').astype(float) / 100
initial_pa_households_2012['% Households $50,000-$74,999'] = initial_pa_households_2012['% Households $50,000-$74,999'].str.rstrip('%').astype(float) / 100
initial_pa_households_2012['% Households $75,000-$99,999'] = initial_pa_households_2012['% Households $75,000-$99,999'].str.rstrip('%').astype(float) / 100
initial_pa_households_2012['% Households $100,000-$149,999'] = initial_pa_households_2012['% Households $100,000-$149,999'].str.rstrip('%').astype(float) / 100
initial_pa_households_2012['% Households $150,000-$199,999'] = initial_pa_households_2012['% Households $150,000-$199,999'].str.rstrip('%').astype(float) / 100
initial_pa_households_2012['% Households $200,000 or More'] = initial_pa_households_2012['% Households $200,000 or More'].str.rstrip('%').astype(float) / 100
initial_pa_households_2012['# of Households'] = initial_pa_households_2012['# of Households'].str.replace(',', '').astype(float)
initial_pa_households_2012['Median Household Income ($)'] = initial_pa_households_2012['Median Household Income ($)'].str.replace(',', '').astype(float)
initial_pa_households_2012['Mean Household Income ($)'] = initial_pa_households_2012['Mean Household Income ($)'].str.replace(',', '').astype(float)
initial_pa_households_2012


Unnamed: 0,Year,County,# of Households,"% Households <$10,000","% Households $10,000-$14,999","% Households $15,000-$24,999","% Households $25,000-$34,999","% Households $35,000-$49,999","% Households $50,000-$74,999","% Households $75,000-$99,999","% Households $100,000-$149,999","% Households $150,000-$199,999","% Households $200,000 or More",Median Household Income ($),Mean Household Income ($)
0,2012,ADAMS,38102.0,0.035,0.055,0.104,0.103,0.115,0.211,0.169,0.141,0.046,0.02,60136.0,69324.0
1,2012,ALLEGHENY,527219.0,0.079,0.052,0.115,0.107,0.137,0.18,0.122,0.121,0.045,0.042,50884.0,72077.0
2,2012,ARMSTRONG,29099.0,0.087,0.057,0.131,0.141,0.172,0.18,0.114,0.087,0.011,0.019,41841.0,52635.0
3,2012,BEAVER,70767.0,0.061,0.061,0.138,0.128,0.132,0.196,0.132,0.119,0.02,0.015,47083.0,58980.0
4,2012,BERKS,151291.0,0.064,0.056,0.11,0.105,0.148,0.195,0.132,0.114,0.041,0.036,52022.0,68265.0
5,2012,BLAIR,50098.0,0.081,0.071,0.136,0.126,0.161,0.213,0.091,0.081,0.018,0.022,41258.0,53745.0
6,2012,BUCKS,231626.0,0.037,0.03,0.072,0.069,0.114,0.175,0.144,0.192,0.084,0.083,75471.0,95823.0
7,2012,BUTLER,73318.0,0.054,0.054,0.099,0.087,0.135,0.188,0.133,0.153,0.055,0.042,57185.0,74144.0
8,2012,CAMBRIA,58228.0,0.084,0.1,0.131,0.121,0.164,0.181,0.1,0.082,0.019,0.018,39969.0,53409.0
9,2012,CARBON,26476.0,0.055,0.073,0.107,0.071,0.186,0.273,0.119,0.078,0.031,0.007,50742.0,61145.0


**2016**
-

In [58]:
# Reading in 2016 U.S. Census Bureau income (household) dataset
initial_pa_households_2016 = pd.read_excel("Resources/PA_Income_Households_2016.xlsx")
initial_pa_households_2016.head(19)


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
0,,Households,Households,Households,Households,Households,Households,Households,Households,Households,...,Households,Households,Households,Households,Households,Households,Households,Households,Households,Households
1,Label,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,...,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate
2,Total,38125,528892,26105,68811,150402,51154,234126,75472,56443,...,57284,312447,111760,38098,580205,58149,29189,83117,148238,169885
3,"Less than $10,000",3.5%,7.2%,6.1%,5.2%,5.9%,6.5%,4.0%,4.0%,7.0%,...,4.9%,3.0%,2.9%,6.3%,14.5%,7.0%,7.8%,4.7%,5.4%,4.3%
4,"$10,000 to $14,999",4.7%,5.1%,7.1%,4.5%,4.2%,6.3%,2.3%,2.8%,7.1%,...,2.8%,2.9%,4.4%,5.1%,6.4%,4.7%,4.7%,4.9%,4.9%,3.5%
5,"$15,000 to $24,999",7.5%,10.3%,12.6%,9.6%,9.3%,12.3%,6.6%,8.7%,12.7%,...,8.8%,6.3%,7.5%,14.0%,12.1%,12.5%,13.2%,10.6%,10.7%,8.9%
6,"$25,000 to $34,999",12.8%,9.5%,12.0%,10.9%,8.2%,14.0%,6.3%,8.0%,11.7%,...,10.0%,6.7%,9.1%,12.3%,11.1%,10.0%,14.4%,9.8%,9.5%,9.2%
7,"$35,000 to $49,999",15.3%,12.8%,14.0%,14.9%,14.3%,16.5%,10.3%,13.5%,16.1%,...,13.2%,10.5%,13.1%,14.4%,12.6%,14.7%,15.4%,13.0%,13.3%,12.2%
8,"$50,000 to $74,999",16.7%,17.1%,21.3%,18.9%,18.5%,19.7%,17.4%,18.4%,18.2%,...,21.8%,15.2%,18.5%,21.0%,16.0%,21.1%,20.5%,18.7%,20.4%,22.5%
9,"$75,000 to $99,999",16.3%,12.4%,9.6%,13.9%,14.5%,11.7%,14.1%,13.9%,12.3%,...,12.9%,13.6%,16.7%,11.8%,9.9%,13.5%,10.5%,12.1%,12.2%,14.0%


In [59]:
# Retrieving only rows that contain the desired county summary demographic statistics
# Four groupings = Households, Families, Married-couple families & non family households
# % of groups above that fall into the income categories: Less Than $10,000, $10,000-$14,999, $15,000-$24,999, $25,000-$34,999,
# $35,000-$49,999, $50,000-$74,999, $75,000-$99,999, $100,000-$149,999, $150,000-$199,999, $200,000 or More
# Total count for each grouping
# Median income for each grouping
# Mean income for each grouping
initial_pa_households_2016 = initial_pa_households_2016.iloc[[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], :]
initial_pa_households_2016


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
2,Total,38125,528892,26105,68811,150402,51154,234126,75472,56443,...,57284,312447,111760,38098,580205,58149,29189,83117,148238,169885
3,"Less than $10,000",3.5%,7.2%,6.1%,5.2%,5.9%,6.5%,4.0%,4.0%,7.0%,...,4.9%,3.0%,2.9%,6.3%,14.5%,7.0%,7.8%,4.7%,5.4%,4.3%
4,"$10,000 to $14,999",4.7%,5.1%,7.1%,4.5%,4.2%,6.3%,2.3%,2.8%,7.1%,...,2.8%,2.9%,4.4%,5.1%,6.4%,4.7%,4.7%,4.9%,4.9%,3.5%
5,"$15,000 to $24,999",7.5%,10.3%,12.6%,9.6%,9.3%,12.3%,6.6%,8.7%,12.7%,...,8.8%,6.3%,7.5%,14.0%,12.1%,12.5%,13.2%,10.6%,10.7%,8.9%
6,"$25,000 to $34,999",12.8%,9.5%,12.0%,10.9%,8.2%,14.0%,6.3%,8.0%,11.7%,...,10.0%,6.7%,9.1%,12.3%,11.1%,10.0%,14.4%,9.8%,9.5%,9.2%
7,"$35,000 to $49,999",15.3%,12.8%,14.0%,14.9%,14.3%,16.5%,10.3%,13.5%,16.1%,...,13.2%,10.5%,13.1%,14.4%,12.6%,14.7%,15.4%,13.0%,13.3%,12.2%
8,"$50,000 to $74,999",16.7%,17.1%,21.3%,18.9%,18.5%,19.7%,17.4%,18.4%,18.2%,...,21.8%,15.2%,18.5%,21.0%,16.0%,21.1%,20.5%,18.7%,20.4%,22.5%
9,"$75,000 to $99,999",16.3%,12.4%,9.6%,13.9%,14.5%,11.7%,14.1%,13.9%,12.3%,...,12.9%,13.6%,16.7%,11.8%,9.9%,13.5%,10.5%,12.1%,12.2%,14.0%
10,"$100,000 to $149,999",15.5%,14.3%,12.1%,14.0%,14.7%,8.8%,19.2%,17.0%,11.4%,...,15.9%,19.7%,16.0%,10.4%,10.0%,12.7%,9.3%,15.1%,14.9%,16.7%
11,"$150,000 to $199,999",4.6%,5.5%,2.4%,4.8%,6.0%,2.5%,8.7%,6.5%,2.2%,...,5.1%,9.7%,6.8%,3.2%,3.9%,2.8%,1.9%,6.1%,5.5%,5.0%


In [60]:
# Transposing the DataFrame's format for process integrity and future merging simplicity
initial_pa_households_2016 = initial_pa_households_2016.T
initial_pa_households_2016.head()


Unnamed: 0,2,3,4,5,6,7,8,9,10,11,12,13,14
Unnamed: 0,Total,"Less than $10,000","$10,000 to $14,999","$15,000 to $24,999","$25,000 to $34,999","$35,000 to $49,999","$50,000 to $74,999","$75,000 to $99,999","$100,000 to $149,999","$150,000 to $199,999","$200,000 or more",Median income (dollars),Mean income (dollars)
"Adams County, Pennsylvania",38125,3.5%,4.7%,7.5%,12.8%,15.3%,16.7%,16.3%,15.5%,4.6%,3.2%,59300,72296
"Allegheny County, Pennsylvania",528892,7.2%,5.1%,10.3%,9.5%,12.8%,17.1%,12.4%,14.3%,5.5%,5.8%,56140,78027
"Armstrong County, Pennsylvania",26105,6.1%,7.1%,12.6%,12.0%,14.0%,21.3%,9.6%,12.1%,2.4%,2.7%,47398,60137
"Beaver County, Pennsylvania",68811,5.2%,4.5%,9.6%,10.9%,14.9%,18.9%,13.9%,14.0%,4.8%,3.3%,55221,69655


In [61]:
# Resetting index and renaming to create a 'County' column
initial_pa_households_2016 = initial_pa_households_2016.reset_index()
initial_pa_households_2016.rename(columns={'index': 'County'}, inplace=True)
initial_pa_households_2016.head()


Unnamed: 0,County,2,3,4,5,6,7,8,9,10,11,12,13,14
0,Unnamed: 0,Total,"Less than $10,000","$10,000 to $14,999","$15,000 to $24,999","$25,000 to $34,999","$35,000 to $49,999","$50,000 to $74,999","$75,000 to $99,999","$100,000 to $149,999","$150,000 to $199,999","$200,000 or more",Median income (dollars),Mean income (dollars)
1,"Adams County, Pennsylvania",38125,3.5%,4.7%,7.5%,12.8%,15.3%,16.7%,16.3%,15.5%,4.6%,3.2%,59300,72296
2,"Allegheny County, Pennsylvania",528892,7.2%,5.1%,10.3%,9.5%,12.8%,17.1%,12.4%,14.3%,5.5%,5.8%,56140,78027
3,"Armstrong County, Pennsylvania",26105,6.1%,7.1%,12.6%,12.0%,14.0%,21.3%,9.6%,12.1%,2.4%,2.7%,47398,60137
4,"Beaver County, Pennsylvania",68811,5.2%,4.5%,9.6%,10.9%,14.9%,18.9%,13.9%,14.0%,4.8%,3.3%,55221,69655


In [62]:
# Confirming column data types prior to renaming
print(initial_pa_households_2016.columns)


Index(['County', 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], dtype='object')


In [63]:
# Converting column data types from integers to strings for renaming
initial_pa_households_2016.columns = initial_pa_households_2016.columns.astype(str)
print(initial_pa_households_2016.columns)


Index(['County', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14'],
      dtype='object')


In [64]:
# Finding and replacing all 'N' values with 'NaN' across the entire DataFrame for future manipulation
initial_pa_households_2016.replace('N', np.nan, inplace=True)


In [65]:
# Further DataFrame cleaning including execution of column renaming, dropping obselete (0) index row and subsequent resetting,
# as well as converting 'County' values to uppercase county names only (removing ' County, Pennsylvania')
# A corresponding year column is also included for future merges and all % data types are coverted to floats
initial_pa_households_2016 = initial_pa_households_2016.rename(columns={'2': '# of Households', '3': '% Households <$10,000', '4': '% Households $10,000-$14,999',
                                                                         '5': '% Households $15,000-$24,999', '6': '% Households $25,000-$34,999', '7': '% Households $35,000-$49,999',
                                                                         '8': '% Households $50,000-$74,999', '9': '% Households $75,000-$99,999',
                                                                         '10': '% Households $100,000-$149,999', '11': '% Households $150,000-$199,999',
                                                                        '12': '% Households $200,000 or More', '13': 'Median Household Income ($)',
                                                                       '14': 'Mean Household Income ($)'})
initial_pa_households_2016 = initial_pa_households_2016.drop(0)
initial_pa_households_2016 = initial_pa_households_2016.reset_index(drop=True)
initial_pa_households_2016['County'] = initial_pa_households_2016['County'].str.replace(' County, Pennsylvania', '', regex=False)
initial_pa_households_2016['County'] = initial_pa_households_2016['County'].apply(lambda x: x.upper())
initial_pa_households_2016.insert(0, 'Year', 2016)
initial_pa_households_2016['% Households <$10,000'] = initial_pa_households_2016['% Households <$10,000'].str.rstrip('%').astype(float) / 100
initial_pa_households_2016['% Households $10,000-$14,999'] = initial_pa_households_2016['% Households $10,000-$14,999'].str.rstrip('%').astype(float) / 100
initial_pa_households_2016['% Households $15,000-$24,999'] = initial_pa_households_2016['% Households $15,000-$24,999'].str.rstrip('%').astype(float) / 100
initial_pa_households_2016['% Households $25,000-$34,999'] = initial_pa_households_2016['% Households $25,000-$34,999'].str.rstrip('%').astype(float) / 100
initial_pa_households_2016['% Households $35,000-$49,999'] = initial_pa_households_2016['% Households $35,000-$49,999'].str.rstrip('%').astype(float) / 100
initial_pa_households_2016['% Households $50,000-$74,999'] = initial_pa_households_2016['% Households $50,000-$74,999'].str.rstrip('%').astype(float) / 100
initial_pa_households_2016['% Households $75,000-$99,999'] = initial_pa_households_2016['% Households $75,000-$99,999'].str.rstrip('%').astype(float) / 100
initial_pa_households_2016['% Households $100,000-$149,999'] = initial_pa_households_2016['% Households $100,000-$149,999'].str.rstrip('%').astype(float) / 100
initial_pa_households_2016['% Households $150,000-$199,999'] = initial_pa_households_2016['% Households $150,000-$199,999'].str.rstrip('%').astype(float) / 100
initial_pa_households_2016['% Households $200,000 or More'] = initial_pa_households_2016['% Households $200,000 or More'].str.rstrip('%').astype(float) / 100
initial_pa_households_2016['# of Households'] = initial_pa_households_2016['# of Households'].str.replace(',', '').astype(float)
initial_pa_households_2016['Median Household Income ($)'] = initial_pa_households_2016['Median Household Income ($)'].str.replace(',', '').astype(float)
initial_pa_households_2016['Mean Household Income ($)'] = initial_pa_households_2016['Mean Household Income ($)'].str.replace(',', '').astype(float)
initial_pa_households_2016


Unnamed: 0,Year,County,# of Households,"% Households <$10,000","% Households $10,000-$14,999","% Households $15,000-$24,999","% Households $25,000-$34,999","% Households $35,000-$49,999","% Households $50,000-$74,999","% Households $75,000-$99,999","% Households $100,000-$149,999","% Households $150,000-$199,999","% Households $200,000 or More",Median Household Income ($),Mean Household Income ($)
0,2016,ADAMS,38125.0,0.035,0.047,0.075,0.128,0.153,0.167,0.163,0.155,0.046,0.032,59300.0,72296.0
1,2016,ALLEGHENY,528892.0,0.072,0.051,0.103,0.095,0.128,0.171,0.124,0.143,0.055,0.058,56140.0,78027.0
2,2016,ARMSTRONG,26105.0,0.061,0.071,0.126,0.12,0.14,0.213,0.096,0.121,0.024,0.027,47398.0,60137.0
3,2016,BEAVER,68811.0,0.052,0.045,0.096,0.109,0.149,0.189,0.139,0.14,0.048,0.033,55221.0,69655.0
4,2016,BERKS,150402.0,0.059,0.042,0.093,0.082,0.143,0.185,0.145,0.147,0.06,0.043,59286.0,76779.0
5,2016,BLAIR,51154.0,0.065,0.063,0.123,0.14,0.165,0.197,0.117,0.088,0.025,0.016,43443.0,58867.0
6,2016,BUCKS,234126.0,0.04,0.023,0.066,0.063,0.103,0.174,0.141,0.192,0.087,0.111,79936.0,106713.0
7,2016,BUTLER,75472.0,0.04,0.028,0.087,0.08,0.135,0.184,0.139,0.17,0.065,0.074,66426.0,88042.0
8,2016,CAMBRIA,56443.0,0.07,0.071,0.127,0.117,0.161,0.182,0.123,0.114,0.022,0.014,44100.0,57562.0
9,2016,CARBON,25986.0,0.042,0.066,0.101,0.129,0.15,0.205,0.128,0.121,0.038,0.02,51676.0,63116.0


**2019**
-
Please note: 2020 United States Census Bureau was unavailable and as such the closest year prior to the election (2019) was utilized

In [66]:
# Reading in 2019 U.S. Census Bureau income (household) dataset
initial_pa_households_2019 = pd.read_excel("Resources/PA_Income_Households_2019.xlsx")
initial_pa_households_2019.head(19)


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
0,,Households,Households,Households,Households,Households,Households,Households,Households,Households,...,Households,Households,Households,Households,Households,Households,Households,Households,Households,Households
1,Label,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,...,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate
2,Total,38703,553858,29095,70745,154696,52497,240491,77920,56490,...,56274,321373,114950,38592,619505,58242,29456,86716,155147,175441
3,"Less than $10,000",2.6%,6.4%,3.1%,5.2%,5.0%,6.5%,3.0%,6.2%,8.0%,...,5.6%,3.0%,4.1%,5.6%,12.2%,5.0%,5.9%,5.1%,7.2%,4.3%
4,"$10,000 to $14,999",3.2%,3.6%,5.7%,4.8%,3.1%,4.8%,1.9%,3.3%,7.1%,...,3.5%,2.0%,2.4%,5.6%,6.6%,6.1%,4.3%,5.5%,3.9%,3.1%
5,"$15,000 to $24,999",9.3%,8.4%,12.2%,9.9%,7.8%,10.0%,7.0%,6.9%,10.8%,...,6.3%,5.5%,6.4%,9.8%,9.7%,10.5%,13.4%,7.0%,9.1%,8.0%
6,"$25,000 to $34,999",8.1%,8.6%,8.2%,8.7%,7.6%,11.7%,5.4%,7.6%,10.8%,...,10.4%,6.2%,9.1%,15.5%,9.9%,9.2%,10.8%,8.6%,9.3%,7.4%
7,"$35,000 to $49,999",13.2%,11.9%,15.3%,14.3%,13.7%,15.9%,8.7%,11.0%,14.0%,...,13.3%,8.2%,10.2%,15.9%,13.3%,12.9%,17.3%,12.6%,13.7%,12.3%
8,"$50,000 to $74,999",18.0%,17.5%,22.6%,17.5%,18.1%,17.5%,14.1%,16.4%,18.2%,...,17.3%,14.6%,18.2%,19.3%,16.3%,20.6%,17.4%,16.8%,17.9%,19.4%
9,"$75,000 to $99,999",17.4%,12.7%,13.6%,13.7%,13.6%,12.6%,12.9%,12.9%,13.8%,...,13.7%,13.9%,14.8%,11.2%,10.8%,15.3%,13.1%,12.9%,14.0%,14.2%


In [67]:
# Retrieving only rows that contain the desired county summary demographic statistics
# Four groupings = Households, Families, Married-couple families & non family households
# % of groups above that fall into the income categories: Less Than $10,000, $10,000-$14,999, $15,000-$24,999, $25,000-$34,999,
# $35,000-$49,999, $50,000-$74,999, $75,000-$99,999, $100,000-$149,999, $150,000-$199,999, $200,000 or More
# Total count for each grouping
# Median income for each grouping
# Mean income for each grouping
initial_pa_households_2019 = initial_pa_households_2019.iloc[[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], :]
initial_pa_households_2019


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
2,Total,38703,553858,29095,70745,154696,52497,240491,77920,56490,...,56274,321373,114950,38592,619505,58242,29456,86716,155147,175441
3,"Less than $10,000",2.6%,6.4%,3.1%,5.2%,5.0%,6.5%,3.0%,6.2%,8.0%,...,5.6%,3.0%,4.1%,5.6%,12.2%,5.0%,5.9%,5.1%,7.2%,4.3%
4,"$10,000 to $14,999",3.2%,3.6%,5.7%,4.8%,3.1%,4.8%,1.9%,3.3%,7.1%,...,3.5%,2.0%,2.4%,5.6%,6.6%,6.1%,4.3%,5.5%,3.9%,3.1%
5,"$15,000 to $24,999",9.3%,8.4%,12.2%,9.9%,7.8%,10.0%,7.0%,6.9%,10.8%,...,6.3%,5.5%,6.4%,9.8%,9.7%,10.5%,13.4%,7.0%,9.1%,8.0%
6,"$25,000 to $34,999",8.1%,8.6%,8.2%,8.7%,7.6%,11.7%,5.4%,7.6%,10.8%,...,10.4%,6.2%,9.1%,15.5%,9.9%,9.2%,10.8%,8.6%,9.3%,7.4%
7,"$35,000 to $49,999",13.2%,11.9%,15.3%,14.3%,13.7%,15.9%,8.7%,11.0%,14.0%,...,13.3%,8.2%,10.2%,15.9%,13.3%,12.9%,17.3%,12.6%,13.7%,12.3%
8,"$50,000 to $74,999",18.0%,17.5%,22.6%,17.5%,18.1%,17.5%,14.1%,16.4%,18.2%,...,17.3%,14.6%,18.2%,19.3%,16.3%,20.6%,17.4%,16.8%,17.9%,19.4%
9,"$75,000 to $99,999",17.4%,12.7%,13.6%,13.7%,13.6%,12.6%,12.9%,12.9%,13.8%,...,13.7%,13.9%,14.8%,11.2%,10.8%,15.3%,13.1%,12.9%,14.0%,14.2%
10,"$100,000 to $149,999",16.2%,16.1%,12.6%,15.2%,19.3%,13.2%,20.3%,18.6%,11.6%,...,17.0%,18.9%,19.9%,12.1%,10.5%,14.0%,13.1%,15.8%,14.1%,19.0%
11,"$150,000 to $199,999",6.7%,7.0%,4.2%,6.8%,6.6%,5.1%,12.0%,7.4%,3.3%,...,8.1%,11.5%,7.2%,3.4%,5.1%,4.0%,2.8%,7.9%,5.9%,6.9%


In [68]:
# Transposing the DataFrame's format for process integrity and future merging simplicity
initial_pa_households_2019 = initial_pa_households_2019.T
initial_pa_households_2019.head()


Unnamed: 0,2,3,4,5,6,7,8,9,10,11,12,13,14
Unnamed: 0,Total,"Less than $10,000","$10,000 to $14,999","$15,000 to $24,999","$25,000 to $34,999","$35,000 to $49,999","$50,000 to $74,999","$75,000 to $99,999","$100,000 to $149,999","$150,000 to $199,999","$200,000 or more",Median income (dollars),Mean income (dollars)
"Adams County, Pennsylvania",38703,2.6%,3.2%,9.3%,8.1%,13.2%,18.0%,17.4%,16.2%,6.7%,5.2%,67715,84262
"Allegheny County, Pennsylvania",553858,6.4%,3.6%,8.4%,8.6%,11.9%,17.5%,12.7%,16.1%,7.0%,7.9%,64871,91448
"Armstrong County, Pennsylvania",29095,3.1%,5.7%,12.2%,8.2%,15.3%,22.6%,13.6%,12.6%,4.2%,2.5%,57636,68661
"Beaver County, Pennsylvania",70745,5.2%,4.8%,9.9%,8.7%,14.3%,17.5%,13.7%,15.2%,6.8%,3.8%,60672,75836


In [69]:
# Resetting index and renaming to create a 'County' column
initial_pa_households_2019 = initial_pa_households_2019.reset_index()
initial_pa_households_2019.rename(columns={'index': 'County'}, inplace=True)
initial_pa_households_2019.head()


Unnamed: 0,County,2,3,4,5,6,7,8,9,10,11,12,13,14
0,Unnamed: 0,Total,"Less than $10,000","$10,000 to $14,999","$15,000 to $24,999","$25,000 to $34,999","$35,000 to $49,999","$50,000 to $74,999","$75,000 to $99,999","$100,000 to $149,999","$150,000 to $199,999","$200,000 or more",Median income (dollars),Mean income (dollars)
1,"Adams County, Pennsylvania",38703,2.6%,3.2%,9.3%,8.1%,13.2%,18.0%,17.4%,16.2%,6.7%,5.2%,67715,84262
2,"Allegheny County, Pennsylvania",553858,6.4%,3.6%,8.4%,8.6%,11.9%,17.5%,12.7%,16.1%,7.0%,7.9%,64871,91448
3,"Armstrong County, Pennsylvania",29095,3.1%,5.7%,12.2%,8.2%,15.3%,22.6%,13.6%,12.6%,4.2%,2.5%,57636,68661
4,"Beaver County, Pennsylvania",70745,5.2%,4.8%,9.9%,8.7%,14.3%,17.5%,13.7%,15.2%,6.8%,3.8%,60672,75836


In [70]:
# Confirming column data types prior to renaming
print(initial_pa_households_2019.columns)


Index(['County', 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], dtype='object')


In [71]:
# Converting column data types from integers to strings for renaming
initial_pa_households_2019.columns = initial_pa_households_2019.columns.astype(str)
print(initial_pa_households_2019.columns)


Index(['County', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14'],
      dtype='object')


In [72]:
# Finding and replacing all 'N' values with 'NaN' across the entire DataFrame for future manipulation
initial_pa_households_2019.replace('N', np.nan, inplace=True)


In [73]:
# Further DataFrame cleaning including execution of column renaming, dropping obselete (0) index row and subsequent resetting,
# as well as converting 'County' values to uppercase county names only (removing ' County, Pennsylvania')
# A corresponding year column is also included for future merges and all % data types are coverted to floats
initial_pa_households_2019 = initial_pa_households_2019.rename(columns={'2': '# of Households', '3': '% Households <$10,000', '4': '% Households $10,000-$14,999',
                                                                         '5': '% Households $15,000-$24,999', '6': '% Households $25,000-$34,999', '7': '% Households $35,000-$49,999',
                                                                         '8': '% Households $50,000-$74,999', '9': '% Households $75,000-$99,999',
                                                                         '10': '% Households $100,000-$149,999', '11': '% Households $150,000-$199,999',
                                                                        '12': '% Households $200,000 or More', '13': 'Median Household Income ($)',
                                                                       '14': 'Mean Household Income ($)'})
initial_pa_households_2019 = initial_pa_households_2019.drop(0)
initial_pa_households_2019 = initial_pa_households_2019.reset_index(drop=True)
initial_pa_households_2019['County'] = initial_pa_households_2019['County'].str.replace(' County, Pennsylvania', '', regex=False)
initial_pa_households_2019['County'] = initial_pa_households_2019['County'].apply(lambda x: x.upper())
initial_pa_households_2019.insert(0, 'Year', 2020)
initial_pa_households_2019['% Households <$10,000'] = initial_pa_households_2019['% Households <$10,000'].str.rstrip('%').astype(float) / 100
initial_pa_households_2019['% Households $10,000-$14,999'] = initial_pa_households_2019['% Households $10,000-$14,999'].str.rstrip('%').astype(float) / 100
initial_pa_households_2019['% Households $15,000-$24,999'] = initial_pa_households_2019['% Households $15,000-$24,999'].str.rstrip('%').astype(float) / 100
initial_pa_households_2019['% Households $25,000-$34,999'] = initial_pa_households_2019['% Households $25,000-$34,999'].str.rstrip('%').astype(float) / 100
initial_pa_households_2019['% Households $35,000-$49,999'] = initial_pa_households_2019['% Households $35,000-$49,999'].str.rstrip('%').astype(float) / 100
initial_pa_households_2019['% Households $50,000-$74,999'] = initial_pa_households_2019['% Households $50,000-$74,999'].str.rstrip('%').astype(float) / 100
initial_pa_households_2019['% Households $75,000-$99,999'] = initial_pa_households_2019['% Households $75,000-$99,999'].str.rstrip('%').astype(float) / 100
initial_pa_households_2019['% Households $100,000-$149,999'] = initial_pa_households_2019['% Households $100,000-$149,999'].str.rstrip('%').astype(float) / 100
initial_pa_households_2019['% Households $150,000-$199,999'] = initial_pa_households_2019['% Households $150,000-$199,999'].str.rstrip('%').astype(float) / 100
initial_pa_households_2019['% Households $200,000 or More'] = initial_pa_households_2019['% Households $200,000 or More'].str.rstrip('%').astype(float) / 100
initial_pa_households_2019['# of Households'] = initial_pa_households_2019['# of Households'].str.replace(',', '').astype(float)
initial_pa_households_2019['Median Household Income ($)'] = initial_pa_households_2019['Median Household Income ($)'].str.replace(',', '').astype(float)
initial_pa_households_2019['Mean Household Income ($)'] = initial_pa_households_2019['Mean Household Income ($)'].str.replace(',', '').astype(float)
initial_pa_households_2019


Unnamed: 0,Year,County,# of Households,"% Households <$10,000","% Households $10,000-$14,999","% Households $15,000-$24,999","% Households $25,000-$34,999","% Households $35,000-$49,999","% Households $50,000-$74,999","% Households $75,000-$99,999","% Households $100,000-$149,999","% Households $150,000-$199,999","% Households $200,000 or More",Median Household Income ($),Mean Household Income ($)
0,2020,ADAMS,38703.0,0.026,0.032,0.093,0.081,0.132,0.18,0.174,0.162,0.067,0.052,67715.0,84262.0
1,2020,ALLEGHENY,553858.0,0.064,0.036,0.084,0.086,0.119,0.175,0.127,0.161,0.07,0.079,64871.0,91448.0
2,2020,ARMSTRONG,29095.0,0.031,0.057,0.122,0.082,0.153,0.226,0.136,0.126,0.042,0.025,57636.0,68661.0
3,2020,BEAVER,70745.0,0.052,0.048,0.099,0.087,0.143,0.175,0.137,0.152,0.068,0.038,60672.0,75836.0
4,2020,BERKS,154696.0,0.05,0.031,0.078,0.076,0.137,0.181,0.136,0.193,0.066,0.052,67708.0,84720.0
5,2020,BLAIR,52497.0,0.065,0.048,0.1,0.117,0.159,0.175,0.126,0.132,0.051,0.028,51004.0,69143.0
6,2020,BUCKS,240491.0,0.03,0.019,0.07,0.054,0.087,0.141,0.129,0.203,0.12,0.148,93767.0,120851.0
7,2020,BUTLER,77920.0,0.062,0.033,0.069,0.076,0.11,0.164,0.129,0.186,0.074,0.097,72262.0,95378.0
8,2020,CAMBRIA,56490.0,0.08,0.071,0.108,0.108,0.14,0.182,0.138,0.116,0.033,0.023,49076.0,63805.0
9,2020,CARBON,27107.0,0.036,0.034,0.08,0.099,0.14,0.189,0.16,0.167,0.053,0.041,61111.0,82287.0


**Income (Families) Data Extraction, Transformation & Loading (2012, 2016 & 2019)**
-
-----------

In [74]:
# Reading in 2012 U.S. Census Bureau income (families) dataset
initial_pa_families_2012 = pd.read_excel("Resources/PA_Income_Families_2012.xlsx")
initial_pa_families_2012.head(19)


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
0,,Families,Families,Families,Families,Families,Families,Families,Families,Families,...,Families,Families,Families,Families,Families,Families,Families,Families,Families,Families
1,Label,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,...,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate
2,Total,27025,304375,19535,45560,103312,32068,166398,50201,37266,...,40865,210306,76899,24864,307880,38613,21076,54650,101414,118190
3,"Less than $10,000",1.2%,4.8%,5.1%,4.2%,5.1%,3.3%,1.3%,2.7%,4.9%,...,4.4%,1.5%,3.7%,4.4%,9.7%,5.2%,2.0%,4.3%,3.2%,3.0%
4,"$10,000 to $14,999",2.8%,2.6%,3.0%,1.8%,2.8%,3.4%,1.6%,1.8%,3.9%,...,3.8%,1.3%,2.5%,3.0%,6.4%,3.5%,3.3%,2.2%,2.4%,1.7%
5,"$15,000 to $24,999",8.1%,7.1%,7.7%,9.4%,7.5%,9.5%,3.9%,6.5%,9.1%,...,5.7%,3.9%,5.6%,10.0%,13.3%,9.1%,9.9%,7.1%,7.6%,6.6%
6,"$25,000 to $34,999",7.7%,7.9%,12.7%,9.1%,8.8%,11.7%,5.5%,7.1%,12.3%,...,9.5%,5.1%,10.1%,13.1%,12.2%,10.2%,9.9%,6.5%,10.1%,8.4%
7,"$35,000 to $49,999",10.1%,12.4%,20.7%,13.2%,13.6%,19.6%,10.2%,11.8%,18.0%,...,12.7%,8.7%,12.6%,17.8%,13.0%,18.0%,16.6%,11.7%,14.0%,15.7%
8,"$50,000 to $74,999",22.1%,18.9%,21.9%,23.8%,20.8%,24.5%,16.9%,20.2%,21.8%,...,22.1%,16.1%,20.5%,21.5%,17.4%,22.3%,25.8%,23.6%,22.5%,21.2%
9,"$75,000 to $99,999",21.0%,15.9%,12.9%,16.1%,16.2%,11.0%,16.3%,16.8%,13.6%,...,17.2%,15.7%,18.0%,15.0%,10.7%,15.5%,16.6%,15.4%,15.3%,17.5%


In [75]:
# Retrieving only rows that contain the desired county summary demographic statistics
# Four groupings = Households, Families, Married-couple families & non family households
# % of groups above that fall into the income categories: Less Than $10,000, $10,000-$14,999, $15,000-$24,999, $25,000-$34,999,
# $35,000-$49,999, $50,000-$74,999, $75,000-$99,999, $100,000-$149,999, $150,000-$199,999, $200,000 or More
# Total count for each grouping
# Median income for each grouping
# Mean income for each grouping
initial_pa_families_2012 = initial_pa_families_2012.iloc[[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], :]
initial_pa_families_2012


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
2,Total,27025,304375,19535,45560,103312,32068,166398,50201,37266,...,40865,210306,76899,24864,307880,38613,21076,54650,101414,118190
3,"Less than $10,000",1.2%,4.8%,5.1%,4.2%,5.1%,3.3%,1.3%,2.7%,4.9%,...,4.4%,1.5%,3.7%,4.4%,9.7%,5.2%,2.0%,4.3%,3.2%,3.0%
4,"$10,000 to $14,999",2.8%,2.6%,3.0%,1.8%,2.8%,3.4%,1.6%,1.8%,3.9%,...,3.8%,1.3%,2.5%,3.0%,6.4%,3.5%,3.3%,2.2%,2.4%,1.7%
5,"$15,000 to $24,999",8.1%,7.1%,7.7%,9.4%,7.5%,9.5%,3.9%,6.5%,9.1%,...,5.7%,3.9%,5.6%,10.0%,13.3%,9.1%,9.9%,7.1%,7.6%,6.6%
6,"$25,000 to $34,999",7.7%,7.9%,12.7%,9.1%,8.8%,11.7%,5.5%,7.1%,12.3%,...,9.5%,5.1%,10.1%,13.1%,12.2%,10.2%,9.9%,6.5%,10.1%,8.4%
7,"$35,000 to $49,999",10.1%,12.4%,20.7%,13.2%,13.6%,19.6%,10.2%,11.8%,18.0%,...,12.7%,8.7%,12.6%,17.8%,13.0%,18.0%,16.6%,11.7%,14.0%,15.7%
8,"$50,000 to $74,999",22.1%,18.9%,21.9%,23.8%,20.8%,24.5%,16.9%,20.2%,21.8%,...,22.1%,16.1%,20.5%,21.5%,17.4%,22.3%,25.8%,23.6%,22.5%,21.2%
9,"$75,000 to $99,999",21.0%,15.9%,12.9%,16.1%,16.2%,11.0%,16.3%,16.8%,13.6%,...,17.2%,15.7%,18.0%,15.0%,10.7%,15.5%,16.6%,15.4%,15.3%,17.5%
10,"$100,000 to $149,999",18.2%,17.7%,11.7%,17.2%,15.2%,11.4%,23.1%,19.8%,11.5%,...,16.7%,22.3%,16.0%,11.0%,10.5%,11.7%,11.5%,18.6%,16.7%,18.0%
11,"$150,000 to $199,999",6.0%,6.5%,1.5%,3.0%,5.2%,2.5%,10.9%,7.2%,2.2%,...,4.3%,11.1%,6.5%,2.7%,3.6%,3.5%,1.4%,5.3%,4.2%,4.6%


In [76]:
# Transposing the DataFrame's format for process integrity and future merging simplicity
initial_pa_families_2012 = initial_pa_families_2012.T
initial_pa_families_2012.head()


Unnamed: 0,2,3,4,5,6,7,8,9,10,11,12,13,14
Unnamed: 0,Total,"Less than $10,000","$10,000 to $14,999","$15,000 to $24,999","$25,000 to $34,999","$35,000 to $49,999","$50,000 to $74,999","$75,000 to $99,999","$100,000 to $149,999","$150,000 to $199,999","$200,000 or more",Median income (dollars),Mean income (dollars)
"Adams County, Pennsylvania",27025,1.2%,2.8%,8.1%,7.7%,10.1%,22.1%,21.0%,18.2%,6.0%,2.8%,71371,80738
"Allegheny County, Pennsylvania",304375,4.8%,2.6%,7.1%,7.9%,12.4%,18.9%,15.9%,17.7%,6.5%,6.2%,69212,91391
"Armstrong County, Pennsylvania",19535,5.1%,3.0%,7.7%,12.7%,20.7%,21.9%,12.9%,11.7%,1.5%,2.8%,50469,61919
"Beaver County, Pennsylvania",45560,4.2%,1.8%,9.4%,9.1%,13.2%,23.8%,16.1%,17.2%,3.0%,2.2%,63001,71953


In [77]:
# Resetting index and renaming to create a 'County' column
initial_pa_families_2012 = initial_pa_families_2012.reset_index()
initial_pa_families_2012.rename(columns={'index': 'County'}, inplace=True)
initial_pa_families_2012.head()


Unnamed: 0,County,2,3,4,5,6,7,8,9,10,11,12,13,14
0,Unnamed: 0,Total,"Less than $10,000","$10,000 to $14,999","$15,000 to $24,999","$25,000 to $34,999","$35,000 to $49,999","$50,000 to $74,999","$75,000 to $99,999","$100,000 to $149,999","$150,000 to $199,999","$200,000 or more",Median income (dollars),Mean income (dollars)
1,"Adams County, Pennsylvania",27025,1.2%,2.8%,8.1%,7.7%,10.1%,22.1%,21.0%,18.2%,6.0%,2.8%,71371,80738
2,"Allegheny County, Pennsylvania",304375,4.8%,2.6%,7.1%,7.9%,12.4%,18.9%,15.9%,17.7%,6.5%,6.2%,69212,91391
3,"Armstrong County, Pennsylvania",19535,5.1%,3.0%,7.7%,12.7%,20.7%,21.9%,12.9%,11.7%,1.5%,2.8%,50469,61919
4,"Beaver County, Pennsylvania",45560,4.2%,1.8%,9.4%,9.1%,13.2%,23.8%,16.1%,17.2%,3.0%,2.2%,63001,71953


In [78]:
# Confirming column data types prior to renaming
print(initial_pa_families_2012.columns)


Index(['County', 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], dtype='object')


In [79]:
# Converting column data types from integers to strings for renaming
initial_pa_families_2012.columns = initial_pa_families_2012.columns.astype(str)
print(initial_pa_families_2012.columns)


Index(['County', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14'],
      dtype='object')


In [80]:
# Finding and replacing all 'N' values with 'NaN' across the entire DataFrame for future manipulation
initial_pa_families_2012.replace('N', np.nan, inplace=True)


In [81]:
# Further DataFrame cleaning including execution of column renaming, dropping obselete (0) index row and subsequent resetting,
# as well as converting 'County' values to uppercase county names only (removing ' County, Pennsylvania')
# A corresponding year column is also included for future merges and all % data types are coverted to floats
initial_pa_families_2012 = initial_pa_families_2012.rename(columns={'2': '# of Families', '3': '% Families <$10,000', '4': '% Families $10,000-$14,999',
                                                                         '5': '% Families $15,000-$24,999', '6': '% Families $25,000-$34,999', '7': '% Families $35,000-$49,999',
                                                                         '8': '% Families $50,000-$74,999', '9': '% Families $75,000-$99,999',
                                                                         '10': '% Families $100,000-$149,999', '11': '% Families $150,000-$199,999',
                                                                        '12': '% Families $200,000 or More', '13': 'Median Families Income ($)',
                                                                       '14': 'Mean Families Income ($)'})
initial_pa_families_2012 = initial_pa_families_2012.drop(0)
initial_pa_families_2012 = initial_pa_families_2012.reset_index(drop=True)
initial_pa_families_2012['County'] = initial_pa_families_2012['County'].str.replace(' County, Pennsylvania', '', regex=False)
initial_pa_families_2012['County'] = initial_pa_families_2012['County'].apply(lambda x: x.upper())
initial_pa_families_2012.insert(0, 'Year', 2012)
initial_pa_families_2012['% Families <$10,000'] = initial_pa_families_2012['% Families <$10,000'].str.rstrip('%').astype(float) / 100
initial_pa_families_2012['% Families $10,000-$14,999'] = initial_pa_families_2012['% Families $10,000-$14,999'].str.rstrip('%').astype(float) / 100
initial_pa_families_2012['% Families $15,000-$24,999'] = initial_pa_families_2012['% Families $15,000-$24,999'].str.rstrip('%').astype(float) / 100
initial_pa_families_2012['% Families $25,000-$34,999'] = initial_pa_families_2012['% Families $25,000-$34,999'].str.rstrip('%').astype(float) / 100
initial_pa_families_2012['% Families $35,000-$49,999'] = initial_pa_families_2012['% Families $35,000-$49,999'].str.rstrip('%').astype(float) / 100
initial_pa_families_2012['% Families $50,000-$74,999'] = initial_pa_families_2012['% Families $50,000-$74,999'].str.rstrip('%').astype(float) / 100
initial_pa_families_2012['% Families $75,000-$99,999'] = initial_pa_families_2012['% Families $75,000-$99,999'].str.rstrip('%').astype(float) / 100
initial_pa_families_2012['% Families $100,000-$149,999'] = initial_pa_families_2012['% Families $100,000-$149,999'].str.rstrip('%').astype(float) / 100
initial_pa_families_2012['% Families $150,000-$199,999'] = initial_pa_families_2012['% Families $150,000-$199,999'].str.rstrip('%').astype(float) / 100
initial_pa_families_2012['% Families $200,000 or More'] = initial_pa_families_2012['% Families $200,000 or More'].str.rstrip('%').astype(float) / 100
initial_pa_families_2012['# of Families'] = initial_pa_families_2012['# of Families'].str.replace(',', '').astype(float)
initial_pa_families_2012['Median Families Income ($)'] = initial_pa_families_2012['Median Families Income ($)'].str.replace(',', '').astype(float)
initial_pa_families_2012['Mean Families Income ($)'] = initial_pa_families_2012['Mean Families Income ($)'].str.replace(',', '').astype(float)
initial_pa_families_2012


Unnamed: 0,Year,County,# of Families,"% Families <$10,000","% Families $10,000-$14,999","% Families $15,000-$24,999","% Families $25,000-$34,999","% Families $35,000-$49,999","% Families $50,000-$74,999","% Families $75,000-$99,999","% Families $100,000-$149,999","% Families $150,000-$199,999","% Families $200,000 or More",Median Families Income ($),Mean Families Income ($)
0,2012,ADAMS,27025.0,0.012,0.028,0.081,0.077,0.101,0.221,0.21,0.182,0.06,0.028,71371.0,80738.0
1,2012,ALLEGHENY,304375.0,0.048,0.026,0.071,0.079,0.124,0.189,0.159,0.177,0.065,0.062,69212.0,91391.0
2,2012,ARMSTRONG,19535.0,0.051,0.03,0.077,0.127,0.207,0.219,0.129,0.117,0.015,0.028,50469.0,61919.0
3,2012,BEAVER,45560.0,0.042,0.018,0.094,0.091,0.132,0.238,0.161,0.172,0.03,0.022,63001.0,71953.0
4,2012,BERKS,103312.0,0.051,0.028,0.075,0.088,0.136,0.208,0.162,0.152,0.052,0.047,64264.0,80280.0
5,2012,BLAIR,32068.0,0.033,0.034,0.095,0.117,0.196,0.245,0.11,0.114,0.025,0.031,52079.0,64426.0
6,2012,BUCKS,166398.0,0.013,0.016,0.039,0.055,0.102,0.169,0.163,0.231,0.109,0.104,90345.0,110787.0
7,2012,BUTLER,50201.0,0.027,0.018,0.065,0.071,0.118,0.202,0.168,0.198,0.072,0.06,74808.0,89069.0
8,2012,CAMBRIA,37266.0,0.049,0.039,0.091,0.123,0.18,0.218,0.136,0.115,0.022,0.028,52802.0,65400.0
9,2012,CARBON,17537.0,0.008,0.041,0.081,0.095,0.153,0.324,0.153,0.097,0.036,0.011,56639.0,71563.0


**2016**
-

In [82]:
# Reading in 2016 U.S. Census Bureau income (families) dataset
initial_pa_families_2016 = pd.read_excel("Resources/PA_Income_Families_2016.xlsx")
initial_pa_families_2016.head(19)


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
0,,Families,Families,Families,Families,Families,Families,Families,Families,Families,...,Families,Families,Families,Families,Families,Families,Families,Families,Families,Families
1,Label,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,...,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate
2,Total,26243,295807,17519,44810,102478,32897,165984,50214,35951,...,37671,214764,78821,24845,309558,38214,19345,54369,99302,117745
3,"Less than $10,000",3.8%,3.9%,4.2%,2.8%,4.3%,2.8%,1.8%,1.7%,3.8%,...,2.4%,1.5%,2.1%,4.3%,8.1%,4.2%,4.8%,2.7%,3.2%,3.1%
4,"$10,000 to $14,999",1.0%,2.6%,4.2%,1.1%,2.4%,2.8%,1.1%,0.7%,3.4%,...,1.7%,1.6%,1.7%,2.2%,5.3%,1.9%,1.6%,1.4%,1.8%,1.9%
5,"$15,000 to $24,999",4.0%,5.3%,6.8%,5.3%,6.1%,8.2%,3.4%,3.6%,7.8%,...,5.0%,3.4%,5.2%,9.7%,11.2%,8.6%,6.9%,5.0%,4.5%,4.7%
6,"$25,000 to $34,999",9.2%,7.0%,7.6%,9.2%,7.0%,11.5%,4.8%,7.5%,10.5%,...,8.2%,4.3%,6.7%,10.5%,11.8%,8.4%,12.9%,7.8%,8.4%,7.7%
7,"$35,000 to $49,999",15.2%,10.8%,15.0%,14.8%,12.4%,16.5%,8.2%,11.7%,14.1%,...,10.8%,8.2%,12.3%,15.1%,13.3%,14.0%,15.7%,12.6%,11.9%,11.9%
8,"$50,000 to $74,999",17.2%,17.7%,26.1%,20.9%,20.4%,24.5%,17.0%,18.3%,22.6%,...,22.6%,13.8%,18.8%,22.5%,17.3%,24.9%,26.2%,20.7%,23.1%,22.7%
9,"$75,000 to $99,999",20.5%,15.8%,13.8%,16.8%,15.8%,15.6%,14.8%,15.2%,17.0%,...,14.5%,14.5%,18.6%,14.8%,11.6%,15.9%,13.7%,14.1%,15.0%,16.6%


In [83]:
# Retrieving only rows that contain the desired county summary demographic statistics
# Four groupings = Households, Families, Married-couple families & non family households
# % of groups above that fall into the income categories: Less Than $10,000, $10,000-$14,999, $15,000-$24,999, $25,000-$34,999,
# $35,000-$49,999, $50,000-$74,999, $75,000-$99,999, $100,000-$149,999, $150,000-$199,999, $200,000 or More
# Total count for each grouping
# Median income for each grouping
# Mean income for each grouping
initial_pa_families_2016 = initial_pa_families_2016.iloc[[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], :]
initial_pa_families_2016


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
2,Total,26243,295807,17519,44810,102478,32897,165984,50214,35951,...,37671,214764,78821,24845,309558,38214,19345,54369,99302,117745
3,"Less than $10,000",3.8%,3.9%,4.2%,2.8%,4.3%,2.8%,1.8%,1.7%,3.8%,...,2.4%,1.5%,2.1%,4.3%,8.1%,4.2%,4.8%,2.7%,3.2%,3.1%
4,"$10,000 to $14,999",1.0%,2.6%,4.2%,1.1%,2.4%,2.8%,1.1%,0.7%,3.4%,...,1.7%,1.6%,1.7%,2.2%,5.3%,1.9%,1.6%,1.4%,1.8%,1.9%
5,"$15,000 to $24,999",4.0%,5.3%,6.8%,5.3%,6.1%,8.2%,3.4%,3.6%,7.8%,...,5.0%,3.4%,5.2%,9.7%,11.2%,8.6%,6.9%,5.0%,4.5%,4.7%
6,"$25,000 to $34,999",9.2%,7.0%,7.6%,9.2%,7.0%,11.5%,4.8%,7.5%,10.5%,...,8.2%,4.3%,6.7%,10.5%,11.8%,8.4%,12.9%,7.8%,8.4%,7.7%
7,"$35,000 to $49,999",15.2%,10.8%,15.0%,14.8%,12.4%,16.5%,8.2%,11.7%,14.1%,...,10.8%,8.2%,12.3%,15.1%,13.3%,14.0%,15.7%,12.6%,11.9%,11.9%
8,"$50,000 to $74,999",17.2%,17.7%,26.1%,20.9%,20.4%,24.5%,17.0%,18.3%,22.6%,...,22.6%,13.8%,18.8%,22.5%,17.3%,24.9%,26.2%,20.7%,23.1%,22.7%
9,"$75,000 to $99,999",20.5%,15.8%,13.8%,16.8%,15.8%,15.6%,14.8%,15.2%,17.0%,...,14.5%,14.5%,18.6%,14.8%,11.6%,15.9%,13.7%,14.1%,15.0%,16.6%
10,"$100,000 to $149,999",18.8%,19.6%,15.9%,17.6%,18.0%,12.4%,23.6%,22.4%,15.8%,...,21.0%,23.6%,19.8%,14.5%,11.8%,16.7%,12.7%,19.9%,19.9%,20.5%
11,"$150,000 to $199,999",6.0%,8.5%,2.9%,6.8%,7.9%,3.4%,11.1%,8.8%,3.1%,...,7.1%,12.3%,8.6%,4.3%,5.3%,4.0%,2.9%,8.9%,7.5%,6.3%


In [84]:
# Transposing the DataFrame's format for process integrity and future merging simplicity
initial_pa_families_2016 = initial_pa_families_2016.T
initial_pa_families_2016.head()


Unnamed: 0,2,3,4,5,6,7,8,9,10,11,12,13,14
Unnamed: 0,Total,"Less than $10,000","$10,000 to $14,999","$15,000 to $24,999","$25,000 to $34,999","$35,000 to $49,999","$50,000 to $74,999","$75,000 to $99,999","$100,000 to $149,999","$150,000 to $199,999","$200,000 or more",Median income (dollars),Mean income (dollars)
"Adams County, Pennsylvania",26243,3.8%,1.0%,4.0%,9.2%,15.2%,17.2%,20.5%,18.8%,6.0%,4.4%,74680,83756
"Allegheny County, Pennsylvania",295807,3.9%,2.6%,5.3%,7.0%,10.8%,17.7%,15.8%,19.6%,8.5%,8.8%,79075,99613
"Armstrong County, Pennsylvania",17519,4.2%,4.2%,6.8%,7.6%,15.0%,26.1%,13.8%,15.9%,2.9%,3.5%,61167,71921
"Beaver County, Pennsylvania",44810,2.8%,1.1%,5.3%,9.2%,14.8%,20.9%,16.8%,17.6%,6.8%,4.6%,68243,83078


In [85]:
# Resetting index and renaming to create a 'County' column
initial_pa_families_2016 = initial_pa_families_2016.reset_index()
initial_pa_families_2016.rename(columns={'index': 'County'}, inplace=True)
initial_pa_families_2016.head()


Unnamed: 0,County,2,3,4,5,6,7,8,9,10,11,12,13,14
0,Unnamed: 0,Total,"Less than $10,000","$10,000 to $14,999","$15,000 to $24,999","$25,000 to $34,999","$35,000 to $49,999","$50,000 to $74,999","$75,000 to $99,999","$100,000 to $149,999","$150,000 to $199,999","$200,000 or more",Median income (dollars),Mean income (dollars)
1,"Adams County, Pennsylvania",26243,3.8%,1.0%,4.0%,9.2%,15.2%,17.2%,20.5%,18.8%,6.0%,4.4%,74680,83756
2,"Allegheny County, Pennsylvania",295807,3.9%,2.6%,5.3%,7.0%,10.8%,17.7%,15.8%,19.6%,8.5%,8.8%,79075,99613
3,"Armstrong County, Pennsylvania",17519,4.2%,4.2%,6.8%,7.6%,15.0%,26.1%,13.8%,15.9%,2.9%,3.5%,61167,71921
4,"Beaver County, Pennsylvania",44810,2.8%,1.1%,5.3%,9.2%,14.8%,20.9%,16.8%,17.6%,6.8%,4.6%,68243,83078


In [86]:
# Confirming column data types prior to renaming
print(initial_pa_families_2016.columns)


Index(['County', 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], dtype='object')


In [87]:
# Converting column data types from integers to strings for renaming
initial_pa_families_2016.columns = initial_pa_families_2016.columns.astype(str)
print(initial_pa_families_2016.columns)


Index(['County', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14'],
      dtype='object')


In [88]:
# Finding and replacing all 'N' values with 'NaN' across the entire DataFrame for future manipulation
initial_pa_families_2016.replace('N', np.nan, inplace=True)


In [89]:
# Further DataFrame cleaning including execution of column renaming, dropping obselete (0) index row and subsequent resetting,
# as well as converting 'County' values to uppercase county names only (removing ' County, Pennsylvania')
# A corresponding year column is also included for future merges and all % data types are coverted to floats
initial_pa_families_2016 = initial_pa_families_2016.rename(columns={'2': '# of Families', '3': '% Families <$10,000', '4': '% Families $10,000-$14,999',
                                                                         '5': '% Families $15,000-$24,999', '6': '% Families $25,000-$34,999', '7': '% Families $35,000-$49,999',
                                                                         '8': '% Families $50,000-$74,999', '9': '% Families $75,000-$99,999',
                                                                         '10': '% Families $100,000-$149,999', '11': '% Families $150,000-$199,999',
                                                                        '12': '% Families $200,000 or More', '13': 'Median Families Income ($)',
                                                                       '14': 'Mean Families Income ($)'})
initial_pa_families_2016 = initial_pa_families_2016.drop(0)
initial_pa_families_2016 = initial_pa_families_2016.reset_index(drop=True)
initial_pa_families_2016['County'] = initial_pa_families_2016['County'].str.replace(' County, Pennsylvania', '', regex=False)
initial_pa_families_2016['County'] = initial_pa_families_2016['County'].apply(lambda x: x.upper())
initial_pa_families_2016.insert(0, 'Year', 2016)
initial_pa_families_2016['% Families <$10,000'] = initial_pa_families_2016['% Families <$10,000'].str.rstrip('%').astype(float) / 100
initial_pa_families_2016['% Families $10,000-$14,999'] = initial_pa_families_2016['% Families $10,000-$14,999'].str.rstrip('%').astype(float) / 100
initial_pa_families_2016['% Families $15,000-$24,999'] = initial_pa_families_2016['% Families $15,000-$24,999'].str.rstrip('%').astype(float) / 100
initial_pa_families_2016['% Families $25,000-$34,999'] = initial_pa_families_2016['% Families $25,000-$34,999'].str.rstrip('%').astype(float) / 100
initial_pa_families_2016['% Families $35,000-$49,999'] = initial_pa_families_2016['% Families $35,000-$49,999'].str.rstrip('%').astype(float) / 100
initial_pa_families_2016['% Families $50,000-$74,999'] = initial_pa_families_2016['% Families $50,000-$74,999'].str.rstrip('%').astype(float) / 100
initial_pa_families_2016['% Families $75,000-$99,999'] = initial_pa_families_2016['% Families $75,000-$99,999'].str.rstrip('%').astype(float) / 100
initial_pa_families_2016['% Families $100,000-$149,999'] = initial_pa_families_2016['% Families $100,000-$149,999'].str.rstrip('%').astype(float) / 100
initial_pa_families_2016['% Families $150,000-$199,999'] = initial_pa_families_2016['% Families $150,000-$199,999'].str.rstrip('%').astype(float) / 100
initial_pa_families_2016['% Families $200,000 or More'] = initial_pa_families_2016['% Families $200,000 or More'].str.rstrip('%').astype(float) / 100
initial_pa_families_2016['# of Families'] = initial_pa_families_2016['# of Families'].str.replace(',', '').astype(float)
initial_pa_families_2016['Median Families Income ($)'] = initial_pa_families_2016['Median Families Income ($)'].str.replace(',', '').astype(float)
initial_pa_families_2016['Mean Families Income ($)'] = initial_pa_families_2016['Mean Families Income ($)'].str.replace(',', '').astype(float)
initial_pa_families_2016


Unnamed: 0,Year,County,# of Families,"% Families <$10,000","% Families $10,000-$14,999","% Families $15,000-$24,999","% Families $25,000-$34,999","% Families $35,000-$49,999","% Families $50,000-$74,999","% Families $75,000-$99,999","% Families $100,000-$149,999","% Families $150,000-$199,999","% Families $200,000 or More",Median Families Income ($),Mean Families Income ($)
0,2016,ADAMS,26243.0,0.038,0.01,0.04,0.092,0.152,0.172,0.205,0.188,0.06,0.044,74680.0,83756.0
1,2016,ALLEGHENY,295807.0,0.039,0.026,0.053,0.07,0.108,0.177,0.158,0.196,0.085,0.088,79075.0,99613.0
2,2016,ARMSTRONG,17519.0,0.042,0.042,0.068,0.076,0.15,0.261,0.138,0.159,0.029,0.035,61167.0,71921.0
3,2016,BEAVER,44810.0,0.028,0.011,0.053,0.092,0.148,0.209,0.168,0.176,0.068,0.046,68243.0,83078.0
4,2016,BERKS,102478.0,0.043,0.024,0.061,0.07,0.124,0.204,0.158,0.18,0.079,0.057,71795.0,89282.0
5,2016,BLAIR,32897.0,0.028,0.028,0.082,0.115,0.165,0.245,0.156,0.124,0.034,0.022,59272.0,71014.0
6,2016,BUCKS,165984.0,0.018,0.011,0.034,0.048,0.082,0.17,0.148,0.236,0.111,0.141,97670.0,124255.0
7,2016,BUTLER,50214.0,0.017,0.007,0.036,0.075,0.117,0.183,0.152,0.224,0.088,0.1,85893.0,105891.0
8,2016,CAMBRIA,35951.0,0.038,0.034,0.078,0.105,0.141,0.226,0.17,0.158,0.031,0.021,60023.0,70733.0
9,2016,CARBON,16260.0,0.025,0.031,0.052,0.126,0.113,0.232,0.169,0.171,0.062,0.019,66525.0,75418.0


**2019**
-
Please note: 2020 United States Census Bureau was unavailable and as such the closest year prior to the election (2019) was utilized

In [90]:
# Reading in 2019 U.S. Census Bureau income (families) dataset
initial_pa_families_2019 = pd.read_excel("Resources/PA_Income_Families_2019.xlsx")
initial_pa_families_2019.head(19)


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
0,,Families,Families,Families,Families,Families,Families,Families,Families,Families,...,Families,Families,Families,Families,Families,Families,Families,Families,Families,Families
1,Label,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,...,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate
2,Total,26375,301596,18184,43107,105402,34637,167024,50071,35187,...,40663,217598,76846,24939,338852,37730,19642,55227,94737,118266
3,"Less than $10,000",1.3%,3.5%,2.6%,3.0%,3.5%,3.8%,1.7%,2.9%,4.3%,...,3.2%,1.7%,2.3%,6.6%,7.7%,4.2%,3.0%,2.6%,3.0%,1.5%
4,"$10,000 to $14,999",2.3%,1.3%,2.4%,1.7%,1.2%,3.4%,0.6%,1.1%,2.6%,...,1.8%,0.7%,1.1%,1.4%,5.0%,1.3%,3.2%,2.6%,2.0%,1.7%
5,"$15,000 to $24,999",3.4%,4.0%,5.8%,5.9%,4.9%,7.0%,3.1%,2.7%,5.7%,...,5.3%,2.3%,3.2%,6.2%,9.0%,5.1%,8.1%,4.0%,4.5%,3.9%
6,"$25,000 to $34,999",7.2%,5.5%,7.1%,6.5%,5.2%,9.2%,3.7%,4.5%,9.0%,...,6.5%,3.8%,6.2%,10.0%,10.0%,7.0%,9.0%,5.7%,5.9%,5.5%
7,"$35,000 to $49,999",11.9%,8.2%,14.6%,11.5%,11.9%,14.9%,6.9%,8.4%,13.3%,...,12.9%,6.4%,9.0%,15.3%,13.5%,15.2%,16.0%,12.1%,12.8%,9.9%
8,"$50,000 to $74,999",18.4%,17.8%,23.1%,17.1%,19.1%,18.5%,12.0%,17.1%,21.5%,...,18.6%,13.0%,19.2%,22.8%,16.9%,22.8%,21.2%,16.7%,17.9%,22.0%
9,"$75,000 to $99,999",19.3%,14.5%,17.8%,18.0%,15.3%,15.1%,13.8%,15.5%,18.8%,...,15.3%,14.2%,16.1%,13.8%,12.5%,18.3%,16.7%,14.5%,18.7%,15.0%


In [91]:
# Retrieving only rows that contain the desired county summary demographic statistics
# Four groupings = Households, Families, Married-couple families & non family households
# % of groups above that fall into the income categories: Less Than $10,000, $10,000-$14,999, $15,000-$24,999, $25,000-$34,999,
# $35,000-$49,999, $50,000-$74,999, $75,000-$99,999, $100,000-$149,999, $150,000-$199,999, $200,000 or More
# Total count for each grouping
# Median income for each grouping
# Mean income for each grouping
initial_pa_families_2019 = initial_pa_families_2019.iloc[[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], :]
initial_pa_families_2019


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
2,Total,26375,301596,18184,43107,105402,34637,167024,50071,35187,...,40663,217598,76846,24939,338852,37730,19642,55227,94737,118266
3,"Less than $10,000",1.3%,3.5%,2.6%,3.0%,3.5%,3.8%,1.7%,2.9%,4.3%,...,3.2%,1.7%,2.3%,6.6%,7.7%,4.2%,3.0%,2.6%,3.0%,1.5%
4,"$10,000 to $14,999",2.3%,1.3%,2.4%,1.7%,1.2%,3.4%,0.6%,1.1%,2.6%,...,1.8%,0.7%,1.1%,1.4%,5.0%,1.3%,3.2%,2.6%,2.0%,1.7%
5,"$15,000 to $24,999",3.4%,4.0%,5.8%,5.9%,4.9%,7.0%,3.1%,2.7%,5.7%,...,5.3%,2.3%,3.2%,6.2%,9.0%,5.1%,8.1%,4.0%,4.5%,3.9%
6,"$25,000 to $34,999",7.2%,5.5%,7.1%,6.5%,5.2%,9.2%,3.7%,4.5%,9.0%,...,6.5%,3.8%,6.2%,10.0%,10.0%,7.0%,9.0%,5.7%,5.9%,5.5%
7,"$35,000 to $49,999",11.9%,8.2%,14.6%,11.5%,11.9%,14.9%,6.9%,8.4%,13.3%,...,12.9%,6.4%,9.0%,15.3%,13.5%,15.2%,16.0%,12.1%,12.8%,9.9%
8,"$50,000 to $74,999",18.4%,17.8%,23.1%,17.1%,19.1%,18.5%,12.0%,17.1%,21.5%,...,18.6%,13.0%,19.2%,22.8%,16.9%,22.8%,21.2%,16.7%,17.9%,22.0%
9,"$75,000 to $99,999",19.3%,14.5%,17.8%,18.0%,15.3%,15.1%,13.8%,15.5%,18.8%,...,15.3%,14.2%,16.1%,13.8%,12.5%,18.3%,16.7%,14.5%,18.7%,15.0%
10,"$100,000 to $149,999",20.3%,22.6%,17.2%,20.7%,24.1%,17.6%,24.3%,24.3%,16.3%,...,21.1%,21.7%,23.5%,16.3%,12.3%,18.3%,16.6%,20.7%,19.5%,24.3%
11,"$150,000 to $199,999",8.6%,10.2%,6.2%,9.6%,8.3%,6.8%,16.0%,10.3%,5.2%,...,8.7%,14.8%,9.3%,5.1%,6.0%,5.1%,4.1%,10.9%,8.4%,8.8%


In [92]:
# Transposing the DataFrame's format for process integrity and future merging simplicity
initial_pa_families_2019 = initial_pa_families_2019.T
initial_pa_families_2019.head()


Unnamed: 0,2,3,4,5,6,7,8,9,10,11,12,13,14
Unnamed: 0,Total,"Less than $10,000","$10,000 to $14,999","$15,000 to $24,999","$25,000 to $34,999","$35,000 to $49,999","$50,000 to $74,999","$75,000 to $99,999","$100,000 to $149,999","$150,000 to $199,999","$200,000 or more",Median income (dollars),Mean income (dollars)
"Adams County, Pennsylvania",26375,1.3%,2.3%,3.4%,7.2%,11.9%,18.4%,19.3%,20.3%,8.6%,7.1%,81210,99218
"Allegheny County, Pennsylvania",301596,3.5%,1.3%,4.0%,5.5%,8.2%,17.8%,14.5%,22.6%,10.2%,12.3%,90891,119863
"Armstrong County, Pennsylvania",18184,2.6%,2.4%,5.8%,7.1%,14.6%,23.1%,17.8%,17.2%,6.2%,3.3%,68733,82941
"Beaver County, Pennsylvania",43107,3.0%,1.7%,5.9%,6.5%,11.5%,17.1%,18.0%,20.7%,9.6%,5.8%,79450,94002


In [93]:
# Resetting index and renaming to create a 'County' column
initial_pa_families_2019 = initial_pa_families_2019.reset_index()
initial_pa_families_2019.rename(columns={'index': 'County'}, inplace=True)
initial_pa_families_2019.head()


Unnamed: 0,County,2,3,4,5,6,7,8,9,10,11,12,13,14
0,Unnamed: 0,Total,"Less than $10,000","$10,000 to $14,999","$15,000 to $24,999","$25,000 to $34,999","$35,000 to $49,999","$50,000 to $74,999","$75,000 to $99,999","$100,000 to $149,999","$150,000 to $199,999","$200,000 or more",Median income (dollars),Mean income (dollars)
1,"Adams County, Pennsylvania",26375,1.3%,2.3%,3.4%,7.2%,11.9%,18.4%,19.3%,20.3%,8.6%,7.1%,81210,99218
2,"Allegheny County, Pennsylvania",301596,3.5%,1.3%,4.0%,5.5%,8.2%,17.8%,14.5%,22.6%,10.2%,12.3%,90891,119863
3,"Armstrong County, Pennsylvania",18184,2.6%,2.4%,5.8%,7.1%,14.6%,23.1%,17.8%,17.2%,6.2%,3.3%,68733,82941
4,"Beaver County, Pennsylvania",43107,3.0%,1.7%,5.9%,6.5%,11.5%,17.1%,18.0%,20.7%,9.6%,5.8%,79450,94002


In [94]:
# Confirming column data types prior to renaming
print(initial_pa_families_2019.columns)


Index(['County', 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], dtype='object')


In [95]:
# Converting column data types from integers to strings for renaming
initial_pa_families_2019.columns = initial_pa_families_2019.columns.astype(str)
print(initial_pa_families_2019.columns)


Index(['County', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14'],
      dtype='object')


In [96]:
# Finding and replacing all 'N' values with 'NaN' across the entire DataFrame for future manipulation
initial_pa_families_2019.replace('N', np.nan, inplace=True)


In [97]:
# Further DataFrame cleaning including execution of column renaming, dropping obselete (0) index row and subsequent resetting,
# as well as converting 'County' values to uppercase county names only (removing ' County, Pennsylvania')
# A corresponding year column is also included for future merges and all % data types are coverted to floats
initial_pa_families_2019 = initial_pa_families_2019.rename(columns={'2': '# of Families', '3': '% Families <$10,000', '4': '% Families $10,000-$14,999',
                                                                         '5': '% Families $15,000-$24,999', '6': '% Families $25,000-$34,999', '7': '% Families $35,000-$49,999',
                                                                         '8': '% Families $50,000-$74,999', '9': '% Families $75,000-$99,999',
                                                                         '10': '% Families $100,000-$149,999', '11': '% Families $150,000-$199,999',
                                                                        '12': '% Families $200,000 or More', '13': 'Median Families Income ($)',
                                                                       '14': 'Mean Families Income ($)'})
initial_pa_families_2019 = initial_pa_families_2019.drop(0)
initial_pa_families_2019 = initial_pa_families_2019.reset_index(drop=True)
initial_pa_families_2019['County'] = initial_pa_families_2019['County'].str.replace(' County, Pennsylvania', '', regex=False)
initial_pa_families_2019['County'] = initial_pa_families_2019['County'].apply(lambda x: x.upper())
initial_pa_families_2019.insert(0, 'Year', 2020)
initial_pa_families_2019['% Families <$10,000'] = initial_pa_families_2019['% Families <$10,000'].str.rstrip('%').astype(float) / 100
initial_pa_families_2019['% Families $10,000-$14,999'] = initial_pa_families_2019['% Families $10,000-$14,999'].str.rstrip('%').astype(float) / 100
initial_pa_families_2019['% Families $15,000-$24,999'] = initial_pa_families_2019['% Families $15,000-$24,999'].str.rstrip('%').astype(float) / 100
initial_pa_families_2019['% Families $25,000-$34,999'] = initial_pa_families_2019['% Families $25,000-$34,999'].str.rstrip('%').astype(float) / 100
initial_pa_families_2019['% Families $35,000-$49,999'] = initial_pa_families_2019['% Families $35,000-$49,999'].str.rstrip('%').astype(float) / 100
initial_pa_families_2019['% Families $50,000-$74,999'] = initial_pa_families_2019['% Families $50,000-$74,999'].str.rstrip('%').astype(float) / 100
initial_pa_families_2019['% Families $75,000-$99,999'] = initial_pa_families_2019['% Families $75,000-$99,999'].str.rstrip('%').astype(float) / 100
initial_pa_families_2019['% Families $100,000-$149,999'] = initial_pa_families_2019['% Families $100,000-$149,999'].str.rstrip('%').astype(float) / 100
initial_pa_families_2019['% Families $150,000-$199,999'] = initial_pa_families_2019['% Families $150,000-$199,999'].str.rstrip('%').astype(float) / 100
initial_pa_families_2019['% Families $200,000 or More'] = initial_pa_families_2019['% Families $200,000 or More'].str.rstrip('%').astype(float) / 100
initial_pa_families_2019['# of Families'] = initial_pa_families_2019['# of Families'].str.replace(',', '').astype(float)
initial_pa_families_2019['Median Families Income ($)'] = initial_pa_families_2019['Median Families Income ($)'].str.replace(',', '').astype(float)
initial_pa_families_2019['Mean Families Income ($)'] = initial_pa_families_2019['Mean Families Income ($)'].str.replace(',', '').astype(float)
initial_pa_families_2019


Unnamed: 0,Year,County,# of Families,"% Families <$10,000","% Families $10,000-$14,999","% Families $15,000-$24,999","% Families $25,000-$34,999","% Families $35,000-$49,999","% Families $50,000-$74,999","% Families $75,000-$99,999","% Families $100,000-$149,999","% Families $150,000-$199,999","% Families $200,000 or More",Median Families Income ($),Mean Families Income ($)
0,2020,ADAMS,26375.0,0.013,0.023,0.034,0.072,0.119,0.184,0.193,0.203,0.086,0.071,81210.0,99218.0
1,2020,ALLEGHENY,301596.0,0.035,0.013,0.04,0.055,0.082,0.178,0.145,0.226,0.102,0.123,90891.0,119863.0
2,2020,ARMSTRONG,18184.0,0.026,0.024,0.058,0.071,0.146,0.231,0.178,0.172,0.062,0.033,68733.0,82941.0
3,2020,BEAVER,43107.0,0.03,0.017,0.059,0.065,0.115,0.171,0.18,0.207,0.096,0.058,79450.0,94002.0
4,2020,BERKS,105402.0,0.035,0.012,0.049,0.052,0.119,0.191,0.153,0.241,0.083,0.067,81575.0,97936.0
5,2020,BLAIR,34637.0,0.038,0.034,0.07,0.092,0.149,0.185,0.151,0.176,0.068,0.036,65157.0,80697.0
6,2020,BUCKS,167024.0,0.017,0.006,0.031,0.037,0.069,0.12,0.138,0.243,0.16,0.18,112006.0,139798.0
7,2020,BUTLER,50071.0,0.029,0.011,0.027,0.045,0.084,0.171,0.155,0.243,0.103,0.132,96134.0,115734.0
8,2020,CAMBRIA,35187.0,0.043,0.026,0.057,0.09,0.133,0.215,0.188,0.163,0.052,0.033,66461.0,79360.0
9,2020,CARBON,17562.0,0.02,0.008,0.039,0.071,0.119,0.25,0.171,0.196,0.072,0.055,72639.0,89053.0


**Income (Married Couple Families) Data Extraction, Transformation & Loading (2012, 2016 & 2019)**
-
-----------

**2012**
-

In [98]:
# Reading in 2012 U.S. Census Bureau income (Married Couple Families) dataset
initial_pa_mc_families_2012 = pd.read_excel("Resources/PA_Income_Married_Couple_Families_2012.xlsx")
initial_pa_mc_families_2012.head(19)


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
0,,Married-couple families,Married-couple families,Married-couple families,Married-couple families,Married-couple families,Married-couple families,Married-couple families,Married-couple families,Married-couple families,...,Married-couple families,Married-couple families,Married-couple families,Married-couple families,Married-couple families,Married-couple families,Married-couple families,Married-couple families,Married-couple families,Married-couple families
1,Label,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,...,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate
2,Total,21723,223942,15553,35185,76132,24351,136406,41674,27432,...,29060,171313,56774,N,154319,28076,17874,44139,78714,91558
3,"Less than $10,000",0.7%,1.1%,2.0%,1.6%,1.6%,1.3%,0.6%,0.7%,1.8%,...,0.9%,0.8%,0.6%,N,2.9%,1.7%,0.9%,1.5%,1.3%,0.8%
4,"$10,000 to $14,999",1.8%,0.9%,2.3%,0.9%,1.1%,1.6%,0.9%,0.7%,1.6%,...,2.9%,0.6%,1.0%,N,2.9%,0.9%,1.6%,0.7%,0.9%,0.3%
5,"$15,000 to $24,999",5.3%,4.4%,5.5%,6.5%,4.5%,6.4%,2.7%,3.7%,7.0%,...,2.9%,2.4%,3.1%,N,9.1%,7.4%,7.2%,4.3%,4.2%,4.7%
6,"$25,000 to $34,999",5.9%,6.3%,11.3%,7.3%,7.0%,10.8%,4.3%,5.3%,9.9%,...,6.2%,3.9%,7.8%,N,9.5%,9.0%,8.9%,4.6%,9.1%,6.3%
7,"$35,000 to $49,999",9.8%,10.1%,20.0%,13.2%,12.5%,17.8%,7.6%,11.4%,18.2%,...,12.5%,7.0%,12.1%,N,12.6%,16.3%,16.8%,12.1%,11.9%,14.5%
8,"$50,000 to $74,999",21.6%,20.0%,24.2%,23.7%,22.3%,27.5%,16.5%,21.3%,24.7%,...,22.7%,15.0%,19.9%,N,18.9%,24.0%,27.9%,24.8%,24.1%,22.3%
9,"$75,000 to $99,999",23.0%,18.7%,15.0%,19.5%,19.3%,13.4%,17.3%,18.2%,16.5%,...,21.1%,15.5%,21.3%,N,14.5%,19.3%,18.2%,17.6%,18.0%,19.0%


In [99]:
# Retrieving only rows that contain the desired county summary demographic statistics
# Four groupings = Households, Families, Married-couple families & non family households
# % of groups above that fall into the income categories: Less Than $10,000, $10,000-$14,999, $15,000-$24,999, $25,000-$34,999,
# $35,000-$49,999, $50,000-$74,999, $75,000-$99,999, $100,000-$149,999, $150,000-$199,999, $200,000 or More
# Total count for each grouping
# Median income for each grouping
# Mean income for each grouping
initial_pa_mc_families_2012 = initial_pa_mc_families_2012.iloc[[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], :]
initial_pa_mc_families_2012


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
2,Total,21723,223942,15553,35185,76132,24351,136406,41674,27432,...,29060,171313,56774,N,154319,28076,17874,44139,78714,91558
3,"Less than $10,000",0.7%,1.1%,2.0%,1.6%,1.6%,1.3%,0.6%,0.7%,1.8%,...,0.9%,0.8%,0.6%,N,2.9%,1.7%,0.9%,1.5%,1.3%,0.8%
4,"$10,000 to $14,999",1.8%,0.9%,2.3%,0.9%,1.1%,1.6%,0.9%,0.7%,1.6%,...,2.9%,0.6%,1.0%,N,2.9%,0.9%,1.6%,0.7%,0.9%,0.3%
5,"$15,000 to $24,999",5.3%,4.4%,5.5%,6.5%,4.5%,6.4%,2.7%,3.7%,7.0%,...,2.9%,2.4%,3.1%,N,9.1%,7.4%,7.2%,4.3%,4.2%,4.7%
6,"$25,000 to $34,999",5.9%,6.3%,11.3%,7.3%,7.0%,10.8%,4.3%,5.3%,9.9%,...,6.2%,3.9%,7.8%,N,9.5%,9.0%,8.9%,4.6%,9.1%,6.3%
7,"$35,000 to $49,999",9.8%,10.1%,20.0%,13.2%,12.5%,17.8%,7.6%,11.4%,18.2%,...,12.5%,7.0%,12.1%,N,12.6%,16.3%,16.8%,12.1%,11.9%,14.5%
8,"$50,000 to $74,999",21.6%,20.0%,24.2%,23.7%,22.3%,27.5%,16.5%,21.3%,24.7%,...,22.7%,15.0%,19.9%,N,18.9%,24.0%,27.9%,24.8%,24.1%,22.3%
9,"$75,000 to $99,999",23.0%,18.7%,15.0%,19.5%,19.3%,13.4%,17.3%,18.2%,16.5%,...,21.1%,15.5%,21.3%,N,14.5%,19.3%,18.2%,17.6%,18.0%,19.0%
10,"$100,000 to $149,999",21.4%,22.3%,14.3%,21.3%,19.1%,14.3%,24.8%,22.7%,13.7%,...,20.9%,24.6%,19.8%,N,17.3%,15.5%,13.3%,21.4%,20.3%,22.1%
11,"$150,000 to $199,999",7.0%,8.4%,1.9%,3.6%,6.6%,3.2%,12.8%,8.7%,2.9%,...,6.1%,13.0%,8.3%,N,6.5%,4.6%,1.7%,6.4%,5.4%,5.9%


In [100]:
# Transposing the DataFrame's format for process integrity and future merging simplicity
initial_pa_mc_families_2012 = initial_pa_mc_families_2012.T
initial_pa_mc_families_2012.head()


Unnamed: 0,2,3,4,5,6,7,8,9,10,11,12,13,14
Unnamed: 0,Total,"Less than $10,000","$10,000 to $14,999","$15,000 to $24,999","$25,000 to $34,999","$35,000 to $49,999","$50,000 to $74,999","$75,000 to $99,999","$100,000 to $149,999","$150,000 to $199,999","$200,000 or more",Median income (dollars),Mean income (dollars)
"Adams County, Pennsylvania",21723,0.7%,1.8%,5.3%,5.9%,9.8%,21.6%,23.0%,21.4%,7.0%,3.5%,79010,N
"Allegheny County, Pennsylvania",223942,1.1%,0.9%,4.4%,6.3%,10.1%,20.0%,18.7%,22.3%,8.4%,8.0%,83482,107991
"Armstrong County, Pennsylvania",15553,2.0%,2.3%,5.5%,11.3%,20.0%,24.2%,15.0%,14.3%,1.9%,3.5%,56338,N
"Beaver County, Pennsylvania",35185,1.6%,0.9%,6.5%,7.3%,13.2%,23.7%,19.5%,21.3%,3.6%,2.5%,72440,N


In [101]:
# Resetting index and renaming to create a 'County' column
initial_pa_mc_families_2012 = initial_pa_mc_families_2012.reset_index()
initial_pa_mc_families_2012.rename(columns={'index': 'County'}, inplace=True)
initial_pa_mc_families_2012.head()


Unnamed: 0,County,2,3,4,5,6,7,8,9,10,11,12,13,14
0,Unnamed: 0,Total,"Less than $10,000","$10,000 to $14,999","$15,000 to $24,999","$25,000 to $34,999","$35,000 to $49,999","$50,000 to $74,999","$75,000 to $99,999","$100,000 to $149,999","$150,000 to $199,999","$200,000 or more",Median income (dollars),Mean income (dollars)
1,"Adams County, Pennsylvania",21723,0.7%,1.8%,5.3%,5.9%,9.8%,21.6%,23.0%,21.4%,7.0%,3.5%,79010,N
2,"Allegheny County, Pennsylvania",223942,1.1%,0.9%,4.4%,6.3%,10.1%,20.0%,18.7%,22.3%,8.4%,8.0%,83482,107991
3,"Armstrong County, Pennsylvania",15553,2.0%,2.3%,5.5%,11.3%,20.0%,24.2%,15.0%,14.3%,1.9%,3.5%,56338,N
4,"Beaver County, Pennsylvania",35185,1.6%,0.9%,6.5%,7.3%,13.2%,23.7%,19.5%,21.3%,3.6%,2.5%,72440,N


In [102]:
# Confirming column data types prior to renaming
print(initial_pa_mc_families_2012.columns)


Index(['County', 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], dtype='object')


In [103]:
# Converting column data types from integers to strings for renaming
initial_pa_mc_families_2012.columns = initial_pa_mc_families_2012.columns.astype(str)
print(initial_pa_mc_families_2012.columns)


Index(['County', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14'],
      dtype='object')


In [104]:
# Finding and replacing all 'N' values with 'NaN' across the entire DataFrame for future manipulation
initial_pa_mc_families_2012.replace('N', np.nan, inplace=True)


In [105]:
# Further DataFrame cleaning including execution of column renaming, dropping obselete (0) index row and subsequent resetting,
# as well as converting 'County' values to uppercase county names only (removing ' County, Pennsylvania')
# A corresponding year column is also included for future merges and all % data types are coverted to floats
initial_pa_mc_families_2012 = initial_pa_mc_families_2012.rename(columns={'2': '# of Married Couple Families', '3': '% Married Couple Families <$10,000', '4': '% Married Couple Families $10,000-$14,999',
                                                                         '5': '% Married Couple Families $15,000-$24,999', '6': '% Married Couple Families $25,000-$34,999', '7': '% Married Couple Families $35,000-$49,999',
                                                                         '8': '% Married Couple Families $50,000-$74,999', '9': '% Married Couple Families $75,000-$99,999',
                                                                         '10': '% Married Couple Families $100,000-$149,999', '11': '% Married Couple Families $150,000-$199,999',
                                                                        '12': '% Married Couple Families $200,000 or More', '13': 'Median Married Couple Families Income ($)',
                                                                       '14': 'Mean Married Couple Families Income ($)'})
initial_pa_mc_families_2012 = initial_pa_mc_families_2012.drop(0)
initial_pa_mc_families_2012 = initial_pa_mc_families_2012.reset_index(drop=True)
initial_pa_mc_families_2012['County'] = initial_pa_mc_families_2012['County'].str.replace(' County, Pennsylvania', '', regex=False)
initial_pa_mc_families_2012['County'] = initial_pa_mc_families_2012['County'].apply(lambda x: x.upper())
initial_pa_mc_families_2012.insert(0, 'Year', 2012)
initial_pa_mc_families_2012['% Married Couple Families <$10,000'] = initial_pa_mc_families_2012['% Married Couple Families <$10,000'].str.rstrip('%').astype(float) / 100
initial_pa_mc_families_2012['% Married Couple Families $10,000-$14,999'] = initial_pa_mc_families_2012['% Married Couple Families $10,000-$14,999'].str.rstrip('%').astype(float) / 100
initial_pa_mc_families_2012['% Married Couple Families $15,000-$24,999'] = initial_pa_mc_families_2012['% Married Couple Families $15,000-$24,999'].str.rstrip('%').astype(float) / 100
initial_pa_mc_families_2012['% Married Couple Families $25,000-$34,999'] = initial_pa_mc_families_2012['% Married Couple Families $25,000-$34,999'].str.rstrip('%').astype(float) / 100
initial_pa_mc_families_2012['% Married Couple Families $35,000-$49,999'] = initial_pa_mc_families_2012['% Married Couple Families $35,000-$49,999'].str.rstrip('%').astype(float) / 100
initial_pa_mc_families_2012['% Married Couple Families $50,000-$74,999'] = initial_pa_mc_families_2012['% Married Couple Families $50,000-$74,999'].str.rstrip('%').astype(float) / 100
initial_pa_mc_families_2012['% Married Couple Families $75,000-$99,999'] = initial_pa_mc_families_2012['% Married Couple Families $75,000-$99,999'].str.rstrip('%').astype(float) / 100
initial_pa_mc_families_2012['% Married Couple Families $100,000-$149,999'] = initial_pa_mc_families_2012['% Married Couple Families $100,000-$149,999'].str.rstrip('%').astype(float) / 100
initial_pa_mc_families_2012['% Married Couple Families $150,000-$199,999'] = initial_pa_mc_families_2012['% Married Couple Families $150,000-$199,999'].str.rstrip('%').astype(float) / 100
initial_pa_mc_families_2012['% Married Couple Families $200,000 or More'] = initial_pa_mc_families_2012['% Married Couple Families $200,000 or More'].str.rstrip('%').astype(float) / 100
initial_pa_mc_families_2012['# of Married Couple Families'] = initial_pa_mc_families_2012['# of Married Couple Families'].str.replace(',', '').astype(float)
initial_pa_mc_families_2012['Median Married Couple Families Income ($)'] = initial_pa_mc_families_2012['Median Married Couple Families Income ($)'].str.replace(',', '').astype(float)
initial_pa_mc_families_2012['Mean Married Couple Families Income ($)'] = initial_pa_mc_families_2012['Mean Married Couple Families Income ($)'].str.replace(',', '').astype(float)
initial_pa_mc_families_2012


Unnamed: 0,Year,County,# of Married Couple Families,"% Married Couple Families <$10,000","% Married Couple Families $10,000-$14,999","% Married Couple Families $15,000-$24,999","% Married Couple Families $25,000-$34,999","% Married Couple Families $35,000-$49,999","% Married Couple Families $50,000-$74,999","% Married Couple Families $75,000-$99,999","% Married Couple Families $100,000-$149,999","% Married Couple Families $150,000-$199,999","% Married Couple Families $200,000 or More",Median Married Couple Families Income ($),Mean Married Couple Families Income ($)
0,2012,ADAMS,21723.0,0.007,0.018,0.053,0.059,0.098,0.216,0.23,0.214,0.07,0.035,79010.0,
1,2012,ALLEGHENY,223942.0,0.011,0.009,0.044,0.063,0.101,0.2,0.187,0.223,0.084,0.08,83482.0,107991.0
2,2012,ARMSTRONG,15553.0,0.02,0.023,0.055,0.113,0.2,0.242,0.15,0.143,0.019,0.035,56338.0,
3,2012,BEAVER,35185.0,0.016,0.009,0.065,0.073,0.132,0.237,0.195,0.213,0.036,0.025,72440.0,
4,2012,BERKS,76132.0,0.016,0.011,0.045,0.07,0.125,0.223,0.193,0.191,0.066,0.061,76113.0,93235.0
5,2012,BLAIR,24351.0,0.013,0.016,0.064,0.108,0.178,0.275,0.134,0.143,0.032,0.038,59183.0,
6,2012,BUCKS,136406.0,0.006,0.009,0.027,0.043,0.076,0.165,0.173,0.248,0.128,0.125,100086.0,121754.0
7,2012,BUTLER,41674.0,0.007,0.007,0.037,0.053,0.114,0.213,0.182,0.227,0.087,0.072,84986.0,
8,2012,CAMBRIA,27432.0,0.018,0.016,0.07,0.099,0.182,0.247,0.165,0.137,0.029,0.037,61238.0,
9,2012,CARBON,,,,,,,,,,,,60563.0,


**2016**
-

In [106]:
# Reading in 2016 U.S. Census Bureau income (Married Couple Families) dataset
initial_pa_mc_families_2016 = pd.read_excel("Resources/PA_Income_Married_Couple_Families_2016.xlsx")
initial_pa_mc_families_2016.head(19)


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
0,,Married-couple families,Married-couple families,Married-couple families,Married-couple families,Married-couple families,Married-couple families,Married-couple families,Married-couple families,Married-couple families,...,Married-couple families,Married-couple families,Married-couple families,Married-couple families,Married-couple families,Married-couple families,Married-couple families,Married-couple families,Married-couple families,Married-couple families
1,Label,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,...,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate
2,Total,N,219048,13582,34077,74186,24110,133809,42219,27102,...,N,174229,61272,17817,160379,28091,15784,42957,79115,90540
3,"Less than $10,000",N,1.2%,1.3%,1.1%,1.1%,0.9%,1.2%,0.6%,0.7%,...,N,0.9%,0.7%,1.2%,3.7%,1.7%,1.8%,1.6%,1.1%,1.0%
4,"$10,000 to $14,999",N,0.9%,2.6%,0.8%,0.9%,1.1%,0.4%,0.5%,1.1%,...,N,0.6%,0.2%,1.3%,2.6%,0.8%,1.3%,1.1%,0.9%,1.3%
5,"$15,000 to $24,999",N,2.9%,4.8%,2.3%,3.4%,5.5%,1.9%,2.7%,4.8%,...,N,1.7%,2.8%,5.1%,6.7%,5.8%,5.3%,3.1%,2.6%,2.9%
6,"$25,000 to $34,999",N,4.6%,7.5%,6.7%,4.9%,11.2%,4.1%,6.2%,9.7%,...,N,3.1%,5.0%,9.4%,8.6%,6.3%,11.2%,6.2%,7.1%,6.0%
7,"$35,000 to $49,999",N,9.2%,14.0%,12.0%,10.8%,15.5%,7.1%,10.4%,12.9%,...,N,7.1%,11.2%,14.9%,11.1%,12.2%,15.7%,10.5%,11.2%,10.1%
8,"$50,000 to $74,999",N,17.0%,26.8%,21.5%,19.8%,25.0%,15.6%,17.8%,23.2%,...,N,12.0%,18.7%,23.3%,18.3%,25.6%,27.5%,20.1%,23.2%,22.7%
9,"$75,000 to $99,999",N,17.5%,15.5%,19.9%,19.0%,18.0%,14.6%,15.4%,20.9%,...,N,14.7%,20.0%,18.5%,14.7%,19.3%,15.1%,15.6%,16.0%,18.8%


In [107]:
# Retrieving only rows that contain the desired county summary demographic statistics
# Four groupings = Households, Families, Married-couple families & non family households
# % of groups above that fall into the income categories: Less Than $10,000, $10,000-$14,999, $15,000-$24,999, $25,000-$34,999,
# $35,000-$49,999, $50,000-$74,999, $75,000-$99,999, $100,000-$149,999, $150,000-$199,999, $200,000 or More
# Total count for each grouping
# Median income for each grouping
# Mean income for each grouping
initial_pa_mc_families_2016 = initial_pa_mc_families_2016.iloc[[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], :]
initial_pa_mc_families_2016


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
2,Total,N,219048,13582,34077,74186,24110,133809,42219,27102,...,N,174229,61272,17817,160379,28091,15784,42957,79115,90540
3,"Less than $10,000",N,1.2%,1.3%,1.1%,1.1%,0.9%,1.2%,0.6%,0.7%,...,N,0.9%,0.7%,1.2%,3.7%,1.7%,1.8%,1.6%,1.1%,1.0%
4,"$10,000 to $14,999",N,0.9%,2.6%,0.8%,0.9%,1.1%,0.4%,0.5%,1.1%,...,N,0.6%,0.2%,1.3%,2.6%,0.8%,1.3%,1.1%,0.9%,1.3%
5,"$15,000 to $24,999",N,2.9%,4.8%,2.3%,3.4%,5.5%,1.9%,2.7%,4.8%,...,N,1.7%,2.8%,5.1%,6.7%,5.8%,5.3%,3.1%,2.6%,2.9%
6,"$25,000 to $34,999",N,4.6%,7.5%,6.7%,4.9%,11.2%,4.1%,6.2%,9.7%,...,N,3.1%,5.0%,9.4%,8.6%,6.3%,11.2%,6.2%,7.1%,6.0%
7,"$35,000 to $49,999",N,9.2%,14.0%,12.0%,10.8%,15.5%,7.1%,10.4%,12.9%,...,N,7.1%,11.2%,14.9%,11.1%,12.2%,15.7%,10.5%,11.2%,10.1%
8,"$50,000 to $74,999",N,17.0%,26.8%,21.5%,19.8%,25.0%,15.6%,17.8%,23.2%,...,N,12.0%,18.7%,23.3%,18.3%,25.6%,27.5%,20.1%,23.2%,22.7%
9,"$75,000 to $99,999",N,17.5%,15.5%,19.9%,19.0%,18.0%,14.6%,15.4%,20.9%,...,N,14.7%,20.0%,18.5%,14.7%,19.3%,15.1%,15.6%,16.0%,18.8%
10,"$100,000 to $149,999",N,24.3%,19.3%,21.5%,22.3%,16.3%,25.3%,24.6%,20.0%,...,N,25.6%,23.2%,17.8%,17.8%,21.2%,15.3%,22.9%,22.8%,24.3%
11,"$150,000 to $199,999",N,10.8%,3.6%,8.5%,10.5%,4.1%,13.1%,10.4%,4.1%,...,N,14.3%,10.6%,5.9%,9.0%,5.2%,3.6%,10.6%,9.2%,7.9%


In [108]:
# Transposing the DataFrame's format for process integrity and future merging simplicity
initial_pa_mc_families_2016 = initial_pa_mc_families_2016.T
initial_pa_mc_families_2016.head()


Unnamed: 0,2,3,4,5,6,7,8,9,10,11,12,13,14
Unnamed: 0,Total,"Less than $10,000","$10,000 to $14,999","$15,000 to $24,999","$25,000 to $34,999","$35,000 to $49,999","$50,000 to $74,999","$75,000 to $99,999","$100,000 to $149,999","$150,000 to $199,999","$200,000 or more",Median income (dollars),Mean income (dollars)
"Adams County, Pennsylvania",N,N,N,N,N,N,N,N,N,N,N,81817,N
"Allegheny County, Pennsylvania",219048,1.2%,0.9%,2.9%,4.6%,9.2%,17.0%,17.5%,24.3%,10.8%,11.6%,94674,117354
"Armstrong County, Pennsylvania",13582,1.3%,2.6%,4.8%,7.5%,14.0%,26.8%,15.5%,19.3%,3.6%,4.6%,68241,N
"Beaver County, Pennsylvania",34077,1.1%,0.8%,2.3%,6.7%,12.0%,21.5%,19.9%,21.5%,8.5%,5.8%,80740,94471


In [109]:
# Resetting index and renaming to create a 'County' column
initial_pa_mc_families_2016 = initial_pa_mc_families_2016.reset_index()
initial_pa_mc_families_2016.rename(columns={'index': 'County'}, inplace=True)
initial_pa_mc_families_2016.head()


Unnamed: 0,County,2,3,4,5,6,7,8,9,10,11,12,13,14
0,Unnamed: 0,Total,"Less than $10,000","$10,000 to $14,999","$15,000 to $24,999","$25,000 to $34,999","$35,000 to $49,999","$50,000 to $74,999","$75,000 to $99,999","$100,000 to $149,999","$150,000 to $199,999","$200,000 or more",Median income (dollars),Mean income (dollars)
1,"Adams County, Pennsylvania",N,N,N,N,N,N,N,N,N,N,N,81817,N
2,"Allegheny County, Pennsylvania",219048,1.2%,0.9%,2.9%,4.6%,9.2%,17.0%,17.5%,24.3%,10.8%,11.6%,94674,117354
3,"Armstrong County, Pennsylvania",13582,1.3%,2.6%,4.8%,7.5%,14.0%,26.8%,15.5%,19.3%,3.6%,4.6%,68241,N
4,"Beaver County, Pennsylvania",34077,1.1%,0.8%,2.3%,6.7%,12.0%,21.5%,19.9%,21.5%,8.5%,5.8%,80740,94471


In [110]:
# Confirming column data types prior to renaming
print(initial_pa_mc_families_2016.columns)


Index(['County', 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], dtype='object')


In [111]:
# Converting column data types from integers to strings for renaming
initial_pa_mc_families_2016.columns = initial_pa_mc_families_2016.columns.astype(str)
print(initial_pa_mc_families_2016.columns)


Index(['County', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14'],
      dtype='object')


In [112]:
# Finding and replacing all 'N' values with 'NaN' across the entire DataFrame for future manipulation
initial_pa_mc_families_2016.replace('N', np.nan, inplace=True)


In [113]:
# Further DataFrame cleaning including execution of column renaming, dropping obselete (0) index row and subsequent resetting,
# as well as converting 'County' values to uppercase county names only (removing ' County, Pennsylvania')
# A corresponding year column is also included for future merges and all % data types are coverted to floats
initial_pa_mc_families_2016 = initial_pa_mc_families_2016.rename(columns={'2': '# of Married Couple Families', '3': '% Married Couple Families <$10,000', '4': '% Married Couple Families $10,000-$14,999',
                                                                         '5': '% Married Couple Families $15,000-$24,999', '6': '% Married Couple Families $25,000-$34,999', '7': '% Married Couple Families $35,000-$49,999',
                                                                         '8': '% Married Couple Families $50,000-$74,999', '9': '% Married Couple Families $75,000-$99,999',
                                                                         '10': '% Married Couple Families $100,000-$149,999', '11': '% Married Couple Families $150,000-$199,999',
                                                                        '12': '% Married Couple Families $200,000 or More', '13': 'Median Married Couple Families Income ($)',
                                                                       '14': 'Mean Married Couple Families Income ($)'})
initial_pa_mc_families_2016 = initial_pa_mc_families_2016.drop(0)
initial_pa_mc_families_2016 = initial_pa_mc_families_2016.reset_index(drop=True)
initial_pa_mc_families_2016['County'] = initial_pa_mc_families_2016['County'].str.replace(' County, Pennsylvania', '', regex=False)
initial_pa_mc_families_2016['County'] = initial_pa_mc_families_2016['County'].apply(lambda x: x.upper())
initial_pa_mc_families_2016.insert(0, 'Year', 2016)
initial_pa_mc_families_2016['% Married Couple Families <$10,000'] = initial_pa_mc_families_2016['% Married Couple Families <$10,000'].str.rstrip('%').astype(float) / 100
initial_pa_mc_families_2016['% Married Couple Families $10,000-$14,999'] = initial_pa_mc_families_2016['% Married Couple Families $10,000-$14,999'].str.rstrip('%').astype(float) / 100
initial_pa_mc_families_2016['% Married Couple Families $15,000-$24,999'] = initial_pa_mc_families_2016['% Married Couple Families $15,000-$24,999'].str.rstrip('%').astype(float) / 100
initial_pa_mc_families_2016['% Married Couple Families $25,000-$34,999'] = initial_pa_mc_families_2016['% Married Couple Families $25,000-$34,999'].str.rstrip('%').astype(float) / 100
initial_pa_mc_families_2016['% Married Couple Families $35,000-$49,999'] = initial_pa_mc_families_2016['% Married Couple Families $35,000-$49,999'].str.rstrip('%').astype(float) / 100
initial_pa_mc_families_2016['% Married Couple Families $50,000-$74,999'] = initial_pa_mc_families_2016['% Married Couple Families $50,000-$74,999'].str.rstrip('%').astype(float) / 100
initial_pa_mc_families_2016['% Married Couple Families $75,000-$99,999'] = initial_pa_mc_families_2016['% Married Couple Families $75,000-$99,999'].str.rstrip('%').astype(float) / 100
initial_pa_mc_families_2016['% Married Couple Families $100,000-$149,999'] = initial_pa_mc_families_2016['% Married Couple Families $100,000-$149,999'].str.rstrip('%').astype(float) / 100
initial_pa_mc_families_2016['% Married Couple Families $150,000-$199,999'] = initial_pa_mc_families_2016['% Married Couple Families $150,000-$199,999'].str.rstrip('%').astype(float) / 100
initial_pa_mc_families_2016['% Married Couple Families $200,000 or More'] = initial_pa_mc_families_2016['% Married Couple Families $200,000 or More'].str.rstrip('%').astype(float) / 100
initial_pa_mc_families_2016['# of Married Couple Families'] = initial_pa_mc_families_2016['# of Married Couple Families'].str.replace(',', '').astype(float)
initial_pa_mc_families_2016['Median Married Couple Families Income ($)'] = initial_pa_mc_families_2016['Median Married Couple Families Income ($)'].str.replace(',', '').astype(float)
initial_pa_mc_families_2016['Mean Married Couple Families Income ($)'] = initial_pa_mc_families_2016['Mean Married Couple Families Income ($)'].str.replace(',', '').astype(float)
initial_pa_mc_families_2016


Unnamed: 0,Year,County,# of Married Couple Families,"% Married Couple Families <$10,000","% Married Couple Families $10,000-$14,999","% Married Couple Families $15,000-$24,999","% Married Couple Families $25,000-$34,999","% Married Couple Families $35,000-$49,999","% Married Couple Families $50,000-$74,999","% Married Couple Families $75,000-$99,999","% Married Couple Families $100,000-$149,999","% Married Couple Families $150,000-$199,999","% Married Couple Families $200,000 or More",Median Married Couple Families Income ($),Mean Married Couple Families Income ($)
0,2016,ADAMS,,,,,,,,,,,,81817.0,
1,2016,ALLEGHENY,219048.0,0.012,0.009,0.029,0.046,0.092,0.17,0.175,0.243,0.108,0.116,94674.0,117354.0
2,2016,ARMSTRONG,13582.0,0.013,0.026,0.048,0.075,0.14,0.268,0.155,0.193,0.036,0.046,68241.0,
3,2016,BEAVER,34077.0,0.011,0.008,0.023,0.067,0.12,0.215,0.199,0.215,0.085,0.058,80740.0,94471.0
4,2016,BERKS,74186.0,0.011,0.009,0.034,0.049,0.108,0.198,0.19,0.223,0.105,0.073,88174.0,104518.0
5,2016,BLAIR,24110.0,0.009,0.011,0.055,0.112,0.155,0.25,0.18,0.163,0.041,0.023,65874.0,
6,2016,BUCKS,133809.0,0.012,0.004,0.019,0.041,0.071,0.156,0.146,0.253,0.131,0.169,108258.0,
7,2016,BUTLER,42219.0,0.006,0.005,0.027,0.062,0.104,0.178,0.154,0.246,0.104,0.113,94695.0,114003.0
8,2016,CAMBRIA,27102.0,0.007,0.011,0.048,0.097,0.129,0.232,0.209,0.2,0.041,0.026,71415.0,
9,2016,CARBON,,,,,,,,,,,,73699.0,


**2019**
-
Please note: 2020 United States Census Bureau was unavailable and as such the closest year prior to the election (2019) was utilized


In [114]:
# Reading in 2019 U.S. Census Bureau income (Married Couple Families) dataset
initial_pa_mc_families_2019 = pd.read_excel("Resources/PA_Income_Married_Couple_Families_2019.xlsx")
initial_pa_mc_families_2019.head(19)


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
0,,Married-couple families,Married-couple families,Married-couple families,Married-couple families,Married-couple families,Married-couple families,Married-couple families,Married-couple families,Married-couple families,...,Married-couple families,Married-couple families,Married-couple families,Married-couple families,Married-couple families,Married-couple families,Married-couple families,Married-couple families,Married-couple families,Married-couple families
1,Label,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,...,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate
2,Total,21827,225213,N,31950,75762,24065,135073,42584,26658,...,N,173673,57892,N,174209,26926,15244,42556,74209,90877
3,"Less than $10,000",0.6%,1.1%,N,0.9%,1.4%,0.5%,0.7%,1.0%,1.1%,...,N,0.6%,1.2%,N,2.6%,0.7%,0.5%,0.8%,0.7%,0.8%
4,"$10,000 to $14,999",0.5%,0.5%,N,0.8%,0.3%,1.0%,0.4%,0.3%,0.4%,...,N,0.4%,0.6%,N,2.9%,0.8%,1.3%,1.4%,0.4%,0.8%
5,"$15,000 to $24,999",3.2%,2.3%,N,3.7%,2.3%,2.7%,1.6%,1.9%,2.7%,...,N,1.2%,1.4%,N,6.5%,3.1%,6.0%,3.0%,2.7%,1.6%
6,"$25,000 to $34,999",5.2%,3.3%,N,4.6%,3.3%,6.6%,2.6%,3.7%,6.9%,...,N,2.5%,3.0%,N,5.8%,4.9%,7.1%,4.4%,5.2%,4.0%
7,"$35,000 to $49,999",10.2%,6.4%,N,7.4%,9.8%,14.0%,5.2%,7.3%,12.6%,...,N,5.2%,7.1%,N,11.2%,13.4%,17.0%,8.2%,10.6%,9.5%
8,"$50,000 to $74,999",18.1%,15.7%,N,17.1%,18.0%,19.3%,11.0%,16.9%,23.4%,...,N,10.9%,17.9%,N,16.6%,23.2%,21.0%,16.1%,18.0%,19.3%
9,"$75,000 to $99,999",21.7%,15.8%,N,19.3%,15.8%,18.2%,13.6%,16.0%,21.9%,...,N,13.2%,16.8%,N,14.1%,22.4%,19.9%,15.6%,21.1%,16.4%


In [115]:
# Retrieving only rows that contain the desired county summary demographic statistics
# Four groupings = Households, Families, Married-couple families & non family households
# % of groups above that fall into the income categories: Less Than $10,000, $10,000-$14,999, $15,000-$24,999, $25,000-$34,999,
# $35,000-$49,999, $50,000-$74,999, $75,000-$99,999, $100,000-$149,999, $150,000-$199,999, $200,000 or More
# Total count for each grouping
# Median income for each grouping
# Mean income for each grouping
initial_pa_mc_families_2019 = initial_pa_mc_families_2019.iloc[[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], :]
initial_pa_mc_families_2019


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
2,Total,21827,225213,N,31950,75762,24065,135073,42584,26658,...,N,173673,57892,N,174209,26926,15244,42556,74209,90877
3,"Less than $10,000",0.6%,1.1%,N,0.9%,1.4%,0.5%,0.7%,1.0%,1.1%,...,N,0.6%,1.2%,N,2.6%,0.7%,0.5%,0.8%,0.7%,0.8%
4,"$10,000 to $14,999",0.5%,0.5%,N,0.8%,0.3%,1.0%,0.4%,0.3%,0.4%,...,N,0.4%,0.6%,N,2.9%,0.8%,1.3%,1.4%,0.4%,0.8%
5,"$15,000 to $24,999",3.2%,2.3%,N,3.7%,2.3%,2.7%,1.6%,1.9%,2.7%,...,N,1.2%,1.4%,N,6.5%,3.1%,6.0%,3.0%,2.7%,1.6%
6,"$25,000 to $34,999",5.2%,3.3%,N,4.6%,3.3%,6.6%,2.6%,3.7%,6.9%,...,N,2.5%,3.0%,N,5.8%,4.9%,7.1%,4.4%,5.2%,4.0%
7,"$35,000 to $49,999",10.2%,6.4%,N,7.4%,9.8%,14.0%,5.2%,7.3%,12.6%,...,N,5.2%,7.1%,N,11.2%,13.4%,17.0%,8.2%,10.6%,9.5%
8,"$50,000 to $74,999",18.1%,15.7%,N,17.1%,18.0%,19.3%,11.0%,16.9%,23.4%,...,N,10.9%,17.9%,N,16.6%,23.2%,21.0%,16.1%,18.0%,19.3%
9,"$75,000 to $99,999",21.7%,15.8%,N,19.3%,15.8%,18.2%,13.6%,16.0%,21.9%,...,N,13.2%,16.8%,N,14.1%,22.4%,19.9%,15.6%,21.1%,16.4%
10,"$100,000 to $149,999",22.1%,26.4%,N,26.5%,29.7%,24.3%,25.6%,26.1%,20.5%,...,N,23.5%,27.3%,N,18.4%,21.1%,19.8%,24.3%,22.2%,27.3%
11,"$150,000 to $199,999",10.1%,12.8%,N,12.1%,10.7%,9.2%,18.0%,11.5%,6.3%,...,N,16.9%,11.8%,N,9.9%,6.8%,5.3%,13.4%,10.0%,11.2%


In [116]:
# Transposing the DataFrame's format for process integrity and future merging simplicity
initial_pa_mc_families_2019 = initial_pa_mc_families_2019.T
initial_pa_mc_families_2019.head()


Unnamed: 0,2,3,4,5,6,7,8,9,10,11,12,13,14
Unnamed: 0,Total,"Less than $10,000","$10,000 to $14,999","$15,000 to $24,999","$25,000 to $34,999","$35,000 to $49,999","$50,000 to $74,999","$75,000 to $99,999","$100,000 to $149,999","$150,000 to $199,999","$200,000 or more",Median income (dollars),Mean income (dollars)
"Adams County, Pennsylvania",21827,0.6%,0.5%,3.2%,5.2%,10.2%,18.1%,21.7%,22.1%,10.1%,8.2%,88173,N
"Allegheny County, Pennsylvania",225213,1.1%,0.5%,2.3%,3.3%,6.4%,15.7%,15.8%,26.4%,12.8%,15.7%,107939,139626
"Armstrong County, Pennsylvania",N,N,N,N,N,N,N,N,N,N,N,74786,N
"Beaver County, Pennsylvania",31950,0.9%,0.8%,3.7%,4.6%,7.4%,17.1%,19.3%,26.5%,12.1%,7.6%,95142,N


In [117]:
# Resetting index and renaming to create a 'County' column
initial_pa_mc_families_2019 = initial_pa_mc_families_2019.reset_index()
initial_pa_mc_families_2019.rename(columns={'index': 'County'}, inplace=True)
initial_pa_mc_families_2019.head()


Unnamed: 0,County,2,3,4,5,6,7,8,9,10,11,12,13,14
0,Unnamed: 0,Total,"Less than $10,000","$10,000 to $14,999","$15,000 to $24,999","$25,000 to $34,999","$35,000 to $49,999","$50,000 to $74,999","$75,000 to $99,999","$100,000 to $149,999","$150,000 to $199,999","$200,000 or more",Median income (dollars),Mean income (dollars)
1,"Adams County, Pennsylvania",21827,0.6%,0.5%,3.2%,5.2%,10.2%,18.1%,21.7%,22.1%,10.1%,8.2%,88173,N
2,"Allegheny County, Pennsylvania",225213,1.1%,0.5%,2.3%,3.3%,6.4%,15.7%,15.8%,26.4%,12.8%,15.7%,107939,139626
3,"Armstrong County, Pennsylvania",N,N,N,N,N,N,N,N,N,N,N,74786,N
4,"Beaver County, Pennsylvania",31950,0.9%,0.8%,3.7%,4.6%,7.4%,17.1%,19.3%,26.5%,12.1%,7.6%,95142,N


In [118]:
# Confirming column data types prior to renaming
print(initial_pa_mc_families_2019.columns)


Index(['County', 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], dtype='object')


In [119]:
# Converting column data types from integers to strings for renaming
initial_pa_mc_families_2019.columns = initial_pa_mc_families_2019.columns.astype(str)
print(initial_pa_mc_families_2019.columns)


Index(['County', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14'],
      dtype='object')


In [120]:
# Finding and replacing all 'N' values with 'NaN' across the entire DataFrame for future manipulation
initial_pa_mc_families_2019.replace('N', np.nan, inplace=True)


In [121]:
# Further DataFrame cleaning including execution of column renaming, dropping obselete (0) index row and subsequent resetting,
# as well as converting 'County' values to uppercase county names only (removing ' County, Pennsylvania')
# A corresponding year column is also included for future merges and all % data types are coverted to floats
initial_pa_mc_families_2019 = initial_pa_mc_families_2019.rename(columns={'2': '# of Married Couple Families', '3': '% Married Couple Families <$10,000', '4': '% Married Couple Families $10,000-$14,999',
                                                                         '5': '% Married Couple Families $15,000-$24,999', '6': '% Married Couple Families $25,000-$34,999', '7': '% Married Couple Families $35,000-$49,999',
                                                                         '8': '% Married Couple Families $50,000-$74,999', '9': '% Married Couple Families $75,000-$99,999',
                                                                         '10': '% Married Couple Families $100,000-$149,999', '11': '% Married Couple Families $150,000-$199,999',
                                                                        '12': '% Married Couple Families $200,000 or More', '13': 'Median Married Couple Families Income ($)',
                                                                       '14': 'Mean Married Couple Families Income ($)'})
initial_pa_mc_families_2019 = initial_pa_mc_families_2019.drop(0)
initial_pa_mc_families_2019 = initial_pa_mc_families_2019.reset_index(drop=True)
initial_pa_mc_families_2019['County'] = initial_pa_mc_families_2019['County'].str.replace(' County, Pennsylvania', '', regex=False)
initial_pa_mc_families_2019['County'] = initial_pa_mc_families_2019['County'].apply(lambda x: x.upper())
initial_pa_mc_families_2019.insert(0, 'Year', 2020)
initial_pa_mc_families_2019['% Married Couple Families <$10,000'] = initial_pa_mc_families_2019['% Married Couple Families <$10,000'].str.rstrip('%').astype(float) / 100
initial_pa_mc_families_2019['% Married Couple Families $10,000-$14,999'] = initial_pa_mc_families_2019['% Married Couple Families $10,000-$14,999'].str.rstrip('%').astype(float) / 100
initial_pa_mc_families_2019['% Married Couple Families $15,000-$24,999'] = initial_pa_mc_families_2019['% Married Couple Families $15,000-$24,999'].str.rstrip('%').astype(float) / 100
initial_pa_mc_families_2019['% Married Couple Families $25,000-$34,999'] = initial_pa_mc_families_2019['% Married Couple Families $25,000-$34,999'].str.rstrip('%').astype(float) / 100
initial_pa_mc_families_2019['% Married Couple Families $35,000-$49,999'] = initial_pa_mc_families_2019['% Married Couple Families $35,000-$49,999'].str.rstrip('%').astype(float) / 100
initial_pa_mc_families_2019['% Married Couple Families $50,000-$74,999'] = initial_pa_mc_families_2019['% Married Couple Families $50,000-$74,999'].str.rstrip('%').astype(float) / 100
initial_pa_mc_families_2019['% Married Couple Families $75,000-$99,999'] = initial_pa_mc_families_2019['% Married Couple Families $75,000-$99,999'].str.rstrip('%').astype(float) / 100
initial_pa_mc_families_2019['% Married Couple Families $100,000-$149,999'] = initial_pa_mc_families_2019['% Married Couple Families $100,000-$149,999'].str.rstrip('%').astype(float) / 100
initial_pa_mc_families_2019['% Married Couple Families $150,000-$199,999'] = initial_pa_mc_families_2019['% Married Couple Families $150,000-$199,999'].str.rstrip('%').astype(float) / 100
initial_pa_mc_families_2019['% Married Couple Families $200,000 or More'] = initial_pa_mc_families_2019['% Married Couple Families $200,000 or More'].str.rstrip('%').astype(float) / 100
initial_pa_mc_families_2019['# of Married Couple Families'] = initial_pa_mc_families_2019['# of Married Couple Families'].str.replace(',', '').astype(float)
initial_pa_mc_families_2019['Median Married Couple Families Income ($)'] = initial_pa_mc_families_2019['Median Married Couple Families Income ($)'].str.replace(',', '').astype(float)
initial_pa_mc_families_2019['Mean Married Couple Families Income ($)'] = initial_pa_mc_families_2019['Mean Married Couple Families Income ($)'].str.replace(',', '').astype(float)
initial_pa_mc_families_2019


Unnamed: 0,Year,County,# of Married Couple Families,"% Married Couple Families <$10,000","% Married Couple Families $10,000-$14,999","% Married Couple Families $15,000-$24,999","% Married Couple Families $25,000-$34,999","% Married Couple Families $35,000-$49,999","% Married Couple Families $50,000-$74,999","% Married Couple Families $75,000-$99,999","% Married Couple Families $100,000-$149,999","% Married Couple Families $150,000-$199,999","% Married Couple Families $200,000 or More",Median Married Couple Families Income ($),Mean Married Couple Families Income ($)
0,2020,ADAMS,21827.0,0.006,0.005,0.032,0.052,0.102,0.181,0.217,0.221,0.101,0.082,88173.0,
1,2020,ALLEGHENY,225213.0,0.011,0.005,0.023,0.033,0.064,0.157,0.158,0.264,0.128,0.157,107939.0,139626.0
2,2020,ARMSTRONG,,,,,,,,,,,,74786.0,
3,2020,BEAVER,31950.0,0.009,0.008,0.037,0.046,0.074,0.171,0.193,0.265,0.121,0.076,95142.0,
4,2020,BERKS,75762.0,0.014,0.003,0.023,0.033,0.098,0.18,0.158,0.297,0.107,0.088,98170.0,113907.0
5,2020,BLAIR,24065.0,0.005,0.01,0.027,0.066,0.14,0.193,0.182,0.243,0.092,0.043,81980.0,
6,2020,BUCKS,135073.0,0.007,0.004,0.016,0.026,0.052,0.11,0.136,0.256,0.18,0.213,125966.0,
7,2020,BUTLER,42584.0,0.01,0.003,0.019,0.037,0.073,0.169,0.16,0.261,0.115,0.153,103524.0,
8,2020,CAMBRIA,26658.0,0.011,0.004,0.027,0.069,0.126,0.234,0.219,0.205,0.063,0.041,78495.0,91000.0
9,2020,CARBON,,,,,,,,,,,,84242.0,


**Income (Nonfamily Households) Data Extraction, Transformation & Loading (2012, 2016 & 2019)**
-
-----------

**2012**
-

In [122]:
# Reading in 2012 U.S. Census Bureau income (Nonfamily Households) dataset
initial_pa_nonfamily_2012 = pd.read_excel("Resources/PA_Income_Nonfamily_Households_2012.xlsx")
initial_pa_nonfamily_2012.head(19)


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
0,,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households,...,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households
1,Label,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,...,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate
2,Total,11077,222844,9564,25207,47979,18030,65228,23117,20962,...,16696,98489,35675,14206,271994,20725,8972,29859,51006,50318
3,"Less than $10,000",9.2%,13.2%,17.2%,10.1%,11.8%,17.4%,10.9%,12.4%,15.4%,...,9.7%,8.6%,9.8%,15.8%,21.6%,12.0%,17.0%,10.8%,13.3%,12.9%
4,"$10,000 to $14,999",13.2%,9.0%,11.4%,14.1%,11.1%,14.5%,7.3%,13.1%,21.5%,...,10.5%,8.1%,11.2%,17.6%,11.2%,15.7%,14.8%,11.9%,12.4%,7.5%
5,"$15,000 to $24,999",16.3%,17.8%,24.7%,22.8%,18.8%,24.0%,15.6%,18.0%,20.6%,...,17.2%,15.1%,17.9%,22.8%,15.2%,22.5%,22.7%,18.3%,22.4%,21.0%
6,"$25,000 to $34,999",18.3%,14.5%,17.8%,19.1%,15.1%,14.7%,10.9%,11.9%,13.1%,...,14.2%,11.6%,14.3%,10.5%,12.6%,10.9%,17.0%,12.5%,16.0%,14.9%
7,"$35,000 to $49,999",15.4%,15.3%,10.2%,12.9%,18.0%,9.7%,15.0%,16.9%,12.6%,...,15.4%,13.8%,16.2%,16.6%,12.5%,15.4%,12.4%,18.5%,14.7%,16.3%
8,"$50,000 to $74,999",16.4%,16.5%,9.8%,12.5%,15.3%,13.5%,18.4%,15.0%,10.5%,...,16.0%,19.2%,16.2%,11.2%,14.0%,15.5%,12.8%,15.7%,12.4%,15.0%
9,"$75,000 to $99,999",6.3%,6.4%,7.1%,6.1%,5.9%,3.8%,8.9%,5.6%,3.2%,...,11.8%,11.0%,6.5%,4.1%,5.6%,5.5%,1.2%,5.4%,4.8%,6.6%


In [123]:
# Retrieving only rows that contain the desired county summary demographic statistics
# Four groupings = Households, Families, Married-couple families & non family households
# % of groups above that fall into the income categories: Less Than $10,000, $10,000-$14,999, $15,000-$24,999, $25,000-$34,999,
# $35,000-$49,999, $50,000-$74,999, $75,000-$99,999, $100,000-$149,999, $150,000-$199,999, $200,000 or More
# Total count for each grouping
# Median income for each grouping
# Mean income for each grouping
initial_pa_nonfamily_2012 = initial_pa_nonfamily_2012.iloc[[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], :]
initial_pa_nonfamily_2012


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
2,Total,11077,222844,9564,25207,47979,18030,65228,23117,20962,...,16696,98489,35675,14206,271994,20725,8972,29859,51006,50318
3,"Less than $10,000",9.2%,13.2%,17.2%,10.1%,11.8%,17.4%,10.9%,12.4%,15.4%,...,9.7%,8.6%,9.8%,15.8%,21.6%,12.0%,17.0%,10.8%,13.3%,12.9%
4,"$10,000 to $14,999",13.2%,9.0%,11.4%,14.1%,11.1%,14.5%,7.3%,13.1%,21.5%,...,10.5%,8.1%,11.2%,17.6%,11.2%,15.7%,14.8%,11.9%,12.4%,7.5%
5,"$15,000 to $24,999",16.3%,17.8%,24.7%,22.8%,18.8%,24.0%,15.6%,18.0%,20.6%,...,17.2%,15.1%,17.9%,22.8%,15.2%,22.5%,22.7%,18.3%,22.4%,21.0%
6,"$25,000 to $34,999",18.3%,14.5%,17.8%,19.1%,15.1%,14.7%,10.9%,11.9%,13.1%,...,14.2%,11.6%,14.3%,10.5%,12.6%,10.9%,17.0%,12.5%,16.0%,14.9%
7,"$35,000 to $49,999",15.4%,15.3%,10.2%,12.9%,18.0%,9.7%,15.0%,16.9%,12.6%,...,15.4%,13.8%,16.2%,16.6%,12.5%,15.4%,12.4%,18.5%,14.7%,16.3%
8,"$50,000 to $74,999",16.4%,16.5%,9.8%,12.5%,15.3%,13.5%,18.4%,15.0%,10.5%,...,16.0%,19.2%,16.2%,11.2%,14.0%,15.5%,12.8%,15.7%,12.4%,15.0%
9,"$75,000 to $99,999",6.3%,6.4%,7.1%,6.1%,5.9%,3.8%,8.9%,5.6%,3.2%,...,11.8%,11.0%,6.5%,4.1%,5.6%,5.5%,1.2%,5.4%,4.8%,6.6%
10,"$100,000 to $149,999",4.5%,4.3%,1.6%,2.2%,2.2%,1.2%,8.5%,5.1%,2.1%,...,3.9%,8.6%,5.8%,1.3%,4.4%,1.8%,1.8%,4.7%,2.8%,4.2%
11,"$150,000 to $199,999",0.2%,1.7%,0.0%,0.2%,0.9%,0.6%,1.8%,1.9%,0.7%,...,0.0%,2.2%,0.5%,0.2%,1.3%,0.0%,0.0%,1.4%,0.3%,0.6%


In [124]:
# Transposing the DataFrame's format for process integrity and future merging simplicity
initial_pa_nonfamily_2012 = initial_pa_nonfamily_2012.T
initial_pa_nonfamily_2012.head()


Unnamed: 0,2,3,4,5,6,7,8,9,10,11,12,13,14
Unnamed: 0,Total,"Less than $10,000","$10,000 to $14,999","$15,000 to $24,999","$25,000 to $34,999","$35,000 to $49,999","$50,000 to $74,999","$75,000 to $99,999","$100,000 to $149,999","$150,000 to $199,999","$200,000 or more",Median income (dollars),Mean income (dollars)
"Adams County, Pennsylvania",11077,9.2%,13.2%,16.3%,18.3%,15.4%,16.4%,6.3%,4.5%,0.2%,0.2%,31678,39002
"Allegheny County, Pennsylvania",222844,13.2%,9.0%,17.8%,14.5%,15.3%,16.5%,6.4%,4.3%,1.7%,1.3%,31395,44363
"Armstrong County, Pennsylvania",9564,17.2%,11.4%,24.7%,17.8%,10.2%,9.8%,7.1%,1.6%,0.0%,0.2%,23832,31013
"Beaver County, Pennsylvania",25207,10.1%,14.1%,22.8%,19.1%,12.9%,12.5%,6.1%,2.2%,0.2%,0.1%,26133,33987


In [125]:
# Resetting index and renaming to create a 'County' column
initial_pa_nonfamily_2012 = initial_pa_nonfamily_2012.reset_index()
initial_pa_nonfamily_2012.rename(columns={'index': 'County'}, inplace=True)
initial_pa_nonfamily_2012.head()


Unnamed: 0,County,2,3,4,5,6,7,8,9,10,11,12,13,14
0,Unnamed: 0,Total,"Less than $10,000","$10,000 to $14,999","$15,000 to $24,999","$25,000 to $34,999","$35,000 to $49,999","$50,000 to $74,999","$75,000 to $99,999","$100,000 to $149,999","$150,000 to $199,999","$200,000 or more",Median income (dollars),Mean income (dollars)
1,"Adams County, Pennsylvania",11077,9.2%,13.2%,16.3%,18.3%,15.4%,16.4%,6.3%,4.5%,0.2%,0.2%,31678,39002
2,"Allegheny County, Pennsylvania",222844,13.2%,9.0%,17.8%,14.5%,15.3%,16.5%,6.4%,4.3%,1.7%,1.3%,31395,44363
3,"Armstrong County, Pennsylvania",9564,17.2%,11.4%,24.7%,17.8%,10.2%,9.8%,7.1%,1.6%,0.0%,0.2%,23832,31013
4,"Beaver County, Pennsylvania",25207,10.1%,14.1%,22.8%,19.1%,12.9%,12.5%,6.1%,2.2%,0.2%,0.1%,26133,33987


In [126]:
# Confirming column data types prior to renaming
print(initial_pa_nonfamily_2012.columns)


Index(['County', 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], dtype='object')


In [127]:
# Converting column data types from integers to strings for renaming
initial_pa_nonfamily_2012.columns = initial_pa_nonfamily_2012.columns.astype(str)
print(initial_pa_nonfamily_2012.columns)


Index(['County', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14'],
      dtype='object')


In [128]:
# Finding and replacing all 'N' values with 'NaN' across the entire DataFrame for future manipulation
initial_pa_nonfamily_2012.replace('N', np.nan, inplace=True)


In [129]:
# Further DataFrame cleaning including execution of column renaming, dropping obselete (0) index row and subsequent resetting,
# as well as converting 'County' values to uppercase county names only (removing ' County, Pennsylvania')
# A corresponding year column is also included for future merges and all % data types are coverted to floats
initial_pa_nonfamily_2012 = initial_pa_nonfamily_2012.rename(columns={'2': '# of Nonfamily Households', '3': '% Nonfamily Households <$10,000', '4': '% Nonfamily Households $10,000-$14,999',
                                                                         '5': '% Nonfamily Households $15,000-$24,999', '6': '% Nonfamily Households $25,000-$34,999', '7': '% Nonfamily Households $35,000-$49,999',
                                                                         '8': '% Nonfamily Households $50,000-$74,999', '9': '% Nonfamily Households $75,000-$99,999',
                                                                         '10': '% Nonfamily Households $100,000-$149,999', '11': '% Nonfamily Households $150,000-$199,999',
                                                                        '12': '% Nonfamily Households $200,000 or More', '13': 'Median Nonfamily Households Income ($)',
                                                                       '14': 'Mean Nonfamily Households Income ($)'})
initial_pa_nonfamily_2012 = initial_pa_nonfamily_2012.drop(0)
initial_pa_nonfamily_2012 = initial_pa_nonfamily_2012.reset_index(drop=True)
initial_pa_nonfamily_2012['County'] = initial_pa_nonfamily_2012['County'].str.replace(' County, Pennsylvania', '', regex=False)
initial_pa_nonfamily_2012['County'] = initial_pa_nonfamily_2012['County'].apply(lambda x: x.upper())
initial_pa_nonfamily_2012.insert(0, 'Year', 2012)
initial_pa_nonfamily_2012['% Nonfamily Households <$10,000'] = initial_pa_nonfamily_2012['% Nonfamily Households <$10,000'].str.rstrip('%').astype(float) / 100
initial_pa_nonfamily_2012['% Nonfamily Households $10,000-$14,999'] = initial_pa_nonfamily_2012['% Nonfamily Households $10,000-$14,999'].str.rstrip('%').astype(float) / 100
initial_pa_nonfamily_2012['% Nonfamily Households $15,000-$24,999'] = initial_pa_nonfamily_2012['% Nonfamily Households $15,000-$24,999'].str.rstrip('%').astype(float) / 100
initial_pa_nonfamily_2012['% Nonfamily Households $25,000-$34,999'] = initial_pa_nonfamily_2012['% Nonfamily Households $25,000-$34,999'].str.rstrip('%').astype(float) / 100
initial_pa_nonfamily_2012['% Nonfamily Households $35,000-$49,999'] = initial_pa_nonfamily_2012['% Nonfamily Households $35,000-$49,999'].str.rstrip('%').astype(float) / 100
initial_pa_nonfamily_2012['% Nonfamily Households $50,000-$74,999'] = initial_pa_nonfamily_2012['% Nonfamily Households $50,000-$74,999'].str.rstrip('%').astype(float) / 100
initial_pa_nonfamily_2012['% Nonfamily Households $75,000-$99,999'] = initial_pa_nonfamily_2012['% Nonfamily Households $75,000-$99,999'].str.rstrip('%').astype(float) / 100
initial_pa_nonfamily_2012['% Nonfamily Households $100,000-$149,999'] = initial_pa_nonfamily_2012['% Nonfamily Households $100,000-$149,999'].str.rstrip('%').astype(float) / 100
initial_pa_nonfamily_2012['% Nonfamily Households $150,000-$199,999'] = initial_pa_nonfamily_2012['% Nonfamily Households $150,000-$199,999'].str.rstrip('%').astype(float) / 100
initial_pa_nonfamily_2012['% Nonfamily Households $200,000 or More'] = initial_pa_nonfamily_2012['% Nonfamily Households $200,000 or More'].str.rstrip('%').astype(float) / 100
initial_pa_nonfamily_2012['# of Nonfamily Households'] = initial_pa_nonfamily_2012['# of Nonfamily Households'].str.replace(',', '').astype(float)
initial_pa_nonfamily_2012['Median Nonfamily Households Income ($)'] = initial_pa_nonfamily_2012['Median Nonfamily Households Income ($)'].str.replace(',', '').astype(float)
initial_pa_nonfamily_2012['Mean Nonfamily Households Income ($)'] = initial_pa_nonfamily_2012['Mean Nonfamily Households Income ($)'].str.replace(',', '').astype(float)
initial_pa_nonfamily_2012


Unnamed: 0,Year,County,# of Nonfamily Households,"% Nonfamily Households <$10,000","% Nonfamily Households $10,000-$14,999","% Nonfamily Households $15,000-$24,999","% Nonfamily Households $25,000-$34,999","% Nonfamily Households $35,000-$49,999","% Nonfamily Households $50,000-$74,999","% Nonfamily Households $75,000-$99,999","% Nonfamily Households $100,000-$149,999","% Nonfamily Households $150,000-$199,999","% Nonfamily Households $200,000 or More",Median Nonfamily Households Income ($),Mean Nonfamily Households Income ($)
0,2012,ADAMS,11077.0,0.092,0.132,0.163,0.183,0.154,0.164,0.063,0.045,0.002,0.002,31678.0,39002.0
1,2012,ALLEGHENY,222844.0,0.132,0.09,0.178,0.145,0.153,0.165,0.064,0.043,0.017,0.013,31395.0,44363.0
2,2012,ARMSTRONG,9564.0,0.172,0.114,0.247,0.178,0.102,0.098,0.071,0.016,0.0,0.002,23832.0,31013.0
3,2012,BEAVER,25207.0,0.101,0.141,0.228,0.191,0.129,0.125,0.061,0.022,0.002,0.001,26133.0,33987.0
4,2012,BERKS,47979.0,0.118,0.111,0.188,0.151,0.18,0.153,0.059,0.022,0.009,0.009,30445.0,38835.0
5,2012,BLAIR,18030.0,0.174,0.145,0.24,0.147,0.097,0.135,0.038,0.012,0.006,0.006,21889.0,31403.0
6,2012,BUCKS,65228.0,0.109,0.073,0.156,0.109,0.15,0.184,0.089,0.085,0.018,0.027,40759.0,54310.0
7,2012,BUTLER,23117.0,0.124,0.131,0.18,0.119,0.169,0.15,0.056,0.051,0.019,0.002,30281.0,40470.0
8,2012,CAMBRIA,20962.0,0.154,0.215,0.206,0.131,0.126,0.105,0.032,0.021,0.007,0.002,20384.0,30089.0
9,2012,CARBON,8939.0,0.156,0.143,0.166,0.056,0.243,0.136,0.036,0.05,0.015,0.0,31642.0,37726.0


**2016**
-

In [130]:
# Reading in 2016 U.S. Census Bureau income (Nonfamily Households) dataset
initial_pa_nonfamily_2016 = pd.read_excel("Resources/PA_Income_Nonfamily_Households_2016.xlsx")
initial_pa_nonfamily_2016.head(19)


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
0,,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households,...,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households
1,Label,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,...,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate
2,Total,11882,233085,8586,24001,47924,18257,68142,25258,20492,...,19613,97683,32939,13253,270647,19935,9844,28748,48936,52140
3,"Less than $10,000",6.4%,12.0%,10.3%,9.9%,10.7%,14.0%,9.8%,9.6%,13.6%,...,9.9%,6.8%,5.3%,12.5%,22.7%,13.9%,15.1%,9.1%,11.0%,8.8%
4,"$10,000 to $14,999",12.8%,8.6%,13.6%,11.1%,8.7%,13.5%,5.6%,7.0%,13.5%,...,5.4%,5.8%,11.1%,10.5%,8.0%,10.4%,10.6%,11.6%,11.5%,7.4%
5,"$15,000 to $24,999",15.6%,17.0%,26.0%,19.1%,15.8%,20.2%,15.6%,17.8%,22.9%,...,16.2%,12.8%,14.5%,25.8%,13.6%,21.9%,27.0%,22.1%,23.3%,18.2%
6,"$25,000 to $34,999",21.6%,13.2%,20.0%,13.7%,11.9%,18.5%,9.9%,9.9%,14.4%,...,14.5%,12.0%,16.6%,13.5%,11.0%,12.2%,17.3%,13.2%,12.0%,13.4%
7,"$35,000 to $49,999",16.1%,15.2%,12.0%,15.0%,19.1%,15.1%,15.8%,16.7%,18.1%,...,17.5%,16.2%,14.7%,12.4%,11.9%,17.1%,15.0%,15.5%,16.9%,13.4%
8,"$50,000 to $74,999",13.4%,16.0%,10.9%,15.9%,15.7%,12.5%,18.7%,20.3%,11.1%,...,19.2%,19.1%,17.3%,18.0%,13.7%,13.6%,7.9%,15.0%,14.1%,20.9%
9,"$75,000 to $99,999",5.5%,7.9%,1.8%,7.8%,10.2%,3.8%,11.2%,10.0%,3.4%,...,10.2%,10.9%,10.8%,4.3%,7.4%,6.4%,3.3%,7.2%,5.6%,8.4%


In [131]:
# Retrieving only rows that contain the desired county summary demographic statistics
# Four groupings = Households, Families, Married-couple families & non family households
# % of groups above that fall into the income categories: Less Than $10,000, $10,000-$14,999, $15,000-$24,999, $25,000-$34,999,
# $35,000-$49,999, $50,000-$74,999, $75,000-$99,999, $100,000-$149,999, $150,000-$199,999, $200,000 or More
# Total count for each grouping
# Median income for each grouping
# Mean income for each grouping
initial_pa_nonfamily_2016 = initial_pa_nonfamily_2016.iloc[[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], :]
initial_pa_nonfamily_2016


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
2,Total,11882,233085,8586,24001,47924,18257,68142,25258,20492,...,19613,97683,32939,13253,270647,19935,9844,28748,48936,52140
3,"Less than $10,000",6.4%,12.0%,10.3%,9.9%,10.7%,14.0%,9.8%,9.6%,13.6%,...,9.9%,6.8%,5.3%,12.5%,22.7%,13.9%,15.1%,9.1%,11.0%,8.8%
4,"$10,000 to $14,999",12.8%,8.6%,13.6%,11.1%,8.7%,13.5%,5.6%,7.0%,13.5%,...,5.4%,5.8%,11.1%,10.5%,8.0%,10.4%,10.6%,11.6%,11.5%,7.4%
5,"$15,000 to $24,999",15.6%,17.0%,26.0%,19.1%,15.8%,20.2%,15.6%,17.8%,22.9%,...,16.2%,12.8%,14.5%,25.8%,13.6%,21.9%,27.0%,22.1%,23.3%,18.2%
6,"$25,000 to $34,999",21.6%,13.2%,20.0%,13.7%,11.9%,18.5%,9.9%,9.9%,14.4%,...,14.5%,12.0%,16.6%,13.5%,11.0%,12.2%,17.3%,13.2%,12.0%,13.4%
7,"$35,000 to $49,999",16.1%,15.2%,12.0%,15.0%,19.1%,15.1%,15.8%,16.7%,18.1%,...,17.5%,16.2%,14.7%,12.4%,11.9%,17.1%,15.0%,15.5%,16.9%,13.4%
8,"$50,000 to $74,999",13.4%,16.0%,10.9%,15.9%,15.7%,12.5%,18.7%,20.3%,11.1%,...,19.2%,19.1%,17.3%,18.0%,13.7%,13.6%,7.9%,15.0%,14.1%,20.9%
9,"$75,000 to $99,999",5.5%,7.9%,1.8%,7.8%,10.2%,3.8%,11.2%,10.0%,3.4%,...,10.2%,10.9%,10.8%,4.3%,7.4%,6.4%,3.3%,7.2%,5.6%,8.4%
10,"$100,000 to $149,999",6.8%,6.8%,3.3%,6.1%,5.5%,1.6%,7.8%,5.3%,2.5%,...,5.1%,10.2%,6.4%,2.6%,7.4%,4.0%,2.6%,4.5%,3.7%,6.1%
11,"$150,000 to $199,999",1.5%,1.4%,1.1%,0.6%,1.3%,0.3%,2.6%,1.2%,0.3%,...,0.9%,3.7%,1.5%,0.1%,2.2%,0.1%,0.3%,0.8%,1.3%,1.7%


In [132]:
# Transposing the DataFrame's format for process integrity and future merging simplicity
initial_pa_nonfamily_2016 = initial_pa_nonfamily_2016.T
initial_pa_nonfamily_2016.head()


Unnamed: 0,2,3,4,5,6,7,8,9,10,11,12,13,14
Unnamed: 0,Total,"Less than $10,000","$10,000 to $14,999","$15,000 to $24,999","$25,000 to $34,999","$35,000 to $49,999","$50,000 to $74,999","$75,000 to $99,999","$100,000 to $149,999","$150,000 to $199,999","$200,000 or more",Median income (dollars),Mean income (dollars)
"Adams County, Pennsylvania",11882,6.4%,12.8%,15.6%,21.6%,16.1%,13.4%,5.5%,6.8%,1.5%,0.4%,31033,42653
"Allegheny County, Pennsylvania",233085,12.0%,8.6%,17.0%,13.2%,15.2%,16.0%,7.9%,6.8%,1.4%,1.8%,33974,48416
"Armstrong County, Pennsylvania",8586,10.3%,13.6%,26.0%,20.0%,12.0%,10.9%,1.8%,3.3%,1.1%,1.1%,25070,34235
"Beaver County, Pennsylvania",24001,9.9%,11.1%,19.1%,13.7%,15.0%,15.9%,7.8%,6.1%,0.6%,0.8%,32343,42638


In [133]:
# Resetting index and renaming to create a 'County' column
initial_pa_nonfamily_2016 = initial_pa_nonfamily_2016.reset_index()
initial_pa_nonfamily_2016.rename(columns={'index': 'County'}, inplace=True)
initial_pa_nonfamily_2016.head()


Unnamed: 0,County,2,3,4,5,6,7,8,9,10,11,12,13,14
0,Unnamed: 0,Total,"Less than $10,000","$10,000 to $14,999","$15,000 to $24,999","$25,000 to $34,999","$35,000 to $49,999","$50,000 to $74,999","$75,000 to $99,999","$100,000 to $149,999","$150,000 to $199,999","$200,000 or more",Median income (dollars),Mean income (dollars)
1,"Adams County, Pennsylvania",11882,6.4%,12.8%,15.6%,21.6%,16.1%,13.4%,5.5%,6.8%,1.5%,0.4%,31033,42653
2,"Allegheny County, Pennsylvania",233085,12.0%,8.6%,17.0%,13.2%,15.2%,16.0%,7.9%,6.8%,1.4%,1.8%,33974,48416
3,"Armstrong County, Pennsylvania",8586,10.3%,13.6%,26.0%,20.0%,12.0%,10.9%,1.8%,3.3%,1.1%,1.1%,25070,34235
4,"Beaver County, Pennsylvania",24001,9.9%,11.1%,19.1%,13.7%,15.0%,15.9%,7.8%,6.1%,0.6%,0.8%,32343,42638


In [134]:
# Confirming column data types prior to renaming
print(initial_pa_nonfamily_2016.columns)


Index(['County', 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], dtype='object')


In [135]:
# Converting column data types from integers to strings for renaming
initial_pa_nonfamily_2016.columns = initial_pa_nonfamily_2016.columns.astype(str)
print(initial_pa_nonfamily_2016.columns)


Index(['County', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14'],
      dtype='object')


In [136]:
# Finding and replacing all 'N' values with 'NaN' across the entire DataFrame for future manipulation
initial_pa_nonfamily_2016.replace('N', np.nan, inplace=True)


In [137]:
# Further DataFrame cleaning including execution of column renaming, dropping obselete (0) index row and subsequent resetting,
# as well as converting 'County' values to uppercase county names only (removing ' County, Pennsylvania')
# A corresponding year column is also included for future merges and all % data types are coverted to floats
initial_pa_nonfamily_2016 = initial_pa_nonfamily_2016.rename(columns={'2': '# of Nonfamily Households', '3': '% Nonfamily Households <$10,000', '4': '% Nonfamily Households $10,000-$14,999',
                                                                         '5': '% Nonfamily Households $15,000-$24,999', '6': '% Nonfamily Households $25,000-$34,999', '7': '% Nonfamily Households $35,000-$49,999',
                                                                         '8': '% Nonfamily Households $50,000-$74,999', '9': '% Nonfamily Households $75,000-$99,999',
                                                                         '10': '% Nonfamily Households $100,000-$149,999', '11': '% Nonfamily Households $150,000-$199,999',
                                                                        '12': '% Nonfamily Households $200,000 or More', '13': 'Median Nonfamily Households Income ($)',
                                                                       '14': 'Mean Nonfamily Households Income ($)'})
initial_pa_nonfamily_2016 = initial_pa_nonfamily_2016.drop(0)
initial_pa_nonfamily_2016 = initial_pa_nonfamily_2016.reset_index(drop=True)
initial_pa_nonfamily_2016['County'] = initial_pa_nonfamily_2016['County'].str.replace(' County, Pennsylvania', '', regex=False)
initial_pa_nonfamily_2016['County'] = initial_pa_nonfamily_2016['County'].apply(lambda x: x.upper())
initial_pa_nonfamily_2016.insert(0, 'Year', 2016)
initial_pa_nonfamily_2016['% Nonfamily Households <$10,000'] = initial_pa_nonfamily_2016['% Nonfamily Households <$10,000'].str.rstrip('%').astype(float) / 100
initial_pa_nonfamily_2016['% Nonfamily Households $10,000-$14,999'] = initial_pa_nonfamily_2016['% Nonfamily Households $10,000-$14,999'].str.rstrip('%').astype(float) / 100
initial_pa_nonfamily_2016['% Nonfamily Households $15,000-$24,999'] = initial_pa_nonfamily_2016['% Nonfamily Households $15,000-$24,999'].str.rstrip('%').astype(float) / 100
initial_pa_nonfamily_2016['% Nonfamily Households $25,000-$34,999'] = initial_pa_nonfamily_2016['% Nonfamily Households $25,000-$34,999'].str.rstrip('%').astype(float) / 100
initial_pa_nonfamily_2016['% Nonfamily Households $35,000-$49,999'] = initial_pa_nonfamily_2016['% Nonfamily Households $35,000-$49,999'].str.rstrip('%').astype(float) / 100
initial_pa_nonfamily_2016['% Nonfamily Households $50,000-$74,999'] = initial_pa_nonfamily_2016['% Nonfamily Households $50,000-$74,999'].str.rstrip('%').astype(float) / 100
initial_pa_nonfamily_2016['% Nonfamily Households $75,000-$99,999'] = initial_pa_nonfamily_2016['% Nonfamily Households $75,000-$99,999'].str.rstrip('%').astype(float) / 100
initial_pa_nonfamily_2016['% Nonfamily Households $100,000-$149,999'] = initial_pa_nonfamily_2016['% Nonfamily Households $100,000-$149,999'].str.rstrip('%').astype(float) / 100
initial_pa_nonfamily_2016['% Nonfamily Households $150,000-$199,999'] = initial_pa_nonfamily_2016['% Nonfamily Households $150,000-$199,999'].str.rstrip('%').astype(float) / 100
initial_pa_nonfamily_2016['% Nonfamily Households $200,000 or More'] = initial_pa_nonfamily_2016['% Nonfamily Households $200,000 or More'].str.rstrip('%').astype(float) / 100
initial_pa_nonfamily_2016['# of Nonfamily Households'] = initial_pa_nonfamily_2016['# of Nonfamily Households'].str.replace(',', '').astype(float)
initial_pa_nonfamily_2016['Median Nonfamily Households Income ($)'] = initial_pa_nonfamily_2016['Median Nonfamily Households Income ($)'].str.replace(',', '').astype(float)
initial_pa_nonfamily_2016['Mean Nonfamily Households Income ($)'] = initial_pa_nonfamily_2016['Mean Nonfamily Households Income ($)'].str.replace(',', '').astype(float)
initial_pa_nonfamily_2016


Unnamed: 0,Year,County,# of Nonfamily Households,"% Nonfamily Households <$10,000","% Nonfamily Households $10,000-$14,999","% Nonfamily Households $15,000-$24,999","% Nonfamily Households $25,000-$34,999","% Nonfamily Households $35,000-$49,999","% Nonfamily Households $50,000-$74,999","% Nonfamily Households $75,000-$99,999","% Nonfamily Households $100,000-$149,999","% Nonfamily Households $150,000-$199,999","% Nonfamily Households $200,000 or More",Median Nonfamily Households Income ($),Mean Nonfamily Households Income ($)
0,2016,ADAMS,11882.0,0.064,0.128,0.156,0.216,0.161,0.134,0.055,0.068,0.015,0.004,31033.0,42653.0
1,2016,ALLEGHENY,233085.0,0.12,0.086,0.17,0.132,0.152,0.16,0.079,0.068,0.014,0.018,33974.0,48416.0
2,2016,ARMSTRONG,8586.0,0.103,0.136,0.26,0.2,0.12,0.109,0.018,0.033,0.011,0.011,25070.0,34235.0
3,2016,BEAVER,24001.0,0.099,0.111,0.191,0.137,0.15,0.159,0.078,0.061,0.006,0.008,32343.0,42638.0
4,2016,BERKS,47924.0,0.107,0.087,0.158,0.119,0.191,0.157,0.102,0.055,0.013,0.012,36853.0,45662.0
5,2016,BLAIR,18257.0,0.14,0.135,0.202,0.185,0.151,0.125,0.038,0.016,0.003,0.005,25956.0,34858.0
6,2016,BUCKS,68142.0,0.098,0.056,0.156,0.099,0.158,0.187,0.112,0.078,0.026,0.03,43036.0,60793.0
7,2016,BUTLER,25258.0,0.096,0.07,0.178,0.099,0.167,0.203,0.1,0.053,0.012,0.02,38937.0,50080.0
8,2016,CAMBRIA,20492.0,0.136,0.135,0.229,0.144,0.181,0.111,0.034,0.025,0.003,0.002,25038.0,32416.0
9,2016,CARBON,9726.0,0.076,0.134,0.198,0.136,0.18,0.183,0.043,0.041,0.0,0.01,31405.0,39293.0


**2019**
-
Please note: 2020 United States Census Bureau was unavailable and as such the closest year prior to the election (2019) was utilized


In [138]:
# Reading in 2019 U.S. Census Bureau income (Nonfamily Households) dataset
initial_pa_nonfamily_2019 = pd.read_excel("Resources/PA_Income_Nonfamily_Households_2019.xlsx")
initial_pa_nonfamily_2019.head(19)


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
0,,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households,...,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households,Nonfamily households
1,Label,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,...,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate
2,Total,12328,252262,10911,27638,49294,17860,73467,27849,21303,...,15611,103775,38104,13653,280653,20512,9814,31489,60410,57175
3,"Less than $10,000",5.6%,10.7%,4.4%,9.7%,9.8%,13.5%,6.5%,12.2%,14.9%,...,14.5%,6.6%,8.6%,8.0%,18.5%,8.0%,11.9%,11.1%,14.2%,10.8%
4,"$10,000 to $14,999",5.5%,6.5%,12.2%,9.5%,7.3%,8.9%,4.9%,7.3%,14.8%,...,8.6%,5.2%,5.6%,13.3%,8.8%,15.1%,7.1%,10.6%,7.2%,5.9%
5,"$15,000 to $24,999",22.5%,14.0%,25.2%,17.7%,15.2%,18.2%,16.2%,14.6%,20.9%,...,10.3%,12.3%,13.1%,17.6%,11.6%,21.4%,24.5%,12.8%,16.9%,17.6%
6,"$25,000 to $34,999",11.1%,12.5%,9.1%,12.3%,13.0%,18.9%,9.3%,13.7%,14.1%,...,17.1%,11.6%,15.9%,23.7%,10.3%,14.0%,14.4%,14.4%,14.7%,11.7%
7,"$35,000 to $49,999",16.4%,16.1%,18.3%,18.3%,18.5%,14.2%,13.5%,16.2%,13.9%,...,14.6%,12.3%,13.8%,15.6%,12.8%,12.2%,20.4%,14.8%,16.4%,17.6%
8,"$50,000 to $74,999",17.3%,17.6%,18.3%,18.1%,17.2%,14.1%,19.8%,16.2%,12.0%,...,16.6%,19.4%,16.0%,12.8%,15.3%,15.0%,9.3%,14.9%,17.2%,16.2%
9,"$75,000 to $99,999",12.5%,10.0%,6.5%,7.1%,9.6%,6.1%,9.9%,9.0%,4.9%,...,9.2%,12.7%,11.1%,6.0%,8.2%,7.8%,5.2%,9.5%,5.5%,10.7%


In [139]:
# Retrieving only rows that contain the desired county summary demographic statistics
# Four groupings = Households, Families, Married-couple families & non family households
# % of groups above that fall into the income categories: Less Than $10,000, $10,000-$14,999, $15,000-$24,999, $25,000-$34,999,
# $35,000-$49,999, $50,000-$74,999, $75,000-$99,999, $100,000-$149,999, $150,000-$199,999, $200,000 or More
# Total count for each grouping
# Median income for each grouping
# Mean income for each grouping
initial_pa_nonfamily_2019 = initial_pa_nonfamily_2019.iloc[[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], :]
initial_pa_nonfamily_2019


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
2,Total,12328,252262,10911,27638,49294,17860,73467,27849,21303,...,15611,103775,38104,13653,280653,20512,9814,31489,60410,57175
3,"Less than $10,000",5.6%,10.7%,4.4%,9.7%,9.8%,13.5%,6.5%,12.2%,14.9%,...,14.5%,6.6%,8.6%,8.0%,18.5%,8.0%,11.9%,11.1%,14.2%,10.8%
4,"$10,000 to $14,999",5.5%,6.5%,12.2%,9.5%,7.3%,8.9%,4.9%,7.3%,14.8%,...,8.6%,5.2%,5.6%,13.3%,8.8%,15.1%,7.1%,10.6%,7.2%,5.9%
5,"$15,000 to $24,999",22.5%,14.0%,25.2%,17.7%,15.2%,18.2%,16.2%,14.6%,20.9%,...,10.3%,12.3%,13.1%,17.6%,11.6%,21.4%,24.5%,12.8%,16.9%,17.6%
6,"$25,000 to $34,999",11.1%,12.5%,9.1%,12.3%,13.0%,18.9%,9.3%,13.7%,14.1%,...,17.1%,11.6%,15.9%,23.7%,10.3%,14.0%,14.4%,14.4%,14.7%,11.7%
7,"$35,000 to $49,999",16.4%,16.1%,18.3%,18.3%,18.5%,14.2%,13.5%,16.2%,13.9%,...,14.6%,12.3%,13.8%,15.6%,12.8%,12.2%,20.4%,14.8%,16.4%,17.6%
8,"$50,000 to $74,999",17.3%,17.6%,18.3%,18.1%,17.2%,14.1%,19.8%,16.2%,12.0%,...,16.6%,19.4%,16.0%,12.8%,15.3%,15.0%,9.3%,14.9%,17.2%,16.2%
9,"$75,000 to $99,999",12.5%,10.0%,6.5%,7.1%,9.6%,6.1%,9.9%,9.0%,4.9%,...,9.2%,12.7%,11.1%,6.0%,8.2%,7.8%,5.2%,9.5%,5.5%,10.7%
10,"$100,000 to $149,999",7.6%,7.4%,4.4%,4.7%,5.8%,3.4%,11.7%,6.2%,3.4%,...,8.3%,11.8%,10.8%,2.7%,7.6%,4.6%,5.5%,6.7%,5.1%,7.0%
11,"$150,000 to $199,999",0.6%,2.9%,0.8%,2.0%,2.2%,1.6%,2.6%,1.5%,0.3%,...,0.8%,4.0%,2.8%,0.1%,3.5%,1.0%,0.0%,2.7%,1.7%,1.3%


In [140]:
# Transposing the DataFrame's format for process integrity and future merging simplicity
initial_pa_nonfamily_2019 = initial_pa_nonfamily_2019.T
initial_pa_nonfamily_2019.head()


Unnamed: 0,2,3,4,5,6,7,8,9,10,11,12,13,14
Unnamed: 0,Total,"Less than $10,000","$10,000 to $14,999","$15,000 to $24,999","$25,000 to $34,999","$35,000 to $49,999","$50,000 to $74,999","$75,000 to $99,999","$100,000 to $149,999","$150,000 to $199,999","$200,000 or more",Median income (dollars),Mean income (dollars)
"Adams County, Pennsylvania",12328,5.6%,5.5%,22.5%,11.1%,16.4%,17.3%,12.5%,7.6%,0.6%,0.9%,38218,48790
"Allegheny County, Pennsylvania",252262,10.7%,6.5%,14.0%,12.5%,16.1%,17.6%,10.0%,7.4%,2.9%,2.3%,40156,55208
"Armstrong County, Pennsylvania",10911,4.4%,12.2%,25.2%,9.1%,18.3%,18.3%,6.5%,4.4%,0.8%,0.8%,32290,41674
"Beaver County, Pennsylvania",27638,9.7%,9.5%,17.7%,12.3%,18.3%,18.1%,7.1%,4.7%,2.0%,0.6%,35330,44751


In [141]:
# Resetting index and renaming to create a 'County' column
initial_pa_nonfamily_2019 = initial_pa_nonfamily_2019.reset_index()
initial_pa_nonfamily_2019.rename(columns={'index': 'County'}, inplace=True)
initial_pa_nonfamily_2019.head()


Unnamed: 0,County,2,3,4,5,6,7,8,9,10,11,12,13,14
0,Unnamed: 0,Total,"Less than $10,000","$10,000 to $14,999","$15,000 to $24,999","$25,000 to $34,999","$35,000 to $49,999","$50,000 to $74,999","$75,000 to $99,999","$100,000 to $149,999","$150,000 to $199,999","$200,000 or more",Median income (dollars),Mean income (dollars)
1,"Adams County, Pennsylvania",12328,5.6%,5.5%,22.5%,11.1%,16.4%,17.3%,12.5%,7.6%,0.6%,0.9%,38218,48790
2,"Allegheny County, Pennsylvania",252262,10.7%,6.5%,14.0%,12.5%,16.1%,17.6%,10.0%,7.4%,2.9%,2.3%,40156,55208
3,"Armstrong County, Pennsylvania",10911,4.4%,12.2%,25.2%,9.1%,18.3%,18.3%,6.5%,4.4%,0.8%,0.8%,32290,41674
4,"Beaver County, Pennsylvania",27638,9.7%,9.5%,17.7%,12.3%,18.3%,18.1%,7.1%,4.7%,2.0%,0.6%,35330,44751


In [142]:
# Confirming column data types prior to renaming
print(initial_pa_nonfamily_2019.columns)


Index(['County', 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], dtype='object')


In [143]:
# Converting column data types from integers to strings for renaming
initial_pa_nonfamily_2019.columns = initial_pa_nonfamily_2019.columns.astype(str)
print(initial_pa_nonfamily_2019.columns)


Index(['County', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14'],
      dtype='object')


In [144]:
# Finding and replacing all 'N' values with 'NaN' across the entire DataFrame for future manipulation
initial_pa_nonfamily_2019.replace('N', np.nan, inplace=True)


In [145]:
# Further DataFrame cleaning including execution of column renaming, dropping obselete (0) index row and subsequent resetting,
# as well as converting 'County' values to uppercase county names only (removing ' County, Pennsylvania')
# A corresponding year column is also included for future merges and all % data types are coverted to floats
initial_pa_nonfamily_2019 = initial_pa_nonfamily_2019.rename(columns={'2': '# of Nonfamily Households', '3': '% Nonfamily Households <$10,000', '4': '% Nonfamily Households $10,000-$14,999',
                                                                         '5': '% Nonfamily Households $15,000-$24,999', '6': '% Nonfamily Households $25,000-$34,999', '7': '% Nonfamily Households $35,000-$49,999',
                                                                         '8': '% Nonfamily Households $50,000-$74,999', '9': '% Nonfamily Households $75,000-$99,999',
                                                                         '10': '% Nonfamily Households $100,000-$149,999', '11': '% Nonfamily Households $150,000-$199,999',
                                                                        '12': '% Nonfamily Households $200,000 or More', '13': 'Median Nonfamily Households Income ($)',
                                                                       '14': 'Mean Nonfamily Households Income ($)'})
initial_pa_nonfamily_2019 = initial_pa_nonfamily_2019.drop(0)
initial_pa_nonfamily_2019 = initial_pa_nonfamily_2019.reset_index(drop=True)
initial_pa_nonfamily_2019['County'] = initial_pa_nonfamily_2019['County'].str.replace(' County, Pennsylvania', '', regex=False)
initial_pa_nonfamily_2019['County'] = initial_pa_nonfamily_2019['County'].apply(lambda x: x.upper())
initial_pa_nonfamily_2019.insert(0, 'Year', 2020)
initial_pa_nonfamily_2019['% Nonfamily Households <$10,000'] = initial_pa_nonfamily_2019['% Nonfamily Households <$10,000'].str.rstrip('%').astype(float) / 100
initial_pa_nonfamily_2019['% Nonfamily Households $10,000-$14,999'] = initial_pa_nonfamily_2019['% Nonfamily Households $10,000-$14,999'].str.rstrip('%').astype(float) / 100
initial_pa_nonfamily_2019['% Nonfamily Households $15,000-$24,999'] = initial_pa_nonfamily_2019['% Nonfamily Households $15,000-$24,999'].str.rstrip('%').astype(float) / 100
initial_pa_nonfamily_2019['% Nonfamily Households $25,000-$34,999'] = initial_pa_nonfamily_2019['% Nonfamily Households $25,000-$34,999'].str.rstrip('%').astype(float) / 100
initial_pa_nonfamily_2019['% Nonfamily Households $35,000-$49,999'] = initial_pa_nonfamily_2019['% Nonfamily Households $35,000-$49,999'].str.rstrip('%').astype(float) / 100
initial_pa_nonfamily_2019['% Nonfamily Households $50,000-$74,999'] = initial_pa_nonfamily_2019['% Nonfamily Households $50,000-$74,999'].str.rstrip('%').astype(float) / 100
initial_pa_nonfamily_2019['% Nonfamily Households $75,000-$99,999'] = initial_pa_nonfamily_2019['% Nonfamily Households $75,000-$99,999'].str.rstrip('%').astype(float) / 100
initial_pa_nonfamily_2019['% Nonfamily Households $100,000-$149,999'] = initial_pa_nonfamily_2019['% Nonfamily Households $100,000-$149,999'].str.rstrip('%').astype(float) / 100
initial_pa_nonfamily_2019['% Nonfamily Households $150,000-$199,999'] = initial_pa_nonfamily_2019['% Nonfamily Households $150,000-$199,999'].str.rstrip('%').astype(float) / 100
initial_pa_nonfamily_2019['% Nonfamily Households $200,000 or More'] = initial_pa_nonfamily_2019['% Nonfamily Households $200,000 or More'].str.rstrip('%').astype(float) / 100
initial_pa_nonfamily_2019['# of Nonfamily Households'] = initial_pa_nonfamily_2019['# of Nonfamily Households'].str.replace(',', '').astype(float)
initial_pa_nonfamily_2019['Median Nonfamily Households Income ($)'] = initial_pa_nonfamily_2019['Median Nonfamily Households Income ($)'].str.replace(',', '').astype(float)
initial_pa_nonfamily_2019['Mean Nonfamily Households Income ($)'] = initial_pa_nonfamily_2019['Mean Nonfamily Households Income ($)'].str.replace(',', '').astype(float)
initial_pa_nonfamily_2019


Unnamed: 0,Year,County,# of Nonfamily Households,"% Nonfamily Households <$10,000","% Nonfamily Households $10,000-$14,999","% Nonfamily Households $15,000-$24,999","% Nonfamily Households $25,000-$34,999","% Nonfamily Households $35,000-$49,999","% Nonfamily Households $50,000-$74,999","% Nonfamily Households $75,000-$99,999","% Nonfamily Households $100,000-$149,999","% Nonfamily Households $150,000-$199,999","% Nonfamily Households $200,000 or More",Median Nonfamily Households Income ($),Mean Nonfamily Households Income ($)
0,2020,ADAMS,12328.0,0.056,0.055,0.225,0.111,0.164,0.173,0.125,0.076,0.006,0.009,38218.0,48790.0
1,2020,ALLEGHENY,252262.0,0.107,0.065,0.14,0.125,0.161,0.176,0.1,0.074,0.029,0.023,40156.0,55208.0
2,2020,ARMSTRONG,10911.0,0.044,0.122,0.252,0.091,0.183,0.183,0.065,0.044,0.008,0.008,32290.0,41674.0
3,2020,BEAVER,27638.0,0.097,0.095,0.177,0.123,0.183,0.181,0.071,0.047,0.02,0.006,35330.0,44751.0
4,2020,BERKS,49294.0,0.098,0.073,0.152,0.13,0.185,0.172,0.096,0.058,0.022,0.013,38557.0,49681.0
5,2020,BLAIR,17860.0,0.135,0.089,0.182,0.189,0.142,0.141,0.061,0.034,0.016,0.011,29644.0,42377.0
6,2020,BUCKS,73467.0,0.065,0.049,0.162,0.093,0.135,0.198,0.099,0.117,0.026,0.057,49565.0,72632.0
7,2020,BUTLER,27849.0,0.122,0.073,0.146,0.137,0.162,0.162,0.09,0.062,0.015,0.031,37006.0,55801.0
8,2020,CAMBRIA,21303.0,0.149,0.148,0.209,0.141,0.139,0.12,0.049,0.034,0.003,0.006,24591.0,36061.0
9,2020,CARBON,9545.0,0.067,0.085,0.156,0.162,0.219,0.082,0.092,0.101,0.018,0.017,36259.0,65593.0


**Auxillary (Total)  Data Extraction, Transformation & Loading (2012, 2016 & 2019)**
-
-----------

**2012**
-

In [146]:
# Reading in 2012 U.S. Census Bureau auxillary (Total) dataset
initial_pa_total_aux_2012 = pd.read_excel("Resources/PA_Total_Auxillary_2012.xlsx")
initial_pa_total_aux_2012.head(60)


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
0,,Total,Total,Total,Total,Total,Total,Total,Total,Total,...,Total,Total,Total,Total,Total,Total,Total,Total,Total,Total
1,Label,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,...,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate
2,Civilian population 18 years and over,79767,991547,54867,136295,317121,100743,488297,144919,114109,...,131152,627565,236235,75785,1198622,118242,62884,166658,293284,337670
3,PERIOD OF SERVICE,,,,,,,,,,...,,,,,,,,,,
4,Gulf War (9/2001 or later) veterans,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
5,Gulf War (8/1990 to 8/2001) veterans,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
6,Vietnam era veterans,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
7,Korean War veterans,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
8,World War II veterans,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
9,SEX,,,,,,,,,,...,,,,,,,,,,


In [147]:
# Retrieving only rows that contain the desired county summary demographic statistics
# Three groupings = Total Population, Veteran Population, Nonveteran Population
# % of population groups above that fall into these age categories: 18 to 34, 35 to 54, 55 to 64, 65 to 74 and 75 & Over
# Labor force participation rate
# Unemployment rate
# % of Population group with income below poverty level in the past 12 months
# % of Population group with any disability
initial_pa_total_aux_2012 = initial_pa_total_aux_2012.iloc[[13, 14, 15, 16, 17, 41, 43, 46, 49], :]
initial_pa_total_aux_2012


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
13,18 to 34 years,25.9%,29.4%,21.9%,23.9%,28.0%,25.7%,23.9%,25.6%,24.8%,...,27.2%,26.0%,26.9%,24.6%,38.2%,23.5%,23.0%,23.6%,22.2%,26.1%
14,35 to 54 years,34.5%,32.2%,34.6%,33.5%,35.7%,32.9%,37.7%,36.1%,32.2%,...,37.0%,36.7%,35.0%,33.7%,31.8%,35.4%,34.3%,34.8%,34.2%,37.2%
15,55 to 64 years,18.0%,17.4%,19.4%,18.4%,16.5%,18.1%,18.3%,17.9%,18.8%,...,18.3%,17.0%,16.9%,18.1%,14.2%,17.8%,18.8%,19.0%,19.3%,17.3%
16,65 to 74 years,11.8%,10.3%,12.4%,12.3%,10.1%,11.8%,10.8%,10.8%,11.9%,...,10.4%,10.3%,10.7%,12.2%,8.4%,11.8%,12.4%,11.8%,12.5%,10.7%
17,75 years and over,9.9%,10.7%,11.7%,11.8%,9.6%,11.4%,9.2%,9.6%,12.2%,...,7.1%,10.0%,10.4%,11.3%,7.4%,11.5%,11.6%,10.8%,11.8%,8.7%
41,Labor force participation rate,80.6%,78.1%,74.2%,78.5%,79.6%,74.5%,81.7%,77.9%,70.3%,...,76.9%,81.9%,76.4%,71.1%,68.8%,73.6%,69.9%,77.4%,78.3%,79.4%
43,Unemployment rate,5.4%,7.3%,8.4%,8.1%,9.6%,7.5%,7.8%,6.3%,9.7%,...,13.4%,6.8%,8.1%,10.8%,15.7%,10.3%,7.5%,7.0%,5.9%,8.0%
46,Below poverty in the past 12 months,8.0%,11.3%,11.7%,9.5%,12.0%,11.5%,5.1%,9.1%,12.5%,...,12.8%,6.2%,9.2%,12.4%,24.0%,13.2%,10.0%,8.8%,9.1%,8.9%
49,With any disability,16.3%,15.3%,20.8%,17.7%,16.0%,18.9%,11.9%,14.7%,20.9%,...,15.7%,11.5%,16.1%,18.0%,20.0%,20.0%,21.1%,17.4%,16.1%,14.5%


In [148]:
# Transposing the DataFrame's format for process integrity and future merging simplicity
initial_pa_total_aux_2012 = initial_pa_total_aux_2012.T
initial_pa_total_aux_2012.head()


Unnamed: 0,13,14,15,16,17,41,43,46,49
Unnamed: 0,18 to 34 years,35 to 54 years,55 to 64 years,65 to 74 years,75 years and over,Labor force participation rate,Unemployment rate,Below poverty in the past 12 months,With any disability
"Adams County, Pennsylvania",25.9%,34.5%,18.0%,11.8%,9.9%,80.6%,5.4%,8.0%,16.3%
"Allegheny County, Pennsylvania",29.4%,32.2%,17.4%,10.3%,10.7%,78.1%,7.3%,11.3%,15.3%
"Armstrong County, Pennsylvania",21.9%,34.6%,19.4%,12.4%,11.7%,74.2%,8.4%,11.7%,20.8%
"Beaver County, Pennsylvania",23.9%,33.5%,18.4%,12.3%,11.8%,78.5%,8.1%,9.5%,17.7%


In [149]:
# Resetting index and renaming to create a 'County' column
initial_pa_total_aux_2012 = initial_pa_total_aux_2012.reset_index()
initial_pa_total_aux_2012.rename(columns={'index': 'County'}, inplace=True)
initial_pa_total_aux_2012.head()


Unnamed: 0,County,13,14,15,16,17,41,43,46,49
0,Unnamed: 0,18 to 34 years,35 to 54 years,55 to 64 years,65 to 74 years,75 years and over,Labor force participation rate,Unemployment rate,Below poverty in the past 12 months,With any disability
1,"Adams County, Pennsylvania",25.9%,34.5%,18.0%,11.8%,9.9%,80.6%,5.4%,8.0%,16.3%
2,"Allegheny County, Pennsylvania",29.4%,32.2%,17.4%,10.3%,10.7%,78.1%,7.3%,11.3%,15.3%
3,"Armstrong County, Pennsylvania",21.9%,34.6%,19.4%,12.4%,11.7%,74.2%,8.4%,11.7%,20.8%
4,"Beaver County, Pennsylvania",23.9%,33.5%,18.4%,12.3%,11.8%,78.5%,8.1%,9.5%,17.7%


In [150]:
# Confirming column data types prior to renaming
print(initial_pa_total_aux_2012.columns)


Index(['County', 13, 14, 15, 16, 17, 41, 43, 46, 49], dtype='object')


In [151]:
# Converting column data types from integers to strings for renaming
initial_pa_total_aux_2012.columns = initial_pa_total_aux_2012.columns.astype(str)
print(initial_pa_total_aux_2012.columns)


Index(['County', '13', '14', '15', '16', '17', '41', '43', '46', '49'], dtype='object')


In [152]:
# Finding and replacing all 'N' values with 'NaN' across the entire DataFrame for future manipulation
initial_pa_total_aux_2012.replace('N', np.nan, inplace=True)


In [153]:
# Further DataFrame cleaning including execution of column renaming, dropping obselete (0) index row and subsequent resetting,
# as well as converting 'County' values to uppercase county names only (removing ' County, Pennsylvania')
# A corresponding year column is also included for future merges and all % data types are coverted to floats
initial_pa_total_aux_2012 = initial_pa_total_aux_2012.rename(columns={'13': '% Total Population 18-34 Years Old', '14': '% Total Population 35-54 Years Old', '15': '% Total Population 55-64 Years Old',
                                                                         '16': '% Total Population 65-74 Years Old', '17': '% Total Population 75 Years Old & Over', '41': 'Total Labor Force Participation Rate (%)',
                                                                         '43': 'Total Unemployment Rate (%)', '46': '% Total Population With Income Below Poverty Level (Past 12 Months)',
                                                                         '49': '% Total Population With Any Disability'})
initial_pa_total_aux_2012 = initial_pa_total_aux_2012.drop(0)
initial_pa_total_aux_2012 = initial_pa_total_aux_2012.reset_index(drop=True)
initial_pa_total_aux_2012['County'] = initial_pa_total_aux_2012['County'].str.replace(' County, Pennsylvania', '', regex=False)
initial_pa_total_aux_2012['County'] = initial_pa_total_aux_2012['County'].apply(lambda x: x.upper())
initial_pa_total_aux_2012.insert(0, 'Year', 2012)
initial_pa_total_aux_2012['% Total Population 18-34 Years Old'] = initial_pa_total_aux_2012['% Total Population 18-34 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_total_aux_2012['% Total Population 35-54 Years Old'] = initial_pa_total_aux_2012['% Total Population 35-54 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_total_aux_2012['% Total Population 55-64 Years Old'] = initial_pa_total_aux_2012['% Total Population 55-64 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_total_aux_2012['% Total Population 65-74 Years Old'] = initial_pa_total_aux_2012['% Total Population 65-74 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_total_aux_2012['% Total Population 75 Years Old & Over'] = initial_pa_total_aux_2012['% Total Population 75 Years Old & Over'].str.rstrip('%').astype(float) / 100
initial_pa_total_aux_2012['Total Labor Force Participation Rate (%)'] = initial_pa_total_aux_2012['Total Labor Force Participation Rate (%)'].str.rstrip('%').astype(float) / 100
initial_pa_total_aux_2012['Total Unemployment Rate (%)'] = initial_pa_total_aux_2012['Total Unemployment Rate (%)'].str.rstrip('%').astype(float) / 100
initial_pa_total_aux_2012['% Total Population With Income Below Poverty Level (Past 12 Months)'] = initial_pa_total_aux_2012['% Total Population With Income Below Poverty Level (Past 12 Months)'].str.rstrip('%').astype(float) / 100
initial_pa_total_aux_2012['% Total Population With Any Disability'] = initial_pa_total_aux_2012['% Total Population With Any Disability'].str.rstrip('%').astype(float) / 100
initial_pa_total_aux_2012


Unnamed: 0,Year,County,% Total Population 18-34 Years Old,% Total Population 35-54 Years Old,% Total Population 55-64 Years Old,% Total Population 65-74 Years Old,% Total Population 75 Years Old & Over,Total Labor Force Participation Rate (%),Total Unemployment Rate (%),% Total Population With Income Below Poverty Level (Past 12 Months),% Total Population With Any Disability
0,2012,ADAMS,0.259,0.345,0.18,0.118,0.099,0.806,0.054,0.08,0.163
1,2012,ALLEGHENY,0.294,0.322,0.174,0.103,0.107,0.781,0.073,0.113,0.153
2,2012,ARMSTRONG,0.219,0.346,0.194,0.124,0.117,0.742,0.084,0.117,0.208
3,2012,BEAVER,0.239,0.335,0.184,0.123,0.118,0.785,0.081,0.095,0.177
4,2012,BERKS,0.28,0.357,0.165,0.101,0.096,0.796,0.096,0.12,0.16
5,2012,BLAIR,0.257,0.329,0.181,0.118,0.114,0.745,0.075,0.115,0.189
6,2012,BUCKS,0.239,0.377,0.183,0.108,0.092,0.817,0.078,0.051,0.119
7,2012,BUTLER,0.256,0.361,0.179,0.108,0.096,0.779,0.063,0.091,0.147
8,2012,CAMBRIA,0.248,0.322,0.188,0.119,0.122,0.703,0.097,0.125,0.209
9,2012,CARBON,0.213,0.364,0.187,0.127,0.109,0.766,0.074,0.081,0.215


**2016**
-

In [154]:
# Reading in 2016 U.S. Census Bureau auxillary (Total) dataset
initial_pa_total_aux_2016 = pd.read_excel("Resources/PA_Total_Auxillary_2016.xlsx")
initial_pa_total_aux_2016.head(60)


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
0,,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent,...,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent
1,Label,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,...,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate
2,Civilian population 18 years and over,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
3,PERIOD OF SERVICE,,,,,,,,,,...,,,,,,,,,,
4,Gulf War (9/2001 or later) veterans,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
5,Gulf War (8/1990 to 8/2001) veterans,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
6,Vietnam era veterans,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
7,Korean War veterans,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
8,World War II veterans,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
9,SEX,,,,,,,,,,...,,,,,,,,,,


In [155]:
# Retrieving only rows that contain the desired county summary demographic statistics
# Three groupings = Total Population, Veteran Population, Nonveteran Population
# % of population groups above that fall into these age categories: 18 to 34, 35 to 54, 55 to 64, 65 to 74 and 75 & Over
# Labor force participation rate
# Unemployment rate
# % of Population group with income below poverty level in the past 12 months
# % of Population group with any disability
initial_pa_total_aux_2016 = initial_pa_total_aux_2016.iloc[[13, 14, 15, 16, 17, 40, 42, 45, 49], :]
initial_pa_total_aux_2016


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
13,18 to 34 years,25.2%,29.9%,21.8%,24.2%,28.5%,24.8%,24.0%,24.8%,24.5%,...,27.8%,26.1%,27.3%,23.1%,37.6%,23.7%,23.2%,24.6%,22.3%,26.4%
14,35 to 54 years,31.7%,29.8%,31.8%,30.7%,32.8%,30.9%,34.0%,33.4%,29.5%,...,32.5%,34.0%,32.0%,32.9%,31.0%,33.3%,31.7%,31.5%,31.0%,34.0%
15,55 to 64 years,18.4%,18.1%,20.0%,19.7%,17.4%,18.7%,19.7%,19.2%,19.3%,...,19.4%,17.9%,18.1%,18.9%,14.9%,18.4%,19.2%,19.8%,20.0%,18.0%
16,65 to 74 years,14.3%,12.0%,14.8%,13.7%,11.9%,13.9%,12.7%,12.7%,14.6%,...,13.1%,11.9%,12.5%,13.6%,9.4%,13.7%,14.0%,13.6%,14.8%,12.4%
17,75 years and over,10.6%,10.2%,11.6%,11.7%,9.4%,11.6%,9.6%,9.9%,12.1%,...,7.1%,10.1%,10.1%,11.5%,7.1%,11.0%,11.8%,10.5%,11.9%,9.1%
40,Labor force participation rate,78.5%,80.0%,71.1%,77.9%,78.6%,73.8%,80.3%,77.4%,70.7%,...,72.7%,81.8%,78.8%,74.4%,70.0%,71.6%,68.5%,74.8%,77.6%,79.1%
42,Unemployment rate,5.6%,5.3%,5.7%,4.3%,5.8%,4.6%,3.8%,3.6%,7.7%,...,7.3%,4.7%,4.1%,4.1%,9.2%,5.7%,5.8%,5.7%,5.2%,4.1%
45,Income in the past 12 months below poverty level,7.7%,10.9%,12.6%,7.1%,11.3%,11.0%,6.2%,6.3%,13.6%,...,9.8%,5.7%,6.7%,12.9%,22.4%,11.5%,10.7%,9.3%,8.6%,8.9%
49,With any disability,17.7%,16.2%,22.9%,17.4%,16.2%,21.0%,13.8%,14.9%,20.6%,...,18.2%,12.3%,14.9%,18.5%,19.1%,21.6%,21.7%,17.5%,18.6%,16.8%


In [156]:
# Transposing the DataFrame's format for process integrity and future merging simplicity
initial_pa_total_aux_2016 = initial_pa_total_aux_2016.T
initial_pa_total_aux_2016.head()


Unnamed: 0,13,14,15,16,17,40,42,45,49
Unnamed: 0,18 to 34 years,35 to 54 years,55 to 64 years,65 to 74 years,75 years and over,Labor force participation rate,Unemployment rate,Income in the past 12 months below poverty level,With any disability
"Adams County, Pennsylvania",25.2%,31.7%,18.4%,14.3%,10.6%,78.5%,5.6%,7.7%,17.7%
"Allegheny County, Pennsylvania",29.9%,29.8%,18.1%,12.0%,10.2%,80.0%,5.3%,10.9%,16.2%
"Armstrong County, Pennsylvania",21.8%,31.8%,20.0%,14.8%,11.6%,71.1%,5.7%,12.6%,22.9%
"Beaver County, Pennsylvania",24.2%,30.7%,19.7%,13.7%,11.7%,77.9%,4.3%,7.1%,17.4%


In [157]:
# Resetting index and renaming to create a 'County' column
initial_pa_total_aux_2016 = initial_pa_total_aux_2016.reset_index()
initial_pa_total_aux_2016.rename(columns={'index': 'County'}, inplace=True)
initial_pa_total_aux_2016.head()


Unnamed: 0,County,13,14,15,16,17,40,42,45,49
0,Unnamed: 0,18 to 34 years,35 to 54 years,55 to 64 years,65 to 74 years,75 years and over,Labor force participation rate,Unemployment rate,Income in the past 12 months below poverty level,With any disability
1,"Adams County, Pennsylvania",25.2%,31.7%,18.4%,14.3%,10.6%,78.5%,5.6%,7.7%,17.7%
2,"Allegheny County, Pennsylvania",29.9%,29.8%,18.1%,12.0%,10.2%,80.0%,5.3%,10.9%,16.2%
3,"Armstrong County, Pennsylvania",21.8%,31.8%,20.0%,14.8%,11.6%,71.1%,5.7%,12.6%,22.9%
4,"Beaver County, Pennsylvania",24.2%,30.7%,19.7%,13.7%,11.7%,77.9%,4.3%,7.1%,17.4%


In [158]:
# Confirming column data types prior to renaming
print(initial_pa_total_aux_2016.columns)


Index(['County', 13, 14, 15, 16, 17, 40, 42, 45, 49], dtype='object')


In [159]:
# Converting column data types from integers to strings for renaming
initial_pa_total_aux_2016.columns = initial_pa_total_aux_2016.columns.astype(str)
print(initial_pa_total_aux_2016.columns)


Index(['County', '13', '14', '15', '16', '17', '40', '42', '45', '49'], dtype='object')


In [160]:
# Finding and replacing all 'N' values with 'NaN' across the entire DataFrame for future manipulation
initial_pa_total_aux_2016.replace('N', np.nan, inplace=True)


In [161]:
# Further DataFrame cleaning including execution of column renaming, dropping obselete (0) index row and subsequent resetting,
# as well as converting 'County' values to uppercase county names only (removing ' County, Pennsylvania')
# A corresponding year column is also included for future merges and all % data types are coverted to floats
initial_pa_total_aux_2016 = initial_pa_total_aux_2016.rename(columns={'13': '% Total Population 18-34 Years Old', '14': '% Total Population 35-54 Years Old', '15': '% Total Population 55-64 Years Old',
                                                                         '16': '% Total Population 65-74 Years Old', '17': '% Total Population 75 Years Old & Over', '40': 'Total Labor Force Participation Rate (%)',
                                                                         '42': 'Total Unemployment Rate (%)', '45': '% Total Population With Income Below Poverty Level (Past 12 Months)',
                                                                         '49': '% Total Population With Any Disability'})
initial_pa_total_aux_2016 = initial_pa_total_aux_2016.drop(0)
initial_pa_total_aux_2016 = initial_pa_total_aux_2016.reset_index(drop=True)
initial_pa_total_aux_2016['County'] = initial_pa_total_aux_2016['County'].str.replace(' County, Pennsylvania', '', regex=False)
initial_pa_total_aux_2016['County'] = initial_pa_total_aux_2016['County'].apply(lambda x: x.upper())
initial_pa_total_aux_2016.insert(0, 'Year', 2016)
initial_pa_total_aux_2016['% Total Population 18-34 Years Old'] = initial_pa_total_aux_2016['% Total Population 18-34 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_total_aux_2016['% Total Population 35-54 Years Old'] = initial_pa_total_aux_2016['% Total Population 35-54 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_total_aux_2016['% Total Population 55-64 Years Old'] = initial_pa_total_aux_2016['% Total Population 55-64 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_total_aux_2016['% Total Population 65-74 Years Old'] = initial_pa_total_aux_2016['% Total Population 65-74 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_total_aux_2016['% Total Population 75 Years Old & Over'] = initial_pa_total_aux_2016['% Total Population 75 Years Old & Over'].str.rstrip('%').astype(float) / 100
initial_pa_total_aux_2016['Total Labor Force Participation Rate (%)'] = initial_pa_total_aux_2016['Total Labor Force Participation Rate (%)'].str.rstrip('%').astype(float) / 100
initial_pa_total_aux_2016['Total Unemployment Rate (%)'] = initial_pa_total_aux_2016['Total Unemployment Rate (%)'].str.rstrip('%').astype(float) / 100
initial_pa_total_aux_2016['% Total Population With Income Below Poverty Level (Past 12 Months)'] = initial_pa_total_aux_2016['% Total Population With Income Below Poverty Level (Past 12 Months)'].str.rstrip('%').astype(float) / 100
initial_pa_total_aux_2016['% Total Population With Any Disability'] = initial_pa_total_aux_2016['% Total Population With Any Disability'].str.rstrip('%').astype(float) / 100
initial_pa_total_aux_2016


Unnamed: 0,Year,County,% Total Population 18-34 Years Old,% Total Population 35-54 Years Old,% Total Population 55-64 Years Old,% Total Population 65-74 Years Old,% Total Population 75 Years Old & Over,Total Labor Force Participation Rate (%),Total Unemployment Rate (%),% Total Population With Income Below Poverty Level (Past 12 Months),% Total Population With Any Disability
0,2016,ADAMS,0.252,0.317,0.184,0.143,0.106,0.785,0.056,0.077,0.177
1,2016,ALLEGHENY,0.299,0.298,0.181,0.12,0.102,0.8,0.053,0.109,0.162
2,2016,ARMSTRONG,0.218,0.318,0.2,0.148,0.116,0.711,0.057,0.126,0.229
3,2016,BEAVER,0.242,0.307,0.197,0.137,0.117,0.779,0.043,0.071,0.174
4,2016,BERKS,0.285,0.328,0.174,0.119,0.094,0.786,0.058,0.113,0.162
5,2016,BLAIR,0.248,0.309,0.187,0.139,0.116,0.738,0.046,0.11,0.21
6,2016,BUCKS,0.24,0.34,0.197,0.127,0.096,0.803,0.038,0.062,0.138
7,2016,BUTLER,0.248,0.334,0.192,0.127,0.099,0.774,0.036,0.063,0.149
8,2016,CAMBRIA,0.245,0.295,0.193,0.146,0.121,0.707,0.077,0.136,0.206
9,2016,CARBON,0.216,0.335,0.2,0.147,0.103,0.772,0.063,0.116,0.196


**2019**
-
Please note: 2020 United States Census Bureau was unavailable and as such the closest year prior to the election (2019) was utilized


In [162]:
# Reading in 2019 U.S. Census Bureau auxillary (Total) dataset
initial_pa_total_aux_2019 = pd.read_excel("Resources/PA_Total_Auxillary_2019.xlsx")
initial_pa_total_aux_2019.head(60)


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
0,,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent,...,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent
1,Label,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,...,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate
2,Civilian population 18 years and over,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
3,PERIOD OF SERVICE,,,,,,,,,,...,,,,,,,,,,
4,Gulf War (9/2001 or later) veterans,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
5,Gulf War (8/1990 to 8/2001) veterans,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
6,Vietnam era veterans,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
7,Korean War veterans,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
8,World War II veterans,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
9,SEX,,,,,,,,,,...,,,,,,,,,,


In [163]:
# Retrieving only rows that contain the desired county summary demographic statistics
# Three groupings = Total Population, Veteran Population, Nonveteran Population
# % of population groups above that fall into these age categories: 18 to 34, 35 to 54, 55 to 64, 65 to 74 and 75 & Over
# Labor force participation rate
# Unemployment rate
# % of Population group with income below poverty level in the past 12 months
# % of Population group with any disability
initial_pa_total_aux_2019 = initial_pa_total_aux_2019.iloc[[13, 14, 15, 16, 17, 40, 42, 45, 49], :]
initial_pa_total_aux_2019


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
13,18 to 34 years,25.6%,29.6%,21.9%,24.0%,28.2%,25.3%,23.3%,24.9%,23.4%,...,27.3%,26.0%,27.6%,25.3%,36.9%,23.3%,22.0%,24.4%,22.1%,26.2%
14,35 to 54 years,29.6%,29.1%,29.9%,29.2%,31.3%,29.6%,32.4%,31.8%,28.7%,...,30.4%,32.8%,30.4%,29.2%,30.5%,32.5%,30.7%,30.5%,29.6%,32.5%
15,55 to 64 years,18.6%,17.4%,20.2%,19.5%,17.7%,18.4%,20.1%,19.3%,18.9%,...,20.4%,18.0%,17.7%,18.8%,14.8%,18.3%,19.1%,18.9%,19.8%,18.1%
16,65 to 74 years,15.3%,13.4%,15.9%,15.3%,13.0%,14.7%,13.8%,13.8%,16.3%,...,13.4%,12.9%,13.3%,14.8%,10.3%,14.7%,15.7%,14.8%,16.0%,13.5%
17,75 years and over,10.9%,10.4%,12.1%,12.1%,9.7%,12.1%,10.4%,10.2%,12.7%,...,8.4%,10.2%,11.0%,12.0%,7.5%,11.2%,12.5%,11.4%,12.4%,9.7%
40,Labor force participation rate,78.9%,80.5%,78.8%,79.1%,80.6%,78.2%,82.9%,76.5%,72.0%,...,74.8%,82.4%,79.1%,73.7%,73.1%,73.0%,69.8%,77.7%,78.9%,80.2%
42,Unemployment rate,1.8%,4.1%,2.9%,4.6%,4.8%,3.9%,3.7%,3.5%,4.1%,...,3.2%,3.3%,4.0%,4.9%,7.9%,4.8%,2.6%,4.8%,4.0%,3.6%
45,Income in the past 12 months below poverty level,6.7%,10.1%,8.9%,9.9%,8.2%,13.3%,5.1%,7.7%,12.6%,...,11.2%,5.8%,6.9%,10.8%,20.9%,10.9%,9.9%,9.4%,9.7%,8.4%
49,With any disability,15.8%,16.0%,18.6%,19.3%,16.0%,18.9%,12.0%,15.2%,21.7%,...,15.5%,12.2%,15.2%,18.6%,20.8%,20.2%,20.5%,19.3%,17.6%,14.5%


In [164]:
# Transposing the DataFrame's format for process integrity and future merging simplicity
initial_pa_total_aux_2019 = initial_pa_total_aux_2019.T
initial_pa_total_aux_2019.head()


Unnamed: 0,13,14,15,16,17,40,42,45,49
Unnamed: 0,18 to 34 years,35 to 54 years,55 to 64 years,65 to 74 years,75 years and over,Labor force participation rate,Unemployment rate,Income in the past 12 months below poverty level,With any disability
"Adams County, Pennsylvania",25.6%,29.6%,18.6%,15.3%,10.9%,78.9%,1.8%,6.7%,15.8%
"Allegheny County, Pennsylvania",29.6%,29.1%,17.4%,13.4%,10.4%,80.5%,4.1%,10.1%,16.0%
"Armstrong County, Pennsylvania",21.9%,29.9%,20.2%,15.9%,12.1%,78.8%,2.9%,8.9%,18.6%
"Beaver County, Pennsylvania",24.0%,29.2%,19.5%,15.3%,12.1%,79.1%,4.6%,9.9%,19.3%


In [165]:
# Resetting index and renaming to create a 'County' column
initial_pa_total_aux_2019 = initial_pa_total_aux_2019.reset_index()
initial_pa_total_aux_2019.rename(columns={'index': 'County'}, inplace=True)
initial_pa_total_aux_2019.head()


Unnamed: 0,County,13,14,15,16,17,40,42,45,49
0,Unnamed: 0,18 to 34 years,35 to 54 years,55 to 64 years,65 to 74 years,75 years and over,Labor force participation rate,Unemployment rate,Income in the past 12 months below poverty level,With any disability
1,"Adams County, Pennsylvania",25.6%,29.6%,18.6%,15.3%,10.9%,78.9%,1.8%,6.7%,15.8%
2,"Allegheny County, Pennsylvania",29.6%,29.1%,17.4%,13.4%,10.4%,80.5%,4.1%,10.1%,16.0%
3,"Armstrong County, Pennsylvania",21.9%,29.9%,20.2%,15.9%,12.1%,78.8%,2.9%,8.9%,18.6%
4,"Beaver County, Pennsylvania",24.0%,29.2%,19.5%,15.3%,12.1%,79.1%,4.6%,9.9%,19.3%


In [166]:
# Confirming column data types prior to renaming
print(initial_pa_total_aux_2019.columns)


Index(['County', 13, 14, 15, 16, 17, 40, 42, 45, 49], dtype='object')


In [167]:
# Converting column data types from integers to strings for renaming
initial_pa_total_aux_2019.columns = initial_pa_total_aux_2019.columns.astype(str)
print(initial_pa_total_aux_2019.columns)


Index(['County', '13', '14', '15', '16', '17', '40', '42', '45', '49'], dtype='object')


In [168]:
# Finding and replacing all 'N' values with 'NaN' across the entire DataFrame for future manipulation
initial_pa_total_aux_2019.replace('N', np.nan, inplace=True)


In [169]:
# Further DataFrame cleaning including execution of column renaming, dropping obselete (0) index row and subsequent resetting,
# as well as converting 'County' values to uppercase county names only (removing ' County, Pennsylvania')
# A corresponding year column is also included for future merges and all % data types are coverted to floats
initial_pa_total_aux_2019 = initial_pa_total_aux_2019.rename(columns={'13': '% Total Population 18-34 Years Old', '14': '% Total Population 35-54 Years Old', '15': '% Total Population 55-64 Years Old',
                                                                         '16': '% Total Population 65-74 Years Old', '17': '% Total Population 75 Years Old & Over', '40': 'Total Labor Force Participation Rate (%)',
                                                                         '42': 'Total Unemployment Rate (%)', '45': '% Total Population With Income Below Poverty Level (Past 12 Months)',
                                                                         '49': '% Total Population With Any Disability'})
initial_pa_total_aux_2019 = initial_pa_total_aux_2019.drop(0)
initial_pa_total_aux_2019 = initial_pa_total_aux_2019.reset_index(drop=True)
initial_pa_total_aux_2019['County'] = initial_pa_total_aux_2019['County'].str.replace(' County, Pennsylvania', '', regex=False)
initial_pa_total_aux_2019['County'] = initial_pa_total_aux_2019['County'].apply(lambda x: x.upper())
initial_pa_total_aux_2019.insert(0, 'Year', 2020)
initial_pa_total_aux_2019['% Total Population 18-34 Years Old'] = initial_pa_total_aux_2019['% Total Population 18-34 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_total_aux_2019['% Total Population 35-54 Years Old'] = initial_pa_total_aux_2019['% Total Population 35-54 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_total_aux_2019['% Total Population 55-64 Years Old'] = initial_pa_total_aux_2019['% Total Population 55-64 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_total_aux_2019['% Total Population 65-74 Years Old'] = initial_pa_total_aux_2019['% Total Population 65-74 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_total_aux_2019['% Total Population 75 Years Old & Over'] = initial_pa_total_aux_2019['% Total Population 75 Years Old & Over'].str.rstrip('%').astype(float) / 100
initial_pa_total_aux_2019['Total Labor Force Participation Rate (%)'] = initial_pa_total_aux_2019['Total Labor Force Participation Rate (%)'].str.rstrip('%').astype(float) / 100
initial_pa_total_aux_2019['Total Unemployment Rate (%)'] = initial_pa_total_aux_2019['Total Unemployment Rate (%)'].str.rstrip('%').astype(float) / 100
initial_pa_total_aux_2019['% Total Population With Income Below Poverty Level (Past 12 Months)'] = initial_pa_total_aux_2019['% Total Population With Income Below Poverty Level (Past 12 Months)'].str.rstrip('%').astype(float) / 100
initial_pa_total_aux_2019['% Total Population With Any Disability'] = initial_pa_total_aux_2019['% Total Population With Any Disability'].str.rstrip('%').astype(float) / 100
initial_pa_total_aux_2019


Unnamed: 0,Year,County,% Total Population 18-34 Years Old,% Total Population 35-54 Years Old,% Total Population 55-64 Years Old,% Total Population 65-74 Years Old,% Total Population 75 Years Old & Over,Total Labor Force Participation Rate (%),Total Unemployment Rate (%),% Total Population With Income Below Poverty Level (Past 12 Months),% Total Population With Any Disability
0,2020,ADAMS,0.256,0.296,0.186,0.153,0.109,0.789,0.018,0.067,0.158
1,2020,ALLEGHENY,0.296,0.291,0.174,0.134,0.104,0.805,0.041,0.101,0.16
2,2020,ARMSTRONG,0.219,0.299,0.202,0.159,0.121,0.788,0.029,0.089,0.186
3,2020,BEAVER,0.24,0.292,0.195,0.153,0.121,0.791,0.046,0.099,0.193
4,2020,BERKS,0.282,0.313,0.177,0.13,0.097,0.806,0.048,0.082,0.16
5,2020,BLAIR,0.253,0.296,0.184,0.147,0.121,0.782,0.039,0.133,0.189
6,2020,BUCKS,0.233,0.324,0.201,0.138,0.104,0.829,0.037,0.051,0.12
7,2020,BUTLER,0.249,0.318,0.193,0.138,0.102,0.765,0.035,0.077,0.152
8,2020,CAMBRIA,0.234,0.287,0.189,0.163,0.127,0.72,0.041,0.126,0.217
9,2020,CARBON,0.216,0.325,0.194,0.155,0.11,0.781,0.056,0.085,0.201


**Auxillary (Veterans)  Data Extraction, Transformation & Loading (2012, 2016 & 2019)**
-
-----------

**2012**
-

In [170]:
# Reading in 2012 U.S. Census Bureau auxillary (Veterans) dataset
initial_pa_veterans_aux_2012 = pd.read_excel("Resources/PA_Veterans_Auxillary_2012.xlsx")
initial_pa_veterans_aux_2012.head(60)


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
0,,Veterans,Veterans,Veterans,Veterans,Veterans,Veterans,Veterans,Veterans,Veterans,...,Veterans,Veterans,Veterans,Veterans,Veterans,Veterans,Veterans,Veterans,Veterans,Veterans
1,Label,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,...,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate
2,Civilian population 18 years and over,8297,90743,6496,15430,30108,12619,41617,13926,13528,...,12417,47335,23557,9788,70017,13890,7371,18049,33140,35326
3,PERIOD OF SERVICE,,,,,,,,,,...,,,,,,,,,,
4,Gulf War (9/2001 or later) veterans,13.6%,8.5%,7.7%,6.3%,8.9%,9.7%,6.7%,7.1%,9.9%,...,5.9%,9.4%,6.2%,9.6%,10.3%,12.7%,11.0%,7.1%,9.7%,13.2%
5,Gulf War (8/1990 to 8/2001) veterans,19.2%,10.9%,9.9%,14.6%,15.6%,13.4%,9.9%,10.0%,12.8%,...,12.1%,13.2%,10.3%,16.8%,8.3%,12.1%,13.3%,10.6%,12.5%,18.9%
6,Vietnam era veterans,33.1%,33.1%,43.6%,33.4%,32.1%,35.9%,39.5%,36.4%,39.3%,...,34.9%,33.3%,32.9%,34.9%,32.1%,35.1%,36.8%,41.8%,33.0%,34.2%
7,Korean War veterans,14.1%,13.9%,10.7%,12.5%,13.1%,13.7%,11.9%,12.7%,13.7%,...,12.2%,14.1%,14.7%,9.1%,10.2%,12.8%,9.6%,13.6%,12.3%,10.9%
8,World War II veterans,5.4%,12.3%,12.9%,12.1%,10.6%,7.8%,11.5%,9.7%,8.3%,...,7.2%,12.8%,11.2%,7.4%,10.2%,6.4%,12.3%,10.4%,10.9%,5.7%
9,SEX,,,,,,,,,,...,,,,,,,,,,


In [171]:
# Retrieving only rows that contain the desired county summary demographic statistics
# Three groupings = Total Population, Veteran Population, Nonveteran Population
# % of population groups above that fall into these age categories: 18 to 34, 35 to 54, 55 to 64, 65 to 74 and 75 & Over
# Labor force participation rate
# Unemployment rate
# % of Population group with income below poverty level in the past 12 months
# % of Population group with any disability
initial_pa_veterans_aux_2012 = initial_pa_veterans_aux_2012.iloc[[13, 14, 15, 16, 17, 41, 43, 46, 49], :]
initial_pa_veterans_aux_2012


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
13,18 to 34 years,8.4%,6.2%,5.3%,5.3%,5.5%,7.3%,3.9%,4.9%,6.2%,...,3.4%,5.4%,3.9%,7.7%,7.6%,10.3%,5.3%,3.6%,5.3%,7.8%
14,35 to 54 years,23.8%,18.8%,14.4%,21.8%,23.9%,24.9%,16.5%,18.6%,21.1%,...,26.3%,16.6%,20.1%,27.6%,21.8%,18.7%,23.0%,16.7%,20.0%,26.9%
15,55 to 64 years,18.7%,20.8%,27.5%,20.2%,21.7%,21.5%,22.5%,24.0%,23.9%,...,26.5%,20.3%,17.8%,19.1%,22.8%,24.0%,25.9%,21.5%,20.2%,23.6%
16,65 to 74 years,23.8%,22.6%,24.5%,23.2%,21.3%,20.3%,27.9%,23.7%,22.2%,...,22.5%,26.2%,24.4%,24.3%,22.3%,23.3%,19.2%,29.3%,26.8%,22.5%
17,75 years and over,25.3%,31.6%,28.3%,29.4%,27.6%,26.0%,29.2%,28.8%,26.5%,...,21.3%,31.6%,33.8%,21.3%,25.5%,23.7%,26.5%,28.9%,27.6%,19.2%
41,Labor force participation rate,88.1%,74.0%,63.7%,77.9%,80.5%,72.4%,78.5%,70.4%,64.6%,...,81.8%,81.4%,77.2%,66.0%,60.8%,76.8%,60.3%,72.2%,76.5%,82.8%
43,Unemployment rate,4.1%,5.2%,7.7%,6.3%,11.3%,9.0%,9.1%,8.4%,12.5%,...,8.9%,7.7%,8.7%,8.6%,9.8%,3.7%,4.9%,3.5%,7.6%,5.7%
46,Below poverty in the past 12 months,3.4%,6.5%,7.6%,6.2%,5.3%,6.6%,5.0%,6.5%,8.5%,...,6.4%,3.0%,3.3%,10.8%,13.5%,3.9%,4.3%,2.1%,6.2%,3.4%
49,With any disability,26.7%,27.4%,32.9%,29.4%,32.3%,26.0%,20.9%,29.1%,31.7%,...,24.7%,22.0%,26.5%,27.1%,32.4%,25.7%,33.6%,26.1%,25.2%,22.4%


In [172]:
# Transposing the DataFrame's format for process integrity and future merging simplicity
initial_pa_veterans_aux_2012 = initial_pa_veterans_aux_2012.T
initial_pa_veterans_aux_2012.head()


Unnamed: 0,13,14,15,16,17,41,43,46,49
Unnamed: 0,18 to 34 years,35 to 54 years,55 to 64 years,65 to 74 years,75 years and over,Labor force participation rate,Unemployment rate,Below poverty in the past 12 months,With any disability
"Adams County, Pennsylvania",8.4%,23.8%,18.7%,23.8%,25.3%,88.1%,4.1%,3.4%,26.7%
"Allegheny County, Pennsylvania",6.2%,18.8%,20.8%,22.6%,31.6%,74.0%,5.2%,6.5%,27.4%
"Armstrong County, Pennsylvania",5.3%,14.4%,27.5%,24.5%,28.3%,63.7%,7.7%,7.6%,32.9%
"Beaver County, Pennsylvania",5.3%,21.8%,20.2%,23.2%,29.4%,77.9%,6.3%,6.2%,29.4%


In [173]:
# Resetting index and renaming to create a 'County' column
initial_pa_veterans_aux_2012 = initial_pa_veterans_aux_2012.reset_index()
initial_pa_veterans_aux_2012.rename(columns={'index': 'County'}, inplace=True)
initial_pa_veterans_aux_2012.head()


Unnamed: 0,County,13,14,15,16,17,41,43,46,49
0,Unnamed: 0,18 to 34 years,35 to 54 years,55 to 64 years,65 to 74 years,75 years and over,Labor force participation rate,Unemployment rate,Below poverty in the past 12 months,With any disability
1,"Adams County, Pennsylvania",8.4%,23.8%,18.7%,23.8%,25.3%,88.1%,4.1%,3.4%,26.7%
2,"Allegheny County, Pennsylvania",6.2%,18.8%,20.8%,22.6%,31.6%,74.0%,5.2%,6.5%,27.4%
3,"Armstrong County, Pennsylvania",5.3%,14.4%,27.5%,24.5%,28.3%,63.7%,7.7%,7.6%,32.9%
4,"Beaver County, Pennsylvania",5.3%,21.8%,20.2%,23.2%,29.4%,77.9%,6.3%,6.2%,29.4%


In [174]:
# Confirming column data types prior to renaming
print(initial_pa_veterans_aux_2012.columns)


Index(['County', 13, 14, 15, 16, 17, 41, 43, 46, 49], dtype='object')


In [175]:
# Converting column data types from integers to strings for renaming
initial_pa_veterans_aux_2012.columns = initial_pa_veterans_aux_2012.columns.astype(str)
print(initial_pa_veterans_aux_2012.columns)


Index(['County', '13', '14', '15', '16', '17', '41', '43', '46', '49'], dtype='object')


In [176]:
# Finding and replacing all 'N' values with 'NaN' across the entire DataFrame for future manipulation
initial_pa_veterans_aux_2012.replace('N', np.nan, inplace=True)


In [177]:
# Further DataFrame cleaning including execution of column renaming, dropping obselete (0) index row and subsequent resetting,
# as well as converting 'County' values to uppercase county names only (removing ' County, Pennsylvania')
# A corresponding year column is also included for future merges and all % data types are coverted to floats
initial_pa_veterans_aux_2012 = initial_pa_veterans_aux_2012.rename(columns={'13': '% Veteran Population 18-34 Years Old', '14': '% Veteran Population 35-54 Years Old', '15': '% Veteran Population 55-64 Years Old',
                                                                         '16': '% Veteran Population 65-74 Years Old', '17': '% Veteran Population 75 Years Old & Over', '41': 'Veteran Labor Force Participation Rate (%)',
                                                                         '43': 'Veteran Unemployment Rate (%)', '46': '% Veteran Population With Income Below Poverty Level (Past 12 Months)',
                                                                         '49': '% Veteran Population With Any Disability'})
initial_pa_veterans_aux_2012 = initial_pa_veterans_aux_2012.drop(0)
initial_pa_veterans_aux_2012 = initial_pa_veterans_aux_2012.reset_index(drop=True)
initial_pa_veterans_aux_2012['County'] = initial_pa_veterans_aux_2012['County'].str.replace(' County, Pennsylvania', '', regex=False)
initial_pa_veterans_aux_2012['County'] = initial_pa_veterans_aux_2012['County'].apply(lambda x: x.upper())
initial_pa_veterans_aux_2012.insert(0, 'Year', 2012)
initial_pa_veterans_aux_2012['% Veteran Population 18-34 Years Old'] = initial_pa_veterans_aux_2012['% Veteran Population 18-34 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_veterans_aux_2012['% Veteran Population 35-54 Years Old'] = initial_pa_veterans_aux_2012['% Veteran Population 35-54 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_veterans_aux_2012['% Veteran Population 55-64 Years Old'] = initial_pa_veterans_aux_2012['% Veteran Population 55-64 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_veterans_aux_2012['% Veteran Population 65-74 Years Old'] = initial_pa_veterans_aux_2012['% Veteran Population 65-74 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_veterans_aux_2012['% Veteran Population 75 Years Old & Over'] = initial_pa_veterans_aux_2012['% Veteran Population 75 Years Old & Over'].str.rstrip('%').astype(float) / 100
initial_pa_veterans_aux_2012['Veteran Labor Force Participation Rate (%)'] = initial_pa_veterans_aux_2012['Veteran Labor Force Participation Rate (%)'].str.rstrip('%').astype(float) / 100
initial_pa_veterans_aux_2012['Veteran Unemployment Rate (%)'] = initial_pa_veterans_aux_2012['Veteran Unemployment Rate (%)'].str.rstrip('%').astype(float) / 100
initial_pa_veterans_aux_2012['% Veteran Population With Income Below Poverty Level (Past 12 Months)'] = initial_pa_veterans_aux_2012['% Veteran Population With Income Below Poverty Level (Past 12 Months)'].str.rstrip('%').astype(float) / 100
initial_pa_veterans_aux_2012['% Veteran Population With Any Disability'] = initial_pa_veterans_aux_2012['% Veteran Population With Any Disability'].str.rstrip('%').astype(float) / 100
initial_pa_veterans_aux_2012


Unnamed: 0,Year,County,% Veteran Population 18-34 Years Old,% Veteran Population 35-54 Years Old,% Veteran Population 55-64 Years Old,% Veteran Population 65-74 Years Old,% Veteran Population 75 Years Old & Over,Veteran Labor Force Participation Rate (%),Veteran Unemployment Rate (%),% Veteran Population With Income Below Poverty Level (Past 12 Months),% Veteran Population With Any Disability
0,2012,ADAMS,0.084,0.238,0.187,0.238,0.253,0.881,0.041,0.034,0.267
1,2012,ALLEGHENY,0.062,0.188,0.208,0.226,0.316,0.74,0.052,0.065,0.274
2,2012,ARMSTRONG,0.053,0.144,0.275,0.245,0.283,0.637,0.077,0.076,0.329
3,2012,BEAVER,0.053,0.218,0.202,0.232,0.294,0.779,0.063,0.062,0.294
4,2012,BERKS,0.055,0.239,0.217,0.213,0.276,0.805,0.113,0.053,0.323
5,2012,BLAIR,0.073,0.249,0.215,0.203,0.26,0.724,0.09,0.066,0.26
6,2012,BUCKS,0.039,0.165,0.225,0.279,0.292,0.785,0.091,0.05,0.209
7,2012,BUTLER,0.049,0.186,0.24,0.237,0.288,0.704,0.084,0.065,0.291
8,2012,CAMBRIA,0.062,0.211,0.239,0.222,0.265,0.646,0.125,0.085,0.317
9,2012,CARBON,0.059,0.277,0.221,0.223,0.22,0.755,0.061,0.053,0.382


**2016**
-

In [178]:
# Reading in 2016 U.S. Census Bureau auxillary (Veterans) dataset
initial_pa_veterans_aux_2016 = pd.read_excel("Resources/PA_Veterans_Auxillary_2016.xlsx")
initial_pa_veterans_aux_2016.head(60)


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
0,,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent,...,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent
1,Label,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,...,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate
2,Civilian population 18 years and over,10.7%,7.8%,10.3%,8.8%,6.6%,9.6%,7.1%,9.5%,10.3%,...,8.0%,6.7%,7.8%,10.3%,4.6%,10.7%,11.0%,9.8%,9.2%,9.2%
3,PERIOD OF SERVICE,,,,,,,,,,...,,,,,,,,,,
4,Gulf War (9/2001 or later) veterans,12.9%,13.7%,9.5%,12.0%,9.8%,10.8%,12.5%,13.4%,16.0%,...,11.8%,12.9%,9.8%,13.2%,14.6%,14.7%,9.6%,13.8%,12.6%,14.9%
5,Gulf War (8/1990 to 8/2001) veterans,19.5%,14.2%,18.5%,14.1%,9.1%,15.6%,15.3%,19.2%,16.1%,...,13.5%,15.2%,16.7%,13.7%,11.1%,16.4%,21.8%,14.4%,15.9%,16.9%
6,Vietnam era veterans,39.2%,34.1%,38.8%,34.3%,39.0%,36.7%,39.6%,32.8%,40.4%,...,42.1%,35.4%,39.1%,32.9%,37.1%,35.9%,40.2%,35.0%,37.6%,34.7%
7,Korean War veterans,5.8%,12.0%,9.0%,10.7%,12.0%,12.3%,12.5%,8.8%,11.2%,...,11.6%,10.6%,11.4%,12.3%,10.6%,11.3%,7.1%,10.9%,10.6%,9.0%
8,World War II veterans,3.9%,6.8%,3.6%,9.2%,4.6%,6.0%,5.4%,6.5%,4.4%,...,3.3%,6.1%,4.0%,4.8%,5.1%,4.1%,3.5%,3.4%,6.4%,3.8%
9,SEX,,,,,,,,,,...,,,,,,,,,,


In [179]:
# Retrieving only rows that contain the desired county summary demographic statistics
# Three groupings = Total Population, Veteran Population, Nonveteran Population
# % of population groups above that fall into these age categories: 18 to 34, 35 to 54, 55 to 64, 65 to 74 and 75 & Over
# Labor force participation rate
# Unemployment rate
# % of Population group with income below poverty level in the past 12 months
# % of Population group with any disability
initial_pa_veterans_aux_2016 = initial_pa_veterans_aux_2016.iloc[[13, 14, 15, 16, 17, 40, 42, 45, 49], :]
initial_pa_veterans_aux_2016


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
13,18 to 34 years,6.3%,8.9%,4.8%,6.5%,6.2%,5.8%,7.0%,4.9%,8.0%,...,7.8%,7.4%,7.8%,8.3%,5.9%,8.1%,1.4%,7.4%,6.9%,8.4%
14,35 to 54 years,21.5%,18.5%,22.4%,16.8%,16.5%,18.5%,19.2%,25.2%,20.0%,...,17.1%,20.1%,18.3%,19.8%,16.5%,21.5%,29.8%,21.9%,19.8%,24.2%
15,55 to 64 years,25.4%,14.1%,16.0%,14.4%,13.6%,19.9%,13.0%,15.8%,19.0%,...,21.2%,13.7%,13.6%,22.3%,23.5%,16.8%,23.7%,17.7%,15.7%,16.7%
16,65 to 74 years,28.2%,28.3%,30.7%,28.7%,34.8%,29.5%,30.4%,26.4%,26.7%,...,32.3%,28.4%,30.7%,26.8%,28.6%,28.2%,24.6%,25.8%,28.1%,26.9%
17,75 years and over,18.6%,30.3%,26.2%,33.5%,29.0%,26.3%,30.5%,27.7%,26.3%,...,21.7%,30.5%,29.6%,22.9%,25.6%,25.4%,20.4%,27.2%,29.5%,23.9%
40,Labor force participation rate,71.0%,78.5%,67.3%,80.8%,83.5%,63.7%,81.9%,87.5%,61.1%,...,66.7%,75.8%,89.5%,73.0%,66.7%,74.1%,56.0%,67.4%,80.3%,85.3%
42,Unemployment rate,11.3%,8.8%,3.5%,2.8%,0.6%,3.9%,7.8%,4.0%,7.3%,...,0.0%,4.0%,3.0%,9.0%,6.2%,1.1%,4.6%,6.6%,8.6%,0.7%
45,Income in the past 12 months below poverty level,3.9%,7.0%,14.4%,5.2%,5.0%,5.3%,4.9%,3.7%,6.5%,...,6.2%,3.5%,3.6%,12.1%,11.3%,5.6%,4.2%,10.5%,6.6%,2.9%
49,With any disability,30.2%,29.6%,33.4%,29.9%,27.7%,33.3%,24.3%,26.2%,31.3%,...,35.9%,25.1%,23.9%,30.4%,33.8%,29.4%,35.5%,34.7%,31.4%,29.7%


In [180]:
# Transposing the DataFrame's format for process integrity and future merging simplicity
initial_pa_veterans_aux_2016 = initial_pa_veterans_aux_2016.T
initial_pa_veterans_aux_2016.head()


Unnamed: 0,13,14,15,16,17,40,42,45,49
Unnamed: 0,18 to 34 years,35 to 54 years,55 to 64 years,65 to 74 years,75 years and over,Labor force participation rate,Unemployment rate,Income in the past 12 months below poverty level,With any disability
"Adams County, Pennsylvania",6.3%,21.5%,25.4%,28.2%,18.6%,71.0%,11.3%,3.9%,30.2%
"Allegheny County, Pennsylvania",8.9%,18.5%,14.1%,28.3%,30.3%,78.5%,8.8%,7.0%,29.6%
"Armstrong County, Pennsylvania",4.8%,22.4%,16.0%,30.7%,26.2%,67.3%,3.5%,14.4%,33.4%
"Beaver County, Pennsylvania",6.5%,16.8%,14.4%,28.7%,33.5%,80.8%,2.8%,5.2%,29.9%


In [181]:
# Resetting index and renaming to create a 'County' column
initial_pa_veterans_aux_2016 = initial_pa_veterans_aux_2016.reset_index()
initial_pa_veterans_aux_2016.rename(columns={'index': 'County'}, inplace=True)
initial_pa_veterans_aux_2016.head()


Unnamed: 0,County,13,14,15,16,17,40,42,45,49
0,Unnamed: 0,18 to 34 years,35 to 54 years,55 to 64 years,65 to 74 years,75 years and over,Labor force participation rate,Unemployment rate,Income in the past 12 months below poverty level,With any disability
1,"Adams County, Pennsylvania",6.3%,21.5%,25.4%,28.2%,18.6%,71.0%,11.3%,3.9%,30.2%
2,"Allegheny County, Pennsylvania",8.9%,18.5%,14.1%,28.3%,30.3%,78.5%,8.8%,7.0%,29.6%
3,"Armstrong County, Pennsylvania",4.8%,22.4%,16.0%,30.7%,26.2%,67.3%,3.5%,14.4%,33.4%
4,"Beaver County, Pennsylvania",6.5%,16.8%,14.4%,28.7%,33.5%,80.8%,2.8%,5.2%,29.9%


In [182]:
# Confirming column data types prior to renaming
print(initial_pa_veterans_aux_2016.columns)


Index(['County', 13, 14, 15, 16, 17, 40, 42, 45, 49], dtype='object')


In [183]:
# Converting column data types from integers to strings for renaming
initial_pa_veterans_aux_2016.columns = initial_pa_veterans_aux_2016.columns.astype(str)
print(initial_pa_veterans_aux_2016.columns)


Index(['County', '13', '14', '15', '16', '17', '40', '42', '45', '49'], dtype='object')


In [184]:
# Finding and replacing all 'N' values with 'NaN' across the entire DataFrame for future manipulation
initial_pa_veterans_aux_2016.replace('N', np.nan, inplace=True)


In [185]:
# Further DataFrame cleaning including execution of column renaming, dropping obselete (0) index row and subsequent resetting,
# as well as converting 'County' values to uppercase county names only (removing ' County, Pennsylvania')
# A corresponding year column is also included for future merges and all % data types are coverted to floats
initial_pa_veterans_aux_2016 = initial_pa_veterans_aux_2016.rename(columns={'13': '% Veteran Population 18-34 Years Old', '14': '% Veteran Population 35-54 Years Old', '15': '% Veteran Population 55-64 Years Old',
                                                                         '16': '% Veteran Population 65-74 Years Old', '17': '% Veteran Population 75 Years Old & Over', '40': 'Veteran Labor Force Participation Rate (%)',
                                                                         '42': 'Veteran Unemployment Rate (%)', '45': '% Veteran Population With Income Below Poverty Level (Past 12 Months)',
                                                                         '49': '% Veteran Population With Any Disability'})
initial_pa_veterans_aux_2016 = initial_pa_veterans_aux_2016.drop(0)
initial_pa_veterans_aux_2016 = initial_pa_veterans_aux_2016.reset_index(drop=True)
initial_pa_veterans_aux_2016['County'] = initial_pa_veterans_aux_2016['County'].str.replace(' County, Pennsylvania', '', regex=False)
initial_pa_veterans_aux_2016['County'] = initial_pa_veterans_aux_2016['County'].apply(lambda x: x.upper())
initial_pa_veterans_aux_2016.insert(0, 'Year', 2016)
initial_pa_veterans_aux_2016['% Veteran Population 18-34 Years Old'] = initial_pa_veterans_aux_2016['% Veteran Population 18-34 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_veterans_aux_2016['% Veteran Population 35-54 Years Old'] = initial_pa_veterans_aux_2016['% Veteran Population 35-54 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_veterans_aux_2016['% Veteran Population 55-64 Years Old'] = initial_pa_veterans_aux_2016['% Veteran Population 55-64 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_veterans_aux_2016['% Veteran Population 65-74 Years Old'] = initial_pa_veterans_aux_2016['% Veteran Population 65-74 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_veterans_aux_2016['% Veteran Population 75 Years Old & Over'] = initial_pa_veterans_aux_2016['% Veteran Population 75 Years Old & Over'].str.rstrip('%').astype(float) / 100
initial_pa_veterans_aux_2016['Veteran Labor Force Participation Rate (%)'] = initial_pa_veterans_aux_2016['Veteran Labor Force Participation Rate (%)'].str.rstrip('%').astype(float) / 100
initial_pa_veterans_aux_2016['Veteran Unemployment Rate (%)'] = initial_pa_veterans_aux_2016['Veteran Unemployment Rate (%)'].str.rstrip('%').astype(float) / 100
initial_pa_veterans_aux_2016['% Veteran Population With Income Below Poverty Level (Past 12 Months)'] = initial_pa_veterans_aux_2016['% Veteran Population With Income Below Poverty Level (Past 12 Months)'].str.rstrip('%').astype(float) / 100
initial_pa_veterans_aux_2016['% Veteran Population With Any Disability'] = initial_pa_veterans_aux_2016['% Veteran Population With Any Disability'].str.rstrip('%').astype(float) / 100
initial_pa_veterans_aux_2016


Unnamed: 0,Year,County,% Veteran Population 18-34 Years Old,% Veteran Population 35-54 Years Old,% Veteran Population 55-64 Years Old,% Veteran Population 65-74 Years Old,% Veteran Population 75 Years Old & Over,Veteran Labor Force Participation Rate (%),Veteran Unemployment Rate (%),% Veteran Population With Income Below Poverty Level (Past 12 Months),% Veteran Population With Any Disability
0,2016,ADAMS,0.063,0.215,0.254,0.282,0.186,0.71,0.113,0.039,0.302
1,2016,ALLEGHENY,0.089,0.185,0.141,0.283,0.303,0.785,0.088,0.07,0.296
2,2016,ARMSTRONG,0.048,0.224,0.16,0.307,0.262,0.673,0.035,0.144,0.334
3,2016,BEAVER,0.065,0.168,0.144,0.287,0.335,0.808,0.028,0.052,0.299
4,2016,BERKS,0.062,0.165,0.136,0.348,0.29,0.835,0.006,0.05,0.277
5,2016,BLAIR,0.058,0.185,0.199,0.295,0.263,0.637,0.039,0.053,0.333
6,2016,BUCKS,0.07,0.192,0.13,0.304,0.305,0.819,0.078,0.049,0.243
7,2016,BUTLER,0.049,0.252,0.158,0.264,0.277,0.875,0.04,0.037,0.262
8,2016,CAMBRIA,0.08,0.2,0.19,0.267,0.263,0.611,0.073,0.065,0.313
9,2016,CARBON,0.072,0.26,0.177,0.324,0.167,0.872,0.076,0.054,0.257


**2019**
-
Please note: 2020 United States Census Bureau was unavailable and as such the closest year prior to the election (2019) was utilized


In [186]:
# Reading in 2019 U.S. Census Bureau auxillary (Veterans) dataset
initial_pa_veterans_aux_2019 = pd.read_excel("Resources/PA_Veterans_Auxillary_2019.xlsx")
initial_pa_veterans_aux_2019.head(60)


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
0,,Percent Veterans,Percent Veterans,Percent Veterans,Percent Veterans,Percent Veterans,Percent Veterans,Percent Veterans,Percent Veterans,Percent Veterans,...,Percent Veterans,Percent Veterans,Percent Veterans,Percent Veterans,Percent Veterans,Percent Veterans,Percent Veterans,Percent Veterans,Percent Veterans,Percent Veterans
1,Label,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,...,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate
2,Civilian population 18 years and over,8.3%,6.8%,9.4%,8.1%,6.4%,8.6%,6.2%,7.5%,8.9%,...,7.3%,5.6%,7.1%,8.3%,4.5%,8.5%,9.3%,6.7%,9.0%,7.8%
3,PERIOD OF SERVICE,,,,,,,,,,...,,,,,,,,,,
4,Gulf War (9/2001 or later) veterans,N,12.5%,27.1%,15.1%,N,9.6%,14.9%,10.9%,13.9%,...,16.7%,15.3%,13.1%,10.4%,18.7%,12.1%,17.3%,17.6%,13.7%,22.0%
5,Gulf War (8/1990 to 8/2001) veterans,N,15.9%,22.3%,14.9%,N,14.5%,14.4%,19.9%,18.4%,...,20.7%,15.4%,14.7%,21.0%,14.4%,16.4%,22.4%,13.4%,11.4%,22.3%
6,Vietnam era veterans,N,34.4%,36.0%,38.6%,N,37.2%,37.5%,38.6%,37.7%,...,31.8%,40.6%,35.7%,44.9%,35.6%,37.5%,31.3%,38.8%,36.0%,32.9%
7,Korean War veterans,N,11.1%,7.8%,7.5%,N,6.5%,8.9%,4.8%,11.5%,...,5.9%,11.7%,6.1%,5.8%,8.4%,8.5%,8.8%,11.2%,9.2%,7.9%
8,World War II veterans,N,3.9%,2.1%,1.3%,N,2.8%,4.4%,1.1%,1.1%,...,0.0%,3.5%,4.5%,0.6%,2.9%,3.2%,0.9%,2.9%,6.4%,1.3%
9,SEX,,,,,,,,,,...,,,,,,,,,,


In [187]:
# Retrieving only rows that contain the desired county summary demographic statistics
# Three groupings = Total Population, Veteran Population, Nonveteran Population
# % of population groups above that fall into these age categories: 18 to 34, 35 to 54, 55 to 64, 65 to 74 and 75 & Over
# Labor force participation rate
# Unemployment rate
# % of Population group with income below poverty level in the past 12 months
# % of Population group with any disability
initial_pa_veterans_aux_2019 = initial_pa_veterans_aux_2019.iloc[[13, 14, 15, 16, 17, 40, 42, 45, 49], :]
initial_pa_veterans_aux_2019


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
13,18 to 34 years,7.7%,5.0%,15.4%,6.5%,5.8%,3.8%,4.1%,3.2%,2.4%,...,6.1%,7.0%,4.8%,3.4%,8.3%,2.0%,2.9%,7.9%,7.7%,8.6%
14,35 to 54 years,15.3%,19.8%,24.7%,15.8%,15.8%,17.1%,19.8%,22.3%,20.2%,...,24.3%,16.9%,15.3%,21.5%,21.4%,23.7%,31.4%,18.3%,15.9%,25.2%
15,55 to 64 years,24.3%,16.8%,11.1%,22.9%,19.1%,22.9%,12.7%,18.3%,16.8%,...,25.1%,13.1%,26.0%,23.3%,20.3%,14.4%,18.1%,13.3%,15.5%,16.0%
16,65 to 74 years,26.3%,25.8%,27.5%,24.1%,33.0%,26.0%,28.6%,27.3%,30.7%,...,23.4%,29.5%,25.5%,34.6%,26.2%,29.9%,21.6%,30.9%,24.4%,23.9%
17,75 years and over,26.4%,32.7%,21.4%,30.8%,26.3%,30.2%,34.8%,29.0%,29.8%,...,21.1%,33.4%,28.3%,17.2%,23.9%,30.0%,26.0%,29.7%,36.6%,26.2%
40,Labor force participation rate,78.3%,78.6%,82.9%,70.3%,72.0%,71.9%,84.4%,75.0%,68.2%,...,79.5%,71.7%,76.2%,67.0%,75.3%,80.1%,63.2%,84.9%,84.7%,85.8%
42,Unemployment rate,2.1%,6.6%,0.6%,15.0%,4.0%,0.6%,2.2%,1.5%,6.5%,...,4.5%,2.0%,0.0%,3.8%,4.9%,4.2%,3.2%,2.2%,1.0%,1.4%
45,Income in the past 12 months below poverty level,4.5%,7.1%,4.9%,9.9%,4.8%,9.6%,2.8%,7.9%,10.1%,...,9.1%,4.8%,4.2%,9.6%,12.5%,7.0%,12.4%,4.0%,7.5%,5.9%
49,With any disability,16.7%,32.0%,26.6%,25.5%,27.8%,34.0%,22.6%,23.5%,32.2%,...,18.7%,28.6%,24.0%,22.0%,35.6%,36.4%,35.8%,30.7%,30.2%,23.4%


In [188]:
# Transposing the DataFrame's format for process integrity and future merging simplicity
initial_pa_veterans_aux_2019 = initial_pa_veterans_aux_2019.T
initial_pa_veterans_aux_2019.head()


Unnamed: 0,13,14,15,16,17,40,42,45,49
Unnamed: 0,18 to 34 years,35 to 54 years,55 to 64 years,65 to 74 years,75 years and over,Labor force participation rate,Unemployment rate,Income in the past 12 months below poverty level,With any disability
"Adams County, Pennsylvania",7.7%,15.3%,24.3%,26.3%,26.4%,78.3%,2.1%,4.5%,16.7%
"Allegheny County, Pennsylvania",5.0%,19.8%,16.8%,25.8%,32.7%,78.6%,6.6%,7.1%,32.0%
"Armstrong County, Pennsylvania",15.4%,24.7%,11.1%,27.5%,21.4%,82.9%,0.6%,4.9%,26.6%
"Beaver County, Pennsylvania",6.5%,15.8%,22.9%,24.1%,30.8%,70.3%,15.0%,9.9%,25.5%


In [189]:
# Resetting index and renaming to create a 'County' column
initial_pa_veterans_aux_2019 = initial_pa_veterans_aux_2019.reset_index()
initial_pa_veterans_aux_2019.rename(columns={'index': 'County'}, inplace=True)
initial_pa_veterans_aux_2019.head()


Unnamed: 0,County,13,14,15,16,17,40,42,45,49
0,Unnamed: 0,18 to 34 years,35 to 54 years,55 to 64 years,65 to 74 years,75 years and over,Labor force participation rate,Unemployment rate,Income in the past 12 months below poverty level,With any disability
1,"Adams County, Pennsylvania",7.7%,15.3%,24.3%,26.3%,26.4%,78.3%,2.1%,4.5%,16.7%
2,"Allegheny County, Pennsylvania",5.0%,19.8%,16.8%,25.8%,32.7%,78.6%,6.6%,7.1%,32.0%
3,"Armstrong County, Pennsylvania",15.4%,24.7%,11.1%,27.5%,21.4%,82.9%,0.6%,4.9%,26.6%
4,"Beaver County, Pennsylvania",6.5%,15.8%,22.9%,24.1%,30.8%,70.3%,15.0%,9.9%,25.5%


In [190]:
# Confirming column data types prior to renaming
print(initial_pa_veterans_aux_2019.columns)


Index(['County', 13, 14, 15, 16, 17, 40, 42, 45, 49], dtype='object')


In [191]:
# Converting column data types from integers to strings for renaming
initial_pa_veterans_aux_2019.columns = initial_pa_veterans_aux_2019.columns.astype(str)
print(initial_pa_veterans_aux_2019.columns)


Index(['County', '13', '14', '15', '16', '17', '40', '42', '45', '49'], dtype='object')


In [192]:
# Finding and replacing all 'N' values with 'NaN' across the entire DataFrame for future manipulation
initial_pa_veterans_aux_2019.replace('N', np.nan, inplace=True)


In [193]:
# Further DataFrame cleaning including execution of column renaming, dropping obselete (0) index row and subsequent resetting,
# as well as converting 'County' values to uppercase county names only (removing ' County, Pennsylvania')
# A corresponding year column is also included for future merges and all % data types are coverted to floats
initial_pa_veterans_aux_2019 = initial_pa_veterans_aux_2019.rename(columns={'13': '% Veteran Population 18-34 Years Old', '14': '% Veteran Population 35-54 Years Old', '15': '% Veteran Population 55-64 Years Old',
                                                                         '16': '% Veteran Population 65-74 Years Old', '17': '% Veteran Population 75 Years Old & Over', '40': 'Veteran Labor Force Participation Rate (%)',
                                                                         '42': 'Veteran Unemployment Rate (%)', '45': '% Veteran Population With Income Below Poverty Level (Past 12 Months)',
                                                                         '49': '% Veteran Population With Any Disability'})
initial_pa_veterans_aux_2019 = initial_pa_veterans_aux_2019.drop(0)
initial_pa_veterans_aux_2019 = initial_pa_veterans_aux_2019.reset_index(drop=True)
initial_pa_veterans_aux_2019['County'] = initial_pa_veterans_aux_2019['County'].str.replace(' County, Pennsylvania', '', regex=False)
initial_pa_veterans_aux_2019['County'] = initial_pa_veterans_aux_2019['County'].apply(lambda x: x.upper())
initial_pa_veterans_aux_2019.insert(0, 'Year', 2020)
initial_pa_veterans_aux_2019['% Veteran Population 18-34 Years Old'] = initial_pa_veterans_aux_2019['% Veteran Population 18-34 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_veterans_aux_2019['% Veteran Population 35-54 Years Old'] = initial_pa_veterans_aux_2019['% Veteran Population 35-54 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_veterans_aux_2019['% Veteran Population 55-64 Years Old'] = initial_pa_veterans_aux_2019['% Veteran Population 55-64 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_veterans_aux_2019['% Veteran Population 65-74 Years Old'] = initial_pa_veterans_aux_2019['% Veteran Population 65-74 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_veterans_aux_2019['% Veteran Population 75 Years Old & Over'] = initial_pa_veterans_aux_2019['% Veteran Population 75 Years Old & Over'].str.rstrip('%').astype(float) / 100
initial_pa_veterans_aux_2019['Veteran Labor Force Participation Rate (%)'] = initial_pa_veterans_aux_2019['Veteran Labor Force Participation Rate (%)'].str.rstrip('%').astype(float) / 100
initial_pa_veterans_aux_2019['Veteran Unemployment Rate (%)'] = initial_pa_veterans_aux_2019['Veteran Unemployment Rate (%)'].str.rstrip('%').astype(float) / 100
initial_pa_veterans_aux_2019['% Veteran Population With Income Below Poverty Level (Past 12 Months)'] = initial_pa_veterans_aux_2019['% Veteran Population With Income Below Poverty Level (Past 12 Months)'].str.rstrip('%').astype(float) / 100
initial_pa_veterans_aux_2019['% Veteran Population With Any Disability'] = initial_pa_veterans_aux_2019['% Veteran Population With Any Disability'].str.rstrip('%').astype(float) / 100
initial_pa_veterans_aux_2019


Unnamed: 0,Year,County,% Veteran Population 18-34 Years Old,% Veteran Population 35-54 Years Old,% Veteran Population 55-64 Years Old,% Veteran Population 65-74 Years Old,% Veteran Population 75 Years Old & Over,Veteran Labor Force Participation Rate (%),Veteran Unemployment Rate (%),% Veteran Population With Income Below Poverty Level (Past 12 Months),% Veteran Population With Any Disability
0,2020,ADAMS,0.077,0.153,0.243,0.263,0.264,0.783,0.021,0.045,0.167
1,2020,ALLEGHENY,0.05,0.198,0.168,0.258,0.327,0.786,0.066,0.071,0.32
2,2020,ARMSTRONG,0.154,0.247,0.111,0.275,0.214,0.829,0.006,0.049,0.266
3,2020,BEAVER,0.065,0.158,0.229,0.241,0.308,0.703,0.15,0.099,0.255
4,2020,BERKS,0.058,0.158,0.191,0.33,0.263,0.72,0.04,0.048,0.278
5,2020,BLAIR,0.038,0.171,0.229,0.26,0.302,0.719,0.006,0.096,0.34
6,2020,BUCKS,0.041,0.198,0.127,0.286,0.348,0.844,0.022,0.028,0.226
7,2020,BUTLER,0.032,0.223,0.183,0.273,0.29,0.75,0.015,0.079,0.235
8,2020,CAMBRIA,0.024,0.202,0.168,0.307,0.298,0.682,0.065,0.101,0.322
9,2020,CARBON,0.0,0.083,0.177,0.3,0.44,0.629,0.0,0.06,0.455


**Auxillary (Nonveterans)  Data Extraction, Transformation & Loading (2012, 2016 & 2019)**
-
-----------

**2012**
-

In [194]:
# Reading in 2012 U.S. Census Bureau auxillary (Nonveterans) dataset
initial_pa_nonveterans_aux_2012 = pd.read_excel("Resources/PA_Nonveterans_Auxillary_2012.xlsx")
initial_pa_nonveterans_aux_2012.head(60)


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
0,,Nonveterans,Nonveterans,Nonveterans,Nonveterans,Nonveterans,Nonveterans,Nonveterans,Nonveterans,Nonveterans,...,Nonveterans,Nonveterans,Nonveterans,Nonveterans,Nonveterans,Nonveterans,Nonveterans,Nonveterans,Nonveterans,Nonveterans
1,Label,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,...,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate
2,Civilian population 18 years and over,71470,900804,48371,120865,287013,88124,446680,130993,100581,...,118735,580230,212678,65997,1128605,104352,55513,148609,260144,302344
3,PERIOD OF SERVICE,,,,,,,,,,...,,,,,,,,,,
4,Gulf War (9/2001 or later) veterans,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
5,Gulf War (8/1990 to 8/2001) veterans,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
6,Vietnam era veterans,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
7,Korean War veterans,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
8,World War II veterans,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
9,SEX,,,,,,,,,,...,,,,,,,,,,


In [195]:
# Retrieving only rows that contain the desired county summary demographic statistics
# Three groupings = Total Population, Veteran Population, Nonveteran Population
# % of population groups above that fall into these age categories: 18 to 34, 35 to 54, 55 to 64, 65 to 74 and 75 & Over
# Labor force participation rate
# Unemployment rate
# % of Population group with income below poverty level in the past 12 months
# % of Population group with any disability
initial_pa_nonveterans_aux_2012 = initial_pa_nonveterans_aux_2012.iloc[[13, 14, 15, 16, 17, 41, 43, 46, 49], :]
initial_pa_nonveterans_aux_2012


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
13,18 to 34 years,27.9%,31.8%,24.1%,26.3%,30.4%,28.4%,25.8%,27.8%,27.4%,...,29.7%,27.7%,29.5%,27.1%,40.1%,25.2%,25.3%,26.1%,24.4%,28.2%
14,35 to 54 years,35.7%,33.5%,37.3%,35.0%,37.0%,34.1%,39.7%,38.0%,33.7%,...,38.2%,38.4%,36.7%,34.6%,32.4%,37.6%,35.8%,37.0%,36.0%,38.4%
15,55 to 64 years,17.9%,17.0%,18.3%,18.2%,16.0%,17.6%,18.0%,17.2%,18.2%,...,17.4%,16.7%,16.8%,18.0%,13.7%,17.0%,17.9%,18.7%,19.2%,16.6%
16,65 to 74 years,10.4%,9.1%,10.8%,10.9%,8.9%,10.6%,9.2%,9.4%,10.5%,...,9.1%,9.0%,9.1%,10.5%,7.5%,10.3%,11.5%,9.7%,10.7%,9.3%
17,75 years and over,8.1%,8.6%,9.5%,9.6%,7.7%,9.3%,7.4%,7.5%,10.3%,...,5.7%,8.2%,7.9%,9.8%,6.3%,9.9%,9.6%,8.6%,9.8%,7.5%
41,Labor force participation rate,80.1%,78.3%,75.0%,78.6%,79.6%,74.7%,81.9%,78.4%,70.8%,...,76.6%,81.9%,76.4%,71.7%,69.1%,73.3%,70.8%,77.8%,78.4%,79.1%
43,Unemployment rate,5.5%,7.4%,8.5%,8.3%,9.5%,7.4%,7.7%,6.1%,9.5%,...,13.7%,6.8%,8.0%,11.0%,15.9%,11.0%,7.7%,7.2%,5.8%,8.2%
46,Below poverty in the past 12 months,8.6%,11.8%,12.2%,10.0%,12.7%,12.1%,5.1%,9.4%,13.0%,...,13.5%,6.5%,9.9%,12.6%,24.7%,14.5%,10.7%,9.6%,9.4%,9.6%
49,With any disability,15.1%,14.1%,19.2%,16.2%,14.2%,17.9%,11.1%,13.1%,19.4%,...,14.8%,10.6%,15.0%,16.6%,19.2%,19.3%,19.4%,16.4%,14.9%,13.6%


In [196]:
# Transposing the DataFrame's format for process integrity and future merging simplicity
initial_pa_nonveterans_aux_2012 = initial_pa_nonveterans_aux_2012.T
initial_pa_nonveterans_aux_2012.head()


Unnamed: 0,13,14,15,16,17,41,43,46,49
Unnamed: 0,18 to 34 years,35 to 54 years,55 to 64 years,65 to 74 years,75 years and over,Labor force participation rate,Unemployment rate,Below poverty in the past 12 months,With any disability
"Adams County, Pennsylvania",27.9%,35.7%,17.9%,10.4%,8.1%,80.1%,5.5%,8.6%,15.1%
"Allegheny County, Pennsylvania",31.8%,33.5%,17.0%,9.1%,8.6%,78.3%,7.4%,11.8%,14.1%
"Armstrong County, Pennsylvania",24.1%,37.3%,18.3%,10.8%,9.5%,75.0%,8.5%,12.2%,19.2%
"Beaver County, Pennsylvania",26.3%,35.0%,18.2%,10.9%,9.6%,78.6%,8.3%,10.0%,16.2%


In [197]:
# Resetting index and renaming to create a 'County' column
initial_pa_nonveterans_aux_2012 = initial_pa_nonveterans_aux_2012.reset_index()
initial_pa_nonveterans_aux_2012.rename(columns={'index': 'County'}, inplace=True)
initial_pa_nonveterans_aux_2012.head()


Unnamed: 0,County,13,14,15,16,17,41,43,46,49
0,Unnamed: 0,18 to 34 years,35 to 54 years,55 to 64 years,65 to 74 years,75 years and over,Labor force participation rate,Unemployment rate,Below poverty in the past 12 months,With any disability
1,"Adams County, Pennsylvania",27.9%,35.7%,17.9%,10.4%,8.1%,80.1%,5.5%,8.6%,15.1%
2,"Allegheny County, Pennsylvania",31.8%,33.5%,17.0%,9.1%,8.6%,78.3%,7.4%,11.8%,14.1%
3,"Armstrong County, Pennsylvania",24.1%,37.3%,18.3%,10.8%,9.5%,75.0%,8.5%,12.2%,19.2%
4,"Beaver County, Pennsylvania",26.3%,35.0%,18.2%,10.9%,9.6%,78.6%,8.3%,10.0%,16.2%


In [198]:
# Confirming column data types prior to renaming
print(initial_pa_nonveterans_aux_2012.columns)


Index(['County', 13, 14, 15, 16, 17, 41, 43, 46, 49], dtype='object')


In [199]:
# Converting column data types from integers to strings for renaming
initial_pa_nonveterans_aux_2012.columns = initial_pa_nonveterans_aux_2012.columns.astype(str)
print(initial_pa_nonveterans_aux_2012.columns)


Index(['County', '13', '14', '15', '16', '17', '41', '43', '46', '49'], dtype='object')


In [200]:
# Finding and replacing all 'N' values with 'NaN' across the entire DataFrame for future manipulation
initial_pa_nonveterans_aux_2012.replace('N', np.nan, inplace=True)


In [201]:
# Further DataFrame cleaning including execution of column renaming, dropping obselete (0) index row and subsequent resetting,
# as well as converting 'County' values to uppercase county names only (removing ' County, Pennsylvania')
# A corresponding year column is also included for future merges and all % data types are coverted to floats
initial_pa_nonveterans_aux_2012 = initial_pa_nonveterans_aux_2012.rename(columns={'13': '% Nonveteran Population 18-34 Years Old', '14': '% Nonveteran Population 35-54 Years Old', '15': '% Nonveteran Population 55-64 Years Old',
                                                                         '16': '% Nonveteran Population 65-74 Years Old', '17': '% Nonveteran Population 75 Years Old & Over', '41': 'Nonveteran Labor Force Participation Rate (%)',
                                                                         '43': 'Nonveteran Unemployment Rate (%)', '46': '% Nonveteran Population With Income Below Poverty Level (Past 12 Months)',
                                                                         '49': '% Nonveteran Population With Any Disability'})
initial_pa_nonveterans_aux_2012 = initial_pa_nonveterans_aux_2012.drop(0)
initial_pa_nonveterans_aux_2012 = initial_pa_nonveterans_aux_2012.reset_index(drop=True)
initial_pa_nonveterans_aux_2012['County'] = initial_pa_nonveterans_aux_2012['County'].str.replace(' County, Pennsylvania', '', regex=False)
initial_pa_nonveterans_aux_2012['County'] = initial_pa_nonveterans_aux_2012['County'].apply(lambda x: x.upper())
initial_pa_nonveterans_aux_2012.insert(0, 'Year', 2012)
initial_pa_nonveterans_aux_2012['% Nonveteran Population 18-34 Years Old'] = initial_pa_nonveterans_aux_2012['% Nonveteran Population 18-34 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_nonveterans_aux_2012['% Nonveteran Population 35-54 Years Old'] = initial_pa_nonveterans_aux_2012['% Nonveteran Population 35-54 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_nonveterans_aux_2012['% Nonveteran Population 55-64 Years Old'] = initial_pa_nonveterans_aux_2012['% Nonveteran Population 55-64 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_nonveterans_aux_2012['% Nonveteran Population 65-74 Years Old'] = initial_pa_nonveterans_aux_2012['% Nonveteran Population 65-74 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_nonveterans_aux_2012['% Nonveteran Population 75 Years Old & Over'] = initial_pa_nonveterans_aux_2012['% Nonveteran Population 75 Years Old & Over'].str.rstrip('%').astype(float) / 100
initial_pa_nonveterans_aux_2012['Nonveteran Labor Force Participation Rate (%)'] = initial_pa_nonveterans_aux_2012['Nonveteran Labor Force Participation Rate (%)'].str.rstrip('%').astype(float) / 100
initial_pa_nonveterans_aux_2012['Nonveteran Unemployment Rate (%)'] = initial_pa_nonveterans_aux_2012['Nonveteran Unemployment Rate (%)'].str.rstrip('%').astype(float) / 100
initial_pa_nonveterans_aux_2012['% Nonveteran Population With Income Below Poverty Level (Past 12 Months)'] = initial_pa_nonveterans_aux_2012['% Nonveteran Population With Income Below Poverty Level (Past 12 Months)'].str.rstrip('%').astype(float) / 100
initial_pa_nonveterans_aux_2012['% Nonveteran Population With Any Disability'] = initial_pa_nonveterans_aux_2012['% Nonveteran Population With Any Disability'].str.rstrip('%').astype(float) / 100
initial_pa_nonveterans_aux_2012


Unnamed: 0,Year,County,% Nonveteran Population 18-34 Years Old,% Nonveteran Population 35-54 Years Old,% Nonveteran Population 55-64 Years Old,% Nonveteran Population 65-74 Years Old,% Nonveteran Population 75 Years Old & Over,Nonveteran Labor Force Participation Rate (%),Nonveteran Unemployment Rate (%),% Nonveteran Population With Income Below Poverty Level (Past 12 Months),% Nonveteran Population With Any Disability
0,2012,ADAMS,0.279,0.357,0.179,0.104,0.081,0.801,0.055,0.086,0.151
1,2012,ALLEGHENY,0.318,0.335,0.17,0.091,0.086,0.783,0.074,0.118,0.141
2,2012,ARMSTRONG,0.241,0.373,0.183,0.108,0.095,0.75,0.085,0.122,0.192
3,2012,BEAVER,0.263,0.35,0.182,0.109,0.096,0.786,0.083,0.1,0.162
4,2012,BERKS,0.304,0.37,0.16,0.089,0.077,0.796,0.095,0.127,0.142
5,2012,BLAIR,0.284,0.341,0.176,0.106,0.093,0.747,0.074,0.121,0.179
6,2012,BUCKS,0.258,0.397,0.18,0.092,0.074,0.819,0.077,0.051,0.111
7,2012,BUTLER,0.278,0.38,0.172,0.094,0.075,0.784,0.061,0.094,0.131
8,2012,CAMBRIA,0.274,0.337,0.182,0.105,0.103,0.708,0.095,0.13,0.194
9,2012,CARBON,0.237,0.378,0.182,0.112,0.091,0.767,0.075,0.085,0.19


**2016**
-

In [202]:
# Reading in 2016 U.S. Census Bureau auxillary (Nonveterans) dataset
initial_pa_nonveterans_aux_2016 = pd.read_excel("Resources/PA_Nonveterans_Auxillary_2016.xlsx")
initial_pa_nonveterans_aux_2016.head(60)


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
0,,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent,...,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent,Percent
1,Label,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,...,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate
2,Civilian population 18 years and over,89.3%,92.2%,89.7%,91.2%,93.4%,90.4%,92.9%,90.5%,89.7%,...,92.0%,93.3%,92.2%,89.7%,95.4%,89.3%,89.0%,90.2%,90.8%,90.8%
3,PERIOD OF SERVICE,,,,,,,,,,...,,,,,,,,,,
4,Gulf War (9/2001 or later) veterans,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
5,Gulf War (8/1990 to 8/2001) veterans,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
6,Vietnam era veterans,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
7,Korean War veterans,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
8,World War II veterans,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
9,SEX,,,,,,,,,,...,,,,,,,,,,


In [203]:
# Retrieving only rows that contain the desired county summary demographic statistics
# Three groupings = Total Population, Veteran Population, Nonveteran Population
# % of population groups above that fall into these age categories: 18 to 34, 35 to 54, 55 to 64, 65 to 74 and 75 & Over
# Labor force participation rate
# Unemployment rate
# % of Population group with income below poverty level in the past 12 months
# % of Population group with any disability
initial_pa_nonveterans_aux_2016 = initial_pa_nonveterans_aux_2016.iloc[[13, 14, 15, 16, 17, 40, 42, 45, 49], :]
initial_pa_nonveterans_aux_2016


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
13,18 to 34 years,27.4%,31.7%,23.8%,25.9%,30.1%,26.9%,25.3%,26.9%,26.4%,...,29.6%,27.5%,29.0%,24.8%,39.2%,25.5%,25.9%,26.4%,23.8%,28.3%
14,35 to 54 years,32.9%,30.7%,32.9%,32.1%,33.9%,32.2%,35.1%,34.3%,30.6%,...,33.9%,35.0%,33.2%,34.4%,31.7%,34.7%,31.9%,32.6%,32.2%,35.0%
15,55 to 64 years,17.5%,18.4%,20.5%,20.2%,17.7%,18.6%,20.2%,19.5%,19.4%,...,19.3%,18.2%,18.4%,18.5%,14.5%,18.6%,18.7%,20.0%,20.4%,18.2%
16,65 to 74 years,12.6%,10.7%,13.0%,12.2%,10.3%,12.3%,11.3%,11.2%,13.2%,...,11.4%,10.7%,10.9%,12.1%,8.5%,11.9%,12.7%,12.3%,13.4%,10.9%
17,75 years and over,9.6%,8.5%,9.9%,9.6%,8.0%,10.0%,8.0%,8.1%,10.4%,...,5.8%,8.6%,8.5%,10.2%,6.2%,9.3%,10.7%,8.7%,10.1%,7.6%
40,Labor force participation rate,79.1%,80.1%,71.4%,77.8%,78.4%,74.4%,80.3%,76.8%,71.4%,...,73.0%,82.0%,78.4%,74.5%,70.1%,71.4%,69.7%,75.3%,77.4%,78.7%
42,Unemployment rate,5.2%,5.2%,5.8%,4.4%,6.0%,4.6%,3.7%,3.5%,7.8%,...,7.7%,4.7%,4.1%,3.7%,9.3%,6.0%,5.9%,5.6%,4.9%,4.3%
45,Income in the past 12 months below poverty level,8.2%,11.2%,12.4%,7.3%,11.7%,11.6%,6.3%,6.6%,14.4%,...,10.1%,5.9%,7.0%,13.0%,22.9%,12.2%,11.5%,9.1%,8.8%,9.5%
49,With any disability,16.1%,15.0%,21.8%,16.2%,15.4%,19.7%,13.0%,13.7%,19.3%,...,16.6%,11.4%,14.1%,17.1%,18.4%,20.7%,20.1%,15.6%,17.3%,15.6%


In [204]:
# Transposing the DataFrame's format for process integrity and future merging simplicity
initial_pa_nonveterans_aux_2016 = initial_pa_nonveterans_aux_2016.T
initial_pa_nonveterans_aux_2016.head()


Unnamed: 0,13,14,15,16,17,40,42,45,49
Unnamed: 0,18 to 34 years,35 to 54 years,55 to 64 years,65 to 74 years,75 years and over,Labor force participation rate,Unemployment rate,Income in the past 12 months below poverty level,With any disability
"Adams County, Pennsylvania",27.4%,32.9%,17.5%,12.6%,9.6%,79.1%,5.2%,8.2%,16.1%
"Allegheny County, Pennsylvania",31.7%,30.7%,18.4%,10.7%,8.5%,80.1%,5.2%,11.2%,15.0%
"Armstrong County, Pennsylvania",23.8%,32.9%,20.5%,13.0%,9.9%,71.4%,5.8%,12.4%,21.8%
"Beaver County, Pennsylvania",25.9%,32.1%,20.2%,12.2%,9.6%,77.8%,4.4%,7.3%,16.2%


In [205]:
# Resetting index and renaming to create a 'County' column
initial_pa_nonveterans_aux_2016 = initial_pa_nonveterans_aux_2016.reset_index()
initial_pa_nonveterans_aux_2016.rename(columns={'index': 'County'}, inplace=True)
initial_pa_nonveterans_aux_2016.head()


Unnamed: 0,County,13,14,15,16,17,40,42,45,49
0,Unnamed: 0,18 to 34 years,35 to 54 years,55 to 64 years,65 to 74 years,75 years and over,Labor force participation rate,Unemployment rate,Income in the past 12 months below poverty level,With any disability
1,"Adams County, Pennsylvania",27.4%,32.9%,17.5%,12.6%,9.6%,79.1%,5.2%,8.2%,16.1%
2,"Allegheny County, Pennsylvania",31.7%,30.7%,18.4%,10.7%,8.5%,80.1%,5.2%,11.2%,15.0%
3,"Armstrong County, Pennsylvania",23.8%,32.9%,20.5%,13.0%,9.9%,71.4%,5.8%,12.4%,21.8%
4,"Beaver County, Pennsylvania",25.9%,32.1%,20.2%,12.2%,9.6%,77.8%,4.4%,7.3%,16.2%


In [206]:
# Confirming column data types prior to renaming
print(initial_pa_nonveterans_aux_2016.columns)


Index(['County', 13, 14, 15, 16, 17, 40, 42, 45, 49], dtype='object')


In [207]:
# Converting column data types from integers to strings for renaming
initial_pa_nonveterans_aux_2016.columns = initial_pa_nonveterans_aux_2016.columns.astype(str)
print(initial_pa_nonveterans_aux_2016.columns)


Index(['County', '13', '14', '15', '16', '17', '40', '42', '45', '49'], dtype='object')


In [208]:
# Finding and replacing all 'N' values with 'NaN' across the entire DataFrame for future manipulation
initial_pa_nonveterans_aux_2016.replace('N', np.nan, inplace=True)


In [209]:
# Further DataFrame cleaning including execution of column renaming, dropping obselete (0) index row and subsequent resetting,
# as well as converting 'County' values to uppercase county names only (removing ' County, Pennsylvania')
# A corresponding year column is also included for future merges and all % data types are coverted to floats
initial_pa_nonveterans_aux_2016 = initial_pa_nonveterans_aux_2016.rename(columns={'13': '% Nonveteran Population 18-34 Years Old', '14': '% Nonveteran Population 35-54 Years Old', '15': '% Nonveteran Population 55-64 Years Old',
                                                                         '16': '% Nonveteran Population 65-74 Years Old', '17': '% Nonveteran Population 75 Years Old & Over', '40': 'Nonveteran Labor Force Participation Rate (%)',
                                                                         '42': 'Nonveteran Unemployment Rate (%)', '45': '% Nonveteran Population With Income Below Poverty Level (Past 12 Months)',
                                                                         '49': '% Nonveteran Population With Any Disability'})
initial_pa_nonveterans_aux_2016 = initial_pa_nonveterans_aux_2016.drop(0)
initial_pa_nonveterans_aux_2016 = initial_pa_nonveterans_aux_2016.reset_index(drop=True)
initial_pa_nonveterans_aux_2016['County'] = initial_pa_nonveterans_aux_2016['County'].str.replace(' County, Pennsylvania', '', regex=False)
initial_pa_nonveterans_aux_2016['County'] = initial_pa_nonveterans_aux_2016['County'].apply(lambda x: x.upper())
initial_pa_nonveterans_aux_2016.insert(0, 'Year', 2016)
initial_pa_nonveterans_aux_2016['% Nonveteran Population 18-34 Years Old'] = initial_pa_nonveterans_aux_2016['% Nonveteran Population 18-34 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_nonveterans_aux_2016['% Nonveteran Population 35-54 Years Old'] = initial_pa_nonveterans_aux_2016['% Nonveteran Population 35-54 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_nonveterans_aux_2016['% Nonveteran Population 55-64 Years Old'] = initial_pa_nonveterans_aux_2016['% Nonveteran Population 55-64 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_nonveterans_aux_2016['% Nonveteran Population 65-74 Years Old'] = initial_pa_nonveterans_aux_2016['% Nonveteran Population 65-74 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_nonveterans_aux_2016['% Nonveteran Population 75 Years Old & Over'] = initial_pa_nonveterans_aux_2016['% Nonveteran Population 75 Years Old & Over'].str.rstrip('%').astype(float) / 100
initial_pa_nonveterans_aux_2016['Nonveteran Labor Force Participation Rate (%)'] = initial_pa_nonveterans_aux_2016['Nonveteran Labor Force Participation Rate (%)'].str.rstrip('%').astype(float) / 100
initial_pa_nonveterans_aux_2016['Nonveteran Unemployment Rate (%)'] = initial_pa_nonveterans_aux_2016['Nonveteran Unemployment Rate (%)'].str.rstrip('%').astype(float) / 100
initial_pa_nonveterans_aux_2016['% Nonveteran Population With Income Below Poverty Level (Past 12 Months)'] = initial_pa_nonveterans_aux_2016['% Nonveteran Population With Income Below Poverty Level (Past 12 Months)'].str.rstrip('%').astype(float) / 100
initial_pa_nonveterans_aux_2016['% Nonveteran Population With Any Disability'] = initial_pa_nonveterans_aux_2016['% Nonveteran Population With Any Disability'].str.rstrip('%').astype(float) / 100
initial_pa_nonveterans_aux_2016


Unnamed: 0,Year,County,% Nonveteran Population 18-34 Years Old,% Nonveteran Population 35-54 Years Old,% Nonveteran Population 55-64 Years Old,% Nonveteran Population 65-74 Years Old,% Nonveteran Population 75 Years Old & Over,Nonveteran Labor Force Participation Rate (%),Nonveteran Unemployment Rate (%),% Nonveteran Population With Income Below Poverty Level (Past 12 Months),% Nonveteran Population With Any Disability
0,2016,ADAMS,0.274,0.329,0.175,0.126,0.096,0.791,0.052,0.082,0.161
1,2016,ALLEGHENY,0.317,0.307,0.184,0.107,0.085,0.801,0.052,0.112,0.15
2,2016,ARMSTRONG,0.238,0.329,0.205,0.13,0.099,0.714,0.058,0.124,0.218
3,2016,BEAVER,0.259,0.321,0.202,0.122,0.096,0.778,0.044,0.073,0.162
4,2016,BERKS,0.301,0.339,0.177,0.103,0.08,0.784,0.06,0.117,0.154
5,2016,BLAIR,0.269,0.322,0.186,0.123,0.1,0.744,0.046,0.116,0.197
6,2016,BUCKS,0.253,0.351,0.202,0.113,0.08,0.803,0.037,0.063,0.13
7,2016,BUTLER,0.269,0.343,0.195,0.112,0.081,0.768,0.035,0.066,0.137
8,2016,CAMBRIA,0.264,0.306,0.194,0.132,0.104,0.714,0.078,0.144,0.193
9,2016,CARBON,0.234,0.344,0.203,0.124,0.094,0.763,0.062,0.124,0.188


**2019**
-
Please note: 2020 United States Census Bureau was unavailable and as such the closest year prior to the election (2019) was utilized


In [210]:
# Reading in 2019 U.S. Census Bureau auxillary (Nonveterans) dataset
initial_pa_nonveterans_aux_2019 = pd.read_excel("Resources/PA_Nonveterans_Auxillary_2019.xlsx")
initial_pa_nonveterans_aux_2019.head(60)


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
0,,Percent Nonveterans,Percent Nonveterans,Percent Nonveterans,Percent Nonveterans,Percent Nonveterans,Percent Nonveterans,Percent Nonveterans,Percent Nonveterans,Percent Nonveterans,...,Percent Nonveterans,Percent Nonveterans,Percent Nonveterans,Percent Nonveterans,Percent Nonveterans,Percent Nonveterans,Percent Nonveterans,Percent Nonveterans,Percent Nonveterans,Percent Nonveterans
1,Label,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,...,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate
2,Civilian population 18 years and over,91.7%,93.2%,90.6%,91.9%,93.6%,91.4%,93.8%,92.5%,91.1%,...,92.7%,94.4%,92.9%,91.7%,95.5%,91.5%,90.7%,93.3%,91.0%,92.2%
3,PERIOD OF SERVICE,,,,,,,,,,...,,,,,,,,,,
4,Gulf War (9/2001 or later) veterans,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
5,Gulf War (8/1990 to 8/2001) veterans,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
6,Vietnam era veterans,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
7,Korean War veterans,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
8,World War II veterans,(X),(X),(X),(X),(X),(X),(X),(X),(X),...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
9,SEX,,,,,,,,,,...,,,,,,,,,,


In [211]:
# Retrieving only rows that contain the desired county summary demographic statistics
# Three groupings = Total Population, Veteran Population, Nonveteran Population
# % of population groups above that fall into these age categories: 18 to 34, 35 to 54, 55 to 64, 65 to 74 and 75 & Over
# Labor force participation rate
# Unemployment rate
# % of Population group with income below poverty level in the past 12 months
# % of Population group with any disability
initial_pa_nonveterans_aux_2019 = initial_pa_nonveterans_aux_2019.iloc[[13, 14, 15, 16, 17, 40, 42, 45, 49], :]
initial_pa_nonveterans_aux_2019


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
13,18 to 34 years,27.3%,31.4%,22.6%,25.5%,29.7%,27.3%,24.6%,26.7%,25.4%,...,29.0%,27.1%,29.3%,27.2%,38.2%,25.3%,23.9%,25.5%,23.6%,27.7%
14,35 to 54 years,30.9%,29.8%,30.5%,30.3%,32.4%,30.8%,33.2%,32.6%,29.5%,...,30.9%,33.8%,31.5%,29.9%,30.9%,33.4%,30.7%,31.4%,31.0%,33.1%
15,55 to 64 years,18.1%,17.5%,21.2%,19.2%,17.7%,17.9%,20.6%,19.4%,19.1%,...,20.0%,18.3%,17.1%,18.4%,14.5%,18.6%,19.2%,19.3%,20.2%,18.3%
16,65 to 74 years,14.2%,12.5%,14.7%,14.5%,11.6%,13.6%,12.8%,12.7%,14.9%,...,12.6%,11.9%,12.4%,13.0%,9.6%,13.3%,15.1%,13.7%,15.2%,12.6%
17,75 years and over,9.5%,8.8%,11.1%,10.4%,8.6%,10.4%,8.7%,8.7%,11.0%,...,7.4%,8.9%,9.7%,11.5%,6.8%,9.4%,11.1%,10.1%,10.0%,8.3%
40,Labor force participation rate,78.9%,80.6%,78.5%,79.6%,80.9%,78.6%,82.9%,76.6%,72.2%,...,74.6%,82.7%,79.3%,74.1%,73.0%,72.6%,70.2%,77.4%,78.6%,79.9%
42,Unemployment rate,1.8%,4.0%,3.1%,4.1%,4.9%,4.1%,3.8%,3.6%,4.0%,...,3.2%,3.3%,4.2%,5.0%,8.0%,4.8%,2.6%,4.9%,4.2%,3.7%
45,Income in the past 12 months below poverty level,6.9%,10.3%,9.3%,9.9%,8.5%,13.6%,5.2%,7.7%,12.8%,...,11.4%,5.8%,7.2%,10.9%,21.3%,11.3%,9.7%,9.7%,10.0%,8.7%
49,With any disability,15.7%,14.8%,17.8%,18.8%,15.1%,17.4%,11.3%,14.5%,20.6%,...,15.3%,11.2%,14.5%,18.3%,20.1%,18.6%,18.9%,18.5%,16.3%,13.8%


In [212]:
# Transposing the DataFrame's format for process integrity and future merging simplicity
initial_pa_nonveterans_aux_2019 = initial_pa_nonveterans_aux_2019.T
initial_pa_nonveterans_aux_2019.head()


Unnamed: 0,13,14,15,16,17,40,42,45,49
Unnamed: 0,18 to 34 years,35 to 54 years,55 to 64 years,65 to 74 years,75 years and over,Labor force participation rate,Unemployment rate,Income in the past 12 months below poverty level,With any disability
"Adams County, Pennsylvania",27.3%,30.9%,18.1%,14.2%,9.5%,78.9%,1.8%,6.9%,15.7%
"Allegheny County, Pennsylvania",31.4%,29.8%,17.5%,12.5%,8.8%,80.6%,4.0%,10.3%,14.8%
"Armstrong County, Pennsylvania",22.6%,30.5%,21.2%,14.7%,11.1%,78.5%,3.1%,9.3%,17.8%
"Beaver County, Pennsylvania",25.5%,30.3%,19.2%,14.5%,10.4%,79.6%,4.1%,9.9%,18.8%


In [213]:
# Resetting index and renaming to create a 'County' column
initial_pa_nonveterans_aux_2019 = initial_pa_nonveterans_aux_2019.reset_index()
initial_pa_nonveterans_aux_2019.rename(columns={'index': 'County'}, inplace=True)
initial_pa_nonveterans_aux_2019.head()


Unnamed: 0,County,13,14,15,16,17,40,42,45,49
0,Unnamed: 0,18 to 34 years,35 to 54 years,55 to 64 years,65 to 74 years,75 years and over,Labor force participation rate,Unemployment rate,Income in the past 12 months below poverty level,With any disability
1,"Adams County, Pennsylvania",27.3%,30.9%,18.1%,14.2%,9.5%,78.9%,1.8%,6.9%,15.7%
2,"Allegheny County, Pennsylvania",31.4%,29.8%,17.5%,12.5%,8.8%,80.6%,4.0%,10.3%,14.8%
3,"Armstrong County, Pennsylvania",22.6%,30.5%,21.2%,14.7%,11.1%,78.5%,3.1%,9.3%,17.8%
4,"Beaver County, Pennsylvania",25.5%,30.3%,19.2%,14.5%,10.4%,79.6%,4.1%,9.9%,18.8%


In [214]:
# Confirming column data types prior to renaming
print(initial_pa_nonveterans_aux_2019.columns)


Index(['County', 13, 14, 15, 16, 17, 40, 42, 45, 49], dtype='object')


In [215]:
# Converting column data types from integers to strings for renaming
initial_pa_nonveterans_aux_2019.columns = initial_pa_nonveterans_aux_2019.columns.astype(str)
print(initial_pa_nonveterans_aux_2019.columns)


Index(['County', '13', '14', '15', '16', '17', '40', '42', '45', '49'], dtype='object')


In [216]:
# Finding and replacing all 'N' values with 'NaN' across the entire DataFrame for future manipulation
initial_pa_nonveterans_aux_2019.replace('N', np.nan, inplace=True)


In [217]:
# Further DataFrame cleaning including execution of column renaming, dropping obselete (0) index row and subsequent resetting,
# as well as converting 'County' values to uppercase county names only (removing ' County, Pennsylvania')
# A corresponding year column is also included for future merges and all % data types are coverted to floats
initial_pa_nonveterans_aux_2019 = initial_pa_nonveterans_aux_2019.rename(columns={'13': '% Nonveteran Population 18-34 Years Old', '14': '% Nonveteran Population 35-54 Years Old', '15': '% Nonveteran Population 55-64 Years Old',
                                                                         '16': '% Nonveteran Population 65-74 Years Old', '17': '% Nonveteran Population 75 Years Old & Over', '40': 'Nonveteran Labor Force Participation Rate (%)',
                                                                         '42': 'Nonveteran Unemployment Rate (%)', '45': '% Nonveteran Population With Income Below Poverty Level (Past 12 Months)',
                                                                         '49': '% Nonveteran Population With Any Disability'})
initial_pa_nonveterans_aux_2019 = initial_pa_nonveterans_aux_2019.drop(0)
initial_pa_nonveterans_aux_2019 = initial_pa_nonveterans_aux_2019.reset_index(drop=True)
initial_pa_nonveterans_aux_2019['County'] = initial_pa_nonveterans_aux_2019['County'].str.replace(' County, Pennsylvania', '', regex=False)
initial_pa_nonveterans_aux_2019['County'] = initial_pa_nonveterans_aux_2019['County'].apply(lambda x: x.upper())
initial_pa_nonveterans_aux_2019.insert(0, 'Year', 2020)
initial_pa_nonveterans_aux_2019['% Nonveteran Population 18-34 Years Old'] = initial_pa_nonveterans_aux_2019['% Nonveteran Population 18-34 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_nonveterans_aux_2019['% Nonveteran Population 35-54 Years Old'] = initial_pa_nonveterans_aux_2019['% Nonveteran Population 35-54 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_nonveterans_aux_2019['% Nonveteran Population 55-64 Years Old'] = initial_pa_nonveterans_aux_2019['% Nonveteran Population 55-64 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_nonveterans_aux_2019['% Nonveteran Population 65-74 Years Old'] = initial_pa_nonveterans_aux_2019['% Nonveteran Population 65-74 Years Old'].str.rstrip('%').astype(float) / 100
initial_pa_nonveterans_aux_2019['% Nonveteran Population 75 Years Old & Over'] = initial_pa_nonveterans_aux_2019['% Nonveteran Population 75 Years Old & Over'].str.rstrip('%').astype(float) / 100
initial_pa_nonveterans_aux_2019['Nonveteran Labor Force Participation Rate (%)'] = initial_pa_nonveterans_aux_2019['Nonveteran Labor Force Participation Rate (%)'].str.rstrip('%').astype(float) / 100
initial_pa_nonveterans_aux_2019['Nonveteran Unemployment Rate (%)'] = initial_pa_nonveterans_aux_2019['Nonveteran Unemployment Rate (%)'].str.rstrip('%').astype(float) / 100
initial_pa_nonveterans_aux_2019['% Nonveteran Population With Income Below Poverty Level (Past 12 Months)'] = initial_pa_nonveterans_aux_2019['% Nonveteran Population With Income Below Poverty Level (Past 12 Months)'].str.rstrip('%').astype(float) / 100
initial_pa_nonveterans_aux_2019['% Nonveteran Population With Any Disability'] = initial_pa_nonveterans_aux_2019['% Nonveteran Population With Any Disability'].str.rstrip('%').astype(float) / 100
initial_pa_nonveterans_aux_2019


Unnamed: 0,Year,County,% Nonveteran Population 18-34 Years Old,% Nonveteran Population 35-54 Years Old,% Nonveteran Population 55-64 Years Old,% Nonveteran Population 65-74 Years Old,% Nonveteran Population 75 Years Old & Over,Nonveteran Labor Force Participation Rate (%),Nonveteran Unemployment Rate (%),% Nonveteran Population With Income Below Poverty Level (Past 12 Months),% Nonveteran Population With Any Disability
0,2020,ADAMS,0.273,0.309,0.181,0.142,0.095,0.789,0.018,0.069,0.157
1,2020,ALLEGHENY,0.314,0.298,0.175,0.125,0.088,0.806,0.04,0.103,0.148
2,2020,ARMSTRONG,0.226,0.305,0.212,0.147,0.111,0.785,0.031,0.093,0.178
3,2020,BEAVER,0.255,0.303,0.192,0.145,0.104,0.796,0.041,0.099,0.188
4,2020,BERKS,0.297,0.324,0.177,0.116,0.086,0.809,0.049,0.085,0.151
5,2020,BLAIR,0.273,0.308,0.179,0.136,0.104,0.786,0.041,0.136,0.174
6,2020,BUCKS,0.246,0.332,0.206,0.128,0.087,0.829,0.038,0.052,0.113
7,2020,BUTLER,0.267,0.326,0.194,0.127,0.087,0.766,0.036,0.077,0.145
8,2020,CAMBRIA,0.254,0.295,0.191,0.149,0.11,0.722,0.04,0.128,0.206
9,2020,CARBON,0.234,0.345,0.196,0.143,0.082,0.785,0.057,0.087,0.18


**Fertility & Income Assistance Data Extraction, Transformation & Loading (2012, 2016 & 2019)**
-
-----------

**2012**
-

In [218]:
# Reading in 2012 U.S. Census Bureau Fertility & Income Assistance dataset
initial_pa_fertility_2012 = pd.read_excel("Resources/PA_Fertility_2012.xlsx")
initial_pa_fertility_2012.head(60)


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
0,,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,...,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months
1,,"Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women",...,"Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women"
2,Label,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,...,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate
3,Women 15 to 50 years,57,46,40,60,53,61,46,55,54,...,42,44,48,54,54,49,57,44,33,40
4,15 to 19 years,82,11,38,40,13,35,2,48,32,...,0,5,11,0,48,17,41,20,4,30
5,20 to 34 years,100,83,90,121,109,115,98,90,111,...,111,75,79,134,73,84,131,97,74,76
6,35 to 50 years,14,19,5,15,20,22,23,32,12,...,6,32,35,6,29,31,5,11,11,16
7,RACE AND HISPANIC OR LATINO ORIGIN,,,,,,,,,,...,,,,,,,,,,
8,One race,N,N,N,N,N,N,N,N,N,...,N,N,N,N,N,N,N,N,N,N
9,White,58,45,40,55,55,60,48,57,53,...,40,41,52,55,44,49,54,42,33,42


In [219]:
# Retrieving only rows that contain the desired county summary demographic statistics
# Primary grouping = Population of women with births in the past 12 months:
# Rate per 1,000 Women (within above group) in the following age categories: 15-19 Years Old, 20-34 Years Old, 35-50 Years Old 
# Rate per 1,000 Women (within above group) Received Public Assistance Income
initial_pa_fertility_2012 = initial_pa_fertility_2012.iloc[[4, 5, 6, 37], :]
initial_pa_fertility_2012


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
4,15 to 19 years,82,11,38,40,13,35,2,48,32,...,0,5,11,0,48,17,41,20,4,30
5,20 to 34 years,100,83,90,121,109,115,98,90,111,...,111,75,79,134,73,84,131,97,74,76
6,35 to 50 years,14,19,5,15,20,22,23,32,12,...,6,32,35,6,29,31,5,11,11,16
37,Received public assistance income,99,184,155,26,174,84,53,176,378,...,88,128,79,121,107,122,390,57,62,157


In [220]:
# Transposing the DataFrame's format for process integrity and future merging simplicity
initial_pa_fertility_2012 = initial_pa_fertility_2012.T
initial_pa_fertility_2012.head()


Unnamed: 0,4,5,6,37
Unnamed: 0,15 to 19 years,20 to 34 years,35 to 50 years,Received public assistance income
"Adams County, Pennsylvania",82,100,14,99
"Allegheny County, Pennsylvania",11,83,19,184
"Armstrong County, Pennsylvania",38,90,5,155
"Beaver County, Pennsylvania",40,121,15,26


In [221]:
# Resetting index and renaming to create a 'County' column
initial_pa_fertility_2012 = initial_pa_fertility_2012.reset_index()
initial_pa_fertility_2012.rename(columns={'index': 'County'}, inplace=True)
initial_pa_fertility_2012.head()


Unnamed: 0,County,4,5,6,37
0,Unnamed: 0,15 to 19 years,20 to 34 years,35 to 50 years,Received public assistance income
1,"Adams County, Pennsylvania",82,100,14,99
2,"Allegheny County, Pennsylvania",11,83,19,184
3,"Armstrong County, Pennsylvania",38,90,5,155
4,"Beaver County, Pennsylvania",40,121,15,26


In [222]:
# Confirming column data types prior to renaming
print(initial_pa_fertility_2012.columns)


Index(['County', 4, 5, 6, 37], dtype='object')


In [223]:
# Converting column data types from integers to strings for renaming
initial_pa_fertility_2012.columns = initial_pa_fertility_2012.columns.astype(str)
print(initial_pa_fertility_2012.columns)


Index(['County', '4', '5', '6', '37'], dtype='object')


In [224]:
# Finding and replacing all 'N' values with 'NaN' across the entire DataFrame for future manipulation
initial_pa_fertility_2012.replace('N', np.nan, inplace=True)


In [225]:
# Further DataFrame cleaning including execution of column renaming, dropping obselete (0) index row and subsequent resetting,
# as well as converting 'County' values to uppercase county names only (removing ' County, Pennsylvania')
# A corresponding year column is also included for future merges
# Converting all values to floats as well
initial_pa_fertility_2012 = initial_pa_fertility_2012.rename(columns={'4': 'Birth Rate Per 1000 Women (15-19 Years Old)', '5': 'Birth Rate Per 1000 Women (20-34 Years Old)', '6': 'Birth Rate Per 1000 Women (35-50 Years Old)',
                                                                         '37': 'Received Public Assistance Income Rate Per 1000 Women (Past 12 Months)'})
initial_pa_fertility_2012 = initial_pa_fertility_2012.drop(0)
initial_pa_fertility_2012 = initial_pa_fertility_2012.reset_index(drop=True)
initial_pa_fertility_2012['County'] = initial_pa_fertility_2012['County'].str.replace(' County, Pennsylvania', '', regex=False)
initial_pa_fertility_2012['County'] = initial_pa_fertility_2012['County'].apply(lambda x: x.upper())
initial_pa_fertility_2012.insert(0, 'Year', 2012)
columns_to_floats = ['Birth Rate Per 1000 Women (15-19 Years Old)', 'Birth Rate Per 1000 Women (20-34 Years Old)',
                     'Birth Rate Per 1000 Women (35-50 Years Old)', 'Received Public Assistance Income Rate Per 1000 Women (Past 12 Months)']
initial_pa_fertility_2012[columns_to_floats] = initial_pa_fertility_2012[columns_to_floats].astype(float)
initial_pa_fertility_2012


Unnamed: 0,Year,County,Birth Rate Per 1000 Women (15-19 Years Old),Birth Rate Per 1000 Women (20-34 Years Old),Birth Rate Per 1000 Women (35-50 Years Old),Received Public Assistance Income Rate Per 1000 Women (Past 12 Months)
0,2012,ADAMS,82.0,100.0,14.0,99.0
1,2012,ALLEGHENY,11.0,83.0,19.0,184.0
2,2012,ARMSTRONG,38.0,90.0,5.0,155.0
3,2012,BEAVER,40.0,121.0,15.0,26.0
4,2012,BERKS,13.0,109.0,20.0,174.0
5,2012,BLAIR,35.0,115.0,22.0,84.0
6,2012,BUCKS,2.0,98.0,23.0,53.0
7,2012,BUTLER,48.0,90.0,32.0,176.0
8,2012,CAMBRIA,32.0,111.0,12.0,378.0
9,2012,CARBON,,,,322.0


**2016**
-

In [226]:
# Reading in 2016 U.S. Census Bureau Fertility & Income Assistance dataset
initial_pa_fertility_2016 = pd.read_excel("Resources/PA_Fertility_2016.xlsx")
initial_pa_fertility_2016.head(60)


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
0,,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,...,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months
1,,"Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women",...,"Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women"
2,Label,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,...,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate
3,Women 15 to 50 years,62,52,42,57,54,77,50,56,56,...,31,51,42,79,54,48,66,46,43,51
4,15 to 19 years,0,2,0,0,18,38,10,0,0,...,0,0,0,0,25,0,26,20,0,7
5,20 to 34 years,139,83,89,117,107,138,92,123,106,...,53,90,75,165,74,106,103,79,76,105
6,35 to 50 years,19,32,16,19,18,34,29,21,31,...,22,33,26,27,36,12,45,25,30,18
7,RACE AND HISPANIC OR LATINO ORIGIN,,,,,,,,,,...,,,,,,,,,,
8,One race,N,N,N,N,N,N,N,N,N,...,N,N,N,N,N,N,N,N,N,N
9,White,57,47,36,51,44,80,44,57,59,...,36,54,40,80,56,44,58,44,38,51


In [227]:
# Retrieving only rows that contain the desired county summary demographic statistics
# Primary grouping = Population of women with births in the past 12 months:
# Rate per 1,000 Women (within above group) in the following age categories: 15-19 Years Old, 20-34 Years Old, 35-50 Years Old 
# Rate per 1,000 Women (within above group) Received Public Assistance Income
initial_pa_fertility_2016 = initial_pa_fertility_2016.iloc[[4, 5, 6, 37], :]
initial_pa_fertility_2016


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
4,15 to 19 years,0,2,0,0,18,38,10,0,0,...,0,0,0,0,25,0,26,20,0,7
5,20 to 34 years,139,83,89,117,107,138,92,123,106,...,53,90,75,165,74,106,103,79,76,105
6,35 to 50 years,19,32,16,19,18,34,29,21,31,...,22,33,26,27,36,12,45,25,30,18
37,Received public assistance income,0,133,200,238,43,220,214,3,193,...,120,54,35,176,158,56,597,178,49,53


In [228]:
# Transposing the DataFrame's format for process integrity and future merging simplicity
initial_pa_fertility_2016 = initial_pa_fertility_2016.T
initial_pa_fertility_2016.head()


Unnamed: 0,4,5,6,37
Unnamed: 0,15 to 19 years,20 to 34 years,35 to 50 years,Received public assistance income
"Adams County, Pennsylvania",0,139,19,0
"Allegheny County, Pennsylvania",2,83,32,133
"Armstrong County, Pennsylvania",0,89,16,200
"Beaver County, Pennsylvania",0,117,19,238


In [229]:
# Resetting index and renaming to create a 'County' column
initial_pa_fertility_2016 = initial_pa_fertility_2016.reset_index()
initial_pa_fertility_2016.rename(columns={'index': 'County'}, inplace=True)
initial_pa_fertility_2016.head()


Unnamed: 0,County,4,5,6,37
0,Unnamed: 0,15 to 19 years,20 to 34 years,35 to 50 years,Received public assistance income
1,"Adams County, Pennsylvania",0,139,19,0
2,"Allegheny County, Pennsylvania",2,83,32,133
3,"Armstrong County, Pennsylvania",0,89,16,200
4,"Beaver County, Pennsylvania",0,117,19,238


In [230]:
# Confirming column data types prior to renaming
print(initial_pa_fertility_2016.columns)


Index(['County', 4, 5, 6, 37], dtype='object')


In [231]:
# Converting column data types from integers to strings for renaming
initial_pa_fertility_2016.columns = initial_pa_fertility_2016.columns.astype(str)
print(initial_pa_fertility_2016.columns)


Index(['County', '4', '5', '6', '37'], dtype='object')


In [232]:
# Finding and replacing all 'N' values with 'NaN' across the entire DataFrame for future manipulation
initial_pa_fertility_2016.replace('N', np.nan, inplace=True)


In [233]:
# Further DataFrame cleaning including execution of column renaming, dropping obselete (0) index row and subsequent resetting,
# as well as converting 'County' values to uppercase county names only (removing ' County, Pennsylvania')
# A corresponding year column is also included for future merges
# Converting all values to floats as well
initial_pa_fertility_2016 = initial_pa_fertility_2016.rename(columns={'4': 'Birth Rate Per 1000 Women (15-19 Years Old)', '5': 'Birth Rate Per 1000 Women (20-34 Years Old)', '6': 'Birth Rate Per 1000 Women (35-50 Years Old)',
                                                                         '37': 'Received Public Assistance Income Rate Per 1000 Women (Past 12 Months)'})
initial_pa_fertility_2016 = initial_pa_fertility_2016.drop(0)
initial_pa_fertility_2016 = initial_pa_fertility_2016.reset_index(drop=True)
initial_pa_fertility_2016['County'] = initial_pa_fertility_2016['County'].str.replace(' County, Pennsylvania', '', regex=False)
initial_pa_fertility_2016['County'] = initial_pa_fertility_2016['County'].apply(lambda x: x.upper())
initial_pa_fertility_2016.insert(0, 'Year', 2016)
columns_to_floats = ['Birth Rate Per 1000 Women (15-19 Years Old)', 'Birth Rate Per 1000 Women (20-34 Years Old)',
                     'Birth Rate Per 1000 Women (35-50 Years Old)', 'Received Public Assistance Income Rate Per 1000 Women (Past 12 Months)']
initial_pa_fertility_2016[columns_to_floats] = initial_pa_fertility_2016[columns_to_floats].astype(float)
initial_pa_fertility_2016


Unnamed: 0,Year,County,Birth Rate Per 1000 Women (15-19 Years Old),Birth Rate Per 1000 Women (20-34 Years Old),Birth Rate Per 1000 Women (35-50 Years Old),Received Public Assistance Income Rate Per 1000 Women (Past 12 Months)
0,2016,ADAMS,0.0,139.0,19.0,0.0
1,2016,ALLEGHENY,2.0,83.0,32.0,133.0
2,2016,ARMSTRONG,0.0,89.0,16.0,200.0
3,2016,BEAVER,0.0,117.0,19.0,238.0
4,2016,BERKS,18.0,107.0,18.0,43.0
5,2016,BLAIR,38.0,138.0,34.0,220.0
6,2016,BUCKS,10.0,92.0,29.0,214.0
7,2016,BUTLER,0.0,123.0,21.0,3.0
8,2016,CAMBRIA,0.0,106.0,31.0,193.0
9,2016,CARBON,126.0,46.0,46.0,0.0


**2019**
-
Please note: 2020 United States Census Bureau was unavailable and as such the closest year prior to the election (2019) was utilized


In [234]:
# Reading in 2019 U.S. Census Bureau Fertility & Income Assistance dataset
initial_pa_fertility_2019 = pd.read_excel("Resources/PA_Fertility_2019.xlsx")
initial_pa_fertility_2019.head(60)


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
0,,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,...,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months,Women with births in the past 12 months
1,,"Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women",...,"Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women","Rate per 1,000 women"
2,Label,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,...,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate,Estimate
3,Women 15 to 50 years,44,50,35,44,49,61,45,58,44,...,39,46,29,68,52,75,97,41,55,47
4,15 to 19 years,2,13,0,10,16,0,6,3,0,...,0,8,0,6,10,9,113,0,13,8
5,20 to 34 years,105,75,90,89,92,87,75,119,100,...,43,67,31,141,78,159,162,97,98,84
6,35 to 50 years,3,33,1,11,20,55,32,22,10,...,46,38,37,16,31,25,44,7,31,25
7,RACE AND HISPANIC OR LATINO ORIGIN,,,,,,,,,,...,,,,,,,,,,
8,One race,N,N,N,N,N,N,N,N,N,...,N,N,N,N,N,N,N,N,N,N
9,White,47,49,35,41,48,63,39,61,40,...,33,53,26,71,49,76,101,44,56,46


In [235]:
# Retrieving only rows that contain the desired county summary demographic statistics
# Primary grouping = Population of women with births in the past 12 months:
# Rate per 1,000 Women (within above group) in the following age categories: 15-19 Years Old, 20-34 Years Old, 35-50 Years Old 
# Rate per 1,000 Women (within above group) Received Public Assistance Income
initial_pa_fertility_2019 = initial_pa_fertility_2019.iloc[[4, 5, 6, 37], :]
initial_pa_fertility_2019


Unnamed: 0.1,Unnamed: 0,"Adams County, Pennsylvania","Allegheny County, Pennsylvania","Armstrong County, Pennsylvania","Beaver County, Pennsylvania","Berks County, Pennsylvania","Blair County, Pennsylvania","Bucks County, Pennsylvania","Butler County, Pennsylvania","Cambria County, Pennsylvania",...,"Monroe County, Pennsylvania","Montgomery County, Pennsylvania","Northampton County, Pennsylvania","Northumberland County, Pennsylvania","Philadelphia County, Pennsylvania","Schuylkill County, Pennsylvania","Somerset County, Pennsylvania","Washington County, Pennsylvania","Westmoreland County, Pennsylvania","York County, Pennsylvania"
4,15 to 19 years,2,13,0,10,16,0,6,3,0,...,0,8,0,6,10,9,113,0,13,8
5,20 to 34 years,105,75,90,89,92,87,75,119,100,...,43,67,31,141,78,159,162,97,98,84
6,35 to 50 years,3,33,1,11,20,55,32,22,10,...,46,38,37,16,31,25,44,7,31,25
37,Received public assistance income,80,148,0,134,146,202,108,0,273,...,0,64,38,100,136,64,102,110,17,162


In [236]:
# Transposing the DataFrame's format for process integrity and future merging simplicity
initial_pa_fertility_2019 = initial_pa_fertility_2019.T
initial_pa_fertility_2019.head()


Unnamed: 0,4,5,6,37
Unnamed: 0,15 to 19 years,20 to 34 years,35 to 50 years,Received public assistance income
"Adams County, Pennsylvania",2,105,3,80
"Allegheny County, Pennsylvania",13,75,33,148
"Armstrong County, Pennsylvania",0,90,1,0
"Beaver County, Pennsylvania",10,89,11,134


In [237]:
# Resetting index and renaming to create a 'County' column
initial_pa_fertility_2019 = initial_pa_fertility_2019.reset_index()
initial_pa_fertility_2019.rename(columns={'index': 'County'}, inplace=True)
initial_pa_fertility_2019.head()


Unnamed: 0,County,4,5,6,37
0,Unnamed: 0,15 to 19 years,20 to 34 years,35 to 50 years,Received public assistance income
1,"Adams County, Pennsylvania",2,105,3,80
2,"Allegheny County, Pennsylvania",13,75,33,148
3,"Armstrong County, Pennsylvania",0,90,1,0
4,"Beaver County, Pennsylvania",10,89,11,134


In [238]:
# Confirming column data types prior to renaming
print(initial_pa_fertility_2019.columns)


Index(['County', 4, 5, 6, 37], dtype='object')


In [239]:
# Converting column data types from integers to strings for renaming
initial_pa_fertility_2019.columns = initial_pa_fertility_2019.columns.astype(str)
print(initial_pa_fertility_2019.columns)


Index(['County', '4', '5', '6', '37'], dtype='object')


In [240]:
# Finding and replacing all 'N' values with 'NaN' across the entire DataFrame for future manipulation
initial_pa_fertility_2019.replace('N', np.nan, inplace=True)


In [241]:
# Further DataFrame cleaning including execution of column renaming, dropping obselete (0) index row and subsequent resetting,
# as well as converting 'County' values to uppercase county names only (removing ' County, Pennsylvania')
# A corresponding year column is also included for future merges
# Converting all values to floats as well
initial_pa_fertility_2019 = initial_pa_fertility_2019.rename(columns={'4': 'Birth Rate Per 1000 Women (15-19 Years Old)', '5': 'Birth Rate Per 1000 Women (20-34 Years Old)', '6': 'Birth Rate Per 1000 Women (35-50 Years Old)',
                                                                         '37': 'Received Public Assistance Income Rate Per 1000 Women (Past 12 Months)'})
initial_pa_fertility_2019 = initial_pa_fertility_2019.drop(0)
initial_pa_fertility_2019 = initial_pa_fertility_2019.reset_index(drop=True)
initial_pa_fertility_2019['County'] = initial_pa_fertility_2019['County'].str.replace(' County, Pennsylvania', '', regex=False)
initial_pa_fertility_2019['County'] = initial_pa_fertility_2019['County'].apply(lambda x: x.upper())
initial_pa_fertility_2019.insert(0, 'Year', 2020)
columns_to_floats = ['Birth Rate Per 1000 Women (15-19 Years Old)', 'Birth Rate Per 1000 Women (20-34 Years Old)',
                     'Birth Rate Per 1000 Women (35-50 Years Old)', 'Received Public Assistance Income Rate Per 1000 Women (Past 12 Months)']
initial_pa_fertility_2019[columns_to_floats] = initial_pa_fertility_2019[columns_to_floats].astype(float)
initial_pa_fertility_2019


Unnamed: 0,Year,County,Birth Rate Per 1000 Women (15-19 Years Old),Birth Rate Per 1000 Women (20-34 Years Old),Birth Rate Per 1000 Women (35-50 Years Old),Received Public Assistance Income Rate Per 1000 Women (Past 12 Months)
0,2020,ADAMS,2.0,105.0,3.0,80.0
1,2020,ALLEGHENY,13.0,75.0,33.0,148.0
2,2020,ARMSTRONG,0.0,90.0,1.0,0.0
3,2020,BEAVER,10.0,89.0,11.0,134.0
4,2020,BERKS,16.0,92.0,20.0,146.0
5,2020,BLAIR,0.0,87.0,55.0,202.0
6,2020,BUCKS,6.0,75.0,32.0,108.0
7,2020,BUTLER,3.0,119.0,22.0,0.0
8,2020,CAMBRIA,0.0,100.0,10.0,273.0
9,2020,CARBON,0.0,109.0,54.0,


**Combining All Derived Datasets/DataFrames for County Level Attribute Aggregation**
-
-----------

In [242]:
# Due to all County attribute DataFrames having the same index, we can simply outer merge all DataFrames
# from the same year on 'County' and 'Year' sequentially. This will produce a master DataFrame for each year
# (2012, 2016, & 2019). Finally we can concatenate all three vertically with axis = 0 (default)
dfs_2012 = [initial_pa_demographic_2012, initial_pa_education_2012, initial_pa_households_2012, initial_pa_families_2012,
            initial_pa_mc_families_2012, initial_pa_nonfamily_2012, initial_pa_total_aux_2012, initial_pa_veterans_aux_2012,
            initial_pa_nonveterans_aux_2012, initial_pa_fertility_2012]

dfs_2016 = [initial_pa_demographic_2016, initial_pa_education_2016, initial_pa_households_2016, initial_pa_families_2016,
            initial_pa_mc_families_2016, initial_pa_nonfamily_2016, initial_pa_total_aux_2016, initial_pa_veterans_aux_2016,
            initial_pa_nonveterans_aux_2016, initial_pa_fertility_2016]

dfs_2019 = [initial_pa_demographic_2019, initial_pa_education_2019, initial_pa_households_2019, initial_pa_families_2019,
            initial_pa_mc_families_2019, initial_pa_nonfamily_2019, initial_pa_total_aux_2019, initial_pa_veterans_aux_2019,
            initial_pa_nonveterans_aux_2019, initial_pa_fertility_2019]


In [243]:
# 2012 DataFrame merging

# Start with the first DataFrame
merged_dfs_2012 = dfs_2012[0]

# Iteratively merge the remaining DataFrames
for df in dfs_2012[1:]:
    merged_dfs_2012 = pd.merge(merged_dfs_2012, df, on=['County', 'Year'], how='outer')

# Display the final resulting 2012 DataFrame
merged_dfs_2012


Unnamed: 0,Year,County,% Male,% Female,% Male > 18 Years Old,% Female > 18 Years Old,% White,% Black or African American,% American Indian and Alaska Native,% Asian,...,% Nonveteran Population 65-74 Years Old,% Nonveteran Population 75 Years Old & Over,Nonveteran Labor Force Participation Rate (%),Nonveteran Unemployment Rate (%),% Nonveteran Population With Income Below Poverty Level (Past 12 Months),% Nonveteran Population With Any Disability,Birth Rate Per 1000 Women (15-19 Years Old),Birth Rate Per 1000 Women (20-34 Years Old),Birth Rate Per 1000 Women (35-50 Years Old),Received Public Assistance Income Rate Per 1000 Women (Past 12 Months)
0,2012,ADAMS,0.492,0.508,0.486,0.514,0.944,0.02,0.005,0.01,...,0.104,0.081,0.801,0.055,0.086,0.151,82.0,100.0,14.0,99.0
1,2012,ALLEGHENY,0.48,0.52,0.473,0.527,0.831,0.145,0.007,0.036,...,0.091,0.086,0.783,0.074,0.118,0.141,11.0,83.0,19.0,184.0
2,2012,ARMSTRONG,0.496,0.504,0.493,0.507,0.986,0.014,0.006,0.003,...,0.108,0.095,0.75,0.085,0.122,0.192,38.0,90.0,5.0,155.0
3,2012,BEAVER,0.483,0.517,0.477,0.523,0.927,0.076,0.005,0.007,...,0.109,0.096,0.786,0.083,0.1,0.162,40.0,121.0,15.0,26.0
4,2012,BERKS,0.491,0.509,0.484,0.516,0.875,0.061,0.01,0.018,...,0.089,0.077,0.796,0.095,0.127,0.142,13.0,109.0,20.0,174.0
5,2012,BLAIR,0.488,0.512,0.48,0.52,0.977,0.024,0.007,0.012,...,0.106,0.093,0.747,0.074,0.121,0.179,35.0,115.0,22.0,84.0
6,2012,BUCKS,0.49,0.51,0.484,0.516,0.909,0.045,0.005,0.048,...,0.092,0.074,0.819,0.077,0.051,0.111,2.0,98.0,23.0,53.0
7,2012,BUTLER,0.496,0.504,0.488,0.512,0.976,0.017,0.004,0.013,...,0.094,0.075,0.784,0.061,0.094,0.131,48.0,90.0,32.0,176.0
8,2012,CAMBRIA,0.496,0.504,0.492,0.508,0.957,0.038,0.012,0.008,...,0.105,0.103,0.708,0.095,0.13,0.194,32.0,111.0,12.0,378.0
9,2012,CARBON,0.489,0.511,0.481,0.519,0.968,0.03,0.013,,...,0.112,0.091,0.767,0.075,0.085,0.19,,,,322.0


In [244]:
# 2016 DataFrame merging

# Start with the first DataFrame
merged_dfs_2016 = dfs_2016[0]

# Iteratively merge the remaining DataFrames
for df in dfs_2016[1:]:
    merged_dfs_2016 = pd.merge(merged_dfs_2016, df, on=['County', 'Year'], how='outer')

# Display the final resulting 2016 DataFrame
merged_dfs_2016


Unnamed: 0,Year,County,% Male,% Female,% Male > 18 Years Old,% Female > 18 Years Old,% White,% Black or African American,% American Indian and Alaska Native,% Asian,...,% Nonveteran Population 65-74 Years Old,% Nonveteran Population 75 Years Old & Over,Nonveteran Labor Force Participation Rate (%),Nonveteran Unemployment Rate (%),% Nonveteran Population With Income Below Poverty Level (Past 12 Months),% Nonveteran Population With Any Disability,Birth Rate Per 1000 Women (15-19 Years Old),Birth Rate Per 1000 Women (20-34 Years Old),Birth Rate Per 1000 Women (35-50 Years Old),Received Public Assistance Income Rate Per 1000 Women (Past 12 Months)
0,2016,ADAMS,0.493,0.507,0.49,0.51,0.948,0.019,0.01,0.011,...,0.126,0.096,0.791,0.052,0.082,0.161,0.0,139.0,19.0,0.0
1,2016,ALLEGHENY,0.483,0.517,0.477,0.523,0.828,0.147,0.008,0.043,...,0.107,0.085,0.801,0.052,0.112,0.15,2.0,83.0,32.0,133.0
2,2016,ARMSTRONG,0.497,0.503,0.493,0.507,0.988,0.005,0.006,0.01,...,0.13,0.099,0.714,0.058,0.124,0.218,0.0,89.0,16.0,200.0
3,2016,BEAVER,0.487,0.513,0.482,0.518,0.935,0.074,0.005,0.008,...,0.122,0.096,0.778,0.044,0.073,0.162,0.0,117.0,19.0,238.0
4,2016,BERKS,0.493,0.507,0.486,0.514,0.892,0.086,0.04,0.018,...,0.103,0.08,0.784,0.06,0.117,0.154,18.0,107.0,18.0,43.0
5,2016,BLAIR,0.49,0.51,0.48,0.52,0.968,0.026,0.003,0.01,...,0.123,0.1,0.744,0.046,0.116,0.197,38.0,138.0,34.0,220.0
6,2016,BUCKS,0.491,0.509,0.485,0.515,0.9,0.049,0.006,0.054,...,0.113,0.08,0.803,0.037,0.063,0.13,10.0,92.0,29.0,214.0
7,2016,BUTLER,0.497,0.503,0.49,0.51,0.969,0.016,0.003,0.018,...,0.112,0.081,0.768,0.035,0.066,0.137,0.0,123.0,21.0,3.0
8,2016,CAMBRIA,0.491,0.509,0.486,0.514,0.964,0.04,0.012,0.008,...,0.132,0.104,0.714,0.078,0.144,0.193,0.0,106.0,31.0,193.0
9,2016,CARBON,0.505,0.495,0.5,0.5,0.963,0.031,,,...,0.124,0.094,0.763,0.062,0.124,0.188,126.0,46.0,46.0,0.0


In [245]:
# 2019 DataFrame merging

# Start with the first DataFrame
merged_dfs_2019 = dfs_2019[0]

# Iteratively merge the remaining DataFrames
for df in dfs_2019[1:]:
    merged_dfs_2019 = pd.merge(merged_dfs_2019, df, on=['County', 'Year'], how='outer')

# Display the final resulting 2019 DataFrame
merged_dfs_2019


Unnamed: 0,Year,County,% Male,% Female,% Male > 18 Years Old,% Female > 18 Years Old,% White,% Black or African American,% American Indian and Alaska Native,% Asian,...,% Nonveteran Population 65-74 Years Old,% Nonveteran Population 75 Years Old & Over,Nonveteran Labor Force Participation Rate (%),Nonveteran Unemployment Rate (%),% Nonveteran Population With Income Below Poverty Level (Past 12 Months),% Nonveteran Population With Any Disability,Birth Rate Per 1000 Women (15-19 Years Old),Birth Rate Per 1000 Women (20-34 Years Old),Birth Rate Per 1000 Women (35-50 Years Old),Received Public Assistance Income Rate Per 1000 Women (Past 12 Months)
0,2020,ADAMS,0.494,0.506,0.487,0.513,0.944,0.03,0.007,0.012,...,0.142,0.095,0.789,0.018,0.069,0.157,2.0,105.0,3.0,80.0
1,2020,ALLEGHENY,0.484,0.516,0.478,0.522,0.821,0.147,0.006,0.048,...,0.125,0.088,0.806,0.04,0.103,0.148,13.0,75.0,33.0,148.0
2,2020,ARMSTRONG,0.501,0.499,0.498,0.502,0.981,0.021,,0.004,...,0.147,0.111,0.785,0.031,0.093,0.178,0.0,90.0,1.0,0.0
3,2020,BEAVER,0.483,0.517,0.479,0.521,0.929,0.084,0.005,0.009,...,0.145,0.104,0.796,0.041,0.099,0.188,10.0,89.0,11.0,134.0
4,2020,BERKS,0.492,0.508,0.487,0.513,0.824,0.075,0.005,0.017,...,0.116,0.086,0.809,0.049,0.085,0.151,16.0,92.0,20.0,146.0
5,2020,BLAIR,0.491,0.509,0.482,0.518,0.975,0.033,,0.004,...,0.136,0.104,0.786,0.041,0.136,0.174,0.0,87.0,55.0,202.0
6,2020,BUCKS,0.491,0.509,0.486,0.514,0.886,0.051,0.004,0.059,...,0.128,0.087,0.829,0.038,0.052,0.113,6.0,75.0,32.0,108.0
7,2020,BUTLER,0.491,0.509,0.486,0.514,0.976,0.017,0.004,0.019,...,0.127,0.087,0.766,0.036,0.077,0.145,3.0,119.0,22.0,0.0
8,2020,CAMBRIA,0.49,0.51,0.484,0.516,0.949,0.054,0.002,0.007,...,0.149,0.11,0.722,0.04,0.128,0.206,0.0,100.0,10.0,273.0
9,2020,CARBON,0.499,0.501,0.498,0.502,0.942,0.029,0.015,0.013,...,0.143,0.082,0.785,0.057,0.087,0.18,0.0,109.0,54.0,


In [246]:
# Concatenating the resulting 2012, 2016 & 2019 master DataFrames vertically into a single county attribute DataFrame
combined_county_attributes = pd.concat([merged_dfs_2012, merged_dfs_2016, merged_dfs_2019], ignore_index=True)
combined_county_attributes


Unnamed: 0,Year,County,% Male,% Female,% Male > 18 Years Old,% Female > 18 Years Old,% White,% Black or African American,% American Indian and Alaska Native,% Asian,...,% Nonveteran Population 65-74 Years Old,% Nonveteran Population 75 Years Old & Over,Nonveteran Labor Force Participation Rate (%),Nonveteran Unemployment Rate (%),% Nonveteran Population With Income Below Poverty Level (Past 12 Months),% Nonveteran Population With Any Disability,Birth Rate Per 1000 Women (15-19 Years Old),Birth Rate Per 1000 Women (20-34 Years Old),Birth Rate Per 1000 Women (35-50 Years Old),Received Public Assistance Income Rate Per 1000 Women (Past 12 Months)
0,2012,ADAMS,0.492,0.508,0.486,0.514,0.944,0.020,0.005,0.010,...,0.104,0.081,0.801,0.055,0.086,0.151,82.0,100.0,14.0,99.0
1,2012,ALLEGHENY,0.480,0.520,0.473,0.527,0.831,0.145,0.007,0.036,...,0.091,0.086,0.783,0.074,0.118,0.141,11.0,83.0,19.0,184.0
2,2012,ARMSTRONG,0.496,0.504,0.493,0.507,0.986,0.014,0.006,0.003,...,0.108,0.095,0.750,0.085,0.122,0.192,38.0,90.0,5.0,155.0
3,2012,BEAVER,0.483,0.517,0.477,0.523,0.927,0.076,0.005,0.007,...,0.109,0.096,0.786,0.083,0.100,0.162,40.0,121.0,15.0,26.0
4,2012,BERKS,0.491,0.509,0.484,0.516,0.875,0.061,0.010,0.018,...,0.089,0.077,0.796,0.095,0.127,0.142,13.0,109.0,20.0,174.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,2020,SCHUYLKILL,0.515,0.485,0.513,0.487,0.954,0.040,0.006,0.008,...,0.133,0.094,0.726,0.048,0.113,0.186,9.0,159.0,25.0,64.0
116,2020,SOMERSET,0.522,0.478,0.524,0.476,0.968,0.033,0.009,0.007,...,0.151,0.111,0.702,0.026,0.097,0.189,113.0,162.0,44.0,102.0
117,2020,WASHINGTON,0.491,0.509,0.486,0.514,0.954,0.048,0.004,0.015,...,0.137,0.101,0.774,0.049,0.097,0.185,0.0,97.0,7.0,110.0
118,2020,WESTMORELAND,0.491,0.509,0.485,0.515,0.963,0.036,0.004,0.013,...,0.152,0.100,0.786,0.042,0.100,0.163,13.0,98.0,31.0,17.0


In [247]:
# Confirming Data Types for our combined county attribute DataFrame
combined_county_attributes.dtypes

Year                                                                        int64
County                                                                     object
% Male                                                                    float64
% Female                                                                  float64
% Male > 18 Years Old                                                     float64
                                                                           ...   
% Nonveteran Population With Any Disability                               float64
Birth Rate Per 1000 Women (15-19 Years Old)                               float64
Birth Rate Per 1000 Women (20-34 Years Old)                               float64
Birth Rate Per 1000 Women (35-50 Years Old)                               float64
Received Public Assistance Income Rate Per 1000 Women (Past 12 Months)    float64
Length: 103, dtype: object

In [248]:
# Confirming Data Types for our combined county attribute DataFrame
combined_county_attributes.dtypes.tolist()


[dtype('int64'),
 dtype('O'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 

**Combined County Attribute Table Preprocessing and Data Imputation**
-
-----------

In [249]:
# First, exploring DataFrame for missing/'NaN'/Null values
non_null_counts = combined_county_attributes.count()
non_null_counts


Year                                                                      120
County                                                                    120
% Male                                                                    120
% Female                                                                  120
% Male > 18 Years Old                                                     120
                                                                         ... 
% Nonveteran Population With Any Disability                               120
Birth Rate Per 1000 Women (15-19 Years Old)                               119
Birth Rate Per 1000 Women (20-34 Years Old)                               119
Birth Rate Per 1000 Women (35-50 Years Old)                               119
Received Public Assistance Income Rate Per 1000 Women (Past 12 Months)    116
Length: 103, dtype: int64

In [250]:
# Describing our DataFrame regarding missing values
columns_without_120_values = non_null_counts[non_null_counts != 120]
count_columns_without_120_values = len(columns_without_120_values)


print(f"Columns without 120 non-null values: ({count_columns_without_120_values} Total)")
print("------------------------------------------")
for column, count in columns_without_120_values.items():
    print(f"{column}: {count}")


Columns without 120 non-null values: (18 Total)
------------------------------------------
% American Indian and Alaska Native: 113
% Asian: 116
# of Married Couple Families: 101
% Married Couple Families <$10,000: 101
% Married Couple Families $10,000-$14,999: 101
% Married Couple Families $15,000-$24,999: 101
% Married Couple Families $25,000-$34,999: 101
% Married Couple Families $35,000-$49,999: 101
% Married Couple Families $50,000-$74,999: 101
% Married Couple Families $75,000-$99,999: 101
% Married Couple Families $100,000-$149,999: 101
% Married Couple Families $150,000-$199,999: 101
% Married Couple Families $200,000 or More: 101
Mean Married Couple Families Income ($): 43
Birth Rate Per 1000 Women (15-19 Years Old): 119
Birth Rate Per 1000 Women (20-34 Years Old): 119
Birth Rate Per 1000 Women (35-50 Years Old): 119
Received Public Assistance Income Rate Per 1000 Women (Past 12 Months): 116


---

It is important to note that this calculation indicates across our dataset/DataFrame, we have (18) columns/county level attributes that possess 1 or more 'NaN' or Null values. Futhermore, from the list of columns/attributes above that fall into this category - we are able to determine how many values are missing respectively and from which original datasets the missing information is derived from:

**-DPO5: % American Indian and Alaska Native & % Asian metrics as part of the baseline population demographic data**

- As these metrics are a small part of an important dataset that contains demographic data for all counties within this study AND the above output determines only (7) values are missing from the % American Indian and Alaska Native column and only (4) from the % Asian  column, we will impute these (11) missing values via Sklearn SimpleImputer with a mean strategy. We will then normalize the associated percentage columns.
    
**-S1901: 'Married Couple Families' data**

- While this table within the above mentioned dataset proves to have the most 'NaN'/Null values contained, the majority of columns missing data are only missing (19) county values (across 120) in regards to the % of Married Couple Families falling into the predefined income brackets. As the entire dataset contains referential data from other counties AND the potential for reference from the same county via a different year (2012, 2016, or 2019), we will impute the (19) missing values with Sklearn SimpleImputer and a mean strategy for each column. Although our model will already inlcude these metrics for Families, Households and Nonfamily Households, by maintaining/keeping data that represents an entire population segement surveyed by the U.S. Census Bureau (Married Couple Families), we hope to achieve a more representative county sample size. We will also normalize the associated percentage columns.

- The most significant number of 'NaN'/Null values of (77) comes from the 'Mean Married Couple Familes Income ($)' column via this same dataset table. As (77) missing values represents missing data from (77) out of (120) counties AND a mean income metric is included from the other three aforementioned surveyed groups, we will remove this column completely from our dataset/Dataframe rather than imputing (77) values.

- We will also Impute the (19) missing values for the '# of Married Couple Families' column via Sklearn SimpleImputer with a mean strategy. Associated column normalization is not required here as values aren't represented as percentages.

**-S1301: 'Fertility & Income Assistance' data**

- The above output shows the final four columns contaning 'NaN'/Null values are 'Birth Rate Per 1000 Women (15-19 Years Old)', 'Birth Rate Per 1000 Women (20-34 Years Old)', 'Birth Rate Per 1000 Women (35-50 Years Old)' & 'Received Public Assistance Income Rate Per 1000 Women (Past 12 Months)' from the Fertility and Income Assistance dataset. Each column contains (1), (1), (1) and (4) missing values for counties respectively. As this is the primary attribute data associated with healthcare AND there's a very miniscule number of missing values, we will Impute via Sklearn SimpleImputer with a mean strategy here as well. Associated column normalization is not required here as values aren't represented as percentages.


In [251]:
# DP05 Imputation % columns
demographic_columns_to_impute = ['% American Indian and Alaska Native', '% Asian']

demographic_percentage_imputer = SimpleImputer(strategy='mean')
combined_county_attributes[demographic_columns_to_impute] = demographic_percentage_imputer.fit_transform(combined_county_attributes[demographic_columns_to_impute])

# Subsequent Normalization for all % Race Population associated columns
demographic_percentage_columns_to_normalize = ['% White', '% Black or African American', '% American Indian and Alaska Native', '% Asian']
combined_county_attributes[demographic_percentage_columns_to_normalize] = combined_county_attributes[demographic_percentage_columns_to_normalize].div(combined_county_attributes[demographic_percentage_columns_to_normalize].sum(axis=1), axis=0)

# Display
combined_county_attributes


Unnamed: 0,Year,County,% Male,% Female,% Male > 18 Years Old,% Female > 18 Years Old,% White,% Black or African American,% American Indian and Alaska Native,% Asian,...,% Nonveteran Population 65-74 Years Old,% Nonveteran Population 75 Years Old & Over,Nonveteran Labor Force Participation Rate (%),Nonveteran Unemployment Rate (%),% Nonveteran Population With Income Below Poverty Level (Past 12 Months),% Nonveteran Population With Any Disability,Birth Rate Per 1000 Women (15-19 Years Old),Birth Rate Per 1000 Women (20-34 Years Old),Birth Rate Per 1000 Women (35-50 Years Old),Received Public Assistance Income Rate Per 1000 Women (Past 12 Months)
0,2012,ADAMS,0.492,0.508,0.486,0.514,0.964249,0.020429,0.005107,0.010215,...,0.104,0.081,0.801,0.055,0.086,0.151,82.0,100.0,14.0,99.0
1,2012,ALLEGHENY,0.480,0.520,0.473,0.527,0.815505,0.142296,0.006869,0.035329,...,0.091,0.086,0.783,0.074,0.118,0.141,11.0,83.0,19.0,184.0
2,2012,ARMSTRONG,0.496,0.504,0.493,0.507,0.977205,0.013875,0.005946,0.002973,...,0.108,0.095,0.750,0.085,0.122,0.192,38.0,90.0,5.0,155.0
3,2012,BEAVER,0.483,0.517,0.477,0.523,0.913300,0.074877,0.004926,0.006897,...,0.109,0.096,0.786,0.083,0.100,0.162,40.0,121.0,15.0,26.0
4,2012,BERKS,0.491,0.509,0.484,0.516,0.907676,0.063278,0.010373,0.018672,...,0.089,0.077,0.796,0.095,0.127,0.142,13.0,109.0,20.0,174.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,2020,SCHUYLKILL,0.515,0.485,0.513,0.487,0.946429,0.039683,0.005952,0.007937,...,0.133,0.094,0.726,0.048,0.113,0.186,9.0,159.0,25.0,64.0
116,2020,SOMERSET,0.522,0.478,0.524,0.476,0.951819,0.032448,0.008850,0.006883,...,0.151,0.111,0.702,0.026,0.097,0.189,113.0,162.0,44.0,102.0
117,2020,WASHINGTON,0.491,0.509,0.486,0.514,0.934378,0.047013,0.003918,0.014691,...,0.137,0.101,0.774,0.049,0.097,0.185,0.0,97.0,7.0,110.0
118,2020,WESTMORELAND,0.491,0.509,0.485,0.515,0.947835,0.035433,0.003937,0.012795,...,0.152,0.100,0.786,0.042,0.100,0.163,13.0,98.0,31.0,17.0


In [252]:
# S1901 Imputation % columns
economic_columns_to_impute = ['% Married Couple Families <$10,000', '% Married Couple Families $10,000-$14,999',
                             '% Married Couple Families $15,000-$24,999', '% Married Couple Families $25,000-$34,999',
                             '% Married Couple Families $35,000-$49,999', '% Married Couple Families $50,000-$74,999',
                              '% Married Couple Families $75,000-$99,999', '% Married Couple Families $100,000-$149,999',
                              '% Married Couple Families $150,000-$199,999', '% Married Couple Families $200,000 or More'
                             ]

econmic_percentage_imputer = SimpleImputer(strategy='mean')
combined_county_attributes[economic_columns_to_impute] = econmic_percentage_imputer.fit_transform(combined_county_attributes[economic_columns_to_impute])

# Subsequent Normalization for all income bracket % associated columns for Married Couple Families
combined_county_attributes[economic_columns_to_impute] = combined_county_attributes[economic_columns_to_impute].div(combined_county_attributes[economic_columns_to_impute].sum(axis=1), axis=0)

# Display
combined_county_attributes


Unnamed: 0,Year,County,% Male,% Female,% Male > 18 Years Old,% Female > 18 Years Old,% White,% Black or African American,% American Indian and Alaska Native,% Asian,...,% Nonveteran Population 65-74 Years Old,% Nonveteran Population 75 Years Old & Over,Nonveteran Labor Force Participation Rate (%),Nonveteran Unemployment Rate (%),% Nonveteran Population With Income Below Poverty Level (Past 12 Months),% Nonveteran Population With Any Disability,Birth Rate Per 1000 Women (15-19 Years Old),Birth Rate Per 1000 Women (20-34 Years Old),Birth Rate Per 1000 Women (35-50 Years Old),Received Public Assistance Income Rate Per 1000 Women (Past 12 Months)
0,2012,ADAMS,0.492,0.508,0.486,0.514,0.964249,0.020429,0.005107,0.010215,...,0.104,0.081,0.801,0.055,0.086,0.151,82.0,100.0,14.0,99.0
1,2012,ALLEGHENY,0.480,0.520,0.473,0.527,0.815505,0.142296,0.006869,0.035329,...,0.091,0.086,0.783,0.074,0.118,0.141,11.0,83.0,19.0,184.0
2,2012,ARMSTRONG,0.496,0.504,0.493,0.507,0.977205,0.013875,0.005946,0.002973,...,0.108,0.095,0.750,0.085,0.122,0.192,38.0,90.0,5.0,155.0
3,2012,BEAVER,0.483,0.517,0.477,0.523,0.913300,0.074877,0.004926,0.006897,...,0.109,0.096,0.786,0.083,0.100,0.162,40.0,121.0,15.0,26.0
4,2012,BERKS,0.491,0.509,0.484,0.516,0.907676,0.063278,0.010373,0.018672,...,0.089,0.077,0.796,0.095,0.127,0.142,13.0,109.0,20.0,174.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,2020,SCHUYLKILL,0.515,0.485,0.513,0.487,0.946429,0.039683,0.005952,0.007937,...,0.133,0.094,0.726,0.048,0.113,0.186,9.0,159.0,25.0,64.0
116,2020,SOMERSET,0.522,0.478,0.524,0.476,0.951819,0.032448,0.008850,0.006883,...,0.151,0.111,0.702,0.026,0.097,0.189,113.0,162.0,44.0,102.0
117,2020,WASHINGTON,0.491,0.509,0.486,0.514,0.934378,0.047013,0.003918,0.014691,...,0.137,0.101,0.774,0.049,0.097,0.185,0.0,97.0,7.0,110.0
118,2020,WESTMORELAND,0.491,0.509,0.485,0.515,0.947835,0.035433,0.003937,0.012795,...,0.152,0.100,0.786,0.042,0.100,0.163,13.0,98.0,31.0,17.0


In [253]:
# S1901 and S1301 Imputation for numerical columns
numerical_columns_to_impute = ['# of Married Couple Families', 'Birth Rate Per 1000 Women (15-19 Years Old)',
                               'Birth Rate Per 1000 Women (20-34 Years Old)', 'Birth Rate Per 1000 Women (35-50 Years Old)',
                               'Received Public Assistance Income Rate Per 1000 Women (Past 12 Months)']

numerical_imputer = SimpleImputer(strategy='mean')
combined_county_attributes[numerical_columns_to_impute] = numerical_imputer.fit_transform(combined_county_attributes[numerical_columns_to_impute])

# Display
combined_county_attributes


Unnamed: 0,Year,County,% Male,% Female,% Male > 18 Years Old,% Female > 18 Years Old,% White,% Black or African American,% American Indian and Alaska Native,% Asian,...,% Nonveteran Population 65-74 Years Old,% Nonveteran Population 75 Years Old & Over,Nonveteran Labor Force Participation Rate (%),Nonveteran Unemployment Rate (%),% Nonveteran Population With Income Below Poverty Level (Past 12 Months),% Nonveteran Population With Any Disability,Birth Rate Per 1000 Women (15-19 Years Old),Birth Rate Per 1000 Women (20-34 Years Old),Birth Rate Per 1000 Women (35-50 Years Old),Received Public Assistance Income Rate Per 1000 Women (Past 12 Months)
0,2012,ADAMS,0.492,0.508,0.486,0.514,0.964249,0.020429,0.005107,0.010215,...,0.104,0.081,0.801,0.055,0.086,0.151,82.0,100.0,14.0,99.0
1,2012,ALLEGHENY,0.480,0.520,0.473,0.527,0.815505,0.142296,0.006869,0.035329,...,0.091,0.086,0.783,0.074,0.118,0.141,11.0,83.0,19.0,184.0
2,2012,ARMSTRONG,0.496,0.504,0.493,0.507,0.977205,0.013875,0.005946,0.002973,...,0.108,0.095,0.750,0.085,0.122,0.192,38.0,90.0,5.0,155.0
3,2012,BEAVER,0.483,0.517,0.477,0.523,0.913300,0.074877,0.004926,0.006897,...,0.109,0.096,0.786,0.083,0.100,0.162,40.0,121.0,15.0,26.0
4,2012,BERKS,0.491,0.509,0.484,0.516,0.907676,0.063278,0.010373,0.018672,...,0.089,0.077,0.796,0.095,0.127,0.142,13.0,109.0,20.0,174.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,2020,SCHUYLKILL,0.515,0.485,0.513,0.487,0.946429,0.039683,0.005952,0.007937,...,0.133,0.094,0.726,0.048,0.113,0.186,9.0,159.0,25.0,64.0
116,2020,SOMERSET,0.522,0.478,0.524,0.476,0.951819,0.032448,0.008850,0.006883,...,0.151,0.111,0.702,0.026,0.097,0.189,113.0,162.0,44.0,102.0
117,2020,WASHINGTON,0.491,0.509,0.486,0.514,0.934378,0.047013,0.003918,0.014691,...,0.137,0.101,0.774,0.049,0.097,0.185,0.0,97.0,7.0,110.0
118,2020,WESTMORELAND,0.491,0.509,0.485,0.515,0.947835,0.035433,0.003937,0.012795,...,0.152,0.100,0.786,0.042,0.100,0.163,13.0,98.0,31.0,17.0


In [254]:
# Dropping the underepresented 'Mean Married Couple Families Income ($)' columns with only (77) non-null values
final_imputed_county_attributes = combined_county_attributes.drop(columns='Mean Married Couple Families Income ($)')
final_imputed_county_attributes


Unnamed: 0,Year,County,% Male,% Female,% Male > 18 Years Old,% Female > 18 Years Old,% White,% Black or African American,% American Indian and Alaska Native,% Asian,...,% Nonveteran Population 65-74 Years Old,% Nonveteran Population 75 Years Old & Over,Nonveteran Labor Force Participation Rate (%),Nonveteran Unemployment Rate (%),% Nonveteran Population With Income Below Poverty Level (Past 12 Months),% Nonveteran Population With Any Disability,Birth Rate Per 1000 Women (15-19 Years Old),Birth Rate Per 1000 Women (20-34 Years Old),Birth Rate Per 1000 Women (35-50 Years Old),Received Public Assistance Income Rate Per 1000 Women (Past 12 Months)
0,2012,ADAMS,0.492,0.508,0.486,0.514,0.964249,0.020429,0.005107,0.010215,...,0.104,0.081,0.801,0.055,0.086,0.151,82.0,100.0,14.0,99.0
1,2012,ALLEGHENY,0.480,0.520,0.473,0.527,0.815505,0.142296,0.006869,0.035329,...,0.091,0.086,0.783,0.074,0.118,0.141,11.0,83.0,19.0,184.0
2,2012,ARMSTRONG,0.496,0.504,0.493,0.507,0.977205,0.013875,0.005946,0.002973,...,0.108,0.095,0.750,0.085,0.122,0.192,38.0,90.0,5.0,155.0
3,2012,BEAVER,0.483,0.517,0.477,0.523,0.913300,0.074877,0.004926,0.006897,...,0.109,0.096,0.786,0.083,0.100,0.162,40.0,121.0,15.0,26.0
4,2012,BERKS,0.491,0.509,0.484,0.516,0.907676,0.063278,0.010373,0.018672,...,0.089,0.077,0.796,0.095,0.127,0.142,13.0,109.0,20.0,174.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,2020,SCHUYLKILL,0.515,0.485,0.513,0.487,0.946429,0.039683,0.005952,0.007937,...,0.133,0.094,0.726,0.048,0.113,0.186,9.0,159.0,25.0,64.0
116,2020,SOMERSET,0.522,0.478,0.524,0.476,0.951819,0.032448,0.008850,0.006883,...,0.151,0.111,0.702,0.026,0.097,0.189,113.0,162.0,44.0,102.0
117,2020,WASHINGTON,0.491,0.509,0.486,0.514,0.934378,0.047013,0.003918,0.014691,...,0.137,0.101,0.774,0.049,0.097,0.185,0.0,97.0,7.0,110.0
118,2020,WESTMORELAND,0.491,0.509,0.485,0.515,0.947835,0.035433,0.003937,0.012795,...,0.152,0.100,0.786,0.042,0.100,0.163,13.0,98.0,31.0,17.0


In [255]:
# Finally, confirm impuation process via value counts (all columns should = 120)
final_imputed_county_attributes.count()


Year                                                                      120
County                                                                    120
% Male                                                                    120
% Female                                                                  120
% Male > 18 Years Old                                                     120
                                                                         ... 
% Nonveteran Population With Any Disability                               120
Birth Rate Per 1000 Women (15-19 Years Old)                               120
Birth Rate Per 1000 Women (20-34 Years Old)                               120
Birth Rate Per 1000 Women (35-50 Years Old)                               120
Received Public Assistance Income Rate Per 1000 Women (Past 12 Months)    120
Length: 102, dtype: int64

In [256]:
# Finally, confirm impuation process via value counts (all columns should = 120)
final_imputed_county_attributes.count().tolist()


[120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120]

In [257]:
# Exporting dataset to CSV
final_imputed_county_attributes.to_csv('Resources/County_Final.csv', index=False)


**Joining County Outcome Table with County Attributes Table**
-
-----------

In [258]:
# Reading in cleaned county election party outcome table ('combined_county_results_clean.csv')
county_party_coutcomes = pd.read_csv('Resources/combined_county_results_clean.csv')
county_party_coutcomes


Unnamed: 0,Year,County,winning_party D:0 R:1
0,2012,ADAMS,1
1,2012,ALLEGHENY,0
2,2012,ARMSTRONG,1
3,2012,BEAVER,1
4,2012,BERKS,1
...,...,...,...
115,2020,SCHUYLKILL,1
116,2020,SOMERSET,1
117,2020,WASHINGTON,1
118,2020,WESTMORELAND,1


In [259]:
# Joining county party outcome table with county attributes table
county_joined_with_outcomes = pd.merge(final_imputed_county_attributes, county_party_coutcomes, on=['County', 'Year'], how='outer')
county_joined_with_outcomes


Unnamed: 0,Year,County,% Male,% Female,% Male > 18 Years Old,% Female > 18 Years Old,% White,% Black or African American,% American Indian and Alaska Native,% Asian,...,% Nonveteran Population 75 Years Old & Over,Nonveteran Labor Force Participation Rate (%),Nonveteran Unemployment Rate (%),% Nonveteran Population With Income Below Poverty Level (Past 12 Months),% Nonveteran Population With Any Disability,Birth Rate Per 1000 Women (15-19 Years Old),Birth Rate Per 1000 Women (20-34 Years Old),Birth Rate Per 1000 Women (35-50 Years Old),Received Public Assistance Income Rate Per 1000 Women (Past 12 Months),winning_party D:0 R:1
0,2012,ADAMS,0.492,0.508,0.486,0.514,0.964249,0.020429,0.005107,0.010215,...,0.081,0.801,0.055,0.086,0.151,82.0,100.0,14.0,99.0,1
1,2012,ALLEGHENY,0.480,0.520,0.473,0.527,0.815505,0.142296,0.006869,0.035329,...,0.086,0.783,0.074,0.118,0.141,11.0,83.0,19.0,184.0,0
2,2012,ARMSTRONG,0.496,0.504,0.493,0.507,0.977205,0.013875,0.005946,0.002973,...,0.095,0.750,0.085,0.122,0.192,38.0,90.0,5.0,155.0,1
3,2012,BEAVER,0.483,0.517,0.477,0.523,0.913300,0.074877,0.004926,0.006897,...,0.096,0.786,0.083,0.100,0.162,40.0,121.0,15.0,26.0,1
4,2012,BERKS,0.491,0.509,0.484,0.516,0.907676,0.063278,0.010373,0.018672,...,0.077,0.796,0.095,0.127,0.142,13.0,109.0,20.0,174.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,2020,SCHUYLKILL,0.515,0.485,0.513,0.487,0.946429,0.039683,0.005952,0.007937,...,0.094,0.726,0.048,0.113,0.186,9.0,159.0,25.0,64.0,1
116,2020,SOMERSET,0.522,0.478,0.524,0.476,0.951819,0.032448,0.008850,0.006883,...,0.111,0.702,0.026,0.097,0.189,113.0,162.0,44.0,102.0,1
117,2020,WASHINGTON,0.491,0.509,0.486,0.514,0.934378,0.047013,0.003918,0.014691,...,0.101,0.774,0.049,0.097,0.185,0.0,97.0,7.0,110.0,1
118,2020,WESTMORELAND,0.491,0.509,0.485,0.515,0.947835,0.035433,0.003937,0.012795,...,0.100,0.786,0.042,0.100,0.163,13.0,98.0,31.0,17.0,1


In [260]:
# Exporting final version of complete county data prior to any scaling via Unsupervised/Supervised Machine Learning
county_joined_with_outcomes.to_csv('Resources/County_Final_With_Outcomes.csv', index=False)


**Unsupervised Learning Segmentation & Dataset Enrichment**
-
-----------

In [261]:
# Scaling the final county attribute table WITHOUT the target/outcome variable (Winning Party - Democrat: 0 Republican: 1)
# Also must remove the 'County' column as it represents  string categorical values
final_imputed_county_attributes_no_county = final_imputed_county_attributes.drop(columns=['County'])
final_imputed_county_attributes_no_county


Unnamed: 0,Year,% Male,% Female,% Male > 18 Years Old,% Female > 18 Years Old,% White,% Black or African American,% American Indian and Alaska Native,% Asian,% High School Graduate (18-24),...,% Nonveteran Population 65-74 Years Old,% Nonveteran Population 75 Years Old & Over,Nonveteran Labor Force Participation Rate (%),Nonveteran Unemployment Rate (%),% Nonveteran Population With Income Below Poverty Level (Past 12 Months),% Nonveteran Population With Any Disability,Birth Rate Per 1000 Women (15-19 Years Old),Birth Rate Per 1000 Women (20-34 Years Old),Birth Rate Per 1000 Women (35-50 Years Old),Received Public Assistance Income Rate Per 1000 Women (Past 12 Months)
0,2012,0.492,0.508,0.486,0.514,0.964249,0.020429,0.005107,0.010215,0.361,...,0.104,0.081,0.801,0.055,0.086,0.151,82.0,100.0,14.0,99.0
1,2012,0.480,0.520,0.473,0.527,0.815505,0.142296,0.006869,0.035329,0.257,...,0.091,0.086,0.783,0.074,0.118,0.141,11.0,83.0,19.0,184.0
2,2012,0.496,0.504,0.493,0.507,0.977205,0.013875,0.005946,0.002973,0.479,...,0.108,0.095,0.750,0.085,0.122,0.192,38.0,90.0,5.0,155.0
3,2012,0.483,0.517,0.477,0.523,0.913300,0.074877,0.004926,0.006897,0.291,...,0.109,0.096,0.786,0.083,0.100,0.162,40.0,121.0,15.0,26.0
4,2012,0.491,0.509,0.484,0.516,0.907676,0.063278,0.010373,0.018672,0.367,...,0.089,0.077,0.796,0.095,0.127,0.142,13.0,109.0,20.0,174.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,2020,0.515,0.485,0.513,0.487,0.946429,0.039683,0.005952,0.007937,0.427,...,0.133,0.094,0.726,0.048,0.113,0.186,9.0,159.0,25.0,64.0
116,2020,0.522,0.478,0.524,0.476,0.951819,0.032448,0.008850,0.006883,0.520,...,0.151,0.111,0.702,0.026,0.097,0.189,113.0,162.0,44.0,102.0
117,2020,0.491,0.509,0.486,0.514,0.934378,0.047013,0.003918,0.014691,0.378,...,0.137,0.101,0.774,0.049,0.097,0.185,0.0,97.0,7.0,110.0
118,2020,0.491,0.509,0.485,0.515,0.947835,0.035433,0.003937,0.012795,0.326,...,0.152,0.100,0.786,0.042,0.100,0.163,13.0,98.0,31.0,17.0


In [262]:
# Scaling the final county attribute table WITHOUT the target/outcome variable (Winning Party - Democrat: 0 Republican: 1)
scaler = StandardScaler()
scaled_final_imputed_county_attributes_no_county = scaler.fit_transform(final_imputed_county_attributes_no_county)
scaled_final_imputed_county_attributes_no_county


array([[-1.22474487, -0.09955146,  0.09955146, ...,  0.1365429 ,
        -0.77104402, -0.22365943],
       [-1.22474487, -1.19385758,  1.19385758, ..., -0.48982056,
        -0.38356835,  0.63912695],
       [-1.22474487,  0.26521725, -0.26521725, ..., -0.2319062 ,
        -1.46850023,  0.34476454],
       ...,
       [ 1.22474487, -0.19074364,  0.19074364, ...,  0.02600817,
        -1.31350996, -0.11200472],
       [ 1.22474487, -0.19074364,  0.19074364, ...,  0.06285308,
         0.54637326, -1.05599453],
       [ 1.22474487,  0.26521725, -0.26521725, ..., -0.45297565,
         0.08140245,  0.41581753]])

In [263]:
# Creating a DataFrame with the scaled data
scaled_final_imputed_county_attributes_no_county_df = pd.DataFrame(scaled_final_imputed_county_attributes_no_county,
                                                                   columns=final_imputed_county_attributes_no_county.columns)
scaled_final_imputed_county_attributes_no_county_df


Unnamed: 0,Year,% Male,% Female,% Male > 18 Years Old,% Female > 18 Years Old,% White,% Black or African American,% American Indian and Alaska Native,% Asian,% High School Graduate (18-24),...,% Nonveteran Population 65-74 Years Old,% Nonveteran Population 75 Years Old & Over,Nonveteran Labor Force Participation Rate (%),Nonveteran Unemployment Rate (%),% Nonveteran Population With Income Below Poverty Level (Past 12 Months),% Nonveteran Population With Any Disability,Birth Rate Per 1000 Women (15-19 Years Old),Birth Rate Per 1000 Women (20-34 Years Old),Birth Rate Per 1000 Women (35-50 Years Old),Received Public Assistance Income Rate Per 1000 Women (Past 12 Months)
0,-1.224745,-0.099551,0.099551,-0.144893,0.144893,0.737739,-0.676706,-0.391053,-0.698111,-0.028583,...,-0.538015,-0.528495,1.018857,-0.181143,-0.762115,-0.285370,3.501633,0.136543,-0.771044,-0.223659
1,-1.224745,-1.193858,1.193858,-1.102662,1.102662,-0.845226,0.885859,0.042635,0.474216,-1.250192,...,-1.262802,-0.135074,0.596452,0.631861,0.112779,-0.609961,-0.100181,-0.489821,-0.383568,0.639127
2,-1.224745,0.265217,-0.265217,0.370828,-0.370828,0.875619,-0.760739,-0.184517,-1.036131,1.357474,...,-0.315004,0.573083,-0.177958,1.102548,0.222141,1.045454,1.269523,-0.231906,-1.468500,0.344765
3,-1.224745,-0.920281,0.920281,-0.807964,0.807964,0.195531,0.021416,-0.435633,-0.852992,-0.850820,...,-0.259251,0.651767,0.666853,1.016968,-0.379349,0.071681,1.370983,0.910286,-0.693549,-0.964641
4,-1.224745,-0.190744,0.190744,-0.292242,0.292242,0.135678,-0.127303,0.904968,-0.303308,0.041895,...,-1.374308,-0.843232,0.901522,1.530445,0.358843,-0.577502,0.001279,0.468147,-0.306073,0.537623
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,1.224745,1.997869,-1.997869,1.844318,-1.844318,0.548088,-0.429840,-0.183065,-0.804447,0.746670,...,1.078818,0.494399,-0.741165,-0.480671,-0.023923,0.850700,-0.201640,2.310393,0.081402,-0.578924
116,1.224745,2.636214,-2.636214,2.654738,-2.654738,0.605455,-0.522596,0.529937,-0.853625,1.839070,...,2.082369,1.832029,-1.304372,-1.422044,-0.461370,0.948077,5.074256,2.420927,1.553810,-0.193208
117,1.224745,-0.190744,0.190744,-0.144893,0.144893,0.419843,-0.335854,-0.683798,-0.489127,0.171104,...,1.301829,1.045188,0.385249,-0.437881,-0.461370,0.818240,-0.658208,0.026008,-1.313510,-0.112005
118,1.224745,-0.190744,0.190744,-0.218568,0.218568,0.563051,-0.484327,-0.679053,-0.577641,-0.439701,...,2.138122,0.966504,0.666853,-0.737409,-0.379349,0.104140,0.001279,0.062853,0.546373,-1.055995


In [264]:
# Principal Component Analysis for dimensionality reduction of scaled data (30 components results in ~>95% explained variance ratio)
pca = PCA(n_components=30, random_state=0)
scaled_pca_data = pca.fit_transform(scaled_final_imputed_county_attributes_no_county_df)
print(f'Explained Variance Ratio: {sum(pca.explained_variance_ratio_)}')


Explained Variance Ratio: 0.9530723089750718


In [265]:
# Creating a new DataFrame with PCA data
scaled_pca_df = pd.DataFrame(scaled_pca_data, columns=["PCA1", "PCA2", "PCA3", "PCA4", "PCA5", "PCA6", "PCA7",
                                                       "PCA8", "PCA9", "PCA10", "PCA11", "PCA12", "PCA13", "PCA14", 
                                                       "PCA15", "PCA16", "PCA17", "PCA18", "PCA19", "PCA20", "PCA21", 
                                                       "PCA22", "PCA23", "PCA24", "PCA25", "PCA26", "PCA27", "PCA28", 
                                                       "PCA29", "PCA30"])
scaled_pca_df


Unnamed: 0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8,PCA9,PCA10,...,PCA21,PCA22,PCA23,PCA24,PCA25,PCA26,PCA27,PCA28,PCA29,PCA30
0,-1.220543,-2.650239,-4.226980,2.832132,2.844367,-0.088180,-0.363029,-0.541284,-0.588251,2.020962,...,0.018784,-1.806501,-0.225332,-0.831585,1.523588,-0.618654,1.341579,0.023301,0.137271,0.660737
1,2.658206,5.462741,-1.965351,-2.013644,-1.300384,-2.486122,1.828332,3.548256,-2.624384,-0.521557,...,-0.572935,-0.117636,-0.580654,-0.440109,0.530133,0.822689,0.229181,-0.195736,-0.317751,0.400293
2,-8.958764,0.423010,-1.356897,-1.365840,-3.561745,0.582613,2.006504,-2.813614,0.624405,-0.885881,...,-0.695176,0.201331,0.335493,-0.985781,0.492323,0.003082,-1.340281,-0.380461,0.792108,-0.441984
3,-4.223753,-0.674939,-2.936526,-0.686120,-1.616674,-2.473770,0.616003,-0.086681,-1.149728,0.265770,...,-0.331090,0.126415,-0.848719,-0.352324,-0.070065,-0.841973,0.944907,-0.469644,0.117054,-0.918419
4,-2.730467,2.469756,-4.297313,1.787232,1.239753,1.029241,0.982666,0.216882,0.024914,0.897717,...,-0.343778,0.401807,0.487343,0.481445,0.185756,0.070464,0.190350,0.332882,0.519049,-0.077251
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,-2.282243,-4.969901,2.059689,0.572913,1.143632,1.710334,1.016356,0.799515,0.759724,1.956680,...,-0.564521,0.954424,0.861859,-0.178525,-0.474849,-0.741148,0.171595,0.140371,1.309219,-0.352059
116,-5.347174,-5.195419,4.089941,-0.096863,0.074844,4.382755,3.432463,1.938479,-1.681425,1.155796,...,-1.204209,-0.433852,0.512079,-2.150850,-0.390801,0.575654,-0.727950,-0.879708,0.141714,-0.637493
117,5.511346,-2.518372,2.030535,-2.766796,-0.902097,-0.214796,-1.906393,-0.973719,0.607461,0.631745,...,-1.707474,-1.185151,-0.044671,0.224726,1.946552,0.393153,-0.633717,0.103932,0.531512,-0.035536
118,3.375545,-3.823598,1.693324,-2.719202,0.088765,-2.393045,0.494310,1.407597,-1.385893,0.546170,...,-1.499379,0.685008,0.012629,1.204141,1.003089,0.117718,0.974957,0.156658,0.437216,0.741619


In [266]:
# Finding best value for K in Kmeans via elbow method
k = list(range(1, 11))
inertia = []
for i in k:
    model = KMeans(n_clusters=i, random_state=0)
    model.fit(scaled_pca_df)
    inertia.append(model.inertia_)

elbow_data_pca = {
    "k": k,
    "inertia": inertia
}

df_elbow_data_pca = pd.DataFrame(elbow_data_pca)
df_elbow_data_pca


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Unnamed: 0,k,inertia
0,1,11551.236385
1,2,8559.832737
2,3,7458.897082
3,4,6472.896299
4,5,5991.340623
5,6,5360.024405
6,7,4945.464106
7,8,4714.697602
8,9,4414.210502
9,10,4295.491462


In [267]:
# Visualize Elbow data (shows optimal value of k = 4)
elbow_plot_pca = df_elbow_data_pca.hvplot.line(x="k", y="inertia", title="Elbow Curve Using PCA(30) w/ Scaled Data", xticks=k)
elbow_plot_pca


In [268]:
# Segment data with the determined number of clusters (k) = 4
model = KMeans(n_clusters=4, random_state=0)
model.fit(scaled_pca_df)
segment_predictions = model.predict(scaled_pca_df)
print(segment_predictions)


  super()._check_params_vs_input(X, default_n_init=10)


[1 1 0 0 0 0 2 1 0 0 1 2 0 0 0 1 1 1 0 0 0 0 0 1 0 0 1 0 0 0 1 2 1 0 3 0 0
 1 0 1 1 1 0 1 1 0 2 1 0 0 1 2 0 1 0 1 1 2 0 0 1 0 0 1 0 1 1 0 0 0 1 2 1 0
 3 0 0 1 1 1 1 2 1 1 1 1 2 2 0 1 1 2 0 1 0 2 1 2 1 0 1 1 1 1 0 1 1 1 1 1 1
 2 2 0 3 0 0 1 1 1]


In [269]:
# Enrich master county dataset that has target/label outcomes included with segment predictions
ultimate_county_df = county_joined_with_outcomes.copy()
ultimate_county_df['Segment'] = segment_predictions
ultimate_county_df


Unnamed: 0,Year,County,% Male,% Female,% Male > 18 Years Old,% Female > 18 Years Old,% White,% Black or African American,% American Indian and Alaska Native,% Asian,...,Nonveteran Labor Force Participation Rate (%),Nonveteran Unemployment Rate (%),% Nonveteran Population With Income Below Poverty Level (Past 12 Months),% Nonveteran Population With Any Disability,Birth Rate Per 1000 Women (15-19 Years Old),Birth Rate Per 1000 Women (20-34 Years Old),Birth Rate Per 1000 Women (35-50 Years Old),Received Public Assistance Income Rate Per 1000 Women (Past 12 Months),winning_party D:0 R:1,Segment
0,2012,ADAMS,0.492,0.508,0.486,0.514,0.964249,0.020429,0.005107,0.010215,...,0.801,0.055,0.086,0.151,82.0,100.0,14.0,99.0,1,1
1,2012,ALLEGHENY,0.480,0.520,0.473,0.527,0.815505,0.142296,0.006869,0.035329,...,0.783,0.074,0.118,0.141,11.0,83.0,19.0,184.0,0,1
2,2012,ARMSTRONG,0.496,0.504,0.493,0.507,0.977205,0.013875,0.005946,0.002973,...,0.750,0.085,0.122,0.192,38.0,90.0,5.0,155.0,1,0
3,2012,BEAVER,0.483,0.517,0.477,0.523,0.913300,0.074877,0.004926,0.006897,...,0.786,0.083,0.100,0.162,40.0,121.0,15.0,26.0,1,0
4,2012,BERKS,0.491,0.509,0.484,0.516,0.907676,0.063278,0.010373,0.018672,...,0.796,0.095,0.127,0.142,13.0,109.0,20.0,174.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,2020,SCHUYLKILL,0.515,0.485,0.513,0.487,0.946429,0.039683,0.005952,0.007937,...,0.726,0.048,0.113,0.186,9.0,159.0,25.0,64.0,1,0
116,2020,SOMERSET,0.522,0.478,0.524,0.476,0.951819,0.032448,0.008850,0.006883,...,0.702,0.026,0.097,0.189,113.0,162.0,44.0,102.0,1,0
117,2020,WASHINGTON,0.491,0.509,0.486,0.514,0.934378,0.047013,0.003918,0.014691,...,0.774,0.049,0.097,0.185,0.0,97.0,7.0,110.0,1,1
118,2020,WESTMORELAND,0.491,0.509,0.485,0.515,0.947835,0.035433,0.003937,0.012795,...,0.786,0.042,0.100,0.163,13.0,98.0,31.0,17.0,1,1


In [270]:
# Export final county dataset with segmentation column included for model training (executed in another notebook)
ultimate_county_df.to_csv('Resources/County_Final_With_Outcomes_And_Segments.csv', index=False)
