# Data Cleanup and Analysis Code

In [9]:
import pandas as pd

In [403]:
# import requests
# query_url = "https://api.covidtracking.com/v2/us/daily.json"
# response = requests.get(query_url).json()

# Transform

### Table 1 : 'US_States'

In [10]:
states_df =  pd.read_csv('Resources/US_States.csv')
states_df

#Trim leading and trailing spaces for string type data
sdf_obj = states_df.select_dtypes(['object'])
#print (sdf_obj)
states_df[sdf_obj.columns] = sdf_obj.apply(lambda x: x.str.strip()) 


# LOAD US_States.csv

In [11]:
# Check for duplicates 
# I am checking total States_Fips vs. total unique States_Fips. 
# If they are equal, then there are no duplicates
States_Fips_List = states_df['State_Fips']
States_Table_Count = States_Fips_List.count()
Unique_States_Count = States_Fips_List.nunique()
print (States_Table_Count, Unique_States_Count)

# State Table is clean. No duplicate FIPS exist. 
# This table is ready for PostgreSQL Table

# LOAD US_States.csv
states_df.to_csv('Resources/Schema/Us_States.csv', index=False)

60 60


# Transform

### Table 2 : 'US_Counties'

In [12]:
counties_df = pd.read_csv('Resources/US_Counties.csv')
#print(counties_df)

In [4]:
# Some states have same county names. So counties are uniquly identified by County_FIPS which is a unique ID 
# called Federal Information Processing Standards

In [13]:
# Check for duplicates 
# I am checking total County_Fips vs. total unique County_Fips. 
# If they are equal, then there are no duplicates
County_Fips_List = counties_df['County_Fips']
County_Table_Count = County_Fips_List.count()
Unique_Counties_Count = County_Fips_List.nunique()
print (County_Table_Count, Unique_Counties_Count)


# County Table is Clean. No Nulls, no duplicate FIPS exist

3198 3198


In [14]:
#Trim leading and trailing spaces for string type data
df_obj = counties_df.select_dtypes(['object'])
counties_df[df_obj.columns] = df_obj.apply(lambda x: x.str.strip()) 


In [15]:
# Merge County table with state 
county_table = counties_df.merge(states_df, how='left', left_on='State', right_on='Sabbr')
county_table = county_table[['County_Fips', 'County', 'State_Fips']]
county_table['County'] = county_table['County'].str.title()


df_obj = county_table.select_dtypes(['object'])
county_table[df_obj.columns] = df_obj.apply(lambda x: x.str.strip()) 
county_table_strip = county_table
county_table_strip

# This table is ready for PostGressql
county_table_strip

Unnamed: 0,County_Fips,County,State_Fips
0,1000,Alabama,1
1,1001,Autauga County,1
2,1003,Baldwin County,1
3,1005,Barbour County,1
4,1007,Bibb County,1
...,...,...,...
3193,56045,Weston County,56
3194,66000,Guam,66
3195,69000,Northern Mariana Islands,69
3196,72000,Puerto Rico,72


# LOAD Us_Counties.csv

In [16]:

# LOAD Us_Counties.csv
county_table_strip.to_csv('Resources/Schema/Us_Counties.csv', index=False)

# TRANSFORM

### Table 3 : 'US_Covid_Data'

In [30]:
covid_df = pd.read_csv('Resources/US_Covid_Data.csv')
#covid_df

In [31]:
# Chack for Nulls in any colums
covid_df.isna().any()

date      False
county    False
state     False
fips       True
cases     False
deaths     True
dtype: bool

In [32]:
# column 'deaths' can contain Nulls but not Fips 
# Identify these records for cleaning

# if (covid_df['fips'].isnull().values.any()):
#     print(covid_df[covid_df['fips'].isna()])

covid_df['fips'].isnull().values.any()

True

In [33]:
# If covid table has records that have state but an unknown county, we keep the data
na_covid_df = covid_df[covid_df['fips'].isna()]
#na_covid_df

In [34]:
# stripping 'state' column of any leading and trailing spaces

#na_covid_df['state'] = na_covid_df['state'].str.strip()
na_covid_df['state'].apply(lambda x: x.strip()) 
#na_covid_df

na_covid_df = na_covid_df.merge(county_table_strip, how="left", left_on = "state", right_on="County")
#na_covid_df

na_covid_df_final = na_covid_df[['date', 'county', 'state', 'County_Fips', 'cases', 'deaths']]
na_covid_df_final


Unnamed: 0,date,county,state,County_Fips,cases,deaths
0,2020-03-01,New York City,New York,36000,1,0.0
1,2020-03-01,Unknown,Rhode Island,44000,2,0.0
2,2020-03-02,New York City,New York,36000,1,0.0
3,2020-03-02,Unknown,Rhode Island,44000,2,0.0
4,2020-03-03,New York City,New York,36000,2,0.0
...,...,...,...,...,...,...
14422,2021-08-06,Unknown,Puerto Rico,72000,5896,2598.0
14423,2021-08-06,Unknown,Rhode Island,44000,13011,8.0
14424,2021-08-06,Unknown,Tennessee,47000,8551,95.0
14425,2021-08-06,Unknown,Utah,49000,1343,25.0


In [35]:
# Records with Null Fips have been cleaned and fips from cencus table for --
# -- state_unknown counties have been used
# Renaming the column to match Us_Covid_Table

na_covid_df_final = na_covid_df_final.rename(columns = {'County_Fips':'county_fips'})
na_covid_df_final

na_covid_df_final['county_fips'].isnull().values.any()

False

In [36]:
# Non Null Covid Data
non_na_covid_df = covid_df[covid_df['fips'].notna()]  
non_na_covid_df

non_na_covid_df = non_na_covid_df.rename(columns = {'fips':'county_fips'})
non_na_covid_df

Unnamed: 0,date,county,state,county_fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061.0,1,0.0
1,2020-01-22,Snohomish,Washington,53061.0,1,0.0
2,2020-01-23,Snohomish,Washington,53061.0,1,0.0
3,2020-01-24,Cook,Illinois,17031.0,1,0.0
4,2020-01-24,Snohomish,Washington,53061.0,1,0.0
...,...,...,...,...,...,...
1592417,2021-08-06,Sweetwater,Wyoming,56037.0,5056,45.0
1592418,2021-08-06,Teton,Wyoming,56039.0,3960,11.0
1592419,2021-08-06,Uinta,Wyoming,56041.0,2498,14.0
1592420,2021-08-06,Washakie,Wyoming,56043.0,949,26.0


In [37]:
# Concatenate the clean dataframes
vertical_stack = pd. concat([na_covid_df_final, non_na_covid_df], axis=0) 
vertical_stack.isnull().any()

date           False
county         False
state          False
county_fips    False
cases          False
deaths          True
dtype: bool

In [53]:
# Converting fips columns to Int
vertical_stack.county_fips = vertical_stack.county_fips.astype(int)


In [54]:

us_covid_table = vertical_stack[['date', 'county_fips', 'cases', 'deaths']]
#us_covid_table = us_covid_table.sort_values(by='date', ascending=False)
us_covid_table

# This table is ready for Loading

Unnamed: 0,date,county_fips,cases,deaths
0,2020-03-01,36000,1,0.0
1,2020-03-01,44000,2,0.0
2,2020-03-02,36000,1,0.0
3,2020-03-02,44000,2,0.0
4,2020-03-03,36000,2,0.0
...,...,...,...,...
1592417,2021-08-06,56037,5056,45.0
1592418,2021-08-06,56039,3960,11.0
1592419,2021-08-06,56041,2498,14.0
1592420,2021-08-06,56043,949,26.0


# LOAD US_Covid_Data.csv

In [61]:

# LOAD US_Covid_Data.csv
us_covid_table.to_csv('Resources/Schema/US_Covid_Data.csv', index=False)

# Transform

### Table 4 : 'US_Census_Data_2020'

In [62]:
population_data = pd.read_csv('Resources/US_Census_Data_2020.csv', encoding='latin-1')
# population_data.loc[population_data['STNAME'] == population_data['CTYNAME']]
population_data = population_data.loc[population_data['COUNTY'] > 0]
population_data

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015,RNETMIG2016,RNETMIG2017,RNETMIG2018,RNETMIG2019,RNETMIG2020
1,50,3,6,1,1,Alabama,Autauga County,54571,54582,54761,...,6.236931,-5.971016,-3.773344,2.206640,-1.529706,4.954403,0.993228,-0.018021,3.486011,6.290545
2,50,3,6,1,3,Alabama,Baldwin County,182265,182263,183121,...,16.705437,17.670696,22.924288,20.300088,17.902273,21.436499,22.476720,24.846335,25.242507,26.401562
3,50,3,6,1,5,Alabama,Barbour County,27457,27454,27325,...,0.329254,-6.860371,-8.093425,-5.063857,-15.677998,-18.377839,-25.138734,-8.790155,-6.257064,0.649799
4,50,3,6,1,7,Alabama,Bibb County,22915,22904,22858,...,-4.912927,-3.789130,-5.800695,1.420612,1.286202,-0.841769,-3.235672,-7.271592,0.268980,-7.199262
5,50,3,6,1,9,Alabama,Blount County,57322,57322,57372,...,0.348029,-1.597971,-0.277742,-1.997117,-1.303543,-1.217158,6.193186,0.242275,0.934175,1.192544
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3189,50,4,8,56,37,Wyoming,Sweetwater County,43806,43806,43580,...,0.776433,15.410190,-4.433558,-12.751566,-13.455712,-17.688190,-20.936101,-15.589918,-10.452355,-9.510457
3190,50,4,8,56,39,Wyoming,Teton County,21294,21298,21298,...,-2.340824,2.322071,23.284369,12.672811,4.881876,1.035867,-1.543805,-13.120659,0.171505,0.383943
3191,50,4,8,56,41,Wyoming,Uinta County,21118,21121,21090,...,-17.908599,-4.151853,-10.624866,-15.022486,-10.381621,-11.424990,-18.658892,-14.135663,-8.840598,-2.177625
3192,50,4,8,56,43,Wyoming,Washakie County,8533,8528,8531,...,-12.837122,-3.084040,-1.307423,-19.048760,0.000000,-15.064998,-16.056321,-16.101642,-7.638447,-6.801848


In [63]:
# population_data = population_data.loc[population_data['STATE'].notnull()].copy()
# population_data = population_data.loc[population_data['COUNTY'].notnull()].copy()

population_data['STATE'] = population_data['STATE'].astype('str')
population_data['COUNTY'] = population_data['COUNTY'].astype('str')

population_data['STATE']=population_data['STATE'].apply(lambda x: x.zfill(2))
population_data['COUNTY']=population_data['COUNTY'].apply(lambda x: x.zfill(3))

population_data['fips'] = population_data['STATE'] + population_data['COUNTY']
# population_data['fips'] 
population_data


Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015,RNETMIG2016,RNETMIG2017,RNETMIG2018,RNETMIG2019,RNETMIG2020,fips
1,50,3,6,01,001,Alabama,Autauga County,54571,54582,54761,...,-5.971016,-3.773344,2.206640,-1.529706,4.954403,0.993228,-0.018021,3.486011,6.290545,01001
2,50,3,6,01,003,Alabama,Baldwin County,182265,182263,183121,...,17.670696,22.924288,20.300088,17.902273,21.436499,22.476720,24.846335,25.242507,26.401562,01003
3,50,3,6,01,005,Alabama,Barbour County,27457,27454,27325,...,-6.860371,-8.093425,-5.063857,-15.677998,-18.377839,-25.138734,-8.790155,-6.257064,0.649799,01005
4,50,3,6,01,007,Alabama,Bibb County,22915,22904,22858,...,-3.789130,-5.800695,1.420612,1.286202,-0.841769,-3.235672,-7.271592,0.268980,-7.199262,01007
5,50,3,6,01,009,Alabama,Blount County,57322,57322,57372,...,-1.597971,-0.277742,-1.997117,-1.303543,-1.217158,6.193186,0.242275,0.934175,1.192544,01009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3189,50,4,8,56,037,Wyoming,Sweetwater County,43806,43806,43580,...,15.410190,-4.433558,-12.751566,-13.455712,-17.688190,-20.936101,-15.589918,-10.452355,-9.510457,56037
3190,50,4,8,56,039,Wyoming,Teton County,21294,21298,21298,...,2.322071,23.284369,12.672811,4.881876,1.035867,-1.543805,-13.120659,0.171505,0.383943,56039
3191,50,4,8,56,041,Wyoming,Uinta County,21118,21121,21090,...,-4.151853,-10.624866,-15.022486,-10.381621,-11.424990,-18.658892,-14.135663,-8.840598,-2.177625,56041
3192,50,4,8,56,043,Wyoming,Washakie County,8533,8528,8531,...,-3.084040,-1.307423,-19.048760,0.000000,-15.064998,-16.056321,-16.101642,-7.638447,-6.801848,56043


In [64]:
population_data = population_data[['fips','POPESTIMATE2016','POPESTIMATE2017','POPESTIMATE2018','POPESTIMATE2019',
'POPESTIMATE2020', 'BIRTHS2016','BIRTHS2017','BIRTHS2018','BIRTHS2019','BIRTHS2020',
'DEATHS2016','DEATHS2017','DEATHS2018','DEATHS2019','DEATHS2020']]
population_data
# Table is ready to load

Unnamed: 0,fips,POPESTIMATE2016,POPESTIMATE2017,POPESTIMATE2018,POPESTIMATE2019,POPESTIMATE2020,BIRTHS2016,BIRTHS2017,BIRTHS2018,BIRTHS2019,BIRTHS2020,DEATHS2016,DEATHS2017,DEATHS2018,DEATHS2019,DEATHS2020
1,01001,55302,55448,55533,55769,56145,675,667,651,592,606,547,574,563,552,582
2,01003,207787,212737,218071,223565,229287,2286,2313,2297,2322,2317,2021,2103,2325,2386,2543
3,01005,25828,25169,24887,24657,24589,284,276,273,248,250,279,297,335,322,334
4,01007,22590,22532,22300,22313,22136,299,267,243,238,249,243,251,314,232,266
5,01009,57487,57801,57770,57840,57879,675,681,675,673,665,651,721,718,658,697
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3189,56037,44319,43663,43188,42917,42673,626,571,529,481,481,303,312,331,301,319
3190,56039,23255,23383,23261,23385,23497,245,244,255,211,225,97,78,72,90,124
3191,56041,20711,20449,20299,20196,20215,312,272,267,232,232,143,151,130,156,171
3192,56043,8180,8013,7886,7824,7760,101,61,88,81,73,78,101,86,81,85



# LOAD US_Census_Data.csv


In [65]:
population_data.to_csv('Resources/Schema/US_Census_Data.csv', index=False)

# Transform

### Table 5 : 'WHO_Flu_data'

In [70]:
who_flu_df = pd.read_csv('Resources/WHO_NREVSS_Clinical_Labs.csv')
#who_flu_df

# Drop Region Type
who_flu_df.drop(columns=['REGION TYPE'], inplace=True) 

# Replace 'X' with 0
who_flu_df.loc[who_flu_df['TOTAL SPECIMENS'] == 'X', 'TOTAL SPECIMENS'] = 0
who_flu_df.loc[who_flu_df['TOTAL A'] == 'X', 'TOTAL A'] = 0
who_flu_df.loc[who_flu_df['TOTAL B'] == 'X', 'TOTAL B'] = 0
who_flu_df.loc[who_flu_df['PERCENT POSITIVE'] == 'X', 'PERCENT POSITIVE'] = 0
who_flu_df.loc[who_flu_df['PERCENT A'] == 'X', 'PERCENT A'] = 0
who_flu_df.loc[who_flu_df['PERCENT B'] == 'X', 'PERCENT B'] = 0
who_flu_df


Unnamed: 0,REGION,YEAR,WEEK,TOTAL SPECIMENS,TOTAL A,TOTAL B,PERCENT POSITIVE,PERCENT A,PERCENT B
0,Alabama,2019,40,512,2,13,2.93,0.39,2.54
1,Alaska,2019,40,0,0,0,0,0,0
2,Arizona,2019,40,278,0,2,0.72,0,0.72
3,Arkansas,2019,40,89,0,0,0,0,0
4,California,2019,40,1776,18,10,1.58,1.01,0.56
...,...,...,...,...,...,...,...,...,...
5125,Wisconsin,2021,29,0,0,0,0,0,0
5126,Wyoming,2021,29,0,0,0,0,0,0
5127,Puerto Rico,2021,29,0,0,0,0,0,0
5128,Virgin Islands,2021,29,0,0,0,0,0,0
