# Data Cleanup and Analysis Code


In [1]:
import pandas as pd
import requests

In [2]:
# query_url = "https://api.covidtracking.com/v2/us/daily.json"
# response = requests.get(query_url).json()

# Transform

### Table 1 : 'US_Counties'

In [3]:
counties_df = pd.read_csv('Resources/US_Counties.csv')
print(counties_df)

      County_Fips             County State
0            1000            ALABAMA   NaN
1            1001     Autauga County    AL
2            1003     Baldwin County    AL
3            1005     Barbour County    AL
4            1007        Bibb County    AL
...           ...                ...   ...
3189        56037  Sweetwater County    WY
3190        56039       Teton County    WY
3191        56041       Uinta County    WY
3192        56043    Washakie County    WY
3193        56045      Weston County    WY

[3194 rows x 3 columns]


In [4]:
# drop rows where state_fips is NaN
counties_df = counties_df.dropna(how='any')

# Some states have same county names. So counties are uniquly identified by County_FIPS which is a unique ID 
# Federal Information Processing Standards
counties_df.sort_values(by='County')

Unnamed: 0,County_Fips,County,State
2357,45001,Abbeville County,SC
1132,22001,Acadia Parish,LA
2867,51001,Accomack County,VA
564,16001,Ada County,ID
1011,21001,Adair County,KY
...,...,...,...
113,4027,Yuma County,AZ
313,8125,Yuma County,CO
2819,48505,Zapata County,TX
2820,48507,Zavala County,TX


In [5]:
# Check for duplicates 
# I am checking total County_Fips vs. total unique County_Fips. 
# If they are equal, then there are no duplicates
County_Fips_List = counties_df['County_Fips']
County_Table_Count = County_Fips_List.count()
Unique_Counties_Count = County_Fips_List.nunique()
print (County_Table_Count, Unique_Counties_Count)

# County Table is normalized. No duplicates FIPS exist

3143 3143


### Table 2 : 'US_States'

In [6]:
# drop rows where state_fips is NaN
states_df =  pd.read_csv('Resources/US_States.csv')
states_df

Unnamed: 0,State_Fips,Sabbr,Sname
0,1,AL,Alabama
1,2,AK,Alaska
2,4,AZ,Arizona
3,5,AR,Arkansas
4,6,CA,California
5,8,CO,Colorado
6,9,CT,Connecticut
7,10,DE,Delaware
8,11,DC,District of Columbia
9,12,FL,Florida


In [7]:
# Check for duplicates 
# I am checking total States_Fips vs. total unique States_Fips. 
# If they are equal, then there are no duplicates
States_Fips_List = states_df['State_Fips']
States_Table_Count = States_Fips_List.count()
Unique_States_Count = States_Fips_List.nunique()
print (States_Table_Count, Unique_States_Count)

# State Table is normalized. No duplicates FIPS exist

51 51


### Table 3 : 'US_Census_Data'

In [8]:
# Census Data 

pop_census_df = pd.read_csv('Resources/US_Census_Data_2020.csv', encoding='latin-1')
pop_census_df.keys()

Index(['SUMLEV', 'REGION', 'DIVISION', 'STATE', 'COUNTY', 'STNAME', 'CTYNAME',
       'CENSUS2010POP', 'ESTIMATESBASE2010', 'POPESTIMATE2010',
       ...
       'RNETMIG2011', 'RNETMIG2012', 'RNETMIG2013', 'RNETMIG2014',
       'RNETMIG2015', 'RNETMIG2016', 'RNETMIG2017', 'RNETMIG2018',
       'RNETMIG2019', 'RNETMIG2020'],
      dtype='object', length=180)

In [9]:
pop_census_df_clean = pop_census_df[['STATE', 'COUNTY', 'STNAME', 'CTYNAME', 'POPESTIMATE2015','POPESTIMATE2016','POPESTIMATE2017','POPESTIMATE2018', 'POPESTIMATE2019', 'POPESTIMATE2020']]

In [10]:
pop_census_df_State_List= pop_census_df_clean.loc[pop_census_df_clean['COUNTY'] == 0]
pop_census_df_State_List ['STATE'].count()

51

In [11]:
counties_df = pd.read_csv('Resources/US_Counties.csv')

In [12]:
us_covid_data = pd.read_csv('Resources/US_Covid_Data.csv')
us_covid_data

Unnamed: 0,date,county,state,fips,cases,deaths
0,2021-07-07,Autauga,Alabama,1001.0,7277,113.0
1,2021-07-07,Baldwin,Alabama,1003.0,22154,316.0
2,2021-07-07,Barbour,Alabama,1005.0,2354,60.0
3,2021-07-07,Bibb,Alabama,1007.0,2699,65.0
4,2021-07-07,Blount,Alabama,1009.0,7013,139.0
...,...,...,...,...,...,...
97389,2021-08-05,Sweetwater,Wyoming,56037.0,5048,45.0
97390,2021-08-05,Teton,Wyoming,56039.0,3950,11.0
97391,2021-08-05,Uinta,Wyoming,56041.0,2482,14.0
97392,2021-08-05,Washakie,Wyoming,56043.0,949,26.0


In [13]:

us_covid_data['fips'].nunique()

3218

In [19]:
flu_df = pd.read_csv('Resources/WHO_NREVSS_Clinical_Labs.csv') 
flu_df

Unnamed: 0,REGION TYPE,REGION,YEAR,WEEK,TOTAL SPECIMENS,TOTAL A,TOTAL B,PERCENT POSITIVE,PERCENT A,PERCENT B
0,States,Alabama,2019,40,512,2,13,2.93,0.39,2.54
1,States,Alaska,2019,40,X,X,X,X,X,X
2,States,Arizona,2019,40,278,0,2,0.72,0,0.72
3,States,Arkansas,2019,40,89,0,0,0,0,0
4,States,California,2019,40,1776,18,10,1.58,1.01,0.56
...,...,...,...,...,...,...,...,...,...,...
5125,States,Wisconsin,2021,29,X,X,X,X,X,X
5126,States,Wyoming,2021,29,X,X,X,X,X,X
5127,States,Puerto Rico,2021,29,0,0,0,0,0,0
5128,States,Virgin Islands,2021,29,X,X,X,X,X,X


In [21]:
flu_df.columns

Index(['REGION TYPE', 'REGION', 'YEAR', 'WEEK', 'TOTAL SPECIMENS', 'TOTAL A',
       'TOTAL B', 'PERCENT POSITIVE', 'PERCENT A', 'PERCENT B'],
      dtype='object')

In [30]:
AX_values = flu_df[flu_df['TOTAL A'] == "X"].index
flu_df.drop(AX_values, inplace= True)
flu_df

Unnamed: 0,REGION TYPE,REGION,YEAR,WEEK,TOTAL SPECIMENS,TOTAL A,TOTAL B,PERCENT POSITIVE,PERCENT A,PERCENT B,TOTAL FLU
0,States,Alabama,2019,40,512,2,13,2.93,0.39,2.54,213
2,States,Arizona,2019,40,278,0,2,0.72,0,0.72,02
3,States,Arkansas,2019,40,89,0,0,0,0,0,00
4,States,California,2019,40,1776,18,10,1.58,1.01,0.56,1810
5,States,Colorado,2019,40,458,2,0,0.44,0.44,0,20
...,...,...,...,...,...,...,...,...,...,...,...
5121,States,Vermont,2021,29,38,0,0,0,0,0,00
5122,States,Virginia,2021,29,232,0,0,0,0,0,00
5123,States,Washington,2021,29,326,0,0,0,0,0,00
5124,States,West Virginia,2021,29,90,0,0,0,0,0,00


In [41]:
combined_flu = pd.to_numeric(flu_df["TOTAL A"]) + pd.to_numeric(flu_df["TOTAL B"])
flu_df["TOTAL FLU"] = combined_flu
combined_flu

0       15
2        2
3        0
4       28
5        2
        ..
5121     0
5122     0
5123     0
5124     0
5127     0
Length: 4003, dtype: int64

In [42]:
flu_df

Unnamed: 0,REGION TYPE,REGION,YEAR,WEEK,TOTAL SPECIMENS,TOTAL A,TOTAL B,PERCENT POSITIVE,PERCENT A,PERCENT B,TOTAL FLU
0,States,Alabama,2019,40,512,2,13,2.93,0.39,2.54,15
2,States,Arizona,2019,40,278,0,2,0.72,0,0.72,2
3,States,Arkansas,2019,40,89,0,0,0,0,0,0
4,States,California,2019,40,1776,18,10,1.58,1.01,0.56,28
5,States,Colorado,2019,40,458,2,0,0.44,0.44,0,2
...,...,...,...,...,...,...,...,...,...,...,...
5121,States,Vermont,2021,29,38,0,0,0,0,0,0
5122,States,Virginia,2021,29,232,0,0,0,0,0,0
5123,States,Washington,2021,29,326,0,0,0,0,0,0
5124,States,West Virginia,2021,29,90,0,0,0,0,0,0


In [44]:
clean_flu_df = flu_df[['REGION', 'YEAR', 'WEEK', 'TOTAL SPECIMENS', 'TOTAL FLU']]
clean_flu_df

Unnamed: 0,REGION,YEAR,WEEK,TOTAL SPECIMENS,TOTAL FLU
0,Alabama,2019,40,512,15
2,Arizona,2019,40,278,2
3,Arkansas,2019,40,89,0
4,California,2019,40,1776,28
5,Colorado,2019,40,458,2
...,...,...,...,...,...
5121,Vermont,2021,29,38,0
5122,Virginia,2021,29,232,0
5123,Washington,2021,29,326,0
5124,West Virginia,2021,29,90,0


In [48]:
clean_flu_df = clean_flu_df.rename(columns={'REGION': 'State', 'YEAR': 'Year', 'WEEK': 'Week', 'TOTAL SPECIMENS': 'Total tested specimen', 'TOTAL FLU': 'Flu Total'})


Unnamed: 0,State,Year,Week,Total tested specimen,Flu Total
0,Alabama,2019,40,512,15
2,Arizona,2019,40,278,2
3,Arkansas,2019,40,89,0
4,California,2019,40,1776,28
5,Colorado,2019,40,458,2
...,...,...,...,...,...
5121,Vermont,2021,29,38,0
5122,Virginia,2021,29,232,0
5123,Washington,2021,29,326,0
5124,West Virginia,2021,29,90,0


In [49]:
clean_flu_df = clean_flu_df.rename(columns={'Total tested specimen': 'Total Tested Specimen'})

In [50]:
clean_flu_df

Unnamed: 0,State,Year,Week,Total Tested Specimen,Flu Total
0,Alabama,2019,40,512,15
2,Arizona,2019,40,278,2
3,Arkansas,2019,40,89,0
4,California,2019,40,1776,28
5,Colorado,2019,40,458,2
...,...,...,...,...,...
5121,Vermont,2021,29,38,0
5122,Virginia,2021,29,232,0
5123,Washington,2021,29,326,0
5124,West Virginia,2021,29,90,0
