In [10]:
# Importing Relevant Dependencies
import pandas as pd
from matplotlib import pyplot as plt
import scipy.stats as st


cdcPath = "../../../COVID-19_Case_Surveillance_Public_Use_Data.csv"

In [16]:
# Create data frame from CSV file variable(path) using pandas .read_csv() method
cdcDf = pd.read_csv(cdcPath, low_memory=False)

# Print out first 10 rows for inspection
cdcDf.head(10)

Unnamed: 0,cdc_report_dt,pos_spec_dt,onset_dt,current_status,sex,age_group,Race and ethnicity (combined),hosp_yn,icu_yn,death_yn,medcond_yn
0,2020/11/10,2020/11/10,,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,Unknown,No,No
1,2020/11/14,2020/11/10,2020/11/10,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,No,No,No
2,2020/11/19,2020/11/10,2020/11/09,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,No,No,No
3,2020/11/14,2020/11/10,,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",Missing,Missing,No,Missing
4,2020/11/13,2020/11/10,2020/11/10,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,No,No,Yes
5,2020/11/17,2020/11/10,2020/11/08,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",Missing,Missing,Missing,Missing
6,2020/11/14,2020/11/10,,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,Missing,Missing,Missing
7,2020/11/10,2020/11/10,,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",Missing,Missing,Missing,Missing
8,2020/11/10,2020/11/10,,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",Unknown,Unknown,No,Unknown
9,2020/11/17,2020/11/10,,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",Missing,Missing,Missing,Missing


In [17]:
# Dropping unnecessary columns
cutCdcDf = cdcDf.drop(['pos_spec_dt', 'onset_dt','icu_yn','death_yn'], axis=1)

In [18]:
# Checking values in Race and Ethnicity to identify possible invalid/missing values 
cdcDf['Race and ethnicity (combined)'].value_counts()

Unknown                                                 2781176
White, Non-Hispanic                                     2599410
Hispanic/Latino                                         1195739
Black, Non-Hispanic                                      736584
Missing                                                  601519
Multiple/Other, Non-Hispanic                             270509
Asian, Non-Hispanic                                      143080
American Indian/Alaska Native, Non-Hispanic               59842
Native Hawaiian/Other Pacific Islander, Non-Hispanic      17213
Name: Race and ethnicity (combined), dtype: int64

In [22]:
# Dropping missing or unknown data using conditional formatting
ethUnkDf = cutCdcDf[(cutCdcDf['Race and ethnicity (combined)'] != 'Missing') &
                    (cutCdcDf['Race and ethnicity (combined)'] != 'Unknown') ]
ethUnkDf.head()

Unnamed: 0,cdc_report_dt,current_status,sex,age_group,Race and ethnicity (combined),hosp_yn,medcond_yn
0,2020/11/10,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,No
1,2020/11/14,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,No
2,2020/11/19,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,No
3,2020/11/14,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",Missing,Missing
4,2020/11/13,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,Yes


In [24]:
# Re-Checking values in Race and Ethnicity for conditional formatting accuracy
ethUnkDf['Race and ethnicity (combined)'].value_counts()

White, Non-Hispanic                                     2599410
Hispanic/Latino                                         1195739
Black, Non-Hispanic                                      736584
Multiple/Other, Non-Hispanic                             270509
Asian, Non-Hispanic                                      143080
American Indian/Alaska Native, Non-Hispanic               59842
Native Hawaiian/Other Pacific Islander, Non-Hispanic      17213
Name: Race and ethnicity (combined), dtype: int64

In [25]:
# Checking values in the Age Groups
ethUnkDf['age_group'].value_counts()

20 - 29 Years    951360
30 - 39 Years    799407
40 - 49 Years    742803
50 - 59 Years    727465
60 - 69 Years    531308
10 - 19 Years    503431
70 - 79 Years    316264
80+ Years        254563
0 - 9 Years      185198
Unknown           10507
Name: age_group, dtype: int64

In [28]:
# Dropping unknown values from the age groups
unkAgeDf = ethUnkDf[ethUnkDf['age_group'] != 'Unknown']

# Checking for accuracy (removable)
##unkAgeDf['age_group'].value_counts()
unkAgeDf.head(10)

Unnamed: 0,cdc_report_dt,current_status,sex,age_group,Race and ethnicity (combined),hosp_yn,medcond_yn
0,2020/11/10,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,No
1,2020/11/14,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,No
2,2020/11/19,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,No
3,2020/11/14,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",Missing,Missing
4,2020/11/13,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,Yes
5,2020/11/17,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",Missing,Missing
6,2020/11/14,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,Missing
7,2020/11/10,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",Missing,Missing
8,2020/11/10,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",Unknown,Unknown
9,2020/11/17,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",Missing,Missing


In [29]:
# Checking hospitilization values
unkAgeDf['hosp_yn'].value_counts()

No         2591276
Missing    1556663
Unknown     443552
Yes         420386
Name: hosp_yn, dtype: int64

In [38]:
# Dropping missing and unknown values from hospitilization
misHospDf = unkAgeDf[(unkAgeDf['hosp_yn'] != 'Missing') &
                     (unkAgeDf['hosp_yn'] != 'Unknown')]

misHospDf.head(10)

Unnamed: 0,cdc_report_dt,current_status,sex,age_group,Race and ethnicity (combined),hosp_yn,medcond_yn
0,2020/11/10,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,No
1,2020/11/14,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,No
2,2020/11/19,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,No
4,2020/11/13,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,Yes
6,2020/11/14,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,Missing
11,2020/11/09,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,No
13,2020/11/15,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,Missing
14,2020/11/06,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,Yes
16,2020/11/17,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,Yes
23,2020/11/17,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,Yes


In [33]:
# Checking values for medical conditions
misHospDf['medcond_yn'].value_counts()

Missing    1811865
Yes         579151
No          459692
Unknown     160954
Name: medcond_yn, dtype: int64

In [51]:
# Dropping missing and unknown values from conditions
misMedDf = misHospDf[(misHospDf['medcond_yn'] != 'Missing') &
                     (misHospDf['medcond_yn'] != 'Unknown')]

misMedDf.head(10)

Unnamed: 0,cdc_report_dt,current_status,sex,age_group,Race and ethnicity (combined),hosp_yn,medcond_yn
0,2020/11/10,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,No
1,2020/11/14,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,No
2,2020/11/19,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,No
4,2020/11/13,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,Yes
11,2020/11/09,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,No
14,2020/11/06,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,Yes
16,2020/11/17,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,Yes
23,2020/11/17,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,Yes
29,2020/11/09,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,Yes
34,2020/11/02,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,Yes


In [52]:
# Checking values in the sex column
misMedDf['sex'].value_counts()

Female     550587
Male       483751
Unknown      3403
Missing      1010
Other          83
Name: sex, dtype: int64

In [53]:
# Removing missing and unknown values from the sex column
misSexDf = misMedDf[(misMedDf['sex'] != 'Missing') &
                    (misMedDf['sex'] != 'Unknown') &
                    (misMedDf['sex'] != 'Other') ]

misSexDf.head()

Unnamed: 0,cdc_report_dt,current_status,sex,age_group,Race and ethnicity (combined),hosp_yn,medcond_yn
0,2020/11/10,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,No
1,2020/11/14,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,No
2,2020/11/19,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,No
4,2020/11/13,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,Yes
11,2020/11/09,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,No


In [54]:
# Checking values in the status column
misSexDf['current_status'].value_counts()

Laboratory-confirmed case    991896
Probable Case                 42451
Name: current_status, dtype: int64

In [55]:
# Removing non-laboratory confirmed cases
cleanCdcDf = misSexDf[misSexDf['current_status'] != 'Probable Case']
cleanCdcDf.head(20)

Unnamed: 0,cdc_report_dt,current_status,sex,age_group,Race and ethnicity (combined),hosp_yn,medcond_yn
0,2020/11/10,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,No
1,2020/11/14,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,No
2,2020/11/19,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,No
4,2020/11/13,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,Yes
11,2020/11/09,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,No
14,2020/11/06,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,Yes
16,2020/11/17,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,Yes
23,2020/11/17,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,Yes
29,2020/11/09,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,Yes
34,2020/11/02,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,Yes


In [56]:
# Renaming Column Headers for Visualization and Analyis Purposes
rnCdcDf = cleanCdcDf.rename(columns={"cdc_report_dt": "CDC Report Date",
                                     "current_status": "Current Status",
                                     "sex": "Gender",
                                     "age_group": "Age Group",
                                     "Race and ethnicity (combined)": "Race/Ethnicity",
                                     "hosp_yn": "Hospitalization Status",
                                     "medcond_yn": "Pre- Existing Med Condition"})

# Setting index for as report date and sorting them from newest to oldest (can be switched if needed)
rnCdcDf.set_index('CDC Report Date', inplace=True)
rnCdcDf.sort_values("CDC Report Date", ascending= True)

Unnamed: 0_level_0,Current Status,Gender,Age Group,Race/Ethnicity,Hospitalization Status,Pre- Existing Med Condition
CDC Report Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020/01/01,Laboratory-confirmed case,Female,60 - 69 Years,"White, Non-Hispanic",No,Yes
2020/01/01,Laboratory-confirmed case,Male,40 - 49 Years,"Black, Non-Hispanic",Yes,No
2020/01/02,Laboratory-confirmed case,Male,30 - 39 Years,"White, Non-Hispanic",No,Yes
2020/01/09,Laboratory-confirmed case,Female,40 - 49 Years,"White, Non-Hispanic",No,No
2020/01/11,Laboratory-confirmed case,Male,60 - 69 Years,"White, Non-Hispanic",No,Yes
...,...,...,...,...,...,...
2020/11/19,Laboratory-confirmed case,Male,50 - 59 Years,"Asian, Non-Hispanic",No,Yes
2020/11/19,Laboratory-confirmed case,Female,40 - 49 Years,Hispanic/Latino,No,No
2020/11/19,Laboratory-confirmed case,Male,50 - 59 Years,"Asian, Non-Hispanic",No,Yes
2020/11/19,Laboratory-confirmed case,Female,40 - 49 Years,"White, Non-Hispanic",No,No
