# Data Quality Report 

## Import Relevant Modules

In [1]:
#Import the required packages

#Import package pandas for data analysis
import pandas as pd

#Import package numpy for numeric computing
import numpy as np

#Import package matplotlib for visualisation/plotting
import matplotlib.pyplot as plt

#Import package seaborn for visualisation
import seaborn as sns

#For showing plots directly in the notebook run the command below
%matplotlib inline

## 1. Initial Data Investigation
##### In this section, a general overview of the data will be displayed to become familiar with the dataset.

In [7]:
# Reading from a csv file, into a data frame
df = pd.read_csv('covid19-cdc-13336431.csv')
# Show data frame first few rows
df.head(10)

Unnamed: 0,cdc_case_earliest_dt,cdc_report_dt,pos_spec_dt,onset_dt,current_status,sex,age_group,race_ethnicity_combined,hosp_yn,icu_yn,death_yn,medcond_yn
0,2020/12/22,,,,Laboratory-confirmed case,Male,10 - 19 Years,Unknown,Missing,Missing,No,Missing
1,2020/07/30,2020/08/15,2020/08/12,2020/07/30,Laboratory-confirmed case,Female,40 - 49 Years,Hispanic/Latino,No,Unknown,No,No
2,2020/10/12,,,,Laboratory-confirmed case,Female,40 - 49 Years,Unknown,Unknown,Missing,No,Missing
3,2020/06/10,2020/06/10,,2020/06/10,Laboratory-confirmed case,Female,60 - 69 Years,Unknown,No,Missing,No,Missing
4,2020/06/26,,,,Laboratory-confirmed case,Female,60 - 69 Years,Unknown,Missing,Missing,No,Missing
5,2020/12/09,,,,Laboratory-confirmed case,Male,20 - 29 Years,Unknown,Unknown,Missing,No,Missing
6,2020/12/05,,,,Laboratory-confirmed case,Male,40 - 49 Years,Unknown,Missing,Missing,No,Missing
7,2020/09/21,2020/10/01,,2020/09/21,Laboratory-confirmed case,Male,30 - 39 Years,"White, Non-Hispanic",No,No,No,No
8,2020/03/27,2020/06/10,,2020/03/27,Laboratory-confirmed case,Male,30 - 39 Years,Unknown,Yes,Unknown,No,Yes
9,2021/01/14,2021/01/14,,,Laboratory-confirmed case,Female,50 - 59 Years,Unknown,Missing,Missing,No,Missing


In [8]:
# Show data frame first 5 rows
df.head(5)

Unnamed: 0,cdc_case_earliest_dt,cdc_report_dt,pos_spec_dt,onset_dt,current_status,sex,age_group,race_ethnicity_combined,hosp_yn,icu_yn,death_yn,medcond_yn
0,2020/12/22,,,,Laboratory-confirmed case,Male,10 - 19 Years,Unknown,Missing,Missing,No,Missing
1,2020/07/30,2020/08/15,2020/08/12,2020/07/30,Laboratory-confirmed case,Female,40 - 49 Years,Hispanic/Latino,No,Unknown,No,No
2,2020/10/12,,,,Laboratory-confirmed case,Female,40 - 49 Years,Unknown,Unknown,Missing,No,Missing
3,2020/06/10,2020/06/10,,2020/06/10,Laboratory-confirmed case,Female,60 - 69 Years,Unknown,No,Missing,No,Missing
4,2020/06/26,,,,Laboratory-confirmed case,Female,60 - 69 Years,Unknown,Missing,Missing,No,Missing


In [4]:
# Show data frame last 5 rows
df.tail(5)

Unnamed: 0,cdc_case_earliest_dt,cdc_report_dt,pos_spec_dt,onset_dt,current_status,sex,age_group,race_ethnicity_combined,hosp_yn,icu_yn,death_yn,medcond_yn
9995,2020/04/24,2020/04/29,,2020/04/24,Laboratory-confirmed case,Female,60 - 69 Years,"White, Non-Hispanic",No,Missing,No,Missing
9996,2020/11/23,2020/11/27,,2020/11/23,Laboratory-confirmed case,Male,30 - 39 Years,"Asian, Non-Hispanic",No,Missing,No,Missing
9997,2021/01/15,,,,Laboratory-confirmed case,Female,20 - 29 Years,Unknown,Missing,Missing,No,Missing
9998,2020/12/21,2020/12/21,2020/12/21,,Laboratory-confirmed case,Female,50 - 59 Years,Unknown,Unknown,Unknown,No,Unknown
9999,2020/03/29,2020/04/05,2020/04/01,2020/04/02,Laboratory-confirmed case,Male,60 - 69 Years,"Multiple/Other, Non-Hispanic",No,Unknown,No,Unknown


In [5]:
# Check how many rows and columns this dataframe has
df.shape

(10000, 12)

In [6]:
#How many rows/instances?
df.shape[0]

10000

In [10]:
#How many columns/features?
df.shape[1]

12

In [17]:

df.info

<bound method DataFrame.info of      cdc_case_earliest_dt cdc_report_dt pos_spec_dt    onset_dt  \
0              2020/12/22           NaN         NaN         NaN   
1              2020/07/30    2020/08/15  2020/08/12  2020/07/30   
2              2020/10/12           NaN         NaN         NaN   
3              2020/06/10    2020/06/10         NaN  2020/06/10   
4              2020/06/26           NaN         NaN         NaN   
...                   ...           ...         ...         ...   
9995           2020/04/24    2020/04/29         NaN  2020/04/24   
9996           2020/11/23    2020/11/27         NaN  2020/11/23   
9997           2021/01/15           NaN         NaN         NaN   
9998           2020/12/21    2020/12/21  2020/12/21         NaN   
9999           2020/03/29    2020/04/05  2020/04/01  2020/04/02   

                 current_status     sex      age_group  \
0     Laboratory-confirmed case    Male  10 - 19 Years   
1     Laboratory-confirmed case  Female  40 - 4

In [9]:
# Show the data types in each column.
df.dtypes

cdc_case_earliest_dt       object
cdc_report_dt              object
pos_spec_dt                object
onset_dt                   object
current_status             object
sex                        object
age_group                  object
race_ethnicity_combined    object
hosp_yn                    object
icu_yn                     object
death_yn                   object
medcond_yn                 object
dtype: object

## 2. Data Cleaning and Preparation
#### This section will ensure data is prepared and cleaned by converting features to appropriate data types and looking for duplicate rows and columns and dropping any columns that are irrelevant or unnecessary

In [24]:
#Convert date time features to appropriate date time data types
df['cdc_case_earliest_dt'] = df['cdc_case_earliest_dt'].astype('datetime64[ns]')
df['cdc_report_dt'] = df['cdc_report_dt'].astype('datetime64[ns]')
df['pos_spec_dt'] = df['pos_spec_dt'].astype('datetime64[ns]')
df['onset_dt'] = df['onset_dt'].astype('datetime64[ns]')
df.dtypes

cdc_case_earliest_dt       datetime64[ns]
cdc_report_dt              datetime64[ns]
pos_spec_dt                datetime64[ns]
onset_dt                   datetime64[ns]
current_status                   category
sex                              category
age_group                        category
race_ethnicity_combined          category
hosp_yn                          category
icu_yn                           category
death_yn                         category
medcond_yn                       category
dtype: object

In [25]:
#Convert categorical features to appropriate category data types
df['current_status'] = df['current_status'].astype('category')
df['sex'] = df['sex'].astype('category')
df['age_group'] = df['age_group'].astype('category')
df['race_ethnicity_combined'] = df['race_ethnicity_combined'].astype('category')
df['hosp_yn'] = df['hosp_yn'].astype('category')
df['icu_yn'] = df['icu_yn'].astype('category')
df['death_yn'] = df['death_yn'].astype('category')
df['medcond_yn'] = df['medcond_yn'].astype('category')


df.dtypes

cdc_case_earliest_dt       datetime64[ns]
cdc_report_dt              datetime64[ns]
pos_spec_dt                datetime64[ns]
onset_dt                   datetime64[ns]
current_status                   category
sex                              category
age_group                        category
race_ethnicity_combined          category
hosp_yn                          category
icu_yn                           category
death_yn                         category
medcond_yn                       category
dtype: object