# CPSC 368 Data Cleaning Notebook (KNM Neighbours)

## Loading Data and Packages

In [1]:
import numpy as np
import pandas as pd

## KFF Data Cleaning

There are 3 KFF datasets: one for all adults aged 19-64, and two for males and females aged 19-64. The data is cleaned to only show the table itself, with other miscellaneous information removed.

In [2]:
KFF2019_adult_original = pd.read_csv('final_datasets_V1/KFF/kff_health_insurance_2019_adult.csv', skiprows = 2, nrows = 53).drop(columns='Footnotes', axis=1)
KFF2019_female_original = pd.read_csv('final_datasets_V1/KFF/kff_health_insurance_2019_female.csv', skiprows = 2, nrows = 53).drop(columns='Footnotes', axis=1)
KFF2019_male_original = pd.read_csv('final_datasets_V1/KFF/kff_health_insurance_2019_male.csv', skiprows = 2, nrows = 53).drop(columns='Footnotes', axis=1)

In [3]:
KFF2019_adult_original.to_csv("final_datasets_V1/cleaned/KFF2019_adult.csv", index=False)
KFF2019_female_original.to_csv("final_datasets_V1/cleaned/KFF2019_female.csv", index=False)
KFF2019_male_original.to_csv("final_datasets_V1/cleaned/KFF2019_male.csv", index=False)

In [4]:
for i in [KFF2019_adult_original, KFF2019_female_original, KFF2019_male_original]: 
    display(i.info())
    display(i.describe())
    display(i.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53 entries, 0 to 52
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Location   53 non-null     object 
 1   Employer   53 non-null     float64
 2   Non-Group  53 non-null     float64
 3   Medicaid   53 non-null     float64
 4   Medicare   53 non-null     float64
 5   Military   53 non-null     float64
 6   Uninsured  53 non-null     float64
 7   Total      53 non-null     float64
dtypes: float64(7), object(1)
memory usage: 3.4+ KB


None

Unnamed: 0,Employer,Non-Group,Medicaid,Medicare,Military,Uninsured,Total
count,53.0,53.0,53.0,53.0,53.0,53.0,53.0
mean,0.615604,0.076075,0.149623,0.02166,0.018094,0.118849,1.0
std,0.066573,0.018413,0.063945,0.007643,0.008998,0.043453,0.0
min,0.305,0.036,0.061,0.009,0.005,0.044,1.0
25%,0.584,0.068,0.101,0.016,0.011,0.087,1.0
50%,0.618,0.073,0.141,0.02,0.018,0.114,1.0
75%,0.658,0.081,0.182,0.025,0.022,0.149,1.0
max,0.704,0.131,0.429,0.042,0.047,0.245,1.0


Location     0
Employer     0
Non-Group    0
Medicaid     0
Medicare     0
Military     0
Uninsured    0
Total        0
dtype: int64

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53 entries, 0 to 52
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Location   53 non-null     object 
 1   Employer   53 non-null     float64
 2   Non-Group  53 non-null     float64
 3   Medicaid   53 non-null     float64
 4   Medicare   53 non-null     float64
 5   Military   52 non-null     float64
 6   Uninsured  53 non-null     float64
 7   Total      53 non-null     float64
dtypes: float64(7), object(1)
memory usage: 3.4+ KB


None

Unnamed: 0,Employer,Non-Group,Medicaid,Medicare,Military,Uninsured,Total
count,53.0,53.0,53.0,53.0,52.0,53.0,53.0
mean,0.610698,0.077642,0.169774,0.021377,0.016904,0.103717,1.0
std,0.06559,0.019097,0.067315,0.007425,0.01122,0.042867,0.0
min,0.306,0.035,0.067,0.006,0.004,0.027,1.0
25%,0.582,0.069,0.118,0.017,0.009,0.075,1.0
50%,0.614,0.074,0.164,0.02,0.0155,0.098,1.0
75%,0.657,0.084,0.198,0.026,0.021,0.126,1.0
max,0.699,0.138,0.451,0.038,0.065,0.232,1.0


Location     0
Employer     0
Non-Group    0
Medicaid     0
Medicare     0
Military     1
Uninsured    0
Total        0
dtype: int64

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53 entries, 0 to 52
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Location   53 non-null     object 
 1   Employer   53 non-null     float64
 2   Non-Group  53 non-null     float64
 3   Medicaid   53 non-null     float64
 4   Medicare   53 non-null     float64
 5   Military   53 non-null     float64
 6   Uninsured  53 non-null     float64
 7   Total      53 non-null     float64
dtypes: float64(7), object(1)
memory usage: 3.4+ KB


None

Unnamed: 0,Employer,Non-Group,Medicaid,Medicare,Military,Uninsured,Total
count,53.0,53.0,53.0,53.0,53.0,53.0,53.0
mean,0.620698,0.074415,0.128623,0.022094,0.019377,0.134623,1.0
std,0.067915,0.018067,0.060703,0.008383,0.00786,0.045389,0.0
min,0.304,0.037,0.051,0.01,0.006,0.056,1.0
25%,0.586,0.065,0.083,0.017,0.013,0.101,1.0
50%,0.629,0.071,0.124,0.021,0.02,0.126,1.0
75%,0.667,0.078,0.159,0.026,0.025,0.167,1.0
max,0.717,0.123,0.404,0.047,0.046,0.259,1.0


Location     0
Employer     0
Non-Group    0
Medicaid     0
Medicare     0
Military     0
Uninsured    0
Total        0
dtype: int64

## U.S. Chronic Disease Indicators

The U.S. Chronic Disease Indicators dataset contains many types of data for a variety of topics, and given our topic questions, we will need to filter the data to focus on the topics of 'Cardiovascular Disease' and 'Cancer'. Any NA values are filled with 0 under the assumption that said data could not be collected at all.

In [5]:
USCDI_original = pd.read_csv('final_datasets_V1/CDC/U.S._Chronic_Disease_Indicators.csv')

In [6]:
condition_0_topic = USCDI_original["Topic"].isin(['Cardiovascular Disease', 'Cancer'])
condition_0_dvu = USCDI_original["DataValueUnit"].isin(['cases per 100,000', 'per 100,000'])
condition_0_sc1 = USCDI_original["StratificationCategory1"].isin(["Sex", "Age", "Overall"])
USCDI_filter = USCDI_original[condition_0_topic & condition_0_dvu & condition_0_sc1][["YearStart","YearEnd","LocationDesc",
                                                                                      "Topic","Question","DataValueUnit", 
                                                                                      "DataValueType", "DataValue",
                                                                                      "StratificationCategory1", "Stratification1"]].fillna(0)
USCDI_filter.to_csv('final_datasets_V1/cleaned/USCDI_filter.csv', index=False)

In [7]:
display(USCDI_filter.info())
display(USCDI_filter.describe())
display(USCDI_filter.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
Index: 8592 entries, 115 to 274446
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   YearStart                8592 non-null   int64  
 1   YearEnd                  8592 non-null   int64  
 2   LocationDesc             8592 non-null   object 
 3   Topic                    8592 non-null   object 
 4   Question                 8592 non-null   object 
 5   DataValueUnit            8592 non-null   object 
 6   DataValueType            8592 non-null   object 
 7   DataValue                8592 non-null   float64
 8   StratificationCategory1  8592 non-null   object 
 9   Stratification1          8592 non-null   object 
dtypes: float64(1), int64(2), object(7)
memory usage: 738.4+ KB


None

Unnamed: 0,YearStart,YearEnd,DataValue
count,8592.0,8592.0,8592.0
mean,2017.706006,2019.745112,128.980575
std,2.348503,0.718885,188.041781
min,2015.0,2019.0,0.0
25%,2015.0,2019.0,17.7
50%,2016.0,2020.0,47.7
75%,2020.0,2020.0,167.625
max,2021.0,2021.0,1456.5


YearStart                  0
YearEnd                    0
LocationDesc               0
Topic                      0
Question                   0
DataValueUnit              0
DataValueType              0
DataValue                  0
StratificationCategory1    0
Stratification1            0
dtype: int64