# CPSC 368 Data Cleaning Notebook (KNM Neighbours)

## Loading Data and Packages

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
KFF2019_adult_original = pd.read_csv('final_datasets_V1/KFF/kff_health_insurance_2019_adult.csv', skiprows = 2, nrows = 53)
KFF2019_female_original = pd.read_csv('final_datasets_V1/KFF/kff_health_insurance_2019_female.csv', skiprows = 2, nrows = 53)
KFF2019_male_original = pd.read_csv('final_datasets_V1/KFF/kff_health_insurance_2019_male.csv', skiprows = 2, nrows = 53)

In [3]:
KFF2019_adult_original["Group"] = "All_Uninsured"
KFF2019_female_original["Group"] = "Female_Uninsured"
KFF2019_male_original["Group"] = "Male_Uninsured"

KFF2019 = pd.concat([KFF2019_female_original, KFF2019_male_original, KFF2019_adult_original], 
                    ignore_index=True)[["Location", "Group", "Uninsured"]]
KFF2019_new1 = KFF2019[KFF2019['Location'] != 'United States']

KFF2019_new = KFF2019_new1.pivot(index='Location', columns='Group', values='Uninsured').fillna(0)
KFF2019_new.index.name = 'Location'
KFF2019_new.reset_index(inplace=True)
KFF2019_new.rename(columns={'Group': 'Index'}, inplace=True)

KFF2019_new.shape

(52, 4)

In [4]:
KFF2019_new.to_csv("final_datasets_V1/cleaned/KFF2019_new.csv")
KFF2019_new.tail()

Group,Location,All_Uninsured,Female_Uninsured,Male_Uninsured
47,Virginia,0.112,0.094,0.131
48,Washington,0.094,0.082,0.106
49,West Virginia,0.099,0.08,0.117
50,Wisconsin,0.082,0.067,0.097
51,Wyoming,0.167,0.167,0.167


In [5]:
USCDI_original = pd.read_csv('final_datasets_V1/CDC/U.S._Chronic_Disease_Indicators.csv')

In [6]:
USCDI = USCDI_original[["YearStart","YearEnd","LocationDesc",
                        "Topic","Question","DataValueUnit", "DataValueType", "DataValue",
                        "StratificationCategory1", "Stratification1"]]
USCDI.head()

Unnamed: 0,YearStart,YearEnd,LocationDesc,Topic,Question,DataValueUnit,DataValueType,DataValue,StratificationCategory1,Stratification1
0,2019,2019,Arkansas,Diabetes,Diabetes among adults,%,Crude Prevalence,13.6,Sex,Male
1,2019,2019,Idaho,Diabetes,Diabetes among adults,%,Crude Prevalence,10.6,Sex,Male
2,2019,2019,Indiana,Sleep,Short sleep duration among high school students,%,Crude Prevalence,,Grade,Grade 12
3,2019,2019,Iowa,Asthma,"Asthma mortality among all people, underlying ...",Number,Number,54.0,Overall,Overall
4,2019,2019,Iowa,Asthma,Current asthma among adults,%,Crude Prevalence,10.3,Age,Age 18-44


### Impact by Sex 

In [7]:
condition_1_t = USCDI["Topic"] == "Cardiovascular Disease"
condition_1_q = USCDI["Question"] == "Coronary heart disease mortality among all people, underlying cause"
condition_1_dvu = USCDI["DataValueUnit"] == "cases per 100,000"
condition_1_syear = USCDI["YearStart"] == 2019
condition_1_eyear = USCDI["YearEnd"] == 2019
# condition_1_ld = USCDI["LocationDesc"].isin(["Texas", "Massachusetts"])

condition_1_sc1 = USCDI["StratificationCategory1"].isin(["Sex", "Age"])
condition_1_dvt = USCDI["DataValueType"] == "Crude Rate"

USCDI_sex = USCDI[condition_1_t 
    & condition_1_q
    & condition_1_dvu
    & condition_1_sc1
    & condition_1_dvt
    & condition_1_syear
    # condition_1_ld
][["LocationDesc", "DataValue", "StratificationCategory1", "Stratification1"]]
# USCDI_sex.rename(columns={'DataValue': 'CHDCasesPer100000'}, inplace=True)
USCDI_sex

Unnamed: 0,LocationDesc,DataValue,StratificationCategory1,Stratification1
14902,California,1.9,Age,Age 0-44
15188,Alaska,39.7,Sex,Female
15589,Alaska,307.9,Age,Age >=65
15903,Alabama,434.6,Age,Age >=65
16510,Alaska,72.3,Age,Age 45-64
...,...,...,...,...
108707,Vermont,109.3,Sex,Female
108768,Wisconsin,76.4,Age,Age 45-64
108917,Wyoming,70.1,Age,Age 45-64
109141,Washington,66.7,Sex,Female


In [8]:
# Only sex-based values 
filtered_df = USCDI_sex[USCDI_sex['StratificationCategory1'] == "Sex"]

# Sum up by location, then place male and female values into separate columns for each country
grouped = filtered_df.groupby(['LocationDesc', 'Stratification1'], as_index=False)['DataValue'].sum()
coronary_proportions = grouped.pivot(index='LocationDesc', columns='Stratification1', values='DataValue').fillna(0)
coronary_proportions.index.name = 'LocationDesc'
coronary_proportions.reset_index(inplace=True)

# Calculate proportion of female in uninsured population 
coronary_proportions['Frac_F'] = coronary_proportions['Female'] / (coronary_proportions['Female'] + coronary_proportions['Male'])
coronary_proportions.head()

Stratification1,LocationDesc,Female,Male,Frac_F
0,Alabama,79.1,122.8,0.391778
1,Alaska,39.7,74.5,0.347636
2,Arizona,81.4,128.5,0.387804
3,Arkansas,135.1,208.9,0.392733
4,California,73.7,111.6,0.397733


In [9]:
# Only age-based values from 0-64, sum them up
filtered_df_2 = USCDI_sex[
    (USCDI_sex['StratificationCategory1'] == "Age") & 
    (USCDI_sex['Stratification1'].isin(['Age 0-44', 'Age 45-64']))
]
result = filtered_df_2.groupby('LocationDesc', as_index=False)['DataValue'].sum()

# Merge CDC tables together on location 
merged_table = pd.merge(coronary_proportions, result, on='LocationDesc')

# Create CHDPercentage split by gender
merged_table["CHDPercentage"] = merged_table["DataValue"]/100000
merged_table["CHDPercentage_F"] = merged_table["CHDPercentage"] * merged_table["Frac_F"]
merged_table["CHDPercentage_M"] = merged_table["CHDPercentage"] * (1 - merged_table["Frac_F"])

merged_table.head()

Unnamed: 0,LocationDesc,Female,Male,Frac_F,DataValue,CHDPercentage,CHDPercentage_F,CHDPercentage_M
0,Alabama,79.1,122.8,0.391778,90.4,0.000904,0.000354,0.00055
1,Alaska,39.7,74.5,0.347636,72.3,0.000723,0.000251,0.000472
2,Arizona,81.4,128.5,0.387804,70.4,0.000704,0.000273,0.000431
3,Arkansas,135.1,208.9,0.392733,169.0,0.00169,0.000664,0.001026
4,California,73.7,111.6,0.397733,66.1,0.000661,0.000263,0.000398


In [10]:
CDC_CHD_new = merged_table[["LocationDesc", "Frac_F", "CHDPercentage", "CHDPercentage_F", "CHDPercentage_M"]]
CDC_CHD_new.to_csv("final_datasets_V1/cleaned/CDC_CHD_new.csv")
CDC_CHD_new.tail()

Unnamed: 0,LocationDesc,Frac_F,CHDPercentage,CHDPercentage_F,CHDPercentage_M
47,Virginia,0.399774,0.000728,0.000291,0.000437
48,Washington,0.375563,0.000618,0.000232,0.000386
49,West Virginia,0.44283,0.001275,0.000565,0.00071
50,Wisconsin,0.377507,0.000792,0.000299,0.000493
51,Wyoming,0.338676,0.000701,0.000237,0.000464


### Impact by State


In [11]:
condition_2_t = USCDI["Topic"] == "Cardiovascular Disease"
condition_2_q = USCDI["Question"] == "Coronary heart disease mortality among all people, underlying cause"
condition_2_dvu = USCDI["DataValueUnit"] == "cases per 100,000"
condition_2_sc1 = USCDI["StratificationCategory1"] == "Age"
condition_2_s1 = USCDI['Stratification1'].isin(['Age 0-44', 'Age 45-64'])
condition_2_dvt = USCDI["DataValueType"] == "Crude Rate"
condition_2_syear = USCDI["YearStart"] == 2019
condition_2_eyear = USCDI["YearEnd"] == 2019

USCDI_state = USCDI[condition_2_t & condition_2_q & condition_2_dvu & condition_2_sc1 & condition_2_dvt & condition_2_syear]
USCDI_state.head()

Unnamed: 0,YearStart,YearEnd,LocationDesc,Topic,Question,DataValueUnit,DataValueType,DataValue,StratificationCategory1,Stratification1
14902,2019,2019,California,Cardiovascular Disease,Coronary heart disease mortality among all peo...,"cases per 100,000",Crude Rate,1.9,Age,Age 0-44
15589,2019,2019,Alaska,Cardiovascular Disease,Coronary heart disease mortality among all peo...,"cases per 100,000",Crude Rate,307.9,Age,Age >=65
15903,2019,2019,Alabama,Cardiovascular Disease,Coronary heart disease mortality among all peo...,"cases per 100,000",Crude Rate,434.6,Age,Age >=65
16510,2019,2019,Alaska,Cardiovascular Disease,Coronary heart disease mortality among all peo...,"cases per 100,000",Crude Rate,72.3,Age,Age 45-64
16813,2019,2019,Arizona,Cardiovascular Disease,Coronary heart disease mortality among all peo...,"cases per 100,000",Crude Rate,68.1,Age,Age 45-64


### Impact by Disease

In [12]:
condition_3_t = USCDI_original["Topic"] == "Cancer"
condition_3_dvu = USCDI["DataValueUnit"] == "per 100,000"
condition_3_dvt = USCDI["DataValueType"] == "Age-adjusted Rate"
condition_3_sc1 = USCDI["StratificationCategory1"] == "Sex"
condition_3_ye = USCDI["YearEnd"] == 2019
# condition_3_ld = USCDI["LocationDesc"].isin(['Texas', 'Massachusetts'])
USCDI_disease = USCDI[
    condition_3_t  
    & condition_3_dvu 
    & condition_3_dvt 
    & condition_3_sc1 
    & condition_3_ye 
    # & condition_3_ld
].fillna(0)
USCDI_disease.head()

Unnamed: 0,YearStart,YearEnd,LocationDesc,Topic,Question,DataValueUnit,DataValueType,DataValue,StratificationCategory1,Stratification1
115,2015,2019,Arkansas,Cancer,"Invasive cancer (all sites combined), incidence","per 100,000",Age-adjusted Rate,548.3,Sex,Male
118,2015,2019,Alabama,Cancer,"Cervical cancer mortality among all females, u...","per 100,000",Age-adjusted Rate,0.0,Sex,Male
123,2015,2019,North Carolina,Cancer,"Prostate cancer mortality among all males, und...","per 100,000",Age-adjusted Rate,0.0,Sex,Female
129,2015,2019,Kentucky,Cancer,Invasive cancer (all sites combined) mortality...,"per 100,000",Age-adjusted Rate,226.6,Sex,Male
175,2015,2019,New York,Cancer,Lung and bronchial cancer mortality among all ...,"per 100,000",Age-adjusted Rate,38.0,Sex,Male


In [13]:
grouped_2 = USCDI_disease.groupby(['LocationDesc', 'Stratification1'], as_index=False)['DataValue'].sum()
cancer_vals = grouped_2.pivot(index='LocationDesc', columns='Stratification1', values='DataValue').fillna(0)
cancer_vals.index.name = 'LocationDesc'
cancer_vals.reset_index(inplace=True)

# Calculate proportion of female in uninsured population 
cancer_vals['Frac_F'] = cancer_vals["Female"]/100000
cancer_vals['Frac_M'] = cancer_vals["Male"]/100000
cancer_vals['Frac_total'] = cancer_vals["Frac_F"] + cancer_vals["Frac_M"]
cancer_vals.to_csv("final_datasets_V1/cleaned/KFF2019_new.csv")
cancer_vals.head()

Stratification1,LocationDesc,Female,Male,Frac_F,Frac_M,Frac_total
0,Alabama,616.3,829.2,0.006163,0.008292,0.014455
1,Alaska,608.4,693.1,0.006084,0.006931,0.013015
2,Arizona,543.7,638.5,0.005437,0.006385,0.011822
3,Arkansas,657.3,862.2,0.006573,0.008622,0.015195
4,California,557.4,644.0,0.005574,0.00644,0.012014
