# CPSC 368 Data Cleaning Notebook (KNM Neighbours)

## Loading Data and Packages

In [1]:
import numpy as np
import pandas as pd

## KFF Data Cleaning

There are 3 KFF datasets: one for all adults aged 19-64, and two for males and females aged 19-64. Each dataset has a corresponding `Group` column applied to them before they are joined on `Location`. Since our focus is on uninsured adults exclusively, only the `Uninsured` column of values are acquired for each individual dataset, which are then grouped by location to create the columns `All_Uninsured`, `Female_Uninsured`, and `Male_Uninsured`, corresponding to the proportion of uninsured individuals in each category for each country.

In [2]:
KFF2019_adult_original = pd.read_csv('final_datasets_V1/KFF/kff_health_insurance_2019_adult.csv', skiprows = 2, nrows = 53).drop(columns='Footnotes', axis=1)
KFF2019_female_original = pd.read_csv('final_datasets_V1/KFF/kff_health_insurance_2019_female.csv', skiprows = 2, nrows = 53).drop(columns='Footnotes', axis=1)
KFF2019_male_original = pd.read_csv('final_datasets_V1/KFF/kff_health_insurance_2019_male.csv', skiprows = 2, nrows = 53).drop(columns='Footnotes', axis=1)

KFF2019_adult_original.to_csv("final_datasets_V1/cleaned/KFF2019_adult.csv", index=False)
KFF2019_female_original.to_csv("final_datasets_V1/cleaned/KFF2019_female.csv", index=False)
KFF2019_male_original.to_csv("final_datasets_V1/cleaned/KFF2019_male.csv", index=False)

In [3]:
KFF2019_adult_original["Group"] = "All_Uninsured"
KFF2019_female_original["Group"] = "Female_Uninsured"
KFF2019_male_original["Group"] = "Male_Uninsured"

KFF2019 = pd.concat([KFF2019_female_original, KFF2019_male_original, KFF2019_adult_original], 
                    ignore_index=True)[["Location", "Group", "Uninsured"]]


KFF2019_new = KFF2019[KFF2019['Location'] != 'United States'].pivot(index='Location', columns='Group', values='Uninsured').fillna(0)
KFF2019_new.index.name = 'Location'
KFF2019_new.reset_index(inplace=True)
KFF2019_new.rename(columns={'Group': 'Index'}, inplace=True)

KFF2019_new.shape

(52, 4)

In [4]:
KFF2019_new.to_csv("final_datasets_V1/cleaned/KFF2019_new.csv", index=False)
KFF2019_new.tail()

Group,Location,All_Uninsured,Female_Uninsured,Male_Uninsured
47,Virginia,0.112,0.094,0.131
48,Washington,0.094,0.082,0.106
49,West Virginia,0.099,0.08,0.117
50,Wisconsin,0.082,0.067,0.097
51,Wyoming,0.167,0.167,0.167


## U.S. Chronic Disease Indicators

The U.S. Chronic Disease Indicators dataset contains many types of data for a variety of topics, and given our topic questions, we will create 2 datasets, one for coronary heart disease mortality, and another for the average of various cancer mortalities. The column `Has2019` is created to determine if the value is relevant to our questions, while `Range` is created to assist in providing the average data value `AvgDataValue` across the range of years, given that some values are obtained for a range greater than 1 year. 

In [5]:
USCDI_original = pd.read_csv('final_datasets_V1/CDC/U.S._Chronic_Disease_Indicators.csv')

In [6]:
condition_0_topic = USCDI_original["Topic"].isin(['Cardiovascular Disease', 'Cancer'])
condition_0_dvu = USCDI_original["DataValueUnit"].isin(['cases per 100,000', 'per 100,000'])
condition_0_sc1 = USCDI_original["StratificationCategory1"].isin(["Sex", "Age", "Overall"])
USCDI_filter = USCDI_original[condition_0_topic & condition_0_dvu & condition_0_sc1][["YearStart","YearEnd","LocationDesc",
                                                                                      "Topic","Question","DataValueUnit", 
                                                                                      "DataValueType", "DataValue",
                                                                                      "StratificationCategory1", "Stratification1"]]
USCDI_filter.to_csv('final_datasets_V1/cleaned/USCDI_filter.csv', index=False)

In [7]:
USCDI = USCDI_filter[["YearStart","YearEnd","LocationDesc",
                        "Topic","Question","DataValueUnit", "DataValueType", "DataValue",
                        "StratificationCategory1", "Stratification1"]].copy().fillna(0)
USCDI["Has2019"] = ((USCDI["YearStart"] <= 2019) & (USCDI["YearEnd"] >= 2019)).astype(int)
USCDI["Range"] = (USCDI["YearEnd"] - USCDI["YearStart"] + 1)
USCDI["AvgDataValue"] = USCDI["DataValue"]/USCDI["Range"]
USCDI.to_csv('final_datasets_V1/cleaned/USCDI.csv', index=False)
USCDI.head()

Unnamed: 0,YearStart,YearEnd,LocationDesc,Topic,Question,DataValueUnit,DataValueType,DataValue,StratificationCategory1,Stratification1,Has2019,Range,AvgDataValue
115,2015,2019,Arkansas,Cancer,"Invasive cancer (all sites combined), incidence","per 100,000",Age-adjusted Rate,548.3,Sex,Male,1,5,109.66
118,2015,2019,Alabama,Cancer,"Cervical cancer mortality among all females, u...","per 100,000",Age-adjusted Rate,0.0,Sex,Male,1,5,0.0
121,2015,2019,North Dakota,Cancer,"Prostate cancer mortality among all males, und...","per 100,000",Crude Rate,18.4,Overall,Overall,1,5,3.68
123,2015,2019,North Carolina,Cancer,"Prostate cancer mortality among all males, und...","per 100,000",Age-adjusted Rate,0.0,Sex,Female,1,5,0.0
129,2015,2019,Kentucky,Cancer,Invasive cancer (all sites combined) mortality...,"per 100,000",Age-adjusted Rate,226.6,Sex,Male,1,5,45.32


In [8]:
display(USCDI.info())
display(USCDI.describe())
display(USCDI.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
Index: 8592 entries, 115 to 274446
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   YearStart                8592 non-null   int64  
 1   YearEnd                  8592 non-null   int64  
 2   LocationDesc             8592 non-null   object 
 3   Topic                    8592 non-null   object 
 4   Question                 8592 non-null   object 
 5   DataValueUnit            8592 non-null   object 
 6   DataValueType            8592 non-null   object 
 7   DataValue                8592 non-null   float64
 8   StratificationCategory1  8592 non-null   object 
 9   Stratification1          8592 non-null   object 
 10  Has2019                  8592 non-null   int32  
 11  Range                    8592 non-null   int64  
 12  AvgDataValue             8592 non-null   float64
dtypes: float64(2), int32(1), int64(3), object(7)
memory usage: 906.2+ KB


None

Unnamed: 0,YearStart,YearEnd,DataValue,Has2019,Range,AvgDataValue
count,8592.0,8592.0,8592.0,8592.0,8592.0,8592.0
mean,2017.706006,2019.745112,128.980575,0.673184,3.039106,84.708527
std,2.348503,0.718885,188.041781,0.469077,1.999734,156.478349
min,2015.0,2019.0,0.0,0.0,1.0,0.0
25%,2015.0,2019.0,17.7,0.0,1.0,3.82
50%,2016.0,2020.0,47.7,1.0,5.0,35.02
75%,2020.0,2020.0,167.625,1.0,5.0,107.325
max,2021.0,2021.0,1456.5,1.0,5.0,1456.5


YearStart                  0
YearEnd                    0
LocationDesc               0
Topic                      0
Question                   0
DataValueUnit              0
DataValueType              0
DataValue                  0
StratificationCategory1    0
Stratification1            0
Has2019                    0
Range                      0
AvgDataValue               0
dtype: int64

### Coronary heart disease
For the coronary heart disease mortality dataset, the U.S. Chronic Disease Indicators dataset is filtered for the corresponding cases, with the common unit being `USCDI["DataValueUnit"] == 'cases per 100,000'` and with the stratification categories `Sex` and `Age`. Sex is used to estimate the proportion of each gender within each location. This is achieved by obtaining the sum of cases per 100,000 people for each location and gender, regardless of age, followed by calculating the proportion of female individuals present. Age is used to get the appropriate age group, with the closest achievable groups being the sum of cases per 100,000 people between `Age 0-44` and `Age 45-64`. Finally, the proportion of individuals that had coronary heart disease is calculated, along with the corresponding proportions for each gender, by dividing their values by 100000. The column `AvgDataValue` is renamed to `CHD_Deaths` to make future interpretation easier for users.

In [9]:
condition_1_topic = USCDI["Topic"] == "Cardiovascular Disease"
condition_1_question = USCDI["Question"] == "Coronary heart disease mortality among all people, underlying cause"
condition_1_dvu = USCDI["DataValueUnit"] == "cases per 100,000"
condition_1_2019 = USCDI["Has2019"] == 1
condition_1_sc1 = USCDI["StratificationCategory1"].isin(["Sex", "Age"])

USCDI_sex = USCDI[condition_1_topic 
    & condition_1_question
    & condition_1_dvu
    & condition_1_sc1
    & condition_1_2019
][["LocationDesc", "DataValueType", "AvgDataValue", "StratificationCategory1", "Stratification1"]]
USCDI_sex = USCDI_sex[USCDI_sex['LocationDesc'] != 'United States'] # Drop Rows where LocationDesc is equal to United States

# USCDI_sex.rename(columns={'DataValue': 'CHDCasesPer100000'}, inplace=True)
USCDI_sex

Unnamed: 0,LocationDesc,DataValueType,AvgDataValue,StratificationCategory1,Stratification1
14902,California,Crude Rate,1.9,Age,Age 0-44
15188,Alaska,Crude Rate,39.7,Sex,Female
15499,California,Age-adjusted Rate,110.6,Sex,Male
15589,Alaska,Crude Rate,307.9,Age,Age >=65
15903,Alabama,Crude Rate,434.6,Age,Age >=65
...,...,...,...,...,...
108707,Vermont,Crude Rate,109.3,Sex,Female
108768,Wisconsin,Crude Rate,76.4,Age,Age 45-64
108917,Wyoming,Crude Rate,70.1,Age,Age 45-64
109141,Washington,Crude Rate,66.7,Sex,Female


In [10]:
# Only sex-based values 
USCDI_sex_only = USCDI_sex[(USCDI_sex['StratificationCategory1'] == "Sex") & 
                           (USCDI_sex['DataValueType'] == "Age-adjusted Rate")]
# Sum up by location and sex, then place male and female values into separate columns for each country
USCDI_sex_groupsum = USCDI_sex_only.groupby(
    ['LocationDesc', 'Stratification1'], as_index=False
)['AvgDataValue'].sum()
coronary_proportions = USCDI_sex_groupsum.pivot(index='LocationDesc', columns='Stratification1', values='AvgDataValue').fillna(0)
display(coronary_proportions.head())

coronary_proportions.index.name = 'LocationDesc'
coronary_proportions.reset_index(inplace=True)

# Calculate proportion of female in coronary heart disease population 
coronary_proportions['Frac_F'] = coronary_proportions['Female'] / (coronary_proportions['Female'] + coronary_proportions['Male'])
coronary_proportions.head()

Stratification1,Female,Male
LocationDesc,Unnamed: 1_level_1,Unnamed: 2_level_1
Alabama,55.7,109.6
Alaska,42.3,82.9
Arizona,56.4,108.0
Arkansas,94.5,181.5
California,55.2,110.6


Stratification1,LocationDesc,Female,Male,Frac_F
0,Alabama,55.7,109.6,0.336963
1,Alaska,42.3,82.9,0.337859
2,Arizona,56.4,108.0,0.343066
3,Arkansas,94.5,181.5,0.342391
4,California,55.2,110.6,0.332931


In [11]:
# Only age-based values from 0-64, sum them up
USCDI_age_only = USCDI_sex[
    (USCDI_sex['StratificationCategory1'] == "Age") 
     & (USCDI_sex['Stratification1'].isin(['Age 0-44', 'Age 45-64'])) 
     & (USCDI["DataValueType"] == "Crude Rate")
]
USCDI_age_sum = USCDI_age_only.groupby('LocationDesc', as_index=False)['AvgDataValue'].sum()
display(USCDI_age_sum.head())

# Merge CDI tables together on location 
USCDI_age_sex_prop = pd.merge(coronary_proportions, USCDI_age_sum, on='LocationDesc')

# Create AvgDataValue split by gender
USCDI_age_sex_prop.rename(columns={'AvgDataValue': 'CHD_Deaths'}, inplace=True)
USCDI_age_sex_prop["CHD_Deaths_F"] = USCDI_age_sex_prop["CHD_Deaths"] * USCDI_age_sex_prop["Frac_F"]
USCDI_age_sex_prop["CHD_Deaths_M"] = USCDI_age_sex_prop["CHD_Deaths"] * (1 - USCDI_age_sex_prop["Frac_F"])

# Create CHDPercentage split by gender
USCDI_age_sex_prop["CHDPercentage"] = USCDI_age_sex_prop["CHD_Deaths"]/100000
USCDI_age_sex_prop["CHDPercentage_F"] = USCDI_age_sex_prop["CHD_Deaths_F"]/100000
USCDI_age_sex_prop["CHDPercentage_M"] = USCDI_age_sex_prop["CHD_Deaths_M"]/100000

USCDI_age_sex_prop.head()

  USCDI_age_only = USCDI_sex[


Unnamed: 0,LocationDesc,AvgDataValue
0,Alabama,90.4
1,Alaska,72.3
2,Arizona,70.4
3,Arkansas,169.0
4,California,66.1


Unnamed: 0,LocationDesc,Female,Male,Frac_F,CHD_Deaths,CHD_Deaths_F,CHD_Deaths_M,CHDPercentage,CHDPercentage_F,CHDPercentage_M
0,Alabama,55.7,109.6,0.336963,90.4,30.461464,59.938536,0.000904,0.000305,0.000599
1,Alaska,42.3,82.9,0.337859,72.3,24.427236,47.872764,0.000723,0.000244,0.000479
2,Arizona,56.4,108.0,0.343066,70.4,24.151825,46.248175,0.000704,0.000242,0.000462
3,Arkansas,94.5,181.5,0.342391,169.0,57.86413,111.13587,0.00169,0.000579,0.001111
4,California,55.2,110.6,0.332931,66.1,22.006755,44.093245,0.000661,0.00022,0.000441


In [12]:
USCDI_CHD = USCDI_age_sex_prop[["LocationDesc", "Frac_F", "CHD_Deaths", "CHD_Deaths_F", "CHD_Deaths_M", "CHDPercentage", "CHDPercentage_F", "CHDPercentage_M"]]
USCDI_CHD.to_csv("final_datasets_V1/cleaned/USCDI_CHD.csv", index=False)

### Cancer
For the cancer dataset, the U.S. Chronic Disease Indicators dataset is filtered for the corresponding cases with data including 2019, with the common unit being `USCDI["DataValueUnit"] == 'per 100,000'` and with the stratification category `Sex`, as the category `Age` is not provided. The columns `Female` and `Male` are renamed to `Cancer_Deaths_F` and `Cancer_Deaths_M` respectively, to make interpretation easier for future users. The proportions of individuals that acquired some form of cancer are then calculated by dividing the corresponding values by 100000.

In [13]:
condition_3_topic = USCDI["Topic"] == "Cancer"
condition_3_dvu = USCDI["DataValueUnit"] == "per 100,000"
condition_3_dvt = USCDI["DataValueType"] == "Age-adjusted Rate"
condition_3_sc1 = USCDI["StratificationCategory1"] == "Sex"
condition_3_2019 = USCDI["Has2019"] == 1

USCDI_disease = USCDI[
    condition_3_topic  
    & condition_3_dvu 
    & condition_3_dvt 
    & condition_3_sc1 
    & condition_3_2019 
    # & condition_3_ld
][["LocationDesc", "AvgDataValue", "StratificationCategory1", "Stratification1"]]
USCDI_disease = USCDI_disease[USCDI_disease['LocationDesc'] != 'United States']

USCDI_disease.head()

Unnamed: 0,LocationDesc,AvgDataValue,StratificationCategory1,Stratification1
115,Arkansas,109.66,Sex,Male
118,Alabama,0.0,Sex,Male
123,North Carolina,0.0,Sex,Female
129,Kentucky,45.32,Sex,Male
138,Delaware,0.0,Sex,Male


In [14]:
USCDI_cancer_sum = USCDI_disease.groupby(['LocationDesc', 'Stratification1'], as_index=False)['AvgDataValue'].sum()
USCDI_cancer_pivot = USCDI_cancer_sum.pivot(index='LocationDesc', columns='Stratification1', values='AvgDataValue').fillna(0)
USCDI_cancer_pivot.index.name = 'LocationDesc'
USCDI_cancer_pivot.reset_index(inplace=True)

# Rename columns
USCDI_cancer_pivot.rename(columns={'Female': 'Cancer_Deaths_F'}, inplace=True)
USCDI_cancer_pivot.rename(columns={'Male': 'Cancer_Deaths_M'}, inplace=True)
USCDI_cancer_pivot['Cancer_Deaths'] = USCDI_cancer_pivot["Cancer_Deaths_F"] + USCDI_cancer_pivot["Cancer_Deaths_M"]

# Calculate proportions
USCDI_cancer_pivot['CancerPercentage_F'] = USCDI_cancer_pivot["Cancer_Deaths_F"]/100000
USCDI_cancer_pivot['CancerPercentage_M'] = USCDI_cancer_pivot["Cancer_Deaths_M"]/100000
USCDI_cancer_pivot['CancerPercentage'] = USCDI_cancer_pivot["Cancer_Deaths"]/100000
USCDI_cancer = USCDI_cancer_pivot[["LocationDesc", "Cancer_Deaths", "Cancer_Deaths_F", "Cancer_Deaths_M", "CancerPercentage", "CancerPercentage_F", "CancerPercentage_M"]]
USCDI_cancer.to_csv("final_datasets_V1/cleaned/USCDI_cancer.csv", index=False)
USCDI_cancer.head()

Stratification1,LocationDesc,Cancer_Deaths,Cancer_Deaths_F,Cancer_Deaths_M,CancerPercentage,CancerPercentage_F,CancerPercentage_M
0,Alabama,570.9,244.28,326.62,0.005709,0.002443,0.003266
1,Alaska,516.58,240.48,276.1,0.005166,0.002405,0.002761
2,Arizona,465.66,214.64,251.02,0.004657,0.002146,0.00251
3,Arkansas,597.84,259.3,338.54,0.005978,0.002593,0.003385
4,California,473.84,219.9,253.94,0.004738,0.002199,0.002539
