# CPSC 368 Data Cleaning Notebook (KNM Neighbours)

## Loading Data and Packages

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

## KFF Data Cleaning

There are 3 KFF datasets: one for all adults aged 19-64, and two for males and females aged 19-64. Each dataset has a corresponding `Group` column applied to them before they are joined on `Location`. Since our focus is on uninsured adults exclusively, only the `Uninsured` column of values are acquired for each individual dataset, which are then grouped by location to create the columns `All_Uninsured`, `Female_Uninsured`, and `Male_Uninsured`, corresponding to the proportion of uninsured individuals in each category for each country.

In [2]:
KFF2019_adult_original = pd.read_csv('final_datasets_V1/KFF/kff_health_insurance_2019_adult.csv', skiprows = 2, nrows = 53)
KFF2019_female_original = pd.read_csv('final_datasets_V1/KFF/kff_health_insurance_2019_female.csv', skiprows = 2, nrows = 53)
KFF2019_male_original = pd.read_csv('final_datasets_V1/KFF/kff_health_insurance_2019_male.csv', skiprows = 2, nrows = 53)

In [3]:
KFF2019_adult_original["Group"] = "All_Uninsured"
KFF2019_female_original["Group"] = "Female_Uninsured"
KFF2019_male_original["Group"] = "Male_Uninsured"

KFF2019 = pd.concat([KFF2019_female_original, KFF2019_male_original, KFF2019_adult_original], 
                    ignore_index=True)[["Location", "Group", "Uninsured"]]


KFF2019_new = KFF2019[KFF2019['Location'] != 'United States'].pivot(index='Location', columns='Group', values='Uninsured').fillna(0)
KFF2019_new.index.name = 'Location'
KFF2019_new.reset_index(inplace=True)
KFF2019_new.rename(columns={'Group': 'Index'}, inplace=True)

KFF2019_new.shape

(52, 4)

In [4]:
KFF2019_new.to_csv("final_datasets_V1/cleaned/KFF2019_new.csv", index=False)
KFF2019_new.tail()

Group,Location,All_Uninsured,Female_Uninsured,Male_Uninsured
47,Virginia,0.112,0.094,0.131
48,Washington,0.094,0.082,0.106
49,West Virginia,0.099,0.08,0.117
50,Wisconsin,0.082,0.067,0.097
51,Wyoming,0.167,0.167,0.167


## U.S. Chronic Disease Indicators

The U.S. Chronic Disease Indicators dataset contains many types of data for a variety of topics, and given our topic questions, we will create 2 datasets, one for coronary heart disease mortality, and another for the average of various cancer mortalities. The column `Has2019` is created to determine if the value is relevant to our questions, while `Range` is created to assist in providing the average data value `AvgDataValue` across the range of years, given that some values are obtained for a range greater than 1 year. 

In [5]:
USCDI_original = pd.read_csv('final_datasets_V1/CDC/U.S._Chronic_Disease_Indicators.csv')

In [None]:
USCDI = USCDI_original[["YearStart","YearEnd","LocationDesc",
                        "Topic","Question","DataValueUnit", "DataValueType", "DataValue",
                        "StratificationCategory1", "Stratification1"]].copy().fillna(0)
USCDI["Has2019"] = ((USCDI["YearStart"] <= 2019) & (USCDI["YearEnd"] >= 2019))
USCDI["Range"] = (USCDI["YearEnd"] - USCDI["YearStart"] + 1)
USCDI["AvgDataValue"] = USCDI["DataValue"]/USCDI["Range"]
USCDI.to_csv('final_datasets_V1/cleaned/U.S._Chronic_Disease_Indicators_newcols.csv', index=False)
USCDI.head()

### Coronary heart disease
For the coronary heart disease mortality dataset, the U.S. Chronic Disease Indicators dataset is filtered for the corresponding cases, with the common unit being `USCDI["DataValueUnit"] == 'cases per 100,000'` and with the stratification categories `Sex` and `Age`. Sex is used to estimate the proportion of each gender within each location. This is achieved by obtaining the sum of cases per 100,000 people for each location and gender, regardless of age, followed by calculating the proportion of female individuals present. Age is used to get the appropriate age group, with the closest achievable groups being the sum of cases per 100,000 people between `Age 0-44` and `Age 45-64`. Finally, the proportion of individuals that had coronary heart disease is calculated, along with the corresponding proportions for each gender, by dividing their values by 100000. The column `AvgDataValue` is renamed to `CHD_Deaths` to make future interpretation easier for users.

In [None]:
condition_1_topic = USCDI["Topic"] == "Cardiovascular Disease"
condition_1_question = USCDI["Question"] == "Coronary heart disease mortality among all people, underlying cause"
condition_1_dvu = USCDI["DataValueUnit"] == "cases per 100,000"
condition_1_2019 = USCDI["Has2019"] == True
condition_1_sc1 = USCDI["StratificationCategory1"].isin(["Sex", "Age"])

USCDI_sex = USCDI[condition_1_topic 
    & condition_1_question
    & condition_1_dvu
    & condition_1_sc1
    & condition_1_2019
][["LocationDesc", "DataValueType", "AvgDataValue", "StratificationCategory1", "Stratification1"]]
USCDI_sex = USCDI_sex[USCDI_sex['LocationDesc'] != 'United States'] # Drop Rows where LocationDesc is equal to United States

# USCDI_sex.rename(columns={'DataValue': 'CHDCasesPer100000'}, inplace=True)
USCDI_sex

In [None]:
# Only sex-based values 
USCDI_sex_only = USCDI_sex[(USCDI_sex['StratificationCategory1'] == "Sex") & 
                           (USCDI_sex['DataValueType'] == "Age-adjusted Rate")]
# Sum up by location and sex, then place male and female values into separate columns for each country
USCDI_sex_groupsum = USCDI_sex_only.groupby(
    ['LocationDesc', 'Stratification1'], as_index=False
)['AvgDataValue'].sum()
coronary_proportions = USCDI_sex_groupsum.pivot(index='LocationDesc', columns='Stratification1', values='AvgDataValue').fillna(0)
display(coronary_proportions.head())

coronary_proportions.index.name = 'LocationDesc'
coronary_proportions.reset_index(inplace=True)

# Calculate proportion of female in coronary heart disease population 
coronary_proportions['Frac_F'] = coronary_proportions['Female'] / (coronary_proportions['Female'] + coronary_proportions['Male'])
coronary_proportions.head()

In [None]:
# Only age-based values from 0-64, sum them up
USCDI_age_only = USCDI_sex[
    (USCDI_sex['StratificationCategory1'] == "Age") 
     & (USCDI_sex['Stratification1'].isin(['Age 0-44', 'Age 45-64'])) 
     & (USCDI["DataValueType"] == "Crude Rate")
]
USCDI_age_sum = USCDI_age_only.groupby('LocationDesc', as_index=False)['AvgDataValue'].sum()
display(USCDI_age_sum.head())

# Merge CDI tables together on location 
USCDI_age_sex_prop = pd.merge(coronary_proportions, USCDI_age_sum, on='LocationDesc')

# Create AvgDataValue split by gender
USCDI_age_sex_prop.rename(columns={'AvgDataValue': 'CHD_Deaths'}, inplace=True)
USCDI_age_sex_prop["CHD_Deaths_F"] = USCDI_age_sex_prop["CHD_Deaths"] * USCDI_age_sex_prop["Frac_F"]
USCDI_age_sex_prop["CHD_Deaths_M"] = USCDI_age_sex_prop["CHD_Deaths"] * (1 - USCDI_age_sex_prop["Frac_F"])

# Create CHDPercentage split by gender
USCDI_age_sex_prop["CHDPercentage"] = USCDI_age_sex_prop["CHD_Deaths"]/100000
USCDI_age_sex_prop["CHDPercentage_F"] = USCDI_age_sex_prop["CHD_Deaths_F"]/100000
USCDI_age_sex_prop["CHDPercentage_M"] = USCDI_age_sex_prop["CHD_Deaths_M"]/100000

USCDI_age_sex_prop.head()

In [None]:
USCDI_CHD = USCDI_age_sex_prop[["LocationDesc", "Frac_F", "CHD_Deaths", "CHD_Deaths_F", "CHD_Deaths_M", "CHDPercentage", "CHDPercentage_F", "CHDPercentage_M"]]
USCDI_CHD.to_csv("final_datasets_V1/cleaned/USCDI_CHD.csv", index=False)

### Cancer
For the cancer dataset, the U.S. Chronic Disease Indicators dataset is filtered for the corresponding cases with data including 2019, with the common unit being `USCDI["DataValueUnit"] == 'per 100,000'` and with the stratification category `Sex`, as the category `Age` is not provided. The columns `Female` and `Male` are renamed to `Cancer_Deaths_F` and `Cancer_Deaths_M` respectively, to make interpretation easier for future users. The proportions of individuals that acquired some form of cancer are then calculated by dividing the corresponding values by 100000.

In [None]:
condition_3_topic = USCDI["Topic"] == "Cancer"
condition_3_dvu = USCDI["DataValueUnit"] == "per 100,000"
condition_3_dvt = USCDI["DataValueType"] == "Age-adjusted Rate"
condition_3_sc1 = USCDI["StratificationCategory1"] == "Sex"
condition_3_2019 = USCDI["Has2019"] == True

USCDI_disease = USCDI[
    condition_3_topic  
    & condition_3_dvu 
    & condition_3_dvt 
    & condition_3_sc1 
    & condition_3_2019 
    # & condition_3_ld
][["LocationDesc", "AvgDataValue", "StratificationCategory1", "Stratification1"]]
USCDI_disease = USCDI_disease[USCDI_disease['LocationDesc'] != 'United States']

USCDI_disease.head()

In [None]:
USCDI_cancer_sum = USCDI_disease.groupby(['LocationDesc', 'Stratification1'], as_index=False)['AvgDataValue'].sum()
USCDI_cancer_pivot = USCDI_cancer_sum.pivot(index='LocationDesc', columns='Stratification1', values='AvgDataValue').fillna(0)
USCDI_cancer_pivot.index.name = 'LocationDesc'
USCDI_cancer_pivot.reset_index(inplace=True)

# Rename columns
USCDI_cancer_pivot.rename(columns={'Female': 'Cancer_Deaths_F'}, inplace=True)
USCDI_cancer_pivot.rename(columns={'Male': 'Cancer_Deaths_M'}, inplace=True)
USCDI_cancer_pivot['Cancer_Deaths'] = USCDI_cancer_pivot["Cancer_Deaths_F"] + USCDI_cancer_pivot["Cancer_Deaths_M"]

# Calculate proportions
USCDI_cancer_pivot['CancerPercentage_F'] = USCDI_cancer_pivot["Cancer_Deaths_F"]/100000
USCDI_cancer_pivot['CancerPercentage_M'] = USCDI_cancer_pivot["Cancer_Deaths_M"]/100000
USCDI_cancer_pivot['CancerPercentage'] = USCDI_cancer_pivot["Cancer_Deaths"]/100000
USCDI_cancer = USCDI_cancer_pivot[["LocationDesc", "Cancer_Deaths", "Cancer_Deaths_F", "Cancer_Deaths_M", "CancerPercentage", "CancerPercentage_F", "CancerPercentage_M"]]
USCDI_cancer.to_csv("final_datasets_V1/cleaned/USCDI_cancer.csv", index=False)
USCDI_cancer.head()