# CPSC 368 Impact By Sex (KNM Neighbours)
## Import Data and Packages

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import altair as alt

In [2]:
USCDI_CHD = pd.read_csv("final_datasets_V1/cleaned/USCDI_CHD.csv")
KFF2019_new = pd.read_csv("final_datasets_V1/cleaned/KFF2019_new.csv")
total_data = pd.merge(USCDI_CHD, KFF2019_new, left_on='LocationDesc', right_on='Location', how='left')

## EDA
For this analysis, the focus will be on uninsured rates and coronary heart disease (CHD) mortality rates across males and females in Texas and Massachusetts. 

`facet_1` displays bars for `CHDPercentage` by location and sex, with the `CHDPercentage_M` values being  greater than the corresponding `CHDPercentage_F` value for both states. This supports existing research that indicates that CHD incidence and mortality rates have historically been higher in men than women between the ages 35 and 84, though the difference in morbidity between sexes decreases with age (Lerner, Kannel, 1986).

In [3]:
total_data_focus = total_data[(total_data['LocationDesc'].isin(['Texas', 'Massachusetts']))]

In [4]:
facet_1 = alt.Chart(total_data_focus).mark_bar().encode(
    alt.Y("LocationDesc:N"),
    alt.X(alt.repeat('row'), 
          type='quantitative', 
          scale=alt.Scale(domain=[0,max(total_data_focus['CHDPercentage_F'].max(), total_data_focus['CHDPercentage_M'].max())]))
).repeat(
  row=['CHDPercentage_F', 'CHDPercentage_M',]
).properties(
    title="CHD Percentage for Females and Males by Location"
)
facet_1

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


`facet_2` displays bars for uninsurance rate by location and sex, with the `Male_Uninsured` values being  greater than the corresponding `Female_Uninsured` value for both states.

In [5]:
facet_2 = alt.Chart(total_data_focus).mark_bar().encode(
    alt.Y("LocationDesc:N"),
    alt.X(alt.repeat('row'), 
          type='quantitative', 
          scale=alt.Scale(domain=[0,max(total_data_focus['Female_Uninsured'].max(), total_data_focus['Male_Uninsured'].max())]))
).repeat(
  row=['Female_Uninsured', 'Male_Uninsured',]
).properties(
    title="Percentage of Uninsured Individuals for Females and Males by Location"
)
facet_2

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


`plot_3` displays bars for the ratio of the percentage of uninsured individuals over the percentage of coronary heart disease (CHD) mortality rates by location and sex, with the `CHD_Uninsured_Ratio_F` values being lower than the corresponding `CHD_Uninsured_Ratio_M` value for both states. This implies that uninsured females are at a relatively lower risk of CHD mortality than uninsured males.

In [6]:
total_data_focus["CHD_Uninsured_Ratio_F"] = total_data_focus["CHDPercentage_F"] / total_data_focus["Female_Uninsured"]
total_data_focus["CHD_Uninsured_Ratio_M"] = total_data_focus["CHDPercentage_M"] / total_data_focus["Male_Uninsured"]

plot_3 = alt.Chart(total_data_focus).mark_bar().encode(
    alt.Y("LocationDesc:N"),
    alt.X(alt.repeat('row'), 
          type='quantitative', 
          scale=alt.Scale(domain=[0,max(total_data_focus['CHD_Uninsured_Ratio_F'].max(), total_data_focus['CHD_Uninsured_Ratio_M'].max())]))
).repeat(
  row=['CHD_Uninsured_Ratio_F', 'CHD_Uninsured_Ratio_M',]
).properties(
    title="Ratio of CHD Mortality Percentage over Uninsured Percentage for Females and Males by Location"
)
plot_3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  total_data_focus["CHD_Uninsured_Ratio_F"] = total_data_focus["CHDPercentage_F"] / total_data_focus["Female_Uninsured"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  total_data_focus["CHD_Uninsured_Ratio_M"] = total_data_focus["CHDPercentage_M"] / total_data_focus["Male_Uninsured"]
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


## References
- Lerner, D. J., & Kannel, W. B. (1986). Patterns of coronary heart disease morbidity and mortality in the sexes: a 26-year follow-up of the Framingham population. American heart journal, 111(2), 383–390. https://doi.org/10.1016/0002-8703(86)90155-9