# CPSC 368 Impact By Sex (KNM Neighbours)
## Import Data and Packages

In [1]:
import oracledb
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import altair as alt

For the coronary heart disease mortality dataset, the U.S. Chronic Disease Indicators dataset is filtered for the corresponding cases, with the common unit being `USCDI["DataValueUnit"] == 'cases per 100,000'` and with the stratification categories `Sex` and `Age`. 
- Sex is used to estimate the proportion of each gender within each location. This is achieved by obtaining the sum of cases per 100,000 people for each location and gender, regardless of age, followed by calculating the proportion of female individuals present.
- Age is used to get the appropriate age group, with the closest achievable groups being the sum of cases per 100,000 people between `Age 0-44` and `Age 45-64`.
- Finally, the proportion of individuals that had coronary heart disease is calculated, along with the corresponding proportions for each gender, by dividing their values by 100000.
- The column `AVGDATAVALUE` is renamed to `CHD_DEATHS` to make future interpretation easier for users.

In [2]:
# INPUT USER INFO HERE
CWL = "nick2003"
studentnum = "45695970"

In [3]:
# Acquire data
dsn = oracledb.makedsn("localhost", 1522, service_name="stu")
connection = oracledb.connect(user=f"ora_{CWL}", password=f"a{studentnum}", dsn=dsn)
cur = connection.cursor()

try:
    # Remove USCDI_CHD view only if it exists
    cur.execute("SELECT COUNT(*) FROM ALL_VIEWS WHERE VIEW_NAME = 'USCDI_CHD'")
    view_exists = cur.fetchone()[0]
    if view_exists:
        cur.execute("DROP VIEW USCDI_CHD CASCADE CONSTRAINTS")
    
    # Create USCDI_CHD
    cur.execute(
        """
        CREATE VIEW USCDI_CHD AS
            WITH CHD_Data AS (
                SELECT 
                    total."LOCATIONDESC" AS LOCATIONDESC,
                    CAST(female.DataValue / (female.DataValue + male.DataValue) AS DECIMAL(19, 18)) AS Frac_F,
                    CAST(total.DataValue AS DECIMAL(24, 18)) AS CHD_DEATHS
                FROM 
                    (SELECT "LOCATIONDESC", SUM("AVGDATAVALUE") as DataValue
                    FROM USCDI
                    WHERE "TOPIC" = 'Cardiovascular Disease'
                    AND "QUESTION" = 'Coronary heart disease mortality among all people, underlying cause'
                    AND "DATAVALUEUNIT" = 'cases per 100,000'
                    AND "STRATIFICATIONCATEGORY1" = 'Age'
                    AND "STRATIFICATION1" IN ('Age 0-44', 'Age 45-64')
                    AND "DATAVALUETYPE" = 'Crude Rate'
                    AND "HAS2019" = 1
                    GROUP BY "LOCATIONDESC") total
                JOIN
                    (SELECT "LOCATIONDESC", SUM("AVGDATAVALUE") as DataValue
                    FROM USCDI
                    WHERE "TOPIC" = 'Cardiovascular Disease'
                    AND "QUESTION" = 'Coronary heart disease mortality among all people, underlying cause'
                    AND "DATAVALUEUNIT" = 'cases per 100,000'
                    AND "STRATIFICATIONCATEGORY1" = 'Sex'
                    AND "STRATIFICATION1" = 'Female'
                    AND "DATAVALUETYPE" = 'Age-adjusted Rate'
                    AND "HAS2019" = 1
                    GROUP BY "LOCATIONDESC") female
                ON total."LOCATIONDESC" = female."LOCATIONDESC"
                JOIN
                    (SELECT "LOCATIONDESC", SUM("AVGDATAVALUE") as DataValue
                    FROM USCDI
                    WHERE "TOPIC" = 'Cardiovascular Disease'
                    AND "QUESTION" = 'Coronary heart disease mortality among all people, underlying cause'
                    AND "DATAVALUEUNIT" = 'cases per 100,000'
                    AND "STRATIFICATIONCATEGORY1" = 'Sex'
                    AND "STRATIFICATION1" = 'Male'
                    AND "DATAVALUETYPE" = 'Age-adjusted Rate'
                    AND "HAS2019" = 1
                    GROUP BY "LOCATIONDESC") male
                ON total."LOCATIONDESC" = male."LOCATIONDESC"
            )
            SELECT 
                CHD_Data.LOCATIONDESC,
                CHD_Data.FRAC_F,
                CHD_Data.CHD_DEATHS,
                CAST(CHD_Data.CHD_DEATHS * CHD_Data.FRAC_F AS DECIMAL(24, 18)) AS CHD_DEATHS_F,
                CAST(CHD_Data.CHD_DEATHS * (1 - CHD_Data.FRAC_F) AS DECIMAL(24, 18)) AS CHD_DEATHS_M,
                CAST(CHD_Data.CHD_DEATHS / 100000 AS DECIMAL(19, 18)) AS CHDPROP,
                CAST((CHD_Data.CHD_DEATHS * CHD_Data.FRAC_F) / 100000 AS DECIMAL(19, 18)) AS CHDPROP_F,
                CAST((CHD_Data.CHD_DEATHS * (1 - CHD_Data.FRAC_F)) / 100000 AS DECIMAL(19, 18)) AS CHDPROP_M
            FROM CHD_Data
        """
    )

    cur.execute(
        """
        SELECT
            uc.LOCATIONDESC,
            uc.FRAC_F,
            uc.CHD_DEATHS,
            uc.CHD_DEATHS_F,
            uc.CHD_DEATHS_M,
            uc.CHDPROP,
            uc.CHDPROP_F,
            uc.CHDPROP_M,
            kff.All_Uninsured,
            kff.Female_Uninsured,
            kff.Male_Uninsured
        FROM USCDI_CHD uc
        LEFT JOIN KFF2019_new kff
            ON uc.LOCATIONDESC = kff.LOCATION
        """
    )
    
    # Fetch SELECT results 
    rows = cur.fetchall()
    
    # Get column names from cursor
    columns = [desc[0] for desc in cur.description]
    
    # Create DataFrame
    total_data = pd.DataFrame(rows, columns=columns)
    
    print("Completed dataframe.")
except Exception as e:
    print(f"Error executing SQL query: {e}")
finally: 
    cur.close()
    connection.close()

Completed dataframe.


In [4]:
total_data.head()

Unnamed: 0,LOCATIONDESC,FRAC_F,CHD_DEATHS,CHD_DEATHS_F,CHD_DEATHS_M,CHDPROP,CHDPROP_F,CHDPROP_M,ALL_UNINSURED,FEMALE_UNINSURED,MALE_UNINSURED
0,Delaware,0.318102,63.1,20.072232,43.027768,0.000631,0.000201,0.00043,0.097,0.075,0.121
1,Georgia,0.334925,75.8,25.387312,50.412688,0.000758,0.000254,0.000504,0.189,0.17,0.211
2,Michigan,0.342314,102.1,34.950309,67.149691,0.001021,0.00035,0.000671,0.083,0.067,0.1
3,Florida,0.346479,81.0,28.064789,52.935211,0.00081,0.000281,0.000529,0.195,0.173,0.219
4,Idaho,0.310811,66.0,20.513514,45.486486,0.00066,0.000205,0.000455,0.16,0.155,0.164


In [5]:
# USCDI_CHD = pd.read_csv("final_datasets_V1/cleaned/USCDI_CHD.csv")
# KFF2019_new = pd.read_csv("final_datasets_V1/cleaned/KFF2019_new.csv")
# total_data = pd.merge(USCDI_CHD, KFF2019_new, left_on='LOCATIONDESC', right_on='Location', how='left')

## EDA
For this analysis, the focus will be on uninsured rates and coronary heart disease (CHD) mortality rates across males and females in Texas and Massachusetts. 

`facet_1` displays bars for `CHDPROP` by location and sex, with the `CHDPROP_M` values being  greater than the corresponding `CHDPROP_F` value for both states. This supports existing research that indicates that CHD incidence and mortality rates have historically been higher in men than women between the ages 35 and 84, though the difference in morbidity between sexes decreases with age (Lerner, Kannel, 1986).

In [6]:
total_data_focus = total_data[(total_data['LOCATIONDESC'].isin(['Texas', 'Massachusetts']))]

In [7]:
facet_1 = alt.Chart(total_data_focus).mark_bar().encode(
    alt.Y("LOCATIONDESC:N"),
    alt.X(alt.repeat('row'), 
          type='quantitative', 
          title='CHD Proportion',
          scale=alt.Scale(domain=[0,max(total_data_focus['CHDPROP_F'].max(), total_data_focus['CHDPROP_M'].max())])
         )
).repeat(
  row=['CHDPROP_F', 'CHDPROP_M',]
).properties(
    title="CHD Proportion for Females and Males by Location"
)
facet_1

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


`facet_2` displays bars for uninsurance rate by location and sex, with the `MALE_UNINSURED` values being  greater than the corresponding `FEMALE_UNINSURED` value for both states.

In [8]:
facet_2 = alt.Chart(total_data_focus).mark_bar().encode(
    alt.Y("LOCATIONDESC:N"),
    alt.X(alt.repeat('row'), 
          type='quantitative', 
          title='Uninsured Proportion',
          scale=alt.Scale(domain=[0,max(total_data_focus['FEMALE_UNINSURED'].max(), total_data_focus['MALE_UNINSURED'].max())])
         )
).repeat(
  row=['FEMALE_UNINSURED', 'MALE_UNINSURED',]
).properties(
    title="Percentage of Uninsured Individuals for Females and Males by Location"
)
facet_2

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


`plot_3` displays bars for the ratio of the percentage of uninsured individuals over the percentage of coronary heart disease (CHD) mortality rates by location and sex, with the `CHD_Uninsured_Ratio_F` values being lower than the corresponding `CHD_Uninsured_Ratio_M` value for both states. This implies that uninsured females are at a relatively lower risk of CHD mortality than uninsured males.

In [9]:
total_data_focus["CHD_Uninsured_Ratio_F"] = total_data_focus['CHDPROP_F'] / total_data_focus['FEMALE_UNINSURED']
total_data_focus["CHD_Uninsured_Ratio_M"] = total_data_focus['CHDPROP_M'] / total_data_focus['MALE_UNINSURED']

plot_3 = alt.Chart(total_data_focus).mark_bar().encode(
    alt.Y("LOCATIONDESC:N"),
    alt.X(alt.repeat('row'), 
          type='quantitative', 
          scale=alt.Scale(domain=[0,max(total_data_focus['CHD_Uninsured_Ratio_F'].max(), total_data_focus['CHD_Uninsured_Ratio_M'].max())]))
).repeat(
  row=['CHD_Uninsured_Ratio_F', 'CHD_Uninsured_Ratio_M',]
).properties(
    title="Ratio of CHD Mortality Percentage over Uninsured Percentage for Females and Males by Location"
)
plot_3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  total_data_focus["CHD_Uninsured_Ratio_F"] = total_data_focus['CHDPROP_F'] / total_data_focus['FEMALE_UNINSURED']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  total_data_focus["CHD_Uninsured_Ratio_M"] = total_data_focus['CHDPROP_M'] / total_data_focus['MALE_UNINSURED']
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


## References
- Lerner, D. J., & Kannel, W. B. (1986). Patterns of coronary heart disease morbidity and mortality in the sexes: a 26-year follow-up of the Framingham population. American heart journal, 111(2), 383–390. https://doi.org/10.1016/0002-8703(86)90155-9