# CPSC 368 Impact By Sex (KNM Neighbours)
## Import Data and Packages

In [6]:
import oracledb
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import altair as alt

For the coronary heart disease mortality dataset, the U.S. Chronic Disease Indicators dataset is filtered for the corresponding cases, with the common unit being `USCDI["DataValueUnit"] == 'cases per 100,000'` and with the stratification categories `Sex` and `Age`. 
- Sex is used to estimate the proportion of each gender within each location. This is achieved by obtaining the sum of cases per 100,000 people for each location and gender, regardless of age, followed by calculating the proportion of female individuals present.
- Age is used to get the appropriate age group, with the closest achievable groups being the sum of cases per 100,000 people between `Age 0-44` and `Age 45-64`.
- Finally, the proportion of individuals that had coronary heart disease is calculated, along with the corresponding proportions for each gender, by dividing their values by 100000.
- The column `AVGDATAVALUE` is renamed to `CHD_DEATHS` to make future interpretation easier for users.

In [7]:
# INPUT USER INFO HERE
CWL = "kshiao"
studentnum = "73239121"

In [45]:
# Acquire data
dsn = oracledb.makedsn("localhost", 1522, service_name="stu")
connection = oracledb.connect(user=f"ora_{CWL}", password=f"a{studentnum}", dsn=dsn)
cur = connection.cursor()

try:
    # Remove USCDI_CHD view only if it exists
    cur.execute("SELECT COUNT(*) FROM ALL_VIEWS WHERE VIEW_NAME = 'USCDI_CHD'")
    view_exists = cur.fetchone()[0]
    if view_exists:
        cur.execute("DROP VIEW USCDI_CHD CASCADE CONSTRAINTS")
    
    # Create USCDI_CHD
    cur.execute(
        """
        SELECT *
                    FROM USCDI_filter
                    WHERE "Topic" = 'Cardiovascular Disease'
                    AND "Question" = 'Coronary heart disease mortality among all people, underlying cause'
                    AND "DataValueUnit" = 'cases per 100,000'
                    AND "StratificationCategory1" = 'Age'
                    AND "Stratification1" IN ('Age 0-44', 'Age 45-64')
                    



        """
    )

    
    # Fetch SELECT results 
    rows = cur.fetchall()
    
    # Get column names from cursor
    columns = [desc[0] for desc in cur.description]
    
    # Create DataFrame
    test_data = pd.DataFrame(rows, columns=columns)
    
    print("Completed dataframe.")
except Exception as e:
    print(f"Error executing SQL query: {e}")
finally: 
    cur.close()
    connection.close()

Completed dataframe.


In [22]:
SELECT "LocationDesc", "DataValue" as DataValue
                    FROM USCDI_filter

                    AND "StratificationCategory1" = 'Age'
                    AND "Stratification1" IN ('Age 0-44', 'Age 45-64')
                    AND "DataValueType" = 'Crude Rate'

SyntaxError: invalid syntax (732418774.py, line 1)

In [46]:
test_data.head()

Unnamed: 0,YearStart,YearEnd,LocationDesc,Topic,Question,DataValueUnit,DataValueType,DataValue,StratificationCategory1,Stratification1
0,2019,2019,California,Cardiovascular Disease,Coronary heart disease mortality among all peo...,"cases per 100,000",Crude Rate,1.9,Age,Age 0-44
1,2019,2019,Alaska,Cardiovascular Disease,Coronary heart disease mortality among all peo...,"cases per 100,000",Crude Rate,72.3,Age,Age 45-64
2,2019,2019,Arizona,Cardiovascular Disease,Coronary heart disease mortality among all peo...,"cases per 100,000",Crude Rate,68.1,Age,Age 45-64
3,2019,2019,Alabama,Cardiovascular Disease,Coronary heart disease mortality among all peo...,"cases per 100,000",Crude Rate,86.0,Age,Age 45-64
4,2019,2019,Arkansas,Cardiovascular Disease,Coronary heart disease mortality among all peo...,"cases per 100,000",Crude Rate,6.2,Age,Age 0-44


In [44]:
test_data['StratificationCategory1'].unique()

array(['Age', 'Sex', 'Overall'], dtype=object)

In [8]:
# Acquire data
dsn = oracledb.makedsn("localhost", 1522, service_name="stu")
connection = oracledb.connect(user=f"ora_{CWL}", password=f"a{studentnum}", dsn=dsn)
cur = connection.cursor()

try:
    # Remove USCDI_CHD view only if it exists
    cur.execute("SELECT COUNT(*) FROM ALL_VIEWS WHERE VIEW_NAME = 'USCDI_CHD'")
    view_exists = cur.fetchone()[0]
    if view_exists:
        cur.execute("DROP VIEW USCDI_CHD CASCADE CONSTRAINTS")
    
    # Create USCDI_CHD
    cur.execute(
        """
        CREATE VIEW USCDI_CHD AS
            WITH CHD_Data AS (
                SELECT 
                    total."LocationDesc" AS LocationDesc,
                    CAST(female.DataValue / (female.DataValue + male.DataValue) AS DECIMAL(19, 18)) AS Frac_F,
                    CAST(total.DataValue AS DECIMAL(24, 18)) AS CHD_DEATHS
                FROM 
                    (SELECT "LocationDesc", "DataValue" as DataValue
                    FROM USCDI_filter
                    WHERE "Topic" = 'Cardiovascular Diseases'
                    AND "Question" = 'Death rate from coronary heart disease (CHD)'
                    AND "DataValueUnit" = 'cases per 100,000'
                    AND "StratificationCategory1" = 'Age'
                    AND "Stratification1" IN ('Age 0-44', 'Age 45-64')
                    AND "DataValueType" = 'Crude Rate') total
                JOIN
                    (SELECT "LocationDesc", "DataValue" as DataValue
                    FROM USCDI_filter
                    WHERE "Topic" = 'Cardiovascular Disease'
                    AND "Question" = 'Coronary heart disease mortality among all people, underlying cause'
                    AND "DataValueUnit" = 'cases per 100,000'
                    AND "StratificationCategory1" = 'Sex'
                    AND "Stratification1" = 'Female'
                    AND "DataValueType" = 'Age-adjusted Rate') female
                ON total."LocationDesc" = female."LocationDesc"
                JOIN
                    (SELECT "LocationDesc", "DataValue" as DataValue
                    FROM USCDI_filter
                    WHERE "Topic" = 'Cardiovascular Disease'
                    AND "Question" = 'Coronary heart disease mortality among all people, underlying cause'
                    AND "DataValueUnit" = 'cases per 100,000'
                    AND "StratificationCategory1" = 'Sex'
                    AND "Stratification1" = 'Male'
                    AND "DataValueType" = 'Age-adjusted Rate') male
                ON total."LocationDesc" = male."LocationDesc"
            )
            SELECT 
                CHD_Data.LocationDesc,
                CHD_Data.Frac_F,
                CHD_Data.CHD_DEATHS,
                CAST(CHD_Data.CHD_DEATHS * CHD_Data.Frac_F AS DECIMAL(24, 18)) AS CHD_DEATHS_F,
                CAST(CHD_Data.CHD_DEATHS * (1 - CHD_Data.Frac_F) AS DECIMAL(24, 18)) AS CHD_DEATHS_M,
                CAST(CHD_Data.CHD_DEATHS / 1000 AS DECIMAL(19, 18)) AS CHDPercentage,
                CAST((CHD_Data.CHD_DEATHS * CHD_Data.Frac_F) / 1000 AS DECIMAL(19, 18)) AS CHDPercentage_F,
                CAST((CHD_Data.CHD_DEATHS * (1 - CHD_Data.Frac_F)) / 1000 AS DECIMAL(19, 18)) AS CHDPercentage_M
            FROM CHD_Data
        """
    )

    cur.execute(
        """
        SELECT
            uc.LocationDesc,
            uc.Frac_F,
            uc.CHD_DEATHS,
            uc.CHD_DEATHS_F,
            uc.CHD_DEATHS_M,
            uc.CHDPercentage,
            uc.CHDPercentage_F,
            uc.CHDPercentage_M,
            kff.All_Uninsured,
            kff.Female_Uninsured,
            kff.Male_Uninsured
        FROM USCDI_CHD uc
        LEFT JOIN KFF2019_new kff
            ON uc.LocationDesc = kff.Location
        """
    )
    
    # Fetch SELECT results 
    rows = cur.fetchall()
    
    # Get column names from cursor
    columns = [desc[0] for desc in cur.description]
    
    # Create DataFrame
    total_data = pd.DataFrame(rows, columns=columns)
    
    print("Completed dataframe.")
except Exception as e:
    print(f"Error executing SQL query: {e}")
finally: 
    cur.close()
    connection.close()

Completed dataframe.


In [9]:
total_data.head()

Unnamed: 0,LOCATIONDESC,FRAC_F,CHD_DEATHS,CHD_DEATHS_F,CHD_DEATHS_M,CHDPERCENTAGE,CHDPERCENTAGE_F,CHDPERCENTAGE_M,ALL_UNINSURED,FEMALE_UNINSURED,MALE_UNINSURED


In [10]:
# USCDI_CHD = pd.read_csv("final_datasets_V1/cleaned/USCDI_CHD.csv")
# KFF2019_new = pd.read_csv("final_datasets_V1/cleaned/KFF2019_new.csv")
# total_data = pd.merge(USCDI_CHD, KFF2019_new, left_on='LOCATIONDESC', right_on='Location', how='left')

## EDA
For this analysis, the focus will be on uninsured rates and coronary heart disease (CHD) mortality rates across males and females in Texas and Massachusetts. 

`facet_1` displays bars for `CHDPERCENTAGE` by location and sex, with the `CHDPERCENTAGE_M` values being  greater than the corresponding `CHDPERCENTAGE_F` value for both states. This supports existing research that indicates that CHD incidence and mortality rates have historically been higher in men than women between the ages 35 and 84, though the difference in morbidity between sexes decreases with age (Lerner, Kannel, 1986).

In [11]:
total_data_focus = total_data[(total_data['LOCATIONDESC'].isin(['Texas', 'Massachusetts']))]

In [12]:
facet_1 = alt.Chart(total_data_focus).mark_bar().encode(
    alt.Y("LOCATIONDESC:N"),
    alt.X(alt.repeat('row'), 
          type='quantitative', 
          scale=alt.Scale(domain=[0,max(total_data_focus['CHDPERCENTAGE_F'].max(), total_data_focus['CHDPERCENTAGE_M'].max())]))
).repeat(
  row=['CHDPERCENTAGE_F', 'CHDPERCENTAGE_M',]
).properties(
    title="CHD Percentage for Females and Males by Location"
)
facet_1

`facet_2` displays bars for uninsurance rate by location and sex, with the `MALE_UNINSURED` values being  greater than the corresponding `FEMALE_UNINSURED` value for both states.

In [13]:
facet_2 = alt.Chart(total_data_focus).mark_bar().encode(
    alt.Y("LOCATIONDESC:N"),
    alt.X(alt.repeat('row'), 
          type='quantitative', 
          scale=alt.Scale(domain=[0,max(total_data_focus['FEMALE_UNINSURED'].max(), total_data_focus['MALE_UNINSURED'].max())]))
).repeat(
  row=['FEMALE_UNINSURED', 'MALE_UNINSURED',]
).properties(
    title="Percentage of Uninsured Individuals for Females and Males by Location"
)
facet_2

`plot_3` displays bars for the ratio of the percentage of uninsured individuals over the percentage of coronary heart disease (CHD) mortality rates by location and sex, with the `CHD_Uninsured_Ratio_F` values being lower than the corresponding `CHD_Uninsured_Ratio_M` value for both states. This implies that uninsured females are at a relatively lower risk of CHD mortality than uninsured males.

In [14]:
total_data_focus["CHD_Uninsured_Ratio_F"] = total_data_focus['CHDPERCENTAGE_F'] / total_data_focus['FEMALE_UNINSURED']
total_data_focus["CHD_Uninsured_Ratio_M"] = total_data_focus['CHDPERCENTAGE_M'] / total_data_focus['MALE_UNINSURED']

plot_3 = alt.Chart(total_data_focus).mark_bar().encode(
    alt.Y("LOCATIONDESC:N"),
    alt.X(alt.repeat('row'), 
          type='quantitative', 
          scale=alt.Scale(domain=[0,max(total_data_focus['CHD_Uninsured_Ratio_F'].max(), total_data_focus['CHD_Uninsured_Ratio_M'].max())]))
).repeat(
  row=['CHD_Uninsured_Ratio_F', 'CHD_Uninsured_Ratio_M',]
).properties(
    title="Ratio of CHD Mortality Percentage over Uninsured Percentage for Females and Males by Location"
)
plot_3

## References
- Lerner, D. J., & Kannel, W. B. (1986). Patterns of coronary heart disease morbidity and mortality in the sexes: a 26-year follow-up of the Framingham population. American heart journal, 111(2), 383–390. https://doi.org/10.1016/0002-8703(86)90155-9