In [2]:
# Import Dependencies
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import statsmodels.api as sm

In [4]:
# Load AQI CSV
aqi_path = ('../Resources/max_aqi_csv.csv')

#Read in the AQI CSV and store into a Pandas DF
aqi_df =pd.read_csv(aqi_path)
aqi_df.head()

Unnamed: 0,State,State Name,Ozone,NO2,SO2,CO,PM 2.5
0,1,Alabama,0.045526,22.978,4.673446,0.345078,10.016438
1,4,Arizona,0.05724,38.92033,12.117582,0.684723,17.213333
2,5,Arkansas,0.043364,17.477596,1.180055,0.287533,9.809836
3,6,California,0.074551,43.77913,3.285256,0.620842,40.764417
4,8,Colorado,0.065656,43.903683,2.054972,0.437755,10.486665


In [5]:
# Remove extra index
aqi_df = aqi_df[['State', 'State Name', 'Ozone', 'NO2', 'SO2', 'CO', 'PM 2.5']]
aqi_df.head()

Unnamed: 0,State,State Name,Ozone,NO2,SO2,CO,PM 2.5
0,1,Alabama,0.045526,22.978,4.673446,0.345078,10.016438
1,4,Arizona,0.05724,38.92033,12.117582,0.684723,17.213333
2,5,Arkansas,0.043364,17.477596,1.180055,0.287533,9.809836
3,6,California,0.074551,43.77913,3.285256,0.620842,40.764417
4,8,Colorado,0.065656,43.903683,2.054972,0.437755,10.486665


In [13]:
# Load CDI CSV
cdi_path = ('../Resources/cdi_csv.csv')

#Read in the CDI CSV and store into a Pandas DF
cdi_df = pd.read_csv(cdi_path)

In [14]:
# Remove extra index
cdi_df = cdi_df.drop('Unnamed: 0', axis=1)

In [15]:
# Limit to rows where stratification equals overall because otherwise the grouped data will be incorrect
overall_only = cdi_df[cdi_df['StratificationCategory1'] == 'Overall']


In [16]:
# Limit to relevent columns for readability
overall_df = overall_only[["LocationDesc", "Topic", "Question", "DataValueType", "DataValue", "LowConfidenceLimit", "HighConfidenceLimit"]]

In [17]:
# Sort by state to ensure we've got all of the required data
overall_sorted = overall_df.sort_values(by=['LocationDesc', 'Topic'])

# Remove rows where DataValueType is 'Number' because that is not a helpful indicator here

overall_cleaned = overall_sorted[(overall_sorted['DataValueType'] != 'Number') & 
                                 (overall_sorted['DataValueType'] != 'Crude Rate') &
                                 (overall_sorted['DataValueType'] != 'Crude Prevalence')]

# Reset the index
overall_cleaned = overall_cleaned.reset_index(drop=True)

In [18]:
## Change the name of the "LocationDesc" column to match "State Name" in AQI data
overall_cleaned = overall_cleaned.rename(columns={'LocationDesc': 'State Name'})

In [20]:
## Merge AQI and CDI DFs on State Name
aqi_cdi_df = pd.merge(overall_cleaned, aqi_df, on='State Name', how='left')

# Drop State Code column
aqi_cdi_df = aqi_cdi_df.drop('State', axis=1)

In [53]:
## Create a DF without missing AQI data for the purpose of creating scatter plots
complete_aqi_cdi = aqi_cdi_df.dropna()
complete_aqi_cdi.head(50)

Unnamed: 0,State Name,Topic,Question,DataValueType,DataValue,LowConfidenceLimit,HighConfidenceLimit,Ozone,NO2,SO2,CO,PM 2.5
0,Alabama,Asthma,Current asthma prevalence among adults aged >=...,Age-adjusted Prevalence,9.0,8.0,10.2,0.03777,15.342895,1.093742,0.258789,8.052705
1,Alabama,Cardiovascular Disease,Mortality from total cardiovascular diseases,Age-adjusted Rate,312.5,308.1,317.0,0.03777,15.342895,1.093742,0.258789,8.052705
2,Alabama,Chronic Obstructive Pulmonary Disease,Prevalence of chronic obstructive pulmonary di...,Age-adjusted Prevalence,8.9,7.9,10.0,0.03777,15.342895,1.093742,0.258789,8.052705
6,Arizona,Asthma,Current asthma prevalence among adults aged >=...,Age-adjusted Prevalence,9.7,8.9,10.5,0.048894,19.49976,1.637301,0.352748,9.111642
7,Arizona,Cardiovascular Disease,Mortality from total cardiovascular diseases,Age-adjusted Rate,195.9,193.1,198.8,0.048894,19.49976,1.637301,0.352748,9.111642
8,Arizona,Chronic Obstructive Pulmonary Disease,Prevalence of chronic obstructive pulmonary di...,Age-adjusted Prevalence,5.4,4.8,6.0,0.048894,19.49976,1.637301,0.352748,9.111642
9,Arkansas,Asthma,Current asthma prevalence among adults aged >=...,Age-adjusted Prevalence,8.9,7.8,10.2,0.037988,12.01836,0.69924,0.28694,7.983347
10,Arkansas,Cardiovascular Disease,Mortality from total cardiovascular diseases,Age-adjusted Rate,284.1,278.7,289.5,0.037988,12.01836,0.69924,0.28694,7.983347
11,Arkansas,Chronic Obstructive Pulmonary Disease,Prevalence of chronic obstructive pulmonary di...,Age-adjusted Prevalence,8.1,7.2,9.0,0.037988,12.01836,0.69924,0.28694,7.983347
12,California,Asthma,Current asthma prevalence among adults aged >=...,Age-adjusted Prevalence,9.3,8.2,10.5,0.045062,14.413923,0.59852,0.34556,13.600519


In [45]:
## Create a DF with only COPD data
copd_df = complete_aqi_cdi[complete_aqi_cdi["Topic"] == 'Chronic Obstructive Pulmonary Disease']
copd_df.head()

Unnamed: 0,State Name,Topic,Question,DataValueType,DataValue,LowConfidenceLimit,HighConfidenceLimit,Ozone,NO2,SO2,CO,PM 2.5
2,Alabama,Chronic Obstructive Pulmonary Disease,Prevalence of chronic obstructive pulmonary di...,Age-adjusted Prevalence,8.9,7.9,10.0,0.03777,15.342895,1.093742,0.258789,8.052705
8,Arizona,Chronic Obstructive Pulmonary Disease,Prevalence of chronic obstructive pulmonary di...,Age-adjusted Prevalence,5.4,4.8,6.0,0.048894,19.49976,1.637301,0.352748,9.111642
11,Arkansas,Chronic Obstructive Pulmonary Disease,Prevalence of chronic obstructive pulmonary di...,Age-adjusted Prevalence,8.1,7.2,9.0,0.037988,12.01836,0.69924,0.28694,7.983347
14,California,Chronic Obstructive Pulmonary Disease,Prevalence of chronic obstructive pulmonary di...,Age-adjusted Prevalence,5.1,4.3,6.0,0.045062,14.413923,0.59852,0.34556,13.600519
17,Colorado,Chronic Obstructive Pulmonary Disease,Prevalence of chronic obstructive pulmonary di...,Age-adjusted Prevalence,4.3,3.9,4.7,0.04851,12.40645,0.882243,0.287936,7.747488


In [46]:
## Create a DF with only asthma data
asthma_df = complete_aqi_cdi[complete_aqi_cdi["Topic"] == 'Asthma']
asthma_df.head()

Unnamed: 0,State Name,Topic,Question,DataValueType,DataValue,LowConfidenceLimit,HighConfidenceLimit,Ozone,NO2,SO2,CO,PM 2.5
0,Alabama,Asthma,Current asthma prevalence among adults aged >=...,Age-adjusted Prevalence,9.0,8.0,10.2,0.03777,15.342895,1.093742,0.258789,8.052705
6,Arizona,Asthma,Current asthma prevalence among adults aged >=...,Age-adjusted Prevalence,9.7,8.9,10.5,0.048894,19.49976,1.637301,0.352748,9.111642
9,Arkansas,Asthma,Current asthma prevalence among adults aged >=...,Age-adjusted Prevalence,8.9,7.8,10.2,0.037988,12.01836,0.69924,0.28694,7.983347
12,California,Asthma,Current asthma prevalence among adults aged >=...,Age-adjusted Prevalence,9.3,8.2,10.5,0.045062,14.413923,0.59852,0.34556,13.600519
15,Colorado,Asthma,Current asthma prevalence among adults aged >=...,Age-adjusted Prevalence,9.6,8.9,10.3,0.04851,12.40645,0.882243,0.287936,7.747488


In [1]:
## Create a DF with only CVD data
cvd_df = complete_aqi_cdi[complete_aqi_cdi["Topic"] == 'Cardiovascular Disease']
cvd_df.head(30)

NameError: name 'complete_aqi_cdi' is not defined

In [59]:
## Ozone Scatter Plot

ozone_scatter = px.scatter(complete_aqi_cdi, 
                           x='DataValue', y='Ozone', 
                           color='Topic',
                           hover_name = "State Name",
                           title = 'Effect of Ozone on CVD, COPD, and Asthma',
                           labels = {'DataValue': 'Age-adjusted CDI Data', 'Ozone': 'Annual Ozone Values'})
ozone_scatter.show()

In [118]:
## Define function to plot one parameter against one outcome
def plot_indiv_cdi(parameter, outcome, df):
    scatter = px.scatter(df, 
                           x='DataValue', y=parameter, 
                           hover_name = "State Name",
                           title = f'Effect of {parameter} on {outcome}',
                           labels = {'DataValue': 'Age-adjusted CDI Data', parameter: f'Annual {parameter} Values'},
                           trendline = 'ols')
    
    # Fit linear regression model
    X = df['DataValue']
    y = df[parameter]
    X = sm.add_constant(X)  # Add constant term for intercept
    model = sm.OLS(y, X).fit()

    r_squared = model.rsquared

    scatter.update_layout(title=f'Effect of {parameter} on {outcome}<br>R-squared: {r_squared:.2f}')
    

    scatter.show()


In [119]:
## Ozone and CVD
plot_indiv_cdi('Ozone', "CVD", cvd_df)

In [120]:
## Ozone and COPD
plot_indiv_cdi('Ozone', "COPD", copd_df)

In [121]:
## Ozone and Asthma
plot_indiv_cdi('Ozone', "Asthma", asthma_df)

In [122]:
## NO2 and CVD
plot_indiv_cdi('NO2', "CVD", cvd_df)

In [123]:
## NO2 and COPD
plot_indiv_cdi('NO2', "COPD", copd_df)

In [124]:
## NO2 and Asthma
plot_indiv_cdi('NO2', "Asthma", asthma_df)

In [125]:
plot_indiv_cdi('SO2', "CVD", cvd_df)

In [126]:
plot_indiv_cdi('SO2', "COPD", copd_df)

In [127]:
plot_indiv_cdi('SO2', "Asthma", asthma_df)

In [128]:
plot_indiv_cdi('CO', "CVD", cvd_df)

In [129]:
plot_indiv_cdi('CO', "COPD", copd_df)

In [130]:
plot_indiv_cdi('CO', "Asthma", asthma_df)

In [131]:
plot_indiv_cdi('PM 2.5', "CVD", cvd_df)

In [132]:
plot_indiv_cdi('PM 2.5', "COPD", copd_df)

In [133]:
plot_indiv_cdi('PM 2.5', "Asthma", asthma_df)