In [23]:
# Engineer the data into manageable chunks.

# Steps:

# 1. Drop N/A Values rows.
# 2. Drop Years, lets say, take data from 2009 - 2019 (??)
# 3. Average the mortality/100_000ppl Data for rows with the same values of 'US States', 'Year' 'Ethnicity' , 'Gender', 'Heart Disease Type' and 'Age range'.



#Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import scipy.stats as stats

In [2]:
# Read in dataset for Rates and Trends in Heart Disease Between 1999-2019
path = 'Rates&Trends_In_Heart_Disease&Stroke_Rate_by_100_000_Demographic_Data_Clean.csv'
Heart_disease_df = pd.read_csv(path)
Heart_disease_df["Year"] = Heart_disease_df["Year"].astype(float)
Heart_disease_df.head()

Unnamed: 0,Year,LocationAbbr,Topic,Data_Value/100_000 People,Stratification1,Stratification2,Stratification3
0,1999.0,AL,All heart disease,,Ages 35-64 years,American Indian/Alaska Native,Overall
1,2013.0,AL,All heart disease,,Ages 35-64 years,American Indian/Alaska Native,Overall
2,2014.0,AL,All heart disease,,Ages 35-64 years,American Indian/Alaska Native,Overall
3,2005.0,AL,All heart disease,,Ages 35-64 years,American Indian/Alaska Native,Overall
4,2012.0,AL,All heart disease,,Ages 35-64 years,American Indian/Alaska Native,Overall


In [4]:
# Drop any rows with null values
Clean_Heart_disease_null = Heart_disease_df.dropna(axis=0, how = "any")
Clean_Heart_disease_null.head()

Unnamed: 0,Year,LocationAbbr,Topic,Data_Value/100_000 People,Stratification1,Stratification2,Stratification3
53,2016.0,AL,All stroke,25.7,Ages 35-64 years,Overall,Overall
79,2011.0,AL,All stroke,29.5,Ages 35-64 years,Overall,Men
106,2017.0,AL,All stroke,33.6,Ages 35-64 years,Overall,Men
108,2017.0,AL,All heart disease,128.7,Ages 35-64 years,Overall,Overall
109,2016.0,AL,All heart disease,128.1,Ages 35-64 years,Overall,Overall


In [7]:
Heart_disease_rename = Clean_Heart_disease_null.rename(columns = {"LocationAbbr": "US States", "LocationDesc": "US County",\
                                                               "Topic": "Heart Disease Type", "Data_Value_Unit": "Rate per 100,000 population",\
                                                               "Stratification1": "Age range", "Stratification2": "Ethnicity", "Stratification3": "Gender"})
Heart_disease_rename.head()

Unnamed: 0,Year,US States,Heart Disease Type,Data_Value/100_000 People,Age range,Ethnicity,Gender
53,2016.0,AL,All stroke,25.7,Ages 35-64 years,Overall,Overall
79,2011.0,AL,All stroke,29.5,Ages 35-64 years,Overall,Men
106,2017.0,AL,All stroke,33.6,Ages 35-64 years,Overall,Men
108,2017.0,AL,All heart disease,128.7,Ages 35-64 years,Overall,Overall
109,2016.0,AL,All heart disease,128.1,Ages 35-64 years,Overall,Overall


In [12]:
# Show a list of all the different states in the dataset.
state_list = Heart_disease_rename['US States'].unique().tolist()
state_list

['AL',
 'AK',
 'AZ',
 'AR',
 'CA',
 'CO',
 'CT',
 'DE',
 'FL',
 'DC',
 'GA',
 'HI',
 'ID',
 'IL']

In [20]:
# Group by the specified columns and calculate the mean for 'Data_Value/100_000 People'

Averaged_grouped_df = Heart_disease_rename.groupby(['US States', 'Year', 'Ethnicity', 'Gender', 'Heart Disease Type','Age range'], as_index=False).mean()
# ^^ So basically it takes any rows that have the same values in columns 'US States', 
# 'Year' 'Ethnicity' , 'Gender', 'Heart Disease Type' and 'Age range'
# and finds the average of those rows then returns a data frame with
# one value (average) for the 'Data_Value/100_000 People'.

In [17]:
Averaged_grouped_df

Unnamed: 0,US States,Year,Ethnicity,Gender,Heart Disease Type,Age range,Data_Value/100_000 People
0,AK,1999.0,American Indian/Alaska Native,Overall,All heart disease,Ages 35-64 years,84.886667
1,AK,1999.0,American Indian/Alaska Native,Overall,All heart disease,Ages 65 years and older,1433.755556
2,AK,1999.0,American Indian/Alaska Native,Overall,All stroke,Ages 35-64 years,23.786667
3,AK,1999.0,American Indian/Alaska Native,Overall,All stroke,Ages 65 years and older,680.877778
4,AK,1999.0,American Indian/Alaska Native,Overall,Cardiovascular disease (CVD),Ages 35-64 years,87.780000
...,...,...,...,...,...,...,...
22465,IL,2019.0,White,Overall,Cardiovascular disease (CVD),Ages 65 years and older,1561.057143
22466,IL,2019.0,White,Overall,Coronary heart disease (CHD),Ages 35-64 years,66.591429
22467,IL,2019.0,White,Overall,Coronary heart disease (CHD),Ages 65 years and older,577.260000
22468,IL,2019.0,White,Overall,Heart failure,Ages 35-64 years,18.165714


In [19]:
# Filtered DF to check if above is correct:
# Filter the DataFrame

df = Heart_disease_rename

filtered_df = df[
    (df['US States'] == 'AK') &
    (df['Year'] == 1999.0) &
    (df['Ethnicity'] == 'American Indian/Alaska Native') &
    (df['Gender'] == 'Overall') &
    (df['Heart Disease Type'] == 'All heart disease') &
    (df['Age range'] == 'Ages 65 years and older')
]

filtered_df

Unnamed: 0,Year,US States,Heart Disease Type,Data_Value/100_000 People,Age range,Ethnicity,Gender
116782,1999.0,AK,All heart disease,1704.0,Ages 65 years and older,American Indian/Alaska Native,Overall
118450,1999.0,AK,All heart disease,1279.7,Ages 65 years and older,American Indian/Alaska Native,Overall
125189,1999.0,AK,All heart disease,1371.5,Ages 65 years and older,American Indian/Alaska Native,Overall
126878,1999.0,AK,All heart disease,1738.8,Ages 65 years and older,American Indian/Alaska Native,Overall
133615,1999.0,AK,All heart disease,1679.0,Ages 65 years and older,American Indian/Alaska Native,Overall
135246,1999.0,AK,All heart disease,1223.5,Ages 65 years and older,American Indian/Alaska Native,Overall
147030,1999.0,AK,All heart disease,1212.4,Ages 65 years and older,American Indian/Alaska Native,Overall
148718,1999.0,AK,All heart disease,1347.5,Ages 65 years and older,American Indian/Alaska Native,Overall
150398,1999.0,AK,All heart disease,1347.4,Ages 65 years and older,American Indian/Alaska Native,Overall


In [25]:
Averaged_grouped_df.to_csv('Averaged_Cleaned_filtered_data.csv', index=False)