In [None]:
# import libraries
import codecademylib3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from scipy.stats import ttest_ind
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from scipy.stats import chi2_contingency

# load data
heart = pd.read_csv('heart_disease.csv')

# Inspecting first few rows of data:
print(heart.head())

# Predictors of Heart Disease:
# Association between thalach and heart_disease:
sns.boxplot(y=heart.thalach, x=heart.heart_disease)
plt.show()

thalach_hd = heart.thalach[heart.heart_disease == "presence"]
thalach_no_hd = heart.thalach[heart.heart_disease == "absence"]

# Mean difference for thalach:
mean_diff = np.mean(thalach_no_hd) - np.mean(thalach_hd)
print("thalach mean difference:", mean_diff)

# Median difference for thalach:
median_diff = np.median(thalach_no_hd) - np.median(thalach_hd)
print("Thalach Median difference:", median_diff)

# Two-sample test:
# Null: The average thalach for a person with heart disease is equal to the average thalach for a person without heart disease.

# Alternative: The average thlach for a person with heart disease is not equal to the avearge thalch for a person without heart disease.

tstat, pval = ttest_ind(thalach_hd, thalach_no_hd)
print('P-value:', pval)

# Checking for association between age and heart disease:
plt.clf()
sns.boxplot(y=heart.age, x=heart.heart_disease)
plt.show()

age_hd = heart.age[heart.heart_disease == 'presence']
age_no_hd = heart.age[heart.heart_disease == 'absence']

# Running a two sample t-test:
tstat, pval = ttest_ind(age_hd, age_no_hd)
print('P-value:', pval)

# Chest Pain and Max Heart Rate:
plt.clf()
sns.boxplot(x=heart.cp, y=heart.thalach)
plt.show()

thalach_typical = heart.thalach[heart.cp == "typical angina"]
thalach_asymptom = heart.thalach[heart.cp == "asymptomatic"]
thalach_nonangin = heart.thalach[heart.cp == "non-anginal pain"]
thalach_atypical = heart.thalach[heart.cp == "atypical angina"]

# ANOVA Test:
# Null: People with typical angina, non-anginal pain, atypical angina, and asymptomatic people all have the same average thalach.

# Alternative: People with typical angina, non-anginal pain, atypical angina, and asymptomatic people do not all have the same average thalach

fstat, pval = f_oneway(thalach_typical, thalach_asymptom, thalach_nonangin, thalach_atypical)
print('P-value:', pval)

# Tukey test:
tukey_results = pairwise_tukeyhsd(heart.thalach, heart.cp, 0.05)
print(tukey_results)

# Heart Disease and Chest Pain:
Xtab = pd.crosstab(heart.cp, heart.heart_disease)
# Chi-Square test:
# Null: There is NOT an association between chest pain type and whether or not someone is diagnosed with heart disease.

# Alternative: There is an association between chest pain type and whether or not someone is diagnosed with heart disease.

chi2, pval, dof, expected = chi2_contingency(Xtab)
print('P-value:', pval)

# Further exploration:
# Association between sex and heart_disease:
Xtab_2 = pd.crosstab(heart.sex, heart.heart_disease)
# Chi-Square test:
# Null: There is NOT an association between gender type and whether or not someone is diagnosed with heart disease.

# Alternative: There is an association between gender type and whether or not someone is diagnosed with heart disease.
chi2, pval, dof, expected = chi2_contingency(Xtab_2)
print('P-value:', pval)