In [1]:
import plotly.express as px
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

## Data Loading



In [3]:
# loading data here
heart = pd.read_csv("Data/heart.csv")

## Data visualization



In [299]:
# use for final project
sorted_heart = heart.sort_values(by = 'HeartDisease', ascending = True)
fig = px.bar(sorted_heart, y = "Cholesterol", x= "MaxHR", color= "HeartDisease", title = "Cholesterol and MaxHR's Effect on Heart Disease")
fig.show()

Interpretation: The bar graph shows how people with a lower max heart rate have a slightly higher chance of getting heart heart disease. For example, a person who has a max heart rate of 180 is far more likely to not have heart disease than to have it, and a person with a max heart rate of 105 is more likely to have heart rate than to not have it. The graph also shows that people with higher cholesterol are more likely to be diagnosed with heart disease. 



In [296]:
# use for final project
fig = px.scatter(heart,color_discrete_sequence=px.colors.qualitative.Pastel, x="Age", y="Cholesterol",
	         size="Cholesterol", color="HeartDisease",
                 hover_name="Sex", log_x=True, size_max=25, title = "Age and Cholesterol's Correlation to Heart Disease")

fig.update_layout(width=1000, height=600)
fig.show()


Interpretation: This bubble chart shows the correlation between a person's age and their cholesterol levels. If the color is teal, the user does not have heart disease. If the bubble color is yellow, the user does have heart disease. There does not seem to be a correlation between age and cholesterol because each age group has multiple users with varying cholesterol levels and a clear line of best fit cannot be drawn. 



In [300]:
# use for final project
fig = px.imshow(heart.corr(), color_continuous_scale='pubu', title = "Correlation Between Different Parameters and Heart Disease")
fig.show()

Interpretation: This heat map shows the correlations between different variables such as age, resting blood pressure, cholesterol, fasting 
blood sugar, maximum heart rate, and oldpeak. For each combination of two variables, a block of color is shown. If the color is dark blue, the relationship between the two variables is strong. If the color is light blue, the relationship is weak. The diagonal line of dark blue squares is predictable because the data for the same variables \(ex. age and age\) is identical. Therefore, the correlation is extremely strong. However, we can see that there is no correlation between age and maximum heart rate because the color is almost 0. 



In [269]:
# use for final project
fig = px.box(heart, y="RestingBP", color='HeartDisease', title="Resting BP vs Heart Disease")
fig.show()

Interpretation: This box plot shows that resting blood pressure is not a crucial factor concerning heart disease. Each plot has a similar median, so it is hard to differentiate whether or not the blood pressure will affect if a person will get heart disease. 



In [267]:
# use for final project
fig = px.density_contour(heart, x="Age", y="MaxHR", title="MaxHR vs Age", facet_col="Sex", color="HeartDisease")
fig.show()

Interpretation: The density contour shows the correlation between maximum heart rate and age for females and males. It is evident that in both graphs people who were not diagnosed with heart disease have a greater maximum heart rate than people who were not diagnosed with heart disease. However, the maximum heart rates become similar as the age increases. 




In [305]:
fig = px.density_contour(heart, x="Age", y="MaxHR", title= "MaxHR vs Age")
fig.update_traces(contours_coloring="fill", contours_showlabels = True)
fig.show()

Interpretation: This density contour shows different ages compared to the maximum heart rate. At the age of 52\-57, the max heart rate is at 124.5 which shows that 34 people at this age have a similar max heart rate.


## Data Cleaning



In [4]:
heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [216]:
heart.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [236]:
# remove outlier
# heart[(heart['RestingBP'] == 0) & (heart['HeartDisease'] == 1)]
row_to_drop = [449]
heart.drop(row_to_drop, axis= 0, inplace= True)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease


In [251]:
# convert the heartdisease into category
heart['HeartDisease'] = heart['HeartDisease'].astype('category')
heart.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 917 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   Age             917 non-null    int64   
 1   Sex             917 non-null    object  
 2   ChestPainType   917 non-null    object  
 3   RestingBP       917 non-null    int64   
 4   Cholesterol     917 non-null    int64   
 5   FastingBS       917 non-null    int64   
 6   RestingECG      917 non-null    object  
 7   MaxHR           917 non-null    int64   
 8   ExerciseAngina  917 non-null    object  
 9   Oldpeak         917 non-null    float64 
 10  ST_Slope        917 non-null    object  
 11  HeartDisease    917 non-null    category
dtypes: category(1), float64(1), int64(5), object(5)
memory usage: 119.3+ KB


<class 'pandas.core.frame.DataFrame'>
Int64Index: 917 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   Age             917 non-null    int64   
 1   Sex             917 non-null    object  
 2   ChestPainType   917 non-null    object  
 3   RestingBP       917 non-null    int64   
 4   Cholesterol     917 non-null    int64   
 5   FastingBS       917 non-null    int64   
 6   RestingECG      917 non-null    object  
 7   MaxHR           917 non-null    int64   
 8   ExerciseAngina  917 non-null    object  
 9   Oldpeak         917 non-null    float64 
 10  ST_Slope        917 non-null    object  
 11  HeartDisease    917 non-null    category
dtypes: category(1), float64(1), int64(5), object(5)
memory usage: 119.3+ KB
