In [3]:
#creating imports
import pandas as pd
import numpy as np
import plotly.express as px

%pip install streamlit

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 24.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
# read data file
heart_disease_data = pd.read_csv("heart_disease_cleaned.csv")

In [5]:
heart_disease_data.columns

Index(['id', 'age', 'sex', 'dataset', 'cp', 'trestbps', 'chol', 'fbs',
       'restecg', 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num'],
      dtype='object')

In [6]:
#removing irrelevant columns
heart_disease_data = heart_disease_data[['id', 'age', 'sex', 'dataset', 'cp', 'trestbps', #'chol', 'fbs',
       #'restecg', 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 
       'num']].copy()

print(heart_disease_data)

      id  age     sex        dataset               cp  trestbps  num
0      1   63    Male      Cleveland   typical angina    145.00    0
1      2   67    Male      Cleveland     asymptomatic    160.00    2
2      3   67    Male      Cleveland     asymptomatic    120.00    1
3      4   37    Male      Cleveland      non-anginal    130.00    0
4      5   41  Female      Cleveland  atypical angina    130.00    0
..   ...  ...     ...            ...              ...       ...  ...
914  916   54  Female  VA Long Beach     asymptomatic    127.00    1
915  917   62    Male  VA Long Beach   typical angina    143.06    0
916  918   55    Male  VA Long Beach     asymptomatic    122.00    2
917  919   58    Male  VA Long Beach     asymptomatic    150.81    0
918  920   62    Male  VA Long Beach  atypical angina    120.00    1

[919 rows x 7 columns]


In [7]:
#renaming columns
heart_disease_data = heart_disease_data.rename(columns={
    'id':'patient_id',
    'dataset':'place_of_study',
    'cp':'chest_pain_type',
    'trestbps':'resting_blood_pressure',
    'num':'predicted_attribute'
})

In [8]:
heart_disease_data['predicted_attribute'] = heart_disease_data['predicted_attribute'].replace(
    {0:'no heart disease',
     1: 'mild heart disease',
     2:'moderate heart disease',
     3:'severe heart disease',
     4: 'critical heart disease'}
)

heart_disease_data['predicted_attribute'] = heart_disease_data['predicted_attribute'].astype(str)

In [9]:
heart_disease_data

Unnamed: 0,patient_id,age,sex,place_of_study,chest_pain_type,resting_blood_pressure,predicted_attribute
0,1,63,Male,Cleveland,typical angina,145.00,no heart disease
1,2,67,Male,Cleveland,asymptomatic,160.00,moderate heart disease
2,3,67,Male,Cleveland,asymptomatic,120.00,mild heart disease
3,4,37,Male,Cleveland,non-anginal,130.00,no heart disease
4,5,41,Female,Cleveland,atypical angina,130.00,no heart disease
...,...,...,...,...,...,...,...
914,916,54,Female,VA Long Beach,asymptomatic,127.00,mild heart disease
915,917,62,Male,VA Long Beach,typical angina,143.06,no heart disease
916,918,55,Male,VA Long Beach,asymptomatic,122.00,moderate heart disease
917,919,58,Male,VA Long Beach,asymptomatic,150.81,no heart disease


In [10]:

heart_disease_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 919 entries, 0 to 918
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   patient_id              919 non-null    int64  
 1   age                     919 non-null    int64  
 2   sex                     919 non-null    object 
 3   place_of_study          919 non-null    object 
 4   chest_pain_type         919 non-null    object 
 5   resting_blood_pressure  919 non-null    float64
 6   predicted_attribute     919 non-null    object 
dtypes: float64(1), int64(2), object(4)
memory usage: 50.4+ KB


In [11]:
heart_disease_data['predicted_attribute'].value_counts()

predicted_attribute
no heart disease          411
mild heart disease        265
moderate heart disease    109
severe heart disease      106
critical heart disease     28
Name: count, dtype: int64

In [12]:
#save dataframe to csv
heart_disease_data.to_csv('heart_disease_data.csv', index=False)

In [13]:
heart_disease_data = pd.read_csv('heart_disease_data.csv')
print(heart_disease_data)

     patient_id  age     sex place_of_study  chest_pain_type  \
0             1   63    Male      Cleveland   typical angina   
1             2   67    Male      Cleveland     asymptomatic   
2             3   67    Male      Cleveland     asymptomatic   
3             4   37    Male      Cleveland      non-anginal   
4             5   41  Female      Cleveland  atypical angina   
..          ...  ...     ...            ...              ...   
914         916   54  Female  VA Long Beach     asymptomatic   
915         917   62    Male  VA Long Beach   typical angina   
916         918   55    Male  VA Long Beach     asymptomatic   
917         919   58    Male  VA Long Beach     asymptomatic   
918         920   62    Male  VA Long Beach  atypical angina   

     resting_blood_pressure     predicted_attribute  
0                    145.00        no heart disease  
1                    160.00  moderate heart disease  
2                    120.00      mild heart disease  
3              

DATA VISUALISATION

In [14]:
#Scatter plot
fig = px.scatter(heart_disease_data, x = 'age', y = 'resting_blood_pressure', color='sex', title='Age vs Blood Pressure', hover_data=['chest_pain_type', 'place_of_study'])
fig.show()

In [15]:
heart_disease_data_pie = heart_disease_data['sex'].value_counts().reset_index()
heart_disease_data_pie.columns =['sex', 'count']
fig = px.pie(heart_disease_data_pie, values='count', names='sex', title='Distribution of Sex')
fig.show()

In [16]:
fig = px.histogram(heart_disease_data, x='age', color='sex', marginal='box', nbins=20, hover_data=['chest_pain_type', 'place_of_study'], title='Age distribution')
fig.show()

In [17]:
heart_disease_counts = heart_disease_data.groupby(['chest_pain_type', 'sex', 'predicted_attribute']).size().reset_index(name='count')
heatmap_data = heart_disease_counts.pivot_table(index='chest_pain_type', columns='predicted_attribute', values='count', aggfunc='sum', fill_value=0)

fig = px.imshow(
    heatmap_data,
    labels=dict(x="Heart Disease Severity", y="Chest Pain Type", color="Count"),
    x=['No Heart Disease', 'Mild Heart Disease', 'Moderate Heart Disease', 'Severe Heart Disease', 'Critical Heart Disease'],
    y=['Typical Angina', 'Atypical Angina', 'Non-Anginal', 'Asymptomatic'],
    title="Distribution of Heart Disease Severity by Chest Pain Type"
)

fig.show()
