In [None]:
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import chi2_contingency
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

%matplotlib inline

import warnings

warnings.filterwarnings('ignore')

In [None]:
heart_attack_df = pd.read_csv("heart_attack_prediction_dataset.csv")

In [None]:
heart_attack_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8763 entries, 0 to 8762
Data columns (total 26 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Patient ID                       8763 non-null   object 
 1   Age                              8763 non-null   int64  
 2   Sex                              8763 non-null   object 
 3   Cholesterol                      8763 non-null   int64  
 4   Blood Pressure                   8763 non-null   object 
 5   Heart Rate                       8763 non-null   int64  
 6   Diabetes                         8763 non-null   int64  
 7   Family History                   8763 non-null   int64  
 8   Smoking                          8763 non-null   int64  
 9   Obesity                          8763 non-null   int64  
 10  Alcohol Consumption              8763 non-null   int64  
 11  Exercise Hours Per Week          8763 non-null   float64
 12  Diet                

In [None]:
heart_attack_df.describe()

Unnamed: 0,Age,Cholesterol,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,Exercise Hours Per Week,Previous Heart Problems,Medication Use,Stress Level,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Heart Attack Risk
count,8763.0,8763.0,8763.0,8763.0,8763.0,8763.0,8763.0,8763.0,8763.0,8763.0,8763.0,8763.0,8763.0,8763.0,8763.0,8763.0,8763.0,8763.0,8763.0
mean,53.707977,259.877211,75.021682,0.652288,0.492982,0.896839,0.501426,0.598083,10.014284,0.495835,0.498345,5.469702,5.99369,158263.181901,28.891446,417.677051,3.489672,7.023508,0.358211
std,21.249509,80.863276,20.550948,0.476271,0.499979,0.304186,0.500026,0.490313,5.783745,0.500011,0.500026,2.859622,3.466359,80575.190806,6.319181,223.748137,2.282687,1.988473,0.479502
min,18.0,120.0,40.0,0.0,0.0,0.0,0.0,0.0,0.002442,0.0,0.0,1.0,0.001263,20062.0,18.002337,30.0,0.0,4.0,0.0
25%,35.0,192.0,57.0,0.0,0.0,1.0,0.0,0.0,4.981579,0.0,0.0,3.0,2.998794,88310.0,23.422985,225.5,2.0,5.0,0.0
50%,54.0,259.0,75.0,1.0,0.0,1.0,1.0,1.0,10.069559,0.0,0.0,5.0,5.933622,157866.0,28.768999,417.0,3.0,7.0,0.0
75%,72.0,330.0,93.0,1.0,1.0,1.0,1.0,1.0,15.050018,1.0,1.0,8.0,9.019124,227749.0,34.324594,612.0,5.0,9.0,1.0
max,90.0,400.0,110.0,1.0,1.0,1.0,1.0,1.0,19.998709,1.0,1.0,10.0,11.999313,299954.0,39.997211,800.0,7.0,10.0,1.0


In [None]:
#Splitting Blood Pressure to Systolic & Diastolic
heart_attack_df[['Systolic_BP', 'Diastolic_BP']] = heart_attack_df['Blood Pressure'].str.split('/', expand=True)

#Convert into numeric values
heart_attack_df['Systolic_BP'] = pd.to_numeric(heart_attack_df['Systolic_BP'])
heart_attack_df['Diastolic_BP'] = pd.to_numeric(heart_attack_df['Diastolic_BP'])

In [None]:
cols_to_drop = ['Patient ID', 'Country', 'Continent', 'Hemisphere', 'Sedentary Hours Per Day', 'Income', 'Triglycerides', 'Diet', 'Blood Pressure']
heart_attack_df.drop(cols_to_drop, axis=1, inplace=True)

In [None]:
heart_attack_df.head()

Unnamed: 0,Age,Sex,Cholesterol,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,Exercise Hours Per Week,Previous Heart Problems,Medication Use,Stress Level,BMI,Physical Activity Days Per Week,Sleep Hours Per Day,Heart Attack Risk,Systolic_BP,Diastolic_BP
0,67,Male,208,72,0,0,1,0,0,4.168189,0,0,9,31.251233,0,6,0,158,88
1,21,Male,389,98,1,1,1,1,1,1.813242,1,0,1,27.194973,1,7,0,165,93
2,21,Female,324,72,1,0,0,0,0,2.078353,1,1,9,28.176571,4,4,0,174,99
3,84,Male,383,73,1,1,1,0,1,9.82813,1,0,9,36.464704,3,4,0,163,100
4,66,Male,318,93,1,1,1,1,0,5.804299,1,0,6,21.809144,1,5,0,91,88


In [None]:
heart_attack_df['Heart Attack Risk'].unique()

array([0, 1])

In [None]:
labelencoder = LabelEncoder()
heart_attack_df['Sex_Cat'] = labelencoder.fit_transform(heart_attack_df['Sex'])

In [None]:
heart_attack_df.head()

Unnamed: 0,Age,Sex,Cholesterol,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,Exercise Hours Per Week,Previous Heart Problems,Medication Use,Stress Level,BMI,Physical Activity Days Per Week,Sleep Hours Per Day,Heart Attack Risk,Systolic_BP,Diastolic_BP,Sex_Cat
0,67,Male,208,72,0,0,1,0,0,4.168189,0,0,9,31.251233,0,6,0,158,88,1
1,21,Male,389,98,1,1,1,1,1,1.813242,1,0,1,27.194973,1,7,0,165,93,1
2,21,Female,324,72,1,0,0,0,0,2.078353,1,1,9,28.176571,4,4,0,174,99,0
3,84,Male,383,73,1,1,1,0,1,9.82813,1,0,9,36.464704,3,4,0,163,100,1
4,66,Male,318,93,1,1,1,1,0,5.804299,1,0,6,21.809144,1,5,0,91,88,1


In [None]:
cols_to_drop_2 = ['Sex']
heart_attack_df.drop(cols_to_drop_2, axis=1, inplace=True)

In [None]:
heart_attack_df.head()

Unnamed: 0,Age,Cholesterol,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,Exercise Hours Per Week,Previous Heart Problems,Medication Use,Stress Level,BMI,Physical Activity Days Per Week,Sleep Hours Per Day,Heart Attack Risk,Systolic_BP,Diastolic_BP,Sex_Cat
0,67,208,72,0,0,1,0,0,4.168189,0,0,9,31.251233,0,6,0,158,88,1
1,21,389,98,1,1,1,1,1,1.813242,1,0,1,27.194973,1,7,0,165,93,1
2,21,324,72,1,0,0,0,0,2.078353,1,1,9,28.176571,4,4,0,174,99,0
3,84,383,73,1,1,1,0,1,9.82813,1,0,9,36.464704,3,4,0,163,100,1
4,66,318,93,1,1,1,1,0,5.804299,1,0,6,21.809144,1,5,0,91,88,1


In [None]:
heart_attack_df.isnull().sum() / len(heart_attack_df) * 100

Age                                0.0
Cholesterol                        0.0
Heart Rate                         0.0
Diabetes                           0.0
Family History                     0.0
Smoking                            0.0
Obesity                            0.0
Alcohol Consumption                0.0
Exercise Hours Per Week            0.0
Previous Heart Problems            0.0
Medication Use                     0.0
Stress Level                       0.0
BMI                                0.0
Physical Activity Days Per Week    0.0
Sleep Hours Per Day                0.0
Heart Attack Risk                  0.0
Systolic_BP                        0.0
Diastolic_BP                       0.0
Sex_Cat                            0.0
dtype: float64

In [None]:
z_scores = np.abs(stats.zscore(heart_attack_df.select_dtypes(include=np.number)))
outliers = heart_attack_df[(z_scores > 3).any(axis=1)]
print(f'Number of outliers: {len(outliers)}')

Number of outliers: 0


Positive/Negative class percentage

In [None]:
heart_risk_counts = heart_attack_df['Heart Attack Risk'].value_counts()
positive_percentage = heart_risk_counts.sum() / len(heart_attack_df) * 100

negative_percentage = 100 - positive_percentage

print(f"Positive class percentage: {positive_percentage:.2f}%")
print(f"Negative class percentage: {negative_percentage:.2f}%")

Positive class percentage: 100.00%
Negative class percentage: 0.00%


Split the data

In [None]:
from sklearn.model_selection import train_test_split
heart_attack_df.head()

target_col = ['Heart Attack Risk']
y = heart_attack_df[target_col]
X = heart_attack_df.drop(target_col, axis=1)

In [None]:
print(X)

      Age  Cholesterol  Heart Rate  Diabetes  Family History  Smoking  \
0      67          208          72         0               0        1   
1      21          389          98         1               1        1   
2      21          324          72         1               0        0   
3      84          383          73         1               1        1   
4      66          318          93         1               1        1   
...   ...          ...         ...       ...             ...      ...   
8758   60          121          61         1               1        1   
8759   28          120          73         1               0        0   
8760   47          250         105         0               1        1   
8761   36          178          60         1               0        1   
8762   25          356          75         1               1        0   

      Obesity  Alcohol Consumption  Exercise Hours Per Week  \
0           0                    0                 4.168189 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

Training set

In [None]:
print(X_train)

      Age  Cholesterol  Heart Rate  Diabetes  Family History  Smoking  \
2963   77          346          74         0               0        1   
599    59          287          88         0               1        1   
45     33          185          79         0               1        1   
1444   74          352          89         0               0        1   
1652   29          260          50         1               0        1   
...   ...          ...         ...       ...             ...      ...   
5734   48          334          43         1               0        1   
5191   68          188          69         0               0        1   
5390   32          169          84         1               0        0   
860    85          285          54         1               1        1   
7270   89          240          53         1               1        1   

      Obesity  Alcohol Consumption  Exercise Hours Per Week  \
2963        0                    1                 2.679789 

In [None]:
from sklearn.metrics import accuracy_score

y_pred = dt.predict(X_test)
accuracy = accuracy_score(y_test, y_test)
print(f"Decision tree accuracy: {accuracy:.3f}")

Decision tree accuracy: 1.000


In [None]:
print(y_pred[:10])
print(y_test[:10])

[1 0 0 1 0 1 0 0 1 1]
      Heart Attack Risk
1226                  0
7903                  1
1559                  1
3621                  1
7552                  0
1467                  1
5488                  1
222                   0
3317                  0
3127                  1


In [None]:
example_row = X_test.iloc[0]
prediction = dt.predict([example_row])
print(f"Prediction: {prediction[0]}")

Prediction: 1


Data Mining Testing

In [None]:
user_input = pd.DataFrame({'Age':[24],
                           'Cholesterol': [300],
                           'Heart Rate': [100],
                           'Diabetes': [1],
                           'Family History': [0],
                           'Smoking': [1],
                           'Obesity':[0],
                           'Alcohol Consumption':[1],
                           'Exercise Hours Per Week':[2.000000],
                           'Previous Heart Problems':[0],
                           'Medication Use':[1],
                           'Stress Level':[6],
                           'BMI':[36.464704],
                           'Physical Activity Days Per Week':[3],
                           'Sleep Hours Per Day':[8],
                           'Systolic_BP':[130],
                           'Diastolic_BP':[90],
                           'Sex_Cat':[1]
                           })

'''user_input['Sex'] = labelencoder.fit_transform(user_input['Sex'])
user_input.drop(cols_to_drop_2, axis=1, inplace=True)'''

prediction = dt.predict(user_input)
print(f"Prediction for example row: {prediction[0]}")

Prediction for example row: 0
