In [1]:
#from google.colab import files
#files.upload()

Saving heart.csv to heart.csv


{'heart.csv': b'Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease\n40,M,ATA,140,289,0,Normal,172,N,0,Up,0\n49,F,NAP,160,180,0,Normal,156,N,1,Flat,1\n37,M,ATA,130,283,0,ST,98,N,0,Up,0\n48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1\n54,M,NAP,150,195,0,Normal,122,N,0,Up,0\n39,M,NAP,120,339,0,Normal,170,N,0,Up,0\n45,F,ATA,130,237,0,Normal,170,N,0,Up,0\n54,M,ATA,110,208,0,Normal,142,N,0,Up,0\n37,M,ASY,140,207,0,Normal,130,Y,1.5,Flat,1\n48,F,ATA,120,284,0,Normal,120,N,0,Up,0\n37,F,NAP,130,211,0,Normal,142,N,0,Up,0\n58,M,ATA,136,164,0,ST,99,Y,2,Flat,1\n39,M,ATA,120,204,0,Normal,145,N,0,Up,0\n49,M,ASY,140,234,0,Normal,140,Y,1,Flat,1\n42,F,NAP,115,211,0,ST,137,N,0,Up,0\n54,F,ATA,120,273,0,Normal,150,N,1.5,Flat,0\n38,M,ASY,110,196,0,Normal,166,N,0,Flat,1\n43,F,ATA,120,201,0,Normal,165,N,0,Up,0\n60,M,ASY,100,248,0,Normal,125,N,1,Flat,1\n36,M,ATA,120,267,0,Normal,160,N,3,Flat,1\n43,F,TA,100,223,0,Normal,142,N,0,Up,0\n44,M,ATA,120,184,0

In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Different classification models
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# Preprocessing and training
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

RANDOM_STATE = 42

In [3]:
# Load the data
df = pd.read_csv('heart.csv')

From the EDA we realised that dropping the rows with missing values of **Cholesterol** column was the best strategy to proceed with.

In [4]:
# Replace zeros in 'Cholesterol' and 'RestingBP' with Nan.
df['Cholesterol'] = df['Cholesterol'].replace({0:np.nan})
df['RestingBP'] = df['RestingBP'].replace({0:np.nan})

# Drop the rows with missing values of Cholesterol.
df.dropna(subset = ['Cholesterol'], inplace=True)

Now Split the data into training and testing before making any changes.

In [5]:
df_train, df_test = train_test_split(df, test_size=0.1, random_state=RANDOM_STATE)

# Print the shapes of train and test sets
print(f'Shape of train set : {df_train.shape}')
print(f'Shape of test set : {df_test.shape}')

Shape of train set : (671, 12)
Shape of test set : (75, 12)


In [6]:
# Separate features and target variable.
X_train, y_train = df_train.iloc[:, :-1], df_train.iloc[:, -1]
X_test, y_test = df_test.iloc[:, :-1], df_test.iloc[:, -1]

# Print the shapes of all the dataframes
print("Training :")
print(f'Shape of X_train : {X_train.shape}')
print(f'Shape of y_train : {y_train.shape}\n')
print('Testing :')
print(f'Shape of X_test : {X_test.shape}')
print(f'Shape of y_test : {y_test.shape}\n')

Training :
Shape of X_train : (671, 11)
Shape of y_train : (671,)

Testing :
Shape of X_test : (75, 11)
Shape of y_test : (75,)



In [7]:
X_train

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
72,52,M,ASY,120.0,182.0,0,Normal,150,N,0.0,Flat
210,48,M,ASY,106.0,263.0,1,Normal,110,N,0.0,Flat
896,47,M,NAP,130.0,253.0,0,Normal,179,N,0.0,Up
589,74,M,NAP,140.0,237.0,1,Normal,94,N,0.0,Flat
63,46,M,ASY,120.0,277.0,0,Normal,125,Y,1.0,Flat
...,...,...,...,...,...,...,...,...,...,...,...
71,44,M,ATA,130.0,215.0,0,Normal,135,N,0.0,Up
106,48,F,ASY,120.0,254.0,0,ST,110,N,0.0,Up
270,45,M,ASY,120.0,225.0,0,Normal,140,N,0.0,Up
607,53,M,ASY,144.0,300.0,1,ST,128,Y,1.5,Flat


### Transform and scale the features

In [11]:
# Create column transformers
oe = OrdinalEncoder(categories=[['M', 'F'], ['TA', 'ATA', 'NAP', 'ASY'], ['N', 'Y'], ['Up', 'Flat', 'Down']])
ohe = OneHotEncoder(drop='first', sparse_output=False)

# Column transformer for encoding categorical columns.
encoder = ColumnTransformer(
    [
        ('oe', oe, [1, 2, 8, 10]),
        ('ohe', ohe, [6])
    ],
remainder='passthrough')

In [10]:
# Create a preprocessing pipeline.
preprocessor = Pipeline(
    [
        ('encoder', encoder),
        ('scaler', StandardScaler())
    ]
)

## Training

- As this is a medical setting we don't want to miss any cases where the patient has a heart disease, so the metric that we must focus on would be 'Recall' this tells us of all the people having heart disease how many did we identify.

### 1. Random Forest

In [50]:
full_pipeline_rf = Pipeline(
    [
       ('preprocessor', preprocessor),
       ('model', RandomForestClassifier(criterion='log_loss', max_depth=90))
    ]
)

params = {
    'model__n_estimators': [20, 30, 50],
    #'model__max_depth': [30, 50, 75, 90, 100],
    'model__min_samples_split': [2, 10],
}

grid = GridSearchCV(
    full_pipeline_rf,
    cv=5,
    param_grid=params,
    n_jobs=-1,
    scoring='accuracy',
    refit=True
)

The max_depth=90 part is commented because, upon experimenting with different depths apart from GridSearchCV, better results were observed with 90 as the depth.

In [51]:
grid.fit(X_train, y_train)

In [55]:
# Extract the best model.
best_rf = grid.best_estimator_

In [56]:
# Random forest metrics.

train_preds = best_rf.predict(X_train)
test_preds = best_rf.predict(X_test)

print(f'Training accuracy : {accuracy_score(train_preds, y_train)*100}%')
print(f'Testing accuracy : {accuracy_score(test_preds, y_test)*100}%\n')
print('Classification report:')
print(classification_report(y_test, test_preds))

Training accuracy : 94.63487332339791%
Testing accuracy : 89.33333333333333%

Classification report:
              precision    recall  f1-score   support

           0       0.85      0.94      0.89        35
           1       0.94      0.85      0.89        40

    accuracy                           0.89        75
   macro avg       0.90      0.90      0.89        75
weighted avg       0.90      0.89      0.89        75

