from google.colab import files
files.upload()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Different classification models
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# Preprocessing and training
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Metrics
from sklearn.metrics import accuracy_score, classification_report

RANDOM_STATE = 42

In [None]:
# Load the data
df = pd.read_csv('heart.csv')

From the EDA we realised that dropping the rows with missing values of **Cholesterol** column was the best strategy to proceed with.

In [None]:
# Replace zeros in 'Cholesterol' and 'RestingBP' with Nan.
df['Cholesterol'] = df['Cholesterol'].replace({0:np.nan})
df['RestingBP'] = df['RestingBP'].replace({0:np.nan})

# Drop the rows with missing values of Cholesterol.
df.dropna(subset = ['Cholesterol'], inplace=True)

Now Split the data into training and testing before making any changes.

In [None]:
df_train, df_test = train_test_split(df, test_size=0.2)

# Print the shapes of train and test sets
print(f'Shape of train set : {df_train.shape}')
print(f'Shape of test set : {df_test.shape}')

Shape of train set : (596, 12)
Shape of test set : (150, 12)


In [None]:
# Separate features and target variable.
X_train, y_train = df_train.iloc[:, :-1], df_train.iloc[:, -1]
X_test, y_test = df_test.iloc[:, :-1], df_test.iloc[:, -1]

# Print the shapes of all the dataframes
print("Training :")
print(f'Shape of X_train : {X_train.shape}')
print(f'Shape of y_train : {y_train.shape}\n')
print('Testing :')
print(f'Shape of X_test : {X_test.shape}')
print(f'Shape of y_test : {y_test.shape}\n')

Training :
Shape of X_train : (596, 11)
Shape of y_train : (596,)

Testing :
Shape of X_test : (150, 11)
Shape of y_test : (150,)



In [None]:
X_train

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
791,51,M,ASY,140.0,298.0,0,Normal,122,Y,4.2,Flat
109,39,M,ATA,190.0,241.0,0,Normal,106,N,0.0,Up
846,39,M,ASY,118.0,219.0,0,Normal,140,N,1.2,Flat
781,50,F,ASY,110.0,254.0,0,LVH,159,N,0.0,Up
691,45,M,ASY,104.0,208.0,0,LVH,148,Y,3.0,Flat
...,...,...,...,...,...,...,...,...,...,...,...
834,44,M,ATA,120.0,220.0,0,Normal,170,N,0.0,Up
771,55,M,ASY,140.0,217.0,0,Normal,111,Y,5.6,Down
805,54,M,ASY,140.0,239.0,0,Normal,160,N,1.2,Up
117,59,F,ASY,130.0,338.0,1,ST,130,Y,1.5,Flat


### Transform and scale the features

In [None]:
# Create column transformers
oe = OrdinalEncoder(categories=[['M', 'F'], ['TA', 'ATA', 'NAP', 'ASY'], ['N', 'Y'], ['Up', 'Flat', 'Down']])
ohe = OneHotEncoder(drop='first', sparse_output=False)

# Column transformer for encoding categorical columns.
encoder = ColumnTransformer(
    [
        ('oe', oe, [1, 2, 8, 10]),
        ('ohe', ohe, [6])
    ],
remainder='passthrough')

In [None]:
# Create a preprocessing pipeline.
preprocessor = Pipeline(
    [
        ('encoder', encoder),
        ('scaler', StandardScaler())
    ]
)

## Training

- As this is a medical setting we don't want to miss any cases where the patient has a heart disease, so the metric that we must focus on would be 'Recall' this tells us of all the people having heart disease how many did we identify.

### 1. Random Forest

In [None]:
full_pipeline_rf = Pipeline(
    [
       ('preprocessor', preprocessor),
       ('model', RandomForestClassifier(criterion='log_loss', max_depth=90))
    ]
)

params = {
    'model__n_estimators': [20, 30, 50],
    #'model__max_depth': [30, 50, 75, 90, 100],
    'model__min_samples_split': [2, 10],
}

grid = GridSearchCV(
    full_pipeline_rf,
    cv=5,
    param_grid=params,
    n_jobs=-1,
    scoring='accuracy',
    refit=True
)

The max_depth=90 part is commented because, upon experimenting with different depths apart from GridSearchCV, better results were observed with 90 as the depth.

In [None]:
grid.fit(X_train, y_train)

In [None]:
# Extract the best model.
best_rf = grid.best_estimator_

In [None]:
# Random forest metrics.

train_preds = best_rf.predict(X_train)
test_preds = best_rf.predict(X_test)

print(f'Training accuracy : {accuracy_score(train_preds, y_train)*100:.2f}%')
print(f'Testing accuracy : {accuracy_score(test_preds, y_test)*100:.2f}%\n')
print('Classification report:')
print(classification_report(y_test, test_preds))

Training accuracy : 94.13%
Testing accuracy : 87.33%

Classification report:
              precision    recall  f1-score   support

           0       0.86      0.89      0.88        76
           1       0.89      0.85      0.87        74

    accuracy                           0.87       150
   macro avg       0.87      0.87      0.87       150
weighted avg       0.87      0.87      0.87       150



### 2. XGBoostClassifier

In [None]:
full_pipeline_xgb = Pipeline(
    [
        ('preprocessor', preprocessor),
        ('model', XGBClassifier(tree_method='hist', eval_metric='logloss', device='cuda'))
    ]
)

params = {
    'model__n_estimators' : [40, 50],
    'model__booster' : ['gbtree', 'dart'],
    'model__max_depth' : [4, 5, 6],
    'model__learning_rate' : [0.05, 0.1, 0.15],
    'model__subsample' : [0.7, 1]
}

grid = GridSearchCV(
    full_pipeline_xgb,
    cv =5,
    param_grid=params,
    refit=True,
    scoring='accuracy'
)

In [None]:
grid.fit(X_train, y_train)

In [None]:
# Extract the best XBG model
best_xgb = grid.best_estimator_

In [None]:
# Evaluate the model.
train_preds = best_xgb.predict(X_train)
test_preds = best_xgb.predict(X_test)

print(f'Training accuracy : {accuracy_score(train_preds, y_train)*100:.2f}')
print(f'Testing accuracy : {accuracy_score(test_preds, y_test)*100:.2f}\n')
print(classification_report(y_test, test_preds))

Training accuracy : 95.81
Testing accuracy : 85.33

              precision    recall  f1-score   support

           0       0.82      0.91      0.86        76
           1       0.89      0.80      0.84        74

    accuracy                           0.85       150
   macro avg       0.86      0.85      0.85       150
weighted avg       0.86      0.85      0.85       150



### 3. SVC

In [None]:
full_pipeline_svc = Pipeline(
    [
        ('preprocessor', preprocessor),
        ('model', SVC(probability=True, kernel='rbf', random_state=RANDOM_STATE))
    ]
)

params_grid = {
    'model__gamma':[1, 0.1, 0.01, 0.001],
    'model__C':[0.1, 1, 10, 100]
}

grid = GridSearchCV(
    full_pipeline_svc,
    cv=5,
    refit=True,
    param_grid=params_grid,
    scoring='accuracy'
)

In [None]:
grid.fit(X_train, y_train)

In [None]:
# Extract the best SVC model
best_svc = grid.best_estimator_

In [None]:
# Evaluate the SVC model.
train_preds = best_svc.predict(X_train)
test_preds = best_svc.predict(X_test)

print(f'Training accuracy : {accuracy_score(train_preds, y_train)*100:.2f}%')
print(f'Testing accuracy : {accuracy_score(test_preds, y_test)*100:.2f}%\n')
print(classification_report(y_test, test_preds))

Training accuracy : 88.42%
Testing accuracy : 84.67%

              precision    recall  f1-score   support

           0       0.83      0.88      0.85        76
           1       0.87      0.81      0.84        74

    accuracy                           0.85       150
   macro avg       0.85      0.85      0.85       150
weighted avg       0.85      0.85      0.85       150



In [None]:
import joblib
joblib.dump(best_rf, "random_forest.pkl")
joblib.dump(best_xgb, "xgb.pkl")

['xgb.pkl']