# Importing the libraries

In [1]:
import pandas as pd

# Importing the dataset

In [2]:
df = pd.read_csv("../data/clean/heart.csv")

In [3]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,NumMajorVessels,Thalassemia,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


# X-y split

In [4]:
X = df.drop(columns="target", axis=1)
y = df['target']

# Feature scaling

In [10]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import pickle

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
scaler = StandardScaler()
scaler.fit(X_train)

filename = "standard_scaler.pkl"
with open("../scalers/"+filename, "wb") as file:
    pickle.dump(st, file)

X_train_np = st.transform(X_train)
X_test_np = st.transform(X_test)

X_train_df = pd.DataFrame(X_train_np, columns=X_train.columns, index=X_train.index)
X_test_df = pd.DataFrame(X_test_np, columns=X_test.columns, index=X_test.index)

display(X_train_df)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,NumMajorVessels,Thalassemia
132,-0.682633,0.685210,-0.980054,-0.403153,0.486281,-0.425879,-1.024877,0.717007,-0.718139,-0.447524,-0.715539,-0.705536,1.118119
202,0.403989,0.685210,-0.980054,-0.171406,0.206315,-0.425879,-1.024877,-0.836627,1.392489,1.768032,-0.715539,1.248879,1.118119
196,-1.008619,-1.459407,-0.004050,-0.055533,-0.260295,-0.425879,-1.024877,1.105415,-0.718139,-0.358902,-0.715539,-0.705536,-0.545256
75,-1.117282,0.685210,-0.004050,-0.634899,-0.521596,-0.425879,0.860584,0.889633,-0.718139,-0.890635,0.991836,-0.705536,-0.545256
176,0.838637,-1.459407,-0.980054,1.103201,-0.073651,-0.425879,0.860584,0.199129,1.392489,0.350076,-0.715539,-0.705536,-0.545256
...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,1.381948,0.685210,-0.980054,-0.634899,-0.204301,-0.425879,0.860584,-3.382861,-0.718139,-0.004413,-0.715539,-0.705536,-0.545256
71,0.947300,0.685210,-0.980054,0.523834,-1.137521,-0.425879,-1.024877,-0.232436,1.392489,2.654254,0.991836,1.248879,1.118119
106,0.621313,-1.459407,1.947959,1.103201,-0.148308,-0.425879,0.860584,0.932789,-0.718139,-0.093035,0.991836,-0.705536,-0.545256
270,1.381948,-1.459407,0.971955,1.219074,0.542274,-0.425879,0.860584,0.975946,-0.718139,-0.890635,0.991836,0.271672,-0.545256


# Machine Learning Models

## Logistic Regression

In [8]:
from sklearn.linear_model import LogisticRegression

In [15]:
log = LogisticRegression(solver='liblinear')
log.fit(X_train,y_train)

filename = "logistic_model.pkl"
with open("../models/"+filename, "wb") as file:
    pickle.dump(st, file)

In [16]:
y_test_pred = log.predict(X_test)

In [23]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

In [188]:
test_accuracy = accuracy_score(y_test,y_test_pred)
print(f"The accuracy of the model in the TEST set is: {test_accuracy: .2f}")

The accuracy of the model in the TEST set is:  0.57


In [189]:
test_precision = precision_score(y_test,y_test_pred)
print(f"The precision of the model in the TEST set is: {test_precision: .2f}")

The precision of the model in the TEST set is:  0.56


In [190]:
test_recall = recall_score(y_test, y_test_pred)
print(f"The recall of the model in the TEST set is: {test_recall: .2f}")

The recall of the model in the TEST set is:  0.48


In [168]:
test_f1 = f1_score(y_test, y_test_pred)
print(f"The f1 score of the model in the TEST set is: {test_f1: .2f}")

The f1 score of the model in the TEST set is:  0.84


In [169]:
test_report = classification_report(y_test, y_test_pred)
print("The classification report of the model in the TEST set is: ",test_report)

The classification report of the model in the TEST set is:                precision    recall  f1-score   support

           0       0.89      0.78      0.83        32
           1       0.79      0.90      0.84        29

    accuracy                           0.84        61
   macro avg       0.84      0.84      0.84        61
weighted avg       0.84      0.84      0.84        61



## KNeighbors Classifier

In [55]:
from sklearn.neighbors import KNeighborsClassifier

In [60]:
score = []
for k in range(1,40):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train,y_train)
    filename = "knn_" + str(k) + ".pkl"
    with open("../models/"+filename, "wb") as file:
        pickle.dump(st, file)
    y_test_pred = knn.predict(X_test)
    score.append(accuracy_score(y_test,y_test_pred))

In [58]:
score

[0.5245901639344263,
 0.5245901639344263,
 0.5245901639344263,
 0.5245901639344263,
 0.47540983606557374,
 0.47540983606557374,
 0.47540983606557374,
 0.47540983606557374,
 0.47540983606557374,
 0.5245901639344263,
 0.5245901639344263,
 0.5245901639344263,
 0.5245901639344263,
 0.5245901639344263,
 0.5245901639344263,
 0.5245901639344263,
 0.5245901639344263,
 0.5245901639344263,
 0.5245901639344263,
 0.5245901639344263,
 0.5245901639344263,
 0.5245901639344263,
 0.5245901639344263,
 0.5245901639344263,
 0.47540983606557374,
 0.47540983606557374,
 0.47540983606557374,
 0.47540983606557374,
 0.47540983606557374,
 0.47540983606557374,
 0.47540983606557374,
 0.47540983606557374,
 0.47540983606557374,
 0.47540983606557374,
 0.47540983606557374,
 0.47540983606557374,
 0.47540983606557374,
 0.47540983606557374,
 0.47540983606557374]

In [61]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

In [183]:
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train,y_train)
y_test_pred = knn.predict(X_test)
test_accuracy = accuracy_score(y_test,y_test_pred)
print(f"The accuracy of the model in the TEST set is: {test_accuracy: .2f}")

The accuracy of the model in the TEST set is:  0.57


In [184]:
test_precision= precision_score(y_test,y_test_pred)
print(f"The precision of the model in the TEST set is: {test_precision: .2f}")

The precision of the model in the TEST set is:  0.56


In [185]:
test_recall = recall_score(y_test, y_test_pred)
print(f"The recall of the model in the TEST set is: {test_recall: .2f}")

The recall of the model in the TEST set is:  0.48


In [186]:
test_f1 = f1_score(y_test, y_test_pred)
print(f"The f1 score of the model in the TEST set is: {test_f1: .2f}")

The f1 score of the model in the TEST set is:  0.52


In [187]:
test_report = classification_report(y_test, y_test_pred)
print("The classification report of the model in the TEST set is: ",test_report)

The classification report of the model in the TEST set is:                precision    recall  f1-score   support

           0       0.58      0.66      0.62        32
           1       0.56      0.48      0.52        29

    accuracy                           0.57        61
   macro avg       0.57      0.57      0.57        61
weighted avg       0.57      0.57      0.57        61



## Support vector classifier

In [96]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],          # Regularization parameter
    'gamma': [1, 0.1, 0.01, 0.001],  # Kernel coefficient
    'kernel': ['rbf', 'linear']      # Kernel type
}

# Initialize the SVC
svc = svm.SVC()

# Set up GridSearchCV
grid_search = GridSearchCV(svc, param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)

# Fit GridSearchCV to the data
grid_search.fit(X_train, y_train)

# Best parameters and estimator
best_params = grid_search.best_params_
best_svc = grid_search.best_estimator_

# Save the best model to a file
filename = "Support_vector_classifier_best.pkl"
with open("../models/" + filename, "wb") as file:
    pickle.dump(best_svc, file)

# Predict using the best model
y_test_pred = best_svc.predict(X_test)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


In [181]:
test_accuracy = accuracy_score(y_test,y_test_pred)
print(f"The accuracy of the model in the TEST set is: {test_accuracy: .2f}")

The accuracy of the model in the TEST set is:  0.57


In [180]:
test_precision = precision_score(y_test,y_test_pred)
print(f"The precision of the model in the TEST set is: {test_precision: .2f}")

The precision of the model in the TEST set is:  0.56


In [89]:
test_recall = recall_score(y_test, y_test_pred)
print(f"The recall of the model in the TEST set is: {test_recall: .2f}")

The recall of the model in the TEST set is:  0.83


In [90]:
test_f1 = f1_score(y_test, y_test_pred)
print(f"The f1 score of the model in the TEST set is: {test_f1: .2f}")

The f1 score of the model in the TEST set is:  0.75


In [182]:
test_report = classification_report(y_test, y_test_pred)
print("The classification report of the model in the TEST set is: ",test_report)

The classification report of the model in the TEST set is:                precision    recall  f1-score   support

           0       0.58      0.66      0.62        32
           1       0.56      0.48      0.52        29

    accuracy                           0.57        61
   macro avg       0.57      0.57      0.57        61
weighted avg       0.57      0.57      0.57        61



## Decision Tree Classifier

In [124]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split

# splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid
param_grid = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'criterion': ['gini', 'entropy']
}

# Initialize GridSearchCV
grid = GridSearchCV(DecisionTreeClassifier(), param_grid, refit=True, verbose=2, cv=5)

# Fit GridSearchCV
grid.fit(X_train, y_train)

# Get the best parameters and estimator
best_params = grid.best_params_
best_tree = grid.best_estimator_

# Predict on the test set
y_test_pred = best_tree.predict(X_test)

# Save the best model
filename = "Decision_tree_classifier_best.pkl"
with open("../models/"+filename, "wb") as file:
    pickle.dump(best_tree, file)

Fitting 5 folds for each of 192 candidates, totalling 960 fits
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5; total time=   0.0s
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5; total time=   0.0s
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5; total time=   0.0s
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5; total time=   0.0s
[CV] END criterion=gini, max

In [107]:
test_accuracy = accuracy_score(y_test,y_test_pred)
print(f"The accuracy of the model in the TEST set is: {test_accuracy: .2f}")

The accuracy of the model in the TEST set is:  0.75


In [104]:
test_precision = precision_score(y_test,y_test_pred)
print(f"The precision of the model in the TEST set is: {test_precision: .2f}")

The precision of the model in the TEST set is:  0.74


In [105]:
test_recall = recall_score(y_test, y_test_pred)
print(f"The recall of the model in the TEST set is: {test_recall: .2f}")

The recall of the model in the TEST set is:  0.69


In [108]:
test_f1 = f1_score(y_test, y_test_pred)
print(f"The f1 score of the model in the TEST set is: {test_f1: .2f}")

The f1 score of the model in the TEST set is:  0.75


In [109]:
test_report = classification_report(y_test, y_test_pred)
print("The classification report of the model in the TEST set is: ",test_report)

The classification report of the model in the TEST set is:                precision    recall  f1-score   support

           0       0.79      0.72      0.75        32
           1       0.72      0.79      0.75        29

    accuracy                           0.75        61
   macro avg       0.76      0.76      0.75        61
weighted avg       0.76      0.75      0.75        61



## Random Forest Classifier

In [110]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split

# splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

# Initialize GridSearchCV
grid = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, verbose=2, cv=5)

# Fit GridSearchCV
grid.fit(X_train, y_train)

# Get the best parameters and estimator
best_params = grid.best_params_
best_forest = grid.best_estimator_

# Predict on the test set
y_test_pred = best_forest.predict(X_test)

# Save the best model
filename = "Random_forest_classifier_best.pkl"
with open("../models/"+filename, "wb") as file:
    pickle.dump(best_forest, file)


Fitting 5 folds for each of 540 candidates, totalling 2700 fits
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.2s
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.2s
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; tot

In [119]:
test_accuracy = accuracy_score(y_test,y_test_pred)
print(f"The accuracy of the model in the TEST set is: {test_accuracy: .2f}")

The accuracy of the model in the TEST set is:  0.72


In [120]:
test_precision = precision_score(y_test,y_test_pred)
print(f"The precision of the model in the TEST set is: {test_precision: .2f}")

The precision of the model in the TEST set is:  0.71


In [121]:
test_recall = recall_score(y_test, y_test_pred)
print(f"The recall of the model in the TEST set is: {test_recall: .2f}")

The recall of the model in the TEST set is:  0.69


In [122]:
test_f1 = f1_score(y_test, y_test_pred)
print(f"The f1 score of the model in the TEST set is: {test_f1: .2f}")

The f1 score of the model in the TEST set is:  0.70


In [123]:
test_report = classification_report(y_test, y_test_pred)
print("The classification report of the model in the TEST set is: ",test_report)

The classification report of the model in the TEST set is:                precision    recall  f1-score   support

           0       0.73      0.75      0.74        32
           1       0.71      0.69      0.70        29

    accuracy                           0.72        61
   macro avg       0.72      0.72      0.72        61
weighted avg       0.72      0.72      0.72        61



## Gradient boosting classifier

In [125]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, train_test_split

# splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.8, 0.9, 1.0]
}

# Initialize GridSearchCV
grid = GridSearchCV(GradientBoostingClassifier(), param_grid, refit=True, verbose=2, cv=5)

# Fit GridSearchCV
grid.fit(X_train, y_train)

# Get the best parameters and estimator
best_params = grid.best_params_
best_gb = grid.best_estimator_

# Predict on the test set
y_test_pred = best_gb.predict(X_test)

# Save the best model
filename = "Gradient_boosting_classifier_best.pkl"
with open("../models/"+filename, "wb") as file:
    pickle.dump(best_gb, file)


Fitting 5 folds for each of 729 candidates, totalling 3645 fits
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100, subsample=0.8; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100, subsample=0.8; total time=   0.2s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100, subsample=0.9; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100, subsample=0.9; total time

In [126]:
test_accuracy = accuracy_score(y_test,y_test_pred)
print(f"The accuracy of the model in the TEST set is: {test_accuracy: .2f}")

The accuracy of the model in the TEST set is:  0.84


In [127]:
test_precision = precision_score(y_test,y_test_pred)
print(f"The precision of the model in the TEST set is: {test_precision: .2f}")

The precision of the model in the TEST set is:  0.79


In [None]:
test_recall = recall_score(y_test, y_test_pred)
print(f"The recall of the model in the TEST set is: {test_recall: .2f}")

In [128]:
test_f1 = f1_score(y_test, y_test_pred)
print(f"The f1 score of the model in the TEST set is: {test_f1: .2f}")

The f1 score of the model in the TEST set is:  0.84


In [129]:
test_report = classification_report(y_test, y_test_pred)
print("The classification report of the model in the TEST set is: ",test_report)

The classification report of the model in the TEST set is:                precision    recall  f1-score   support

           0       0.89      0.78      0.83        32
           1       0.79      0.90      0.84        29

    accuracy                           0.84        61
   macro avg       0.84      0.84      0.84        61
weighted avg       0.84      0.84      0.84        61

