## Model Building and Evaluation of Model

In [1]:
#Load libraries
import pandas as pd
import numpy as np
import joblib
from pandas.plotting import scatter_matrix
from sklearn import model_selection
from sklearn.model_selection import train_test_split, GridSearchCV
from scipy.stats import randint
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
#Getting data from the cleaned dataset
df = pd.read_csv('data/heart_attack_prediction_cleaned_dataset.csv')

In [3]:
#Making a copy of the dataframe
df_copy = df.copy()

#Displaying first 20 rows of data to check if it is the cleaned dataset
print(df_copy.head(20))

    Age  Sex  Cholesterol  Heart Rate  Diabetes  Family History  Smoking  \
0    67    1          208          72         0               0        1   
1    21    1          389          98         1               1        1   
2    21    0          324          72         1               0        0   
3    84    1          383          73         1               1        1   
4    66    1          318          93         1               1        1   
5    54    0          297          48         1               1        1   
6    90    1          358          84         0               0        1   
7    84    1          220         107         0               0        1   
8    20    1          145          68         1               0        1   
9    43    0          248          55         0               1        1   
10   73    0          373          97         1               1        1   
11   71    1          374          70         1               1        1   
12   77    1

In [4]:
y = df_copy['Heart Attack Risk'].to_numpy()

del df_copy['Heart Attack Risk']

x = df_copy.to_numpy()

In [5]:
#Shuffle and split dataset into 0.7 Train Size and 0.3 test size
size = 0.30
seed = 7
X_train, X_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=size,random_state = seed)
                                                

### Declaring Models to be Trained

In [6]:
#Models to be trained
LR = LogisticRegression(solver='lbfgs', multi_class='auto',max_iter=1000)
LDA = LinearDiscriminantAnalysis()
KNN = KNeighborsClassifier()
DTC = DecisionTreeClassifier()
RF = RandomForestClassifier()
ADA = AdaBoostClassifier()
GB = GradientBoostingClassifier()

### Training the Models

In [7]:
#Training the models
LR.fit(X_train,y_train)
LDA.fit(X_train,y_train)
KNN.fit(X_train,y_train)
DTC.fit(X_train,y_train)
RF.fit(X_train,y_train)
ADA.fit(X_train,y_train)
GB.fit(X_train,y_train)

GradientBoostingClassifier()

### Evaluating Model with Test Data

In [8]:
#Evaluating model with test data

# Models to be trained
models = [LR, LDA, KNN, DTC, RF, ADA, GB]
model_names = ['LR', 'LDA', 'KNN', 'DTC', 'RF', 'ADA', 'GB']

# Dictionary to store accuracy scores
accuracy_scores = {}

# Loop over models
for model, name in zip(models, model_names):
    # Evaluate the model on test data
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    # Print the accuracy for each model
    print(f'{name}: {accuracy}')

    # Store accuracy score in the dictionary
    accuracy_scores[name] = accuracy

# Find the best model
best_model = max(accuracy_scores, key=accuracy_scores.get)
best_accuracy = accuracy_scores[best_model]

print(f'The best model for test data is {best_model} with an accuracy of {best_accuracy}')

LR: 0.6317991631799164
LDA: 0.6317991631799164
KNN: 0.5621909471281856
DTC: 0.5450741726892354
RF: 0.6249524534043363
ADA: 0.6276150627615062
GB: 0.6298972993533662
The best model for test data is LR with an accuracy of 0.6317991631799164


### Evaluating Model with Training Data

In [9]:
#Evaluating model with train data

# Models to be trained
models = [LR, LDA, KNN, DTC, RF, ADA, GB]
model_names = ['LR', 'LDA', 'KNN', 'DTC', 'RF', 'ADA', 'GB']

# Dictionary to store accuracy scores
accuracy_scores_train = {}

# Loop over models
for model, name in zip(models, model_names):
    # Evaluate the model on training data
    y_pred_train = model.predict(X_train)
    accuracy_train = accuracy_score(y_train, y_pred_train)
    
    # Print the accuracy for each model on training data
    print(f'{name}: {accuracy_train}')

    # Store accuracy score in the dictionary
    accuracy_scores_train[name] = accuracy_train

# Find the best model for training data
best_model_train = max(accuracy_scores_train, key=accuracy_scores_train.get)
best_accuracy_train = accuracy_scores_train[best_model_train]

print(f'The best model for training data is {best_model_train} with an accuracy of {best_accuracy_train}')

LR: 0.6460710792305184
LDA: 0.6460710792305184
KNN: 0.7243234431040104
DTC: 1.0
RF: 1.0
ADA: 0.649983697424193
GB: 0.6713400717313336
The best model for training data is DTC with an accuracy of 1.0


#### In the evaluation of various classification models, it is evident that the performance, as measured by accuracy, is consistently higher on the training data compared to the test data. This discrepancy suggests a potential issue of overfitting, where models excel in learning the training data but struggle to generalize effectively to new, unseen data.

### Viewing Feature Importance on RandomForestClassifier Model

In [10]:
# These are the feature labels from our data set
feature_labels = np.array(['Age', 'Sex', 'Cholesterol', 'Heart Rate', 'Diabetes', 'Family History', 'Smoking', 'Obesity', 'Alcohol Consumption', 'Exercise Hours Per Week', 'Previous Heart Problems', 'Medication Use', 'Stress Level', 'Sedentary Hours Per Day', 'Income', 'BMI', 'Triglycerides', 'Sleep Hours Per Day', 'Systolic', 'Diastolic', 'Diet_Average', 'Diet_Healthy', 'Diet_Unhealthy'])

# Create a numpy array based on the model's feature importances
importance = RF.feature_importances_

# Sort the feature labels based on the feature importance rankings from the model
feature_indexes_by_importance = importance.argsort()

# Print each feature label, from most important to least important (reverse order)
for index in feature_indexes_by_importance:
    print("{} - {:.2f}%".format(feature_labels[index], (importance[index] * 100.0)))

Smoking - 0.59%
Diet_Unhealthy - 1.04%
Diet_Healthy - 1.05%
Diet_Average - 1.08%
Sex - 1.10%
Diabetes - 1.16%
Obesity - 1.27%
Alcohol Consumption - 1.28%
Previous Heart Problems - 1.28%
Family History - 1.29%
Medication Use - 1.31%
Sleep Hours Per Day - 4.07%
Stress Level - 4.58%
Diastolic - 6.94%
Age - 7.36%
Heart Rate - 7.52%
Systolic - 7.55%
Cholesterol - 7.95%
Triglycerides - 8.13%
Exercise Hours Per Week - 8.20%
BMI - 8.35%
Income - 8.43%
Sedentary Hours Per Day - 8.47%


#### From the Percentages, We can see that Smoking, Diet_Unhealthy, Diet_Average, Diet_Healthy, Sex, Obesity, Diabetes, Medication Use, Alcohol Consumption, Family History and Previous Heart Problems are Features that are less important when finding out a person's Heart Attack Risk

<!-- ### Performing GridSearch on Logistic Regression based on its performance in terms of accuracy on both the training and test datasets. LR demonstrates a balanced accuracy score, making it a suitable candidate for optimization -->

### Doing GridSearch on DecisionTreeClassifier, the worst Model with Test Data

In [11]:
# Define the parameter grid for DecisionTreeClassifier
param_grid_dt = {
    'criterion': ['entropy', 'gini'],
    'splitter': ['random'],
    'max_depth': [3, 2, 1, 10],
    'min_samples_split': [2, 3, 4, 6],
    'min_samples_leaf': [1, 3, 5, 8]
}

# Create the DecisionTreeClassifier model
dt_model = DecisionTreeClassifier()

# Create GridSearchCV object for DecisionTreeClassifier
grid_search_dt = GridSearchCV(estimator=dt_model, param_grid=param_grid_dt, scoring='accuracy', cv=5, n_jobs=-1, verbose=100)

# Fit the DecisionTreeClassifier model to the data
grid_search_dt.fit(X_train, y_train)

# Get the best parameters and the corresponding accuracy score
best_params_dt = grid_search_dt.best_params_
best_score_dt = grid_search_dt.best_score_

# Testing GridSearch Model with Test Data
accuracy_score_grid_search_dt = accuracy_score(y_test, grid_search_dt.predict(X_test))

print('Best Parameters for Decision Tree Classifier:', best_params_dt)
print('Best Accuracy Score for Decision Tree Classifier:', best_score_dt)

print('GridSearch Decision Tree Classifier with Test Data', accuracy_score_grid_search_dt)

Fitting 5 folds for each of 128 candidates, totalling 640 fits
Best Parameters for Decision Tree Classifier: {'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 8, 'min_samples_split': 4, 'splitter': 'random'}
Best Accuracy Score for Decision Tree Classifier: 0.6463973324505319
GridSearch Decision Tree Classifier with Test Data 0.6317991631799164


### Doing GridSearch on LogisticRegression, the best Model with Test Data

In [12]:
# Define the parameter grid for Logistic Regression
param_grid_lr = {
     'penalty': ['l2'],  # 'l1' is removed since lbfgs does not support it
    'tol': [1e-3, 1e-4],
    'C': [0.01, 0.1, 1],
    'fit_intercept': [True, False],
    'class_weight': [None, 'balanced'],
    'solver': ['lbfgs', 'liblinear'],
    'max_iter': [100, 200, 300],
    'random_state': [None, 42]
}

# Create the Logistic Regression model
lr_model = LogisticRegression()

# Create GridSearchCV object for Logistic Regression
grid_search_lr = GridSearchCV(estimator=lr_model, param_grid=param_grid_lr, scoring='accuracy', cv=5, n_jobs=-1, verbose=100)

# Fit the Logistic Regression model to the data
grid_search_lr.fit(X_train, y_train)

# Get the best parameters and the corresponding accuracy score
best_params_lr = grid_search_lr.best_params_
best_score_lr = grid_search_lr.best_score_

# Testing GridSearch Model with Test Data
accuracy_score_grid_search_lr = accuracy_score(y_test, grid_search_lr.predict(X_test))

print('Best Parameters for Logistic Regression:', best_params_lr)
print('Best Accuracy Score for Logistic Regression:', best_score_lr)

print('GridSearch Logistic Regression with Test Data', accuracy_score_grid_search_lr)

Fitting 5 folds for each of 288 candidates, totalling 1440 fits
Best Parameters for Logistic Regression: {'C': 0.01, 'class_weight': None, 'fit_intercept': True, 'max_iter': 100, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.001}
Best Accuracy Score for Logistic Regression: 0.64607106817647
GridSearch Logistic Regression with Test Data 0.6317991631799164


### Doing GridSearch on LinearDiscriminantAnalysis, another best Model with Test Data

In [13]:
# Define the parameter grid for LDA
param_grid_lda = {
    'solver': ['svd', 'lsqr', 'eigen'],
    'n_components': [None, 1],
    'tol': [1e-4, 1e-3, 1e-2],
}

# Create the LDA model
lda_model = LinearDiscriminantAnalysis()

# Create GridSearchCV object for LDA
grid_search_lda = GridSearchCV(estimator=lda_model, param_grid=param_grid_lda, scoring='accuracy', cv=5, n_jobs=-1, verbose=100)

# Fit the LDA model to the data
grid_search_lda.fit(X_train, y_train)

# Get the best parameters and the corresponding accuracy score
best_params_lda = grid_search_lda.best_params_
best_score_lda = grid_search_lda.best_score_

# Testing GridSearch Model with Test Data
accuracy_score_grid_search_lda = accuracy_score(y_test, grid_search_lda.predict(X_test))

print('Best Parameters for Linear Discriminant Analysis:', best_params_lda)
print('Best Accuracy Score for Linear Discriminant Analysis:', best_score_lda)

print('GridSearch LDA with Test Data', accuracy_score_grid_search_lda)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Parameters for Linear Discriminant Analysis: {'n_components': None, 'solver': 'svd', 'tol': 0.0001}
Best Accuracy Score for Linear Discriminant Analysis: 0.64607106817647
GridSearch LDA with Test Data 0.6317991631799164


### Doing GridSearch on KNeighborsClassifier

In [14]:
# Define the parameter grid for KNeighborsClassifier
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1, 2]
}

# Create the KNeighborsClassifier model
knn_model = KNeighborsClassifier()

# Create GridSearchCV object for KNeighborsClassifier
grid_search_knn = GridSearchCV(estimator=knn_model, param_grid=param_grid_knn, scoring='accuracy', cv=5, n_jobs=-1, verbose=100)

# Fit the KNeighborsClassifier model to the data
grid_search_knn.fit(X_train, y_train)

# Get the best parameters and the corresponding accuracy score
best_params_knn = grid_search_knn.best_params_
best_score_knn = grid_search_knn.best_score_

# Testing GridSearch Model with Test Data
accuracy_score_grid_search_knn = accuracy_score(y_test, grid_search_knn.predict(X_test))

print('Best Parameters for KNeighborsClassifier:', best_params_knn)
print('Best Accuracy Score for KNeighborsClassifier:', best_score_knn)

print('GridSearch KNeighborsClassifier with Test Data', accuracy_score_grid_search_knn)

Fitting 5 folds for each of 64 candidates, totalling 320 fits
Best Parameters for KNeighborsClassifier: {'algorithm': 'auto', 'n_neighbors': 9, 'p': 2, 'weights': 'uniform'}
Best Accuracy Score for KNeighborsClassifier: 0.5906425704413076
GridSearch KNeighborsClassifier with Test Data 0.5891974134651959


### Doing GridSearch on RandomForestClassifier

In [22]:
# Define the parameter grid for RandomForestClassifier
param_grid_rf = {
    'n_estimators': [65, 70, 75],  # Number of trees in the forest
    'max_features': ['sqrt'],  # Number of features to consider at each split
    'max_depth': [9, 10, 11],  # Maximum depth of the trees
    'min_samples_split': [5, 6, 7],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 3]  # Minimum number of samples required to be at a leaf node
}


# Create the RandomForestClassifier model
rf_model = RandomForestClassifier()

# Create GridSearchCV object for RandomForestClassifier
grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid_rf, scoring='accuracy', cv=5, n_jobs=-1, verbose=100)

# Fit the RandomForestClassifier model to the data
grid_search_rf.fit(X_train, y_train)

# Get the best parameters and the corresponding accuracy score
best_params_rf = grid_search_rf.best_params_
best_score_rf = grid_search_rf.best_score_

# Testing GridSearch Model with Test Data
accuracy_score_grid_search_rf = accuracy_score(y_test, grid_search_rf.predict(X_test))

print('Best Parameters for RandomForestClassifier:', best_params_rf)
print('Best Accuracy Score for RandomForestClassifier:', best_score_rf)

print('GridSearch RandomForestClassifier with Test Data', accuracy_score_grid_search_rf)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best Parameters for RandomForestClassifier: {'max_depth': 11, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 6, 'n_estimators': 65}
Best Accuracy Score for RandomForestClassifier: 0.6463973324505318
GridSearch RandomForestClassifier with Test Data 0.6321795359452264


### Doing GridSearch on AdaBoostClassifier

In [16]:
# Define the parameter grid for AdaBoostClassifier
param_grid_ada = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 1.0],
    'algorithm': ['SAMME', 'SAMME.R']
}

# Create the AdaBoostClassifier model
ada_model = AdaBoostClassifier()

# Create GridSearchCV object for AdaBoostClassifier
grid_search_ada = GridSearchCV(estimator=ada_model, param_grid=param_grid_ada, scoring='accuracy', cv=5, n_jobs=-1, verbose=100)

# Fit the AdaBoostClassifier model to the data
grid_search_ada.fit(X_train, y_train)

# Get the best parameters and the corresponding accuracy score
best_params_ada = grid_search_ada.best_params_
best_score_ada = grid_search_ada.best_score_

# Testing GridSearch Model with Test Data
accuracy_score_grid_search_ada = accuracy_score(y_test, grid_search_ada.predict(X_test))

print('Best Parameters for AdaBoostClassifier:', best_params_ada)
print('Best Accuracy Score for AdaBoostClassifier:', best_score_ada)

print('GridSearch AdaBoostClassifier with Test Data', accuracy_score_grid_search_ada)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Parameters for AdaBoostClassifier: {'algorithm': 'SAMME', 'learning_rate': 0.01, 'n_estimators': 50}
Best Accuracy Score for AdaBoostClassifier: 0.64607106817647
GridSearch AdaBoostClassifier with Test Data 0.6317991631799164


### Doing GridSearch on GradientBoostingClassifier

In [17]:
# Define the parameter grid for GradientBoostingClassifier
param_grid_gb = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the GradientBoostingClassifier model
gb_model = GradientBoostingClassifier()

# Create GridSearchCV object for GradientBoostingClassifier
grid_search_gb = GridSearchCV(estimator=gb_model, param_grid=param_grid_gb, scoring='accuracy', cv=5, n_jobs=-1, verbose=100)

# Fit the GradientBoostingClassifier model to the data
grid_search_gb.fit(X_train, y_train)

# Get the best parameters and the corresponding accuracy score
best_params_gb = grid_search_gb.best_params_
best_score_gb = grid_search_gb.best_score_

# Testing GridSearch Model with Test Data
accuracy_score_grid_search_gb = accuracy_score(y_test, grid_search_gb.predict(X_test))

print('Best Parameters for GradientBoostingClassifier:', best_params_gb)
print('Best Accuracy Score for GradientBoostingClassifier:', best_score_gb)

print('GridSearch GradientBoostingClassifier with Test Data', accuracy_score_grid_search_gb)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best Parameters for GradientBoostingClassifier: {'learning_rate': 0.01, 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best Accuracy Score for GradientBoostingClassifier: 0.64607106817647
GridSearch GradientBoostingClassifier with Test Data 0.6317991631799164


## Saving the Best Trained Model

In [24]:
joblib.dump(grid_search_rf, "models/model_rf.pk1")

['models/model_rf.pk1']