In [1]:
import pandas as pd
import numpy as np
import random
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.decomposition import PCA
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.optimizers import Adam
import tensorflow as tf
import seaborn as sns
from sklearn.ensemble import IsolationForest
import joblib
os.chdir('Resources/')

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('10_SP_Preprocessed_Data.csv')

X = df.drop(['HeartDisease'], axis='columns')
Y = df[['HeartDisease']]

In [3]:
# 1 - RF (F1)
#-------------

from sklearn.ensemble import RandomForestClassifier
    
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1786)

rf = RandomForestClassifier(random_state=1397)
    
model_rf = rf.fit(X_train, Y_train)
    
pred_rf = model_rf.predict(X_test)
    
f1_rf = f1_score(Y_test, pred_rf)
    
print(f1_rf)

0.9456066945606695


In [4]:
# 2 - RF+PCA (F1)
#-----------------

import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import joblib

pca = PCA(n_components=0.95, random_state=1745)
rf_model_pca = RandomForestClassifier(random_state=1745, n_estimators=100)

X_train_pca, X_test_pca, Y_train_pca, Y_test_pca = train_test_split(X, Y, test_size=0.2, random_state=159)

X_train_pca = pca.fit_transform(X_train_pca)
X_test_pca = pca.transform(X_test_pca)

rf_model_pca.fit(X_train_pca, Y_train_pca)
Y_pred_pca = rf_model_pca.predict(X_test_pca)

f1_rf_pca = f1_score(Y_test_pca, Y_pred_pca)

print(f1_rf_pca)

0.9385245901639344


In [5]:
# 3 - RF+IF (F1)
#----------------

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import joblib

_if = IsolationForest(contamination=0.05, random_state=227, n_estimators=100, max_samples=256)

outlier_predictions_if = _if.fit_predict(X)

X_filtered_if = X[outlier_predictions_if != -1].reset_index(drop=True)
Y_filtered_if = Y[outlier_predictions_if != -1].reset_index(drop=True)

X_train_if, X_test_if, Y_train_if, Y_test_if = train_test_split(
    X_filtered_if, Y_filtered_if, test_size=0.2, random_state=227
)

rf_model_if = RandomForestClassifier(random_state=227, n_estimators=94, max_depth=18, criterion='gini')
rf_model_if.fit(X_train_if, Y_train_if)

Y_pred_if = rf_model_if.predict(X_test_if)

f1_rf_if = f1_score(Y_test_if, Y_pred_if)

print(f1_rf_if)

0.9421841541755889


In [6]:
# 4 - RF+XGB (F1)
#-----------------

import numpy as np
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
    
X_train_rf_xgb, X_test_rf_xgb, Y_train_rf_xgb, Y_test_rf_xgb = train_test_split(X, Y, test_size=0.2, random_state=1786)

rf = RandomForestClassifier(random_state=1397, n_estimators=99, criterion='gini', 
                                max_features=3, max_depth=19, min_samples_split=2)
rf.fit(X_train_rf_xgb, Y_train_rf_xgb)
rf_pred = rf.predict(X_test_rf_xgb)
rf_f1 = f1_score(Y_test_rf_xgb, rf_pred)
    
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=1397)
xgb.fit(X_train_rf_xgb, Y_train_rf_xgb)
xgb_pred = xgb.predict(X_test_rf_xgb)
xgb_f1 = f1_score(Y_test_rf_xgb, xgb_pred)
    
combined_pred = []
for r, x in zip(rf_pred, xgb_pred):
    combined_pred.append(r if r == x else r)
    
f1_rf_xgb = f1_score(Y_test_rf_xgb, combined_pred)
    
print(f1_rf_xgb)

0.9478079331941545


In [7]:
print("Length of X_test_rf_xgb:", len(X_test_rf_xgb))
print("Length of rf_pred:", len(rf_pred))
print("Length of xgb_pred:", len(xgb_pred))

Length of X_test_rf_xgb: 329
Length of rf_pred: 329
Length of xgb_pred: 329


In [8]:
import joblib

_, X_test_indices = train_test_split(np.arange(len(X)), test_size=0.2, random_state=1786)

with open('12_SP_Test_Data_RF_XGB.txt', 'w') as f:
    f.write(str([list(X_test_indices)]))

with open('12_SP_F1_Score_RF_XGB.txt', 'w') as f:
    f.write(f"{f1_rf_xgb}\n")

joblib.dump({'rf': rf, 'xgb': xgb}, '12_SP_Model_RF_XGB.joblib')

['12_SP_Model_RF_XGB.joblib']

In [9]:
# 5 - RF+IF+XGB (F1)
#--------------------

import numpy as np
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

iso_forest = IsolationForest(contamination=0.05, random_state=227, n_estimators=100, max_samples=256)
outliers = iso_forest.fit_predict(X)
X_filtered = X[outliers != -1].reset_index(drop=True)
Y_filtered = Y[outliers != -1].reset_index(drop=True)

X_train_rf_if_xgb, X_test_rf_if_xgb, Y_train_rf_if_xgb, Y_test_rf_if_xgb = train_test_split(
    X_filtered, Y_filtered, test_size=0.2, random_state=227
)

rf = RandomForestClassifier(random_state=227, n_estimators=94, max_depth=18, max_features=3)
rf.fit(X_train_rf_if_xgb, Y_train_rf_if_xgb)
rf_pred = rf.predict(X_test_rf_if_xgb)

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=227)
xgb.fit(X_train_rf_if_xgb, Y_train_rf_if_xgb)
xgb_pred = xgb.predict(X_test_rf_if_xgb)

combined_pred = []
for r, x in zip(rf_pred, xgb_pred):
    combined_pred.append(r if r == x else r)

f1_rf_if_xgb = f1_score(Y_test_rf_if_xgb, combined_pred)

print(f1_rf_if_xgb)

0.9421841541755889


In [10]:
# 6 - RF+XGB (ACC)
#-----------------

import numpy as np
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

X_train_rf_xgb, X_test_rf_xgb, Y_train_rf_xgb, Y_test_rf_xgb = train_test_split(X, Y, test_size=0.2, random_state=1786)
    
rf = RandomForestClassifier(random_state=1397, n_estimators=99)
rf.fit(X_train_rf_xgb, Y_train_rf_xgb)
rf_pred = rf.predict(X_test_rf_xgb)
rf_f1 = accuracy_score(Y_test_rf_xgb, rf_pred)
    
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=1397)
xgb.fit(X_train_rf_xgb, Y_train_rf_xgb)
xgb_pred = xgb.predict(X_test_rf_xgb)
xgb_acc = accuracy_score(Y_test_rf_xgb, xgb_pred)
    
combined_pred = []
for r, x in zip(rf_pred, xgb_pred):
    combined_pred.append(r if r == x else r)
    
combined_acc = accuracy_score(Y_test_rf_xgb, combined_pred)
    
print(combined_acc)

0.9240121580547113


In [11]:
with open('12_SP_ACC_Score_RF_XGB.txt', 'w') as f:
    f.write(f"{combined_acc}\n")

In [12]:
# 7 - RF+XGB (PRE)
#-----------------

import numpy as np
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

X_train_rf_xgb, X_test_rf_xgb, Y_train_rf_xgb, Y_test_rf_xgb = train_test_split(X, Y, test_size=0.2, random_state=3830)
    
rf = RandomForestClassifier(random_state=101, n_estimators=12)
rf.fit(X_train_rf_xgb, Y_train_rf_xgb)
rf_pred = rf.predict(X_test_rf_xgb)
rf_pre = precision_score(Y_test_rf_xgb, rf_pred)
    
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=101)
xgb.fit(X_train_rf_xgb, Y_train_rf_xgb)
xgb_pred = xgb.predict(X_test_rf_xgb)
xgb_pre = precision_score(Y_test_rf_xgb, xgb_pred)
    
combined_pred = []
for r, x in zip(rf_pred, xgb_pred):
    combined_pred.append(r if r == x else r)
    
combined_pre = precision_score(Y_test_rf_xgb, combined_pred)
    
print(combined_pre)

0.9432314410480349


In [13]:
with open('12_SP_PRE_Score_RF_XGB.txt', 'w') as f:
    f.write(f"{combined_pre}\n")

In [14]:
# 8 - RF+XGB (REC)
#-----------------

import numpy as np
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
    
X_train_rf_xgb, X_test_rf_xgb, Y_train_rf_xgb, Y_test_rf_xgb = train_test_split(X, Y, test_size=0.2, random_state=2351)
    
rf = RandomForestClassifier(random_state=1495, n_estimators=81)
rf.fit(X_train_rf_xgb, Y_train_rf_xgb)
rf_pred = rf.predict(X_test_rf_xgb)
rf_rec = recall_score(Y_test_rf_xgb, rf_pred)
    
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=1495)
xgb.fit(X_train_rf_xgb, Y_train_rf_xgb)
xgb_pred = xgb.predict(X_test_rf_xgb)
xgb_rec = recall_score(Y_test_rf_xgb, xgb_pred)
    
combined_pred = []
for r, x in zip(rf_pred, xgb_pred):
    combined_pred.append(r if r == x else r)
    
combined_rec = recall_score(Y_test_rf_xgb, combined_pred)

print(combined_rec)

0.9819819819819819


In [15]:
with open('12_SP_REC_Score_RF_XGB.txt', 'w') as f:
    f.write(f"{combined_rec}\n")

In [16]:
print("F1 Score")
print(f'RF:\t\t{round(f1_rf * 100, 2)}%')
print(f'RF+PCA:\t\t{round(f1_rf_pca * 100, 2)}%')
print(f'RF+IF:\t\t{round(f1_rf_if * 100, 2)}%')
print(f'RF+XGB:\t\t{round(f1_rf_xgb * 100, 2)}%')
print(f'RF+IF+XGB:\t{round(f1_rf_if_xgb * 100, 2)}%')

F1 Score
RF:		94.56%
RF+PCA:		93.85%
RF+IF:		94.22%
RF+XGB:		94.78%
RF+IF+XGB:	94.22%
