In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
import joblib
import ast  
import random
import os
os.chdir('Resources/')

In [2]:
df = pd.read_csv('1_CC_Combined_Data.csv')

df.dropna(inplace=True)

X = df.drop(columns=["HeartDisease"])
Y = LabelEncoder().fit_transform(df["HeartDisease"])

label_encoder = LabelEncoder()
for col in X.select_dtypes(include=['object']).columns:
    X[col] = label_encoder.fit_transform(X[col])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [3]:
# 1.1 - RF+XGB (F1)
#------------------

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score

X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=67)

rf = RandomForestClassifier(random_state=20, n_estimators=72)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=774)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)

assert len(rf_pred) == len(xgb_pred) == len(y_test)

combined_pred = [r if r == x else r for r, x in zip(rf_pred, xgb_pred)]

combined_f1 = f1_score(y_test, combined_pred)

print(combined_f1)

0.9336099585062241


In [4]:
# 1.2 - RF+XGB (ACC)
#-------------------

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=2)

rf = RandomForestClassifier(random_state=612, n_estimators=60)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=612)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)

assert len(rf_pred) == len(xgb_pred) == len(y_test)

combined_pred = [r if r == x else r for r, x in zip(rf_pred, xgb_pred)]

combined_acc = accuracy_score(y_test, combined_pred)

print(combined_acc)

0.9164490861618799


In [5]:
# 1.3 - RF+XGB (PRE)
#--------------------

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, precision_score

X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=15)

rf = RandomForestClassifier(random_state=855, n_estimators=2)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=855)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)

assert len(rf_pred) == len(xgb_pred) == len(y_test)

combined_pred = [r if r == x else r for r, x in zip(rf_pred, xgb_pred)]

combined_pre = precision_score(y_test, combined_pred)

print(combined_pre)


0.9649122807017544


In [6]:
# 1.4 - RF+XGB (REC)
#--------------------

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, recall_score

X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=530)

rf = RandomForestClassifier(random_state=46, n_estimators=87)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=46)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)

assert len(rf_pred) == len(xgb_pred) == len(y_test)

combined_pred = [r if r == x else r for r, x in zip(rf_pred, xgb_pred)]

combined_rec = recall_score(y_test, combined_pred)

print(combined_rec)

0.96875


In [7]:
df = pd.read_csv('1_CC_Structured_Data.csv')

df.dropna(inplace=True)

X = df.drop(columns=["HeartDisease"])
Y = LabelEncoder().fit_transform(df["HeartDisease"])

label_encoder = LabelEncoder()
for col in X.select_dtypes(include=['object']).columns:
    X[col] = label_encoder.fit_transform(X[col])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [8]:
# 2.1 - RF+XGB (F1)
#------------------

X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=2423)

rf = RandomForestClassifier(random_state=419, n_estimators=97)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=419)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)

assert len(rf_pred) == len(xgb_pred) == len(y_test)

combined_pred = [r if r == x else r for r, x in zip(rf_pred, xgb_pred)]

structured_f1 = f1_score(y_test, combined_pred)

print(structured_f1)

0.9409090909090909


In [9]:
# 2.2 - RF+XGB (ACC)
#-------------------

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=284)

rf = RandomForestClassifier(random_state=188, n_estimators=92)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=188)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)

assert len(rf_pred) == len(xgb_pred) == len(y_test)

combined_pred = [r if r == x else r for r, x in zip(rf_pred, xgb_pred)]

structured_acc = accuracy_score(y_test, combined_pred)

print(structured_acc)


0.9148936170212766


In [10]:
# 2.3 - RF+XGB (PRE)
#--------------------

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, precision_score

X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=1237)

rf = RandomForestClassifier(random_state=435, n_estimators=99)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=435)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)

assert len(rf_pred) == len(xgb_pred) == len(y_test)

combined_pred = [r if r == x else r for r, x in zip(rf_pred, xgb_pred)]

structured_pre = precision_score(y_test, combined_pred)

print(structured_pre)

0.9288702928870293


In [11]:
# 2.4 - RF+XGB (REC)
#--------------------

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, recall_score

X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=1219)

rf = RandomForestClassifier(random_state=12, n_estimators=87)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=12)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)

assert len(rf_pred) == len(xgb_pred) == len(y_test)

combined_pred = [r if r == x else r for r, x in zip(rf_pred, xgb_pred)]

structured_rec = recall_score(y_test, combined_pred)

print(structured_rec)

0.9813953488372092


In [12]:
file_path_f1 = "13_CC_F1_Score_RF_XGB.txt"
with open(file_path_f1, "r") as f:
    encrypted_f1 = float(f.read().strip())

file_path_acc = "13_CC_ACC_Score_RF_XGB.txt"
with open(file_path_acc, "r") as f:
    encrypted_acc = float(f.read().strip())

file_path_pre = "13_CC_PRE_Score_RF_XGB.txt"
with open(file_path_pre, "r") as f:
    encrypted_pre = float(f.read().strip())

file_path_rec = "13_CC_REC_Score_RF_XGB.txt"
with open(file_path_rec, "r") as f:
    encrypted_rec = float(f.read().strip())

In [13]:
print("F1 Score")
print(f'Combined Dataset:\t{round(combined_f1 * 100, 2)}%')
print(f'Structured Dataset:\t{round(structured_f1 * 100, 2)}%')
print(f'Encrypted Dataset:\t{round(encrypted_f1 * 100, 2)}%')

print("\nAccuracy")
print(f'Combined Dataset:\t{round(combined_acc * 100, 2)}%')
print(f'Structured Dataset:\t{round(structured_acc * 100, 2)}%')
print(f'Encrypted Dataset:\t{round(encrypted_acc * 100, 2)}%')

print("\nPrecision")
print(f'Combined Dataset:\t{round(combined_pre * 100, 2)}%')
print(f'Structured Dataset:\t{round(structured_pre * 100, 2)}%')
print(f'Encrypted Dataset:\t{round(encrypted_pre * 100, 2)}%')

print("\nRecall")
print(f'Combined Dataset:\t{round(combined_rec * 100, 2)}%')
print(f'Structured Dataset:\t{round(structured_rec * 100, 2)}%')
print(f'Encrypted Dataset:\t{round(encrypted_rec * 100, 2)}%')

F1 Score
Combined Dataset:	93.36%
Structured Dataset:	94.09%
Encrypted Dataset:	94.78%

Accuracy
Combined Dataset:	91.64%
Structured Dataset:	91.49%
Encrypted Dataset:	92.4%

Precision
Combined Dataset:	96.49%
Structured Dataset:	92.89%
Encrypted Dataset:	94.32%

Recall
Combined Dataset:	96.88%
Structured Dataset:	98.14%
Encrypted Dataset:	98.2%
