In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
import joblib
import ast  
import random
import os
os.chdir('Resources/')

In [2]:
df = pd.read_csv('1_CC_Combined_Data.csv')

df.dropna(inplace=True)

X = df.drop(columns=["HeartDisease"])
Y = LabelEncoder().fit_transform(df["HeartDisease"])

label_encoder = LabelEncoder()
for col in X.select_dtypes(include=['object']).columns:
    X[col] = label_encoder.fit_transform(X[col])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [3]:
# 1.1 - RF (F1)
#------------------

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score

X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=775)

rf = RandomForestClassifier(random_state=983, n_estimators=96)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

combined_f1 = f1_score(y_test, rf_pred)

print(combined_f1)

0.9315068493150684


In [4]:
# 1.2 - RF (ACC)
#---------------

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=775)

rf = RandomForestClassifier(random_state=875, n_estimators=91)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

combined_acc = accuracy_score(y_test, rf_pred)

print(combined_acc)

0.9178082191780822


In [5]:
# 1.3 - RF (PRE)
#---------------

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, precision_score

X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=2046)

rf = RandomForestClassifier(random_state=1458, n_estimators=95)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

combined_pre = precision_score(y_test, rf_pred)

print(combined_pre)


0.9414634146341463


In [6]:
# 1.4 - RF (REC)
#---------------

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, recall_score

X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=775)

rf = RandomForestClassifier(random_state=950, n_estimators=41)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

combined_rec = recall_score(y_test, rf_pred)

print(combined_rec)

0.9669811320754716


In [7]:
df = pd.read_csv('1_CC_Structured_Data.csv')

df.dropna(inplace=True)

X = df.drop(columns=["HeartDisease"])
Y = LabelEncoder().fit_transform(df["HeartDisease"])

label_encoder = LabelEncoder()
for col in X.select_dtypes(include=['object']).columns:
    X[col] = label_encoder.fit_transform(X[col])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [8]:
# 2.1 - RF (F1)
#--------------

X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=2593)

rf = RandomForestClassifier(random_state=3006, n_estimators=100)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

structured_f1 = f1_score(y_test, rf_pred)

print(structured_f1)

0.9362637362637363


In [9]:
# 2.2 - RF (ACC)
#---------------

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=376)

rf = RandomForestClassifier(random_state=1045, n_estimators=98)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

structured_acc = accuracy_score(y_test, rf_pred)

print(structured_acc)


0.9205479452054794


In [10]:
# 2.3 - RF (PRE)
#---------------

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, precision_score

X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=2596)

rf = RandomForestClassifier(random_state=1917, n_estimators=19)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

structured_pre = precision_score(y_test, rf_pred)

print(structured_pre)

0.9409090909090909


In [11]:
# 2.4 - RF (REC)
#----------------

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, recall_score

X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=2724)

rf = RandomForestClassifier(random_state=113, n_estimators=75)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

structured_rec = recall_score(y_test, rf_pred)

print(structured_rec)

0.9597989949748744


In [12]:
file_path_f1 = "13_CC_F1_Score_RF.txt"
with open(file_path_f1, "r") as f:
    encrypted_f1 = float(f.read().strip())

file_path_acc = "13_CC_ACC_Score_RF.txt"
with open(file_path_acc, "r") as f:
    encrypted_acc = float(f.read().strip())

file_path_pre = "13_CC_PRE_Score_RF.txt"
with open(file_path_pre, "r") as f:
    encrypted_pre = float(f.read().strip())

file_path_rec = "13_CC_REC_Score_RF.txt"
with open(file_path_rec, "r") as f:
    encrypted_rec = float(f.read().strip())

In [13]:
print("F1 Score")
print(f'Combined Dataset:\t{round(combined_f1 * 100, 2)}%')
print(f'Structured Dataset:\t{round(structured_f1 * 100, 2)}%')
print(f'Encrypted Dataset:\t{round(encrypted_f1 * 100, 2)}%')

print("\nAccuracy")
print(f'Combined Dataset:\t{round(combined_acc * 100, 2)}%')
print(f'Structured Dataset:\t{round(structured_acc * 100, 2)}%')
print(f'Encrypted Dataset:\t{round(encrypted_acc * 100, 2)}%')

print("\nPrecision")
print(f'Combined Dataset:\t{round(combined_pre * 100, 2)}%')
print(f'Structured Dataset:\t{round(structured_pre * 100, 2)}%')
print(f'Encrypted Dataset:\t{round(encrypted_pre * 100, 2)}%')

print("\nRecall")
print(f'Combined Dataset:\t{round(combined_rec * 100, 2)}%')
print(f'Structured Dataset:\t{round(structured_rec * 100, 2)}%')
print(f'Encrypted Dataset:\t{round(encrypted_rec * 100, 2)}%')

F1 Score
Combined Dataset:	93.15%
Structured Dataset:	93.63%
Encrypted Dataset:	92.86%

Accuracy
Combined Dataset:	91.78%
Structured Dataset:	92.05%
Encrypted Dataset:	91.78%

Precision
Combined Dataset:	94.15%
Structured Dataset:	94.09%
Encrypted Dataset:	93.3%

Recall
Combined Dataset:	96.7%
Structured Dataset:	95.98%
Encrypted Dataset:	97.47%
