In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
import random
import joblib
import os
os.chdir('Resources/')

from custom_tuple_scaler import CustomTupleScaler

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
import joblib
import ast  # To safely parse the list from file

# Load data
df = pd.read_csv('1_CC_Structured_Data.csv')

# Load test row indices
with open("12_SP_Test_Data_RF_IF.txt", "r") as file:
    test_indices_list = ast.literal_eval(file.read().strip())  # Convert string to list safely

# Load previously stored F1 scores
with open("13_CC_F1_Scores_RF_IF.txt", "r") as file:
    stored_f1_scores = []
    for line in file:
        if "F1-score for fold" in line:
            stored_f1_scores.append(float(line.split(":")[1].strip()))

# Define categorical columns
categorical_cols = ["Sex", "ChestPainType", "FastingBloodSugar", "RestingECG", "ExerciseAngina", "ST_Slope", "HeartDisease"]

# Encode categorical data
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store encoders for future use

# Define features and target
X = df.drop(columns=["HeartDisease"])  # Features
y = df["HeartDisease"]  # Target

# Load hyperparameters for the RandomForestClassifier
with open("13_CC_Hyperparameters.txt", "r") as hp_file:
    hyperparameters = {}
    for line in hp_file:
        if "RandomForestClassifier" in line:
            params = line.split("Hyperparameters:")[1].strip()
            hyperparameters["RandomForestClassifier"] = eval(params)

# Extract hyperparameters for RandomForestClassifier
rf_params = hyperparameters["RandomForestClassifier"]

# Initialize and train the RandomForest model using the same hyperparameters
rf_model = RandomForestClassifier(random_state=rf_params["random_state"], n_estimators=rf_params["n_estimators"])

# Store calculated F1 scores
calculated_f1_scores = []

# Iterate over each test dataset
for i, test_indices in enumerate(test_indices_list, start=1):
    # Ensure test indices are within range
    test_indices = [idx for idx in test_indices if idx < len(df)]

    # Split into train and test using the extracted indices
    df_test = df.iloc[test_indices]

    # Separate features and target
    X_test, y_test = df_test.drop(columns=["HeartDisease"]), df_test["HeartDisease"]
    
    # Split into train set excluding current test data
    df_train = df.drop(test_indices)
    X_train, y_train = df_train.drop(columns=["HeartDisease"]), df_train["HeartDisease"]

    # Scale the data using StandardScaler
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)  # Fit on the train data
    X_test_scaled = scaler.transform(X_test)  # Transform the test data

    # Train the RandomForest model
    rf_model.fit(X_train_scaled, y_train)

    # Predict on the test set
    y_pred = rf_model.predict(X_test_scaled)

    # Calculate F1-score
    f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted for class imbalance
    calculated_f1_scores.append(f1 * 100)  # Convert to percentage

    # Save the trained model for the current fold
    model_path = f'12_SP_Model_RF_IF_{i}.joblib'

# Compare stored vs calculated F1 scores
print("\n--- F1 Score Comparison ---")
for i in range(5):
    print(f"Fold {i+1}: Original Dataset: {calculated_f1_scores[i]:.2f}%, Encrypted Dataset: {stored_f1_scores[i]:.2f}%")



--- F1 Score Comparison ---
Fold 1: Original Dataset: 96.61%, Encrypted Dataset: 97.07%
Fold 2: Original Dataset: 97.09%, Encrypted Dataset: 95.45%
Fold 3: Original Dataset: 98.22%, Encrypted Dataset: 97.10%
Fold 4: Original Dataset: 98.38%, Encrypted Dataset: 98.11%
Fold 5: Original Dataset: 97.73%, Encrypted Dataset: 96.08%
