In [1]:
import time
import os
import psutil
import numpy as np
import pandas as pd
import joblib
from concrete.ml.sklearn import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, precision_score, recall_score,
                             f1_score)


In [None]:
# First Column (Diseases) is the target variable.
# All other columns are symptoms.
X = joblib.load('../data/raw/raw_concrete_X.pkl')
y = joblib.load('../data/raw/raw_concrete_y.pkl')


In [6]:
# Create mask to exclude rare classes
class_counts = y.value_counts()
rare_classes = class_counts[class_counts < 2].index.tolist()
mask = ~y.isin(rare_classes)
X_filtered = X[mask]
y_filtered = y[mask]

# Encode remaining classes to ensure 0-indexed consecutive labels
le = LabelEncoder()
y_encoded = le.fit_transform(y_filtered)

In [None]:


# Perform split with valid stratification
X_train, X_test, y_train, y_test = train_test_split(
    X_filtered, y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded  # Now safe
)

# Subsample X_train
X_train_sub, y_train_sub = resample(
    X_train, y_train, n_samples=int(0.3 * len(X_train)), random_state=42, stratify=y_train)

joblib.dump(X_train, '../data/raw/X_train_XGB.pkl')

['../data/raw/X_train_XGB.pkl']

In [None]:
# Get training model time
start_train = time.time()
model = XGBClassifier(
    n_estimators=10,       # FHE-optimized (balance accuracy/circuit size)
    max_depth=3,           # Critical for FHE performance
    learning_rate=0.1,
    n_bits=3,              # Quantization bits (FHE requirement)
    objective="multi:softmax",
    num_class=len(le.classes_),
    tree_method="hist"     # Essential for large datasets
)
model.fit(X_train_sub, y_train_sub)
training_time = time.time() - start_train

# Get Resource Utilisation after training
process = psutil.Process(os.getpid())
memory_usage = process.memory_info().rss / 1024 ** 2  # Convert bytes to MB

ValueError: Found input variables with inconsistent numbers of samples: [98770, 197540]

In [None]:
# Get prediction time
start_pred = time.time()
y_pred = model.predict(X_test)
prediction_time = time.time() - start_pred

In [None]:
# Get evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
class_report = classification_report(y_test, y_pred, zero_division=0)
conf_matrix = confusion_matrix(y_test, y_pred)

output_file = "../results/XGB_plain.txt"
with open(output_file, "w") as f:
    f.write("Plaintext Evaluation Metrics for XGB Model:\n")
    f.write("-------------------\n")
    f.write(f"Training Time      : {training_time:.4f} seconds\n")
    f.write(f"Prediction Time    : {prediction_time:.4f} seconds\n")
    f.write(f"Memory Usage       : {memory_usage:.2f} MB\n")
    f.write(f"Accuracy           : {accuracy:.4f}\n")
    f.write(f"Precision          : {precision:.4f}\n")
    f.write(f"Recall             : {recall:.4f}\n")
    f.write(f"F1 Score           : {f1:.4f}\n")
    f.write("\nClassification Report:\n")
    f.write(class_report + "\n")
    f.write("Confusion Matrix:\n")
    f.write(str(conf_matrix) + "\n")

print(f"Evaluation results have been written to '{output_file}'.")


In [None]:
# Compile Model For FHE Inference
print("Compiling model for FHE inference...")
start_compile = time.time()
model.compile(X_train)
compile_time = time.time() - start_compile
print(f"Compilation Time: {compile_time:.4f} seconds")

In [None]:
# Save model
from pathlib import Path
fhe_model_path = Path('../models/compiled_XGB_model.json')
with fhe_model_path.open('w') as f:
    model.dump(f)

In [None]:
# Get prediction time on Encrypted Data
start_pred = time.time()
y_pred_fhe = model.predict(X_test, fhe="execute")
prediction_time = time.time() - start_pred

In [None]:
# Get evaluation metrics
accuracy_plain_fhe = accuracy_score(y_test, y_pred_fhe)
precision_plain_fhe = precision_score(y_test, y_pred_fhe, average='weighted', zero_division=0)
recall_plain_fhe = recall_score(y_test, y_pred_fhe, average='weighted', zero_division=0)
f1_plain_fhe = f1_score(y_test, y_pred_fhe, average='weighted', zero_division=0)
class_report_plain_fhe = classification_report(y_test, y_pred_fhe, zero_division=0)
conf_matrix_plain_fhe = confusion_matrix(y_test, y_pred_fhe)

output_file = "../results/XGB_encrypted.txt"
with open(output_file, "w") as f:
    f.write("FHE XGB Evaluation Metrics for Encrypted Model:\n")
    f.write("-----------------------------------------------\n")
    f.write(f"Training Time      : {training_time:.4f} seconds\n")
    f.write(f"Prediction Time    : {prediction_time:.4f} seconds\n")
    f.write(f"Memory Usage       : {memory_usage:.2f} MB\n")
    f.write(f"Accuracy           : {accuracy_plain_fhe:.4f}\n")
    f.write(f"Precision          : {precision_plain_fhe:.4f}\n")
    f.write(f"Recall             : {recall_plain_fhe:.4f}\n")
    f.write(f"F1 Score           : {f1_plain_fhe:.4f}\n")
    f.write("\nClassification Report:\n")
    f.write(class_report_plain_fhe + "\n")
    f.write("Confusion Matrix:\n")
    f.write(str(conf_matrix_plain_fhe) + "\n")

print(f"FHE evaluation results saved to {output_file}")