In [44]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from opacus import PrivacyEngine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression as SKlearnLogisticRegression
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, classification_report, confusion_matrix)
from concrete.ml.sklearn import LogisticRegression as ConcreteLogisticRegression
import numpy as np
import time
import psutil
import os

In [45]:
# Helper function for memory usage
def get_memory_usage():
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / 1024 ** 2  # Convert bytes to MB

In [46]:
# Define PyTorch Logistic Regression Model
class LogisticRegression(nn.Module):
    def __init__(self, num_features, num_classes):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(num_features, num_classes)
    
    def forward(self, x):
        return self.linear(x)  # Return logits for CrossEntropyLoss

In [47]:
# Load the dataset
df = pd.read_csv('../data/raw/dataset_1/newdataset.csv')
                 
X = df.drop('diseases', axis=1)
y = df['diseases']

le = LabelEncoder()
y_encoded = le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Convert to numpy arrays
X_train = X_train.to_numpy().astype(np.float32)
X_test = X_test.to_numpy().astype(np.float32)
y_train = y_train.astype(np.int64)  
y_test = y_test.astype(np.int64)

In [48]:
# Convert to PyTorch tensors and create DataLoader
train_dataset = TensorDataset(
    torch.from_numpy(X_train), 
    torch.from_numpy(y_train)
)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)

In [49]:
# Initialize model and optimizer
model = LogisticRegression(
    num_features=X_train.shape[1],
    num_classes=len(le.classes_)
)
optimizer = torch.optim.SGD(model.parameters(), lr=0.2)

# Differential Privacy Parameters
PRIVACY_PARAMS = {
    "noise_multiplier": 0.7,
    "max_grad_norm": 1.0,
    "delta": 1e-5
}

# Attach DP Engine
privacy_engine = PrivacyEngine()
model, optimizer, train_loader = privacy_engine.make_private(
    module=model,
    optimizer=optimizer,
    data_loader=train_loader,
    noise_multiplier=PRIVACY_PARAMS["noise_multiplier"],
    max_grad_norm=PRIVACY_PARAMS["max_grad_norm"]
)

In [50]:
start_train = time.time()
model.train()
for epoch in range(15):
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = nn.CrossEntropyLoss()(outputs, y_batch)
        loss.backward()
        optimizer.step()
    
    # Calculate privacy spending
    epsilon = privacy_engine.get_epsilon(PRIVACY_PARAMS["delta"])
    print(f"Epoch {epoch+1}/15 | ε = {epsilon:.2f}")

training_time = time.time() - start_train
memory_usage = get_memory_usage()

Epoch 1/15 | ε = 0.45
Epoch 2/15 | ε = 0.55
Epoch 3/15 | ε = 0.63
Epoch 4/15 | ε = 0.69
Epoch 5/15 | ε = 0.74
Epoch 6/15 | ε = 0.79
Epoch 7/15 | ε = 0.83
Epoch 8/15 | ε = 0.88
Epoch 9/15 | ε = 0.92
Epoch 10/15 | ε = 0.96
Epoch 11/15 | ε = 0.99
Epoch 12/15 | ε = 1.03
Epoch 13/15 | ε = 1.06
Epoch 14/15 | ε = 1.09
Epoch 15/15 | ε = 1.13


In [51]:
# Convert to scikit-learn model
sk_model = SKlearnLogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    max_iter=1,          # Prevent re-training
    warm_start=False,
    penalty=None         # Match PyTorch's no regularization
)
# Initialize dummy fit
sk_model.fit(X_train[:2], y_train[:2])

# Transfer learned parameters
sk_model.coef_ = model.linear.weight.detach().numpy()  # Shape: (n_classes, n_features)
sk_model.intercept_ = model.linear.bias.detach().numpy()
sk_model.classes_ = np.unique(y_encoded)

In [52]:
# Evaluate
start_pred = time.time()
y_pred = sk_model.predict(X_test)
prediction_time = time.time() - start_pred


In [53]:
# Save Artifact
import joblib
joblib.dump(sk_model, '../models/final_plaintext_dp_logistic_model.pkl')

['../models/final_plaintext_dp_logistic_model.pkl']

In [54]:
# Get evaluation metrics
accuracy_plain = accuracy_score(y_test, y_pred)
precision_plain = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall_plain = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1_plain = f1_score(y_test, y_pred, average='weighted', zero_division=0)
class_report_plain = classification_report(y_test, y_pred, zero_division=0)
conf_matrix_plain = confusion_matrix(y_test, y_pred)

output_file = "../results/lr_dp_plain.txt"
with open(output_file, "w") as f:
    f.write("Plaintext Evaluation Metrics for LR DP Model:\n")
    f.write("-----------------------------------------------\n")
    f.write(f"Training Time      : {training_time:.4f} seconds\n")
    f.write(f"Prediction Time    : {prediction_time:.4f} seconds\n")
    f.write(f"Memory Usage       : {memory_usage:.2f} MB\n")
    f.write(f"Accuracy           : {accuracy_plain:.4f}\n")
    f.write(f"Precision          : {precision_plain:.4f}\n")
    f.write(f"Recall             : {recall_plain:.4f}\n")
    f.write(f"F1 Score           : {f1_plain:.4f}\n")
    f.write("\nClassification Report:\n")
    f.write(class_report_plain + "\n")
    f.write("Confusion Matrix:\n")
    f.write(str(conf_matrix_plain) + "\n")

print(f"Plaintext evaluation results saved to {output_file}")

Plaintext evaluation results saved to ../results/lr_dp_plain.txt


In [55]:
# Compile Model For FHE Inference
print("Converting model for FHE inference...")
cml_model = ConcreteLogisticRegression.from_sklearn_model(sk_model, X_train, n_bits=8)
start_compile = time.time()
cml_model.compile(X_train)
compile_time = time.time() - start_compile
print(f"Compilation Time: {compile_time:.4f} seconds")

Converting model for FHE inference...
Compilation Time: 215.2022 seconds


In [56]:
# Save Artifacts
from pathlib import Path
fhe_model_path = Path('../models/compiled_dp_lr_model.json')
with fhe_model_path.open('w') as f:
    cml_model.dump(f)

In [57]:
# Perform FHE inference
start_pred_fhe = time.time()
y_pred_fhe = cml_model.predict(X_test, fhe="execute")
prediction_time_fhe = time.time() - start_pred_fhe

In [60]:
# Get FHE evaluation metrics
accuracy_fhe = accuracy_score(y_test, y_pred_fhe)
precision_fhe = precision_score(y_test, y_pred_fhe, average='weighted', zero_division=0)
recall_fhe = recall_score(y_test, y_pred_fhe, average='weighted', zero_division=0)
f1_fhe = f1_score(y_test, y_pred_fhe, average='weighted', zero_division=0)
class_report_fhe = classification_report(y_test, y_pred_fhe, zero_division=0)
conf_matrix_fhe = confusion_matrix(y_test, y_pred_fhe)

# Save FHE results
output_file_fhe = "../results/lr_dp_fhe.txt"
with open(output_file_fhe, "w") as f:
    f.write("FHE Evaluation Metrics for LR DP Model:\n")
    f.write("-----------------------------------------------\n")
    f.write(f"Compile Time      : {compile_time:.4f} seconds\n")
    f.write(f"Prediction Time    : {prediction_time_fhe:.4f} seconds\n")
    f.write(f"Memory Usage       : {memory_usage:.2f} MB\n")
    f.write(f"Accuracy           : {accuracy_fhe:.4f}\n")
    f.write(f"Precision          : {precision_fhe:.4f}\n")
    f.write(f"Recall             : {recall_fhe:.4f}\n")
    f.write(f"F1 Score           : {f1_fhe:.4f}\n")
    f.write("\nClassification Report:\n")
    f.write(class_report_fhe + "\n")
    f.write("Confusion Matrix:\n")
    f.write(str(conf_matrix_fhe) + "\n")

print(f"FHE evaluation results saved to {output_file_fhe}")

FHE evaluation results saved to ../results/lr_dp_fhe.txt


In [59]:
encrypted_model_path = "../data/encrypted/DP/"
os.makedirs(encrypted_model_path, exist_ok=True)

from concrete.ml.deployment import FHEModelDev
dev = FHEModelDev(path_dir=encrypted_model_path, model=cml_model)
dev.save()