# Import Libraries

In [118]:
import shutil
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, recall_score, f1_score, classification_report
)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from concrete.ml.sklearn import LogisticRegression
from concrete.ml.deployment import FHEModelClient, FHEModelServer, FHEModelDev


# Load data

In [119]:
# Load the CSV file
data = pd.read_csv('./data/extracted_data.csv')

# Display the first few rows
print(data.head())

# Check for missing values and drop them
data.dropna(inplace=True)

# Ensure labels are integers
data['phishing'] = data['phishing'].astype(int)


   phishing                                               text
0         0  Tana:\n\nNone of the CPs on today's list are a...
1         1  Commentary\n\n\n\nIt is time to refinance!\n\n...
2         1  FROM MRS. JOCELYN MOMODU.\nNO 102 16TH STREET,...
3         0  Vince\n\nThanks for the update - especially yo...
4         0  I'm getting these messages and I'm not sure wh...


# Split data into training and testing sets

In [120]:
X = data['text']
y = data['phishing']

# Split data into training and testing sets
X_train_text, X_test_text, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


# Feature extraction - text vectorization


In [121]:
# Server side: Initialize and fit the vectorizer
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
vectorizer.fit(X_train_text)


In [122]:
# Client side: Transform the training and testing data
X_train_vectors = vectorizer.transform(X_train_text)
X_test_vectors = vectorizer.transform(X_test_text)


# Quantize the Features for Homomorphic Encryption
Since homomorphic encryption operates on integers, we'll quantize the TF-IDF features.

In [123]:
# Client side: Scale features to [0, 1] using MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
X_train_scaled = scaler.fit_transform(X_train_vectors.toarray())
X_test_scaled = scaler.transform(X_test_vectors.toarray())

# Quantize features by scaling up, then cast to float32
scale_factor = 2 ** 7  # Adjust the scaling factor as needed
X_train_quantized = (X_train_scaled * scale_factor).astype(np.float32) # need to use float32 because float8 is not supported.
X_test_quantized = (X_test_scaled * scale_factor).astype(np.float32)


### Train classification model

In [124]:
# Initialize the model with appropriate parameters
model = LogisticRegression(n_bits=7)  # n_bits should match the quantization bits

# Train the model on quantized data
model.fit(X_train_quantized, y_train)


### Evaluate model

In [125]:
# Make predictions on the test set
y_pred = model.predict(X_test_quantized)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Model Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Model Performance:
Accuracy: 0.9780
Recall: 0.9760
F1 Score: 0.9745

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1706
           1       0.97      0.98      0.97      1294

    accuracy                           0.98      3000
   macro avg       0.98      0.98      0.98      3000
weighted avg       0.98      0.98      0.98      3000



### Compile and save the model for HE

In [126]:
# Compile the model with representative data (subset of training data)
representative_data = X_train_quantized[:100]
model.compile(representative_data)


<concrete.fhe.compilation.circuit.Circuit at 0x7f69753d74c0>

In [127]:
model_directory = './model'

# Create a backup if the directory exists
if os.path.exists(model_directory):
    backup_directory = model_directory + '_backup'
    if os.path.exists(backup_directory):
        shutil.rmtree(backup_directory)
    shutil.copytree(model_directory, backup_directory)

# Clear the path
shutil.rmtree(model_directory, ignore_errors=True)


# Setup the development environment
dev = FHEModelDev(path_dir=model_directory, model=model)
dev.save()

# Simulate encrypted prediction

### initialize fhe model server and client

In [128]:
# Initialize FHEModelClient and FHEModelServer
client = FHEModelClient(model_directory)
serialized_evaluation_keys = client.get_serialized_evaluation_keys()

server = FHEModelServer(model_directory)
server.load()


### user input

In [129]:
# Prepare the sample input
sample_text = X_test_text.iloc[0]
sample_label = y_test.iloc[0]

# Transform and scale the sample text
sample_vector = vectorizer.transform([sample_text]).toarray()
sample_vector_scaled = scaler.transform(sample_vector)
#sample_vector_quantized = (sample_vector_scaled * scale_factor).astype(np.uint32)

# Encrypt the quantized data
encrypted_input = client.quantize_encrypt_serialize(sample_vector_scaled)


### server prediction

In [130]:
# Server processes the encrypted data
encrypted_result = server.run(encrypted_input, serialized_evaluation_keys)

### client decrypts the results

In [131]:
# Client decrypts the result
result = client.deserialize_decrypt_dequantize(encrypted_result)

# result

In [132]:
# Convert probabilities to label by selecting the class with the highest probability
predicted_label = np.argmax(result)

In [133]:
print(f"Decrypted Prediction Probabilities: {result}")
print(f"Predicted Label: {predicted_label}")
print(f"Actual Label: {sample_label}")

Decrypted Prediction Probabilities: [[0.3966567 0.6033433]]
Predicted Label: 1
Actual Label: 1
