In [1]:
import time
import os
import psutil
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, precision_score, recall_score,
                             f1_score)

In [2]:
# Load the dataset
df = pd.read_csv('../data/raw/dataset_1/newdataset.csv')

# First Column (Diseases) is the target variable.
# All other columns are symptoms.
X = df.drop('diseases', axis=1)
y = df['diseases']

# store X and y
joblib.dump(X, '../data/raw/raw_X.pkl')
joblib.dump(y, '../data/raw/raw_y.pkl')

['../data/raw/raw_y.pkl']

In [3]:
# Encode disease names to numerical labels.
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Create a train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42
)

In [4]:
# Get training model time
start_train = time.time()
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)
training_time = time.time() - start_train

# Get Resource Utilisation after training
process = psutil.Process(os.getpid())
memory_usage = process.memory_info().rss / 1024 ** 2  # Convert bytes to MB

In [5]:
# Get prediction time
start_pred = time.time()
y_pred = model.predict(X_test)
prediction_time = time.time() - start_pred

In [6]:
# Get evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
class_report = classification_report(y_test, y_pred, zero_division=0)
conf_matrix = confusion_matrix(y_test, y_pred)

output_file = "../results/lr_baseline.txt"
with open(output_file, "w") as f:
    f.write("Plaintext LR Evaluation Metrics:\n")
    f.write("-------------------\n")
    f.write(f"Training Time      : {training_time:.4f} seconds\n")
    f.write(f"Prediction Time    : {prediction_time:.4f} seconds\n")
    f.write(f"Memory Usage       : {memory_usage:.2f} MB\n")
    f.write(f"Accuracy           : {accuracy:.4f}\n")
    f.write(f"Precision          : {precision:.4f}\n")
    f.write(f"Recall             : {recall:.4f}\n")
    f.write(f"F1 Score           : {f1:.4f}\n")
    f.write("\nClassification Report:\n")
    f.write(class_report + "\n")
    f.write("Confusion Matrix:\n")
    f.write(str(conf_matrix) + "\n")

print(f"Evaluation results have been written to '{output_file}'.")

Evaluation results have been written to '../results/lr_baseline.txt'.
