In [None]:
# HIGGS Dataset – Logistic Regression vs Neural Network (Full Pipeline)

This notebook runs the full experiment:

1. Load the HIGGS dataset
2. Preprocess data (check missing values, duplicates, scaling)
3. Train Logistic Regression across multiple training sample sizes
4. Train Neural Network (MLPClassifier) across the same sample sizes
5. Generate:
   - Performance vs sample size plots
   - Training time vs sample size plots
   - ROC and PR curves for both models
   - Comparison plots (F1 + Training Time)


In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    average_precision_score,
    roc_curve,
    precision_recall_curve
)

# Configuration
DATA_PATH = r"C:\MSAI\Machine Learning\Assignment 3\HIGGS.csv"
OUTPUT_DIR = r"C:\MSAI\Machine Learning\Assignment 3"

PERCENTAGES = [1, 2, 3, 4, 5, 10, 15, 20, 30, 40, 50, 60, 70, 80, 90, 100]

print("Current Working Directory:", os.getcwd())
print("Data path:", DATA_PATH)
print("Output dir:", OUTPUT_DIR)


In [None]:
def evaluate_classifier(clf, X_train, y_train, X_test, y_test):
    """
    Fits model on training data, predicts on test data,
    and returns metrics + training time.
    """
    import time
    start_time = time.time()
    clf.fit(X_train, y_train)
    train_time = time.time() - start_time

    y_pred = clf.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)

    return {
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1,
        "train_time": train_time
    }


In [None]:
## 1. Load and Explore Data


In [None]:
df = pd.read_csv(DATA_PATH, header=None)
print("Dataset loaded successfully!")
print("Shape:", df.shape)
df.head()


In [None]:
# Missing values
missing_counts = df.isnull().sum()
print("Total missing values:", missing_counts.sum())

# Duplicates
duplicate_count = df.duplicated().sum()
print("Number of duplicated rows:", duplicate_count)

# Class balance
print("\nClass distribution (counts):")
print(df[0].value_counts())

print("\nClass distribution (proportions):")
print(df[0].value_counts(normalize=True))


In [None]:
summary = df.describe().T
summary_path = os.path.join(OUTPUT_DIR, "higgs_summary.txt")
with open(summary_path, "w") as f:
    f.write(summary.to_string())

print("Summary file saved at:", summary_path)



In [None]:
## 2. Train–Test Split and Scaling


In [None]:
Y = df[0]          # target
X = df.iloc[:, 1:] # 28 features

X_train, X_test, y_train, y_test = train_test_split(
    X, Y,
    test_size=0.2,
    random_state=42,
    stratify=Y
)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Scaling complete!")
print("Scaled train shape:", X_train_scaled.shape)
print("Scaled test shape:", X_test_scaled.shape)

n_train = X_train_scaled.shape[0]


In [None]:
## 3. Logistic Regression – Experiments Across Sample Sizes


In [None]:
results_lr = []

for p in PERCENTAGES:
    frac = p / 100.0
    n_samples = int(n_train * frac)

    X_sub = X_train_scaled[:n_samples]
    y_sub = y_train.iloc[:n_samples]

    print(f"[LR] Training with {p}% of training data ({n_samples} samples)")

    lr_clf = LogisticRegression(
        max_iter=1000,
        solver="saga",
        n_jobs=-1
    )

    metrics = evaluate_classifier(lr_clf, X_sub, y_sub, X_test_scaled, y_test)
    metrics["percentage"] = p
    metrics["n_samples"] = n_samples

    results_lr.append(metrics)

results_lr_df = pd.DataFrame(results_lr)
results_lr_df


In [None]:
lr_report_path = os.path.join(OUTPUT_DIR, "LogisticRegressionReport.txt")
with open(lr_report_path, "w") as f:
    f.write(results_lr_df.to_string())

print("Logistic Regression report saved at:", lr_report_path)


In [None]:
plt.figure(figsize=(10, 6))
plt.plot(results_lr_df["percentage"], results_lr_df["accuracy"], marker="o", label="Accuracy")
plt.plot(results_lr_df["percentage"], results_lr_df["precision"], marker="o", label="Precision")
plt.plot(results_lr_df["percentage"], results_lr_df["recall"], marker="o", label="Recall")
plt.plot(results_lr_df["percentage"], results_lr_df["f1"], marker="o", label="F1-score")
plt.xlabel("Training Data Used (%)")
plt.ylabel("Score")
plt.title("Logistic Regression Performance vs Sample Size (HIGGS)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(8, 5))
plt.plot(results_lr_df["percentage"], results_lr_df["train_time"], marker="o")
plt.xlabel("Training Data Used (%)")
plt.ylabel("Training Time (seconds)")
plt.title("Logistic Regression Training Time vs Sample Size (HIGGS)")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
lr_full = LogisticRegression(
    max_iter=1000,
    solver="saga",
    n_jobs=-1,
    random_state=42
)
lr_full.fit(X_train_scaled, y_train)

y_score_lr = lr_full.predict_proba(X_test_scaled)[:, 1]

# ROC
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_score_lr)
roc_auc_lr = roc_auc_score(y_test, y_score_lr)

# PRC
prec_lr, rec_lr, _ = precision_recall_curve(y_test, y_score_lr)
ap_lr = average_precision_score(y_test, y_score_lr)

print(f"LR ROC AUC: {roc_auc_lr:.4f}")
print(f"LR AP (PR AUC): {ap_lr:.4f}")

plt.figure(figsize=(8, 6))
plt.plot(fpr_lr, tpr_lr, label=f"LR (AUC = {roc_auc_lr:.3f})")
plt.plot([0, 1], [0, 1], 'k--', label="Random")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve – Logistic Regression on HIGGS")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

plt.figure(figsize=(8, 6))
plt.plot(rec_lr, prec_lr, label=f"LR (AP = {ap_lr:.3f})")
baseline = y_test.mean()
plt.hlines(baseline, 0, 1, colors='k', linestyles='--',
           label=f"Baseline (pos rate = {baseline:.2f})")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision–Recall Curve – Logistic Regression on HIGGS")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
## 4. Neural Network (MLP) – Experiments Across Sample Sizes


In [None]:
results_nn = []

for p in PERCENTAGES:
    frac = p / 100.0
    n_samples = int(n_train * frac)

    X_sub = X_train_scaled[:n_samples]
    y_sub = y_train.iloc[:n_samples]

    print(f"[NN] Training with {p}% of training data ({n_samples} samples)")

    nn_clf = MLPClassifier(
        hidden_layer_sizes=(128, 64),
        activation="relu",
        solver="adam",
        max_iter=1000,
        random_state=42
    )

    metrics = evaluate_classifier(nn_clf, X_sub, y_sub, X_test_scaled, y_test)
    metrics["percentage"] = p
    metrics["n_samples"] = n_samples

    results_nn.append(metrics)

results_nn_df = pd.DataFrame(results_nn)
results_nn_df


In [None]:
nn_report_path = os.path.join(OUTPUT_DIR, "NeuralNetworkReport.txt")
with open(nn_report_path, "w") as f:
    f.write(results_nn_df.to_string())

print("Neural Network report saved at:", nn_report_path)


In [None]:
plt.figure(figsize=(10, 6))
plt.plot(results_nn_df["percentage"], results_nn_df["accuracy"], marker="o", label="Accuracy")
plt.plot(results_nn_df["percentage"], results_nn_df["precision"], marker="o", label="Precision")
plt.plot(results_nn_df["percentage"], results_nn_df["recall"], marker="o", label="Recall")
plt.plot(results_nn_df["percentage"], results_nn_df["f1"], marker="o", label="F1-score")
plt.xlabel("Training Data Used (%)")
plt.ylabel("Score")
plt.title("Neural Network Performance vs Sample Size (HIGGS)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(8, 5))
plt.plot(results_nn_df["percentage"], results_nn_df["train_time"], marker="o")
plt.xlabel("Training Data Used (%)")
plt.ylabel("Training Time (seconds)")
plt.title("Neural Network Training Time vs Sample Size (HIGGS)")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
print("Training full Neural Network model on all training data for ROC/PR curves...")
nn_full = MLPClassifier(
    hidden_layer_sizes=(128, 64),
    activation="relu",
    solver="adam",
    max_iter=1000,
    random_state=42
)
nn_full.fit(X_train_scaled, y_train)

y_score_nn = nn_full.predict_proba(X_test_scaled)[:, 1]

# ROC
fpr_nn, tpr_nn, _ = roc_curve(y_test, y_score_nn)
roc_auc_nn = roc_auc_score(y_test, y_score_nn)

# PRC
prec_nn, rec_nn, _ = precision_recall_curve(y_test, y_score_nn)
ap_nn = average_precision_score(y_test, y_score_nn)

print(f"NN ROC AUC: {roc_auc_nn:.4f}")
print(f"NN AP (PR AUC): {ap_nn:.4f}")

# Combined ROC
plt.figure(figsize=(8, 6))
plt.plot(fpr_lr, tpr_lr, label=f"LR (AUC = {roc_auc_lr:.3f})")
plt.plot(fpr_nn, tpr_nn, label=f"NN (AUC = {roc_auc_nn:.3f})")
plt.plot([0, 1], [0, 1], 'k--', label="Random")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve – HIGGS (Logistic Regression vs Neural Network)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Combined PRC
plt.figure(figsize=(8, 6))
plt.plot(rec_lr, prec_lr, label=f"LR (AP = {ap_lr:.3f})")
plt.plot(rec_nn, prec_nn, label=f"NN (AP = {ap_nn:.3f})")
baseline = y_test.mean()
plt.hlines(baseline, 0, 1, colors='k', linestyles='--',
           label=f"Baseline (pos rate = {baseline:.2f})")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision–Recall Curve – HIGGS (Logistic Regression vs Neural Network)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
## 5. LR vs NN – F1 and Training Time Comparison


In [None]:
plt.figure(figsize=(10, 6))
plt.plot(results_lr_df["percentage"], results_lr_df["f1"], marker="o", label="LR F1")
plt.plot(results_nn_df["percentage"], results_nn_df["f1"], marker="o", label="NN F1")
plt.xlabel("Training Data Used (%)")
plt.ylabel("F1-score")
plt.title("F1-score vs Training Size – LR vs NN (HIGGS)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
plt.plot(results_lr_df["percentage"], results_lr_df["train_time"], marker="o", label="LR Train Time")
plt.plot(results_nn_df["percentage"], results_nn_df["train_time"], marker="o", label="NN Train Time")
plt.xlabel("Training Data Used (%)")
plt.ylabel("Training Time (seconds)")
plt.title("Training Time vs Training Size – LR vs NN (HIGGS)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
