In [None]:
# ===============================
# Malware Classification Pipeline (Stable & Warning-Free) + XGBoost + KNN
# ===============================

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score
)

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier

from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import matplotlib.pyplot as plt

# ------------------------------------------------
# 1. Load Dataset
# ------------------------------------------------
df = pd.read_csv("MalwareData.csv", sep="|")

print("Shape:", df.shape)
print(df.head())

# ------------------------------------------------
# 2. Prepare Data
# ------------------------------------------------
y = df["legitimate"]

drop_cols = [
    "Name", "md5", "CheckSum", "LoaderFlags", "BaseOfData",
    "Machine", "NumberOfRvaAndSizes",
    "MajorOperatingSystemVersion", "MinorOperatingSystemVersion",
    "MajorImageVersion", "MinorImageVersion",
    "MajorSubsystemVersion", "MinorSubsystemVersion",
    "MajorLinkerVersion", "MinorLinkerVersion",
    "Subsystem", "DllCharacteristics"
]

X = df.drop(columns=drop_cols + ["legitimate"], errors="ignore")
X = X.apply(pd.to_numeric, errors="coerce")

# --- Fix: replace NaN / Inf safely ---
X = X.replace([np.inf, -np.inf], np.nan).fillna(0)
X = np.nan_to_num(X, nan=0.0, posinf=1e6, neginf=-1e6)

# --- Fix: scaling for stability ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- Fix: PCA to prevent matmul overflow ---
pca = PCA(n_components=0.80, random_state=42)
X_pca = pca.fit_transform(X_scaled)

print("PCA reduced dimensions:", X_pca.shape[1])

# ------------------------------------------------
# 3. Train-Test Split
# ------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_pca, y, test_size=0.2, stratify=y, random_state=42
)

print("\nTraining samples:", len(y_train))
print("Testing samples:", len(y_test))

# ------------------------------------------------
# 4. Models (Defaults)
# ------------------------------------------------
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Linear SVM": LinearSVC(C=1.0, max_iter=5000),
    "Neural Network": MLPClassifier(
        hidden_layer_sizes=(64, 32),
        activation="relu",
        solver="adam",
        early_stopping=True,
        learning_rate_init=0.001,
        max_iter=300,
        random_state=42
    ),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "XGBoost": XGBClassifier(
        n_estimators=150,
        max_depth=5,
        learning_rate=0.1,
        subsample=0.9,
        colsample_bytree=0.9,
        eval_metric="logloss",
        tree_method="hist"
    )
}

results_default = {}

print("\n==================== DEFAULT MODEL RESULTS ====================")

for name, model in models.items():
    print(f"\n===== {name} =====")

    cv_score = cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy").mean()
    print("5-Fold Validation Accuracy:", cv_score)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
        roc = roc_auc_score(y_test, y_prob)
    else:
        roc = np.nan

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    results_default[name] = [acc, prec, rec, f1, roc]

    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1-score: {f1:.4f}")
    print(f"ROC-AUC: {roc}")

# ------------------------------------------------
# 5. Hyperparameter Tuning (SAFE)
# ------------------------------------------------
print("\n==================== HYPERPARAMETER TUNING ====================")

param_grids = {
    "Random Forest": {
        "n_estimators": [100, 200],
        "max_depth": [None, 20, 30]
    },
    "Linear SVM": {
        "C": [0.5, 1, 5]
    },
    "Neural Network": {
        "hidden_layer_sizes": [(16,), (16, 8)],
        "learning_rate_init": [0.001, 0.0005],
        "max_iter": [300]
    },
    "KNN": {
        "n_neighbors": [3, 5, 7],
        "weights": ["uniform", "distance"]
    },
    "XGBoost": {
        "n_estimators": [100, 150],
        "max_depth": [4, 5],
        "learning_rate": [0.05, 0.1]
    }
}

tuned_models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Linear SVM": LinearSVC(max_iter=5000),
    "Neural Network": MLPClassifier(
    hidden_layer_sizes=(32, 16),
    activation="tanh",
    solver="adam",
    learning_rate_init=0.0005,
    early_stopping=True,
    max_iter=400,
    max_fun=12000,
    random_state=42
    ),
    "KNN": KNeighborsClassifier(),
    "XGBoost": XGBClassifier(
        eval_metric="logloss",
        tree_method="hist"
    )
}

results_tuned = {}

for name, model in tuned_models.items():
    print(f"\n===== Tuning {name} =====")

    gs = GridSearchCV(
        model,
        param_grids[name],
        cv=3,
        scoring="accuracy",
        n_jobs=-1
    )

    gs.fit(X_train, y_train)
    print("Best Parameters:", gs.best_params_)

    best_model = gs.best_estimator_
    y_pred = best_model.predict(X_test)

    if hasattr(best_model, "predict_proba"):
        y_prob = best_model.predict_proba(X_test)[:, 1]
        roc = roc_auc_score(y_test, y_prob)
    else:
        roc = np.nan

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    results_tuned[name] = [acc, prec, rec, f1, roc]

    print(f"Tuned Accuracy: {acc:.4f}")
    print(f"Tuned Precision: {prec:.4f}")
    print(f"Tuned Recall: {rec:.4f}")
    print(f"Tuned F1-score: {f1:.4f}")
    print(f"Tuned ROC-AUC: {roc}")

# ------------------------------------------------
# 6. Summary Tables
# ------------------------------------------------
print("\n==================== DEFAULT MODEL SUMMARY ====================")
summary_default = pd.DataFrame(
    results_default,
    index=["Accuracy", "Precision", "Recall", "F1-score", "ROC-AUC"]
)
print(summary_default)

print("\n==================== TUNED MODEL SUMMARY ====================")
summary_tuned = pd.DataFrame(
    results_tuned,
    index=["Accuracy", "Precision", "Recall", "F1-score", "ROC-AUC"]
)
print(summary_tuned)

# ------------------------------------------------
# 7. Confusion Matrices for Default Models
# ------------------------------------------------
print("\n==================== CONFUSION MATRICES ====================")

for name, model in models.items():
    print(f"\n{ name } Confusion Matrix")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1])
    disp.plot(cmap=plt.cm.Blues)
    plt.title(f"{name} - Default Model")
    plt.show()

# ------------------------------------------------
# 8. Confusion Matrices for Tuned Models
# ------------------------------------------------
for name, model in tuned_models.items():
    print(f"\n{ name } Confusion Matrix (Tuned)")
    gs = GridSearchCV(model, param_grids[name], cv=3, scoring="accuracy", n_jobs=-1)
    gs.fit(X_train, y_train)
    best_model = gs.best_estimator_
    y_pred = best_model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1])
    disp.plot(cmap=plt.cm.Oranges)
    plt.title(f"{name} - Tuned Model")
    plt.show()

# ------------------------------------------------
# 9. Comparison Plot (Accuracy, F1, ROC-AUC)
# ------------------------------------------------
metrics_to_plot = ["Accuracy", "F1-score", "ROC-AUC"]
x = np.arange(len(models))  # for bar positions

fig, ax = plt.subplots(1, 3, figsize=(18, 5))

for i, metric in enumerate(metrics_to_plot):
    default_vals = [results_default[m][metrics_to_plot.index(metric)] for m in models]
    tuned_vals = [results_tuned[m][metrics_to_plot.index(metric)] for m in models]
    
    width = 0.35
    ax[i].bar(x - width/2, default_vals, width, label="Default")
    ax[i].bar(x + width/2, tuned_vals, width, label="Tuned")
    ax[i].set_xticks(x)
    ax[i].set_xticklabels(models.keys(), rotation=45, ha="right")
    ax[i].set_ylabel(metric)
    ax[i].set_title(f"{metric} Comparison")
    ax[i].legend()

plt.tight_layout()
plt.show()


Shape: (138047, 57)
           Name                               md5  Machine  \
0   memtest.exe  631ea355665f28d4707448e442fbf5b8      332   
1       ose.exe  9d10f99a6712e28f8acd5641e3a7ea6b      332   
2     setup.exe  4d92f518527353c0db88a70fddcfd390      332   
3      DW20.EXE  a41e524f8d45f0074fd07805ff0c9b12      332   
4  dwtrig20.exe  c87e561258f2f8650cef999bf643a731      332   

   SizeOfOptionalHeader  Characteristics  MajorLinkerVersion  \
0                   224              258                   9   
1                   224             3330                   9   
2                   224             3330                   9   
3                   224              258                   9   
4                   224              258                   9   

   MinorLinkerVersion  SizeOfCode  SizeOfInitializedData  \
0                   0      361984                 115712   
1                   0      130560                  19968   
2                   0      517120         