
# Malware Detection — Combined Dataset Training & Evaluation

This notebook trains models on `data/EDA/Cleaned_combined_malware_dataset.csv` (PE header + process stats).  
Target column: **`legitimate`** (1 = benign, 0 = malware).

Models:
- **Classification:** Logistic Regression (scaled), SVM RBF (scaled), Random Forest (constrained)
- **Clustering:** K-Means (k=2), DBSCAN
- **Outputs:** ROC curves (LR & SVM), RF feature importances, metrics table (`artifacts/classifier_metrics_combined.csv`)


In [None]:

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, classification_report, roc_curve, ConfusionMatrixDisplay
)
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score, adjusted_rand_score

# Paths
DATA_PATH = os.path.join("data", "EDA", "Cleaned_combined_malware_dataset.csv")
ART_DIR = os.path.join("artifacts")
PLOT_DIR = os.path.join(ART_DIR, "plots")
os.makedirs(PLOT_DIR, exist_ok=True)

print("Reading:", DATA_PATH)


In [None]:

df = pd.read_csv(DATA_PATH)

# Drop accidental unnamed columns
drop_un = [c for c in df.columns if str(c).lower().startswith("unnamed") or str(c).strip() == ""]
if drop_un:
    df = df.drop(columns=drop_un)
    print("Dropped accidental columns:", drop_un)

if "legitimate" not in df.columns:
    raise ValueError("Expected 'legitimate' column in combined dataset. Found: " + str(list(df.columns)))

print("Shape:", df.shape)
print("Columns:", list(df.columns)[:10], "...")
display(df.head())


In [None]:

# Features/labels
X = df.drop(columns=["legitimate"]).copy()
y = df["legitimate"].astype(int)

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# Scaled versions for LR/SVM
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

print("Train:", X_train.shape, "| Test:", X_test.shape)


In [None]:

def report_classifier(name, y_true, y_pred, y_prob=None):
    print(f"\n=== {name} ===")
    print(classification_report(y_true, y_pred))
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_prob) if y_prob is not None else float("nan")
    print(f"Accuracy={acc:.5f}, Precision={prec:.5f}, Recall={rec:.5f}, F1={f1:.5f}, ROC AUC={auc:.5f}")
    return {"model": name, "accuracy": acc, "precision": prec, "recall": rec, "f1": f1, "roc_auc": auc}


In [None]:

# Logistic Regression (scaled)
logreg = LogisticRegression(max_iter=5000, solver="saga", n_jobs=-1)
logreg.fit(X_train_scaled, y_train)
y_pred_lr = logreg.predict(X_test_scaled)
y_prob_lr = logreg.predict_proba(X_test_scaled)[:, 1]
m_lr = report_classifier("Logistic Regression (scaled)", y_test, y_pred_lr, y_prob_lr)

# ROC curve
fpr, tpr, _ = roc_curve(y_test, y_prob_lr)
plt.figure()
plt.plot(fpr, tpr, label="LogReg ROC")
plt.plot([0,1],[0,1],'--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Logistic Regression")
plt.legend()
plt.tight_layout()
plt.savefig(os.path.join(PLOT_DIR, "roc_logreg_combined.png"), dpi=160, bbox_inches="tight")
plt.show()


In [None]:

# SVM RBF (scaled)
svm = SVC(kernel="rbf", probability=True, random_state=42)
svm.fit(X_train_scaled, y_train)
y_pred_svm = svm.predict(X_test_scaled)
y_prob_svm = svm.predict_proba(X_test_scaled)[:, 1]
m_svm = report_classifier("SVM RBF (scaled)", y_test, y_pred_svm, y_prob_svm)

# ROC curve
fpr, tpr, _ = roc_curve(y_test, y_prob_svm)
plt.figure()
plt.plot(fpr, tpr, label="SVM ROC")
plt.plot([0,1],[0,1],'--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - SVM (RBF)")
plt.legend()
plt.tight_layout()
plt.savefig(os.path.join(PLOT_DIR, "roc_svm_combined.png"), dpi=160, bbox_inches="tight")
plt.show()


In [None]:

# Random Forest (constrained)
rf = RandomForestClassifier(
    n_estimators=300, max_depth=15, min_samples_leaf=5, random_state=42, n_jobs=-1
)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
y_prob_rf = rf.predict_proba(X_test)[:, 1]
m_rf = report_classifier("Random Forest (constrained)", y_test, y_pred_rf, y_prob_rf)

# Feature importances
importances = rf.feature_importances_
idx = np.argsort(importances)[::-1][:20]
top_feats = [X.columns[i] for i in idx]

plt.figure(figsize=(9,5))
plt.bar(range(len(idx)), importances[idx])
plt.xticks(range(len(idx)), top_feats, rotation=60, ha='right')
plt.title("Random Forest Feature Importance (Top 20) — Combined Dataset")
plt.tight_layout()
plt.savefig(os.path.join(PLOT_DIR, "rf_importances_combined.png"), dpi=160, bbox_inches="tight")
plt.show()

print("\nTop 20 RF features:")
for name, val in zip(top_feats, importances[idx]):
    print(f"{name:35s} {val:.4f}")


In [None]:

# Clustering
print("\n--- Clustering ---")

scaler_km = StandardScaler()
X_all_scaled = scaler_km.fit_transform(X)

# K-Means
kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
labels_km = kmeans.fit_predict(X_all_scaled)
sil_km = silhouette_score(X_all_scaled, labels_km)
ari_km = adjusted_rand_score(y, labels_km)
print(f"K-Means (k=2): Silhouette={sil_km:.5f}, ARI={ari_km:.5f}")

# DBSCAN
dbscan = DBSCAN(eps=0.8, min_samples=15, n_jobs=-1)
labels_db = dbscan.fit_predict(X_all_scaled)
valid = labels_db != -1
if valid.sum() > 1 and len(set(labels_db[valid])) > 1:
    sil_db = silhouette_score(X_all_scaled[valid], labels_db[valid])
    ari_db = adjusted_rand_score(y[valid], labels_db[valid])
else:
    sil_db, ari_db = float("nan"), float("nan")
unique, counts = np.unique(labels_db, return_counts=True)
print("DBSCAN clusters:", dict(zip(unique, counts)))
print(f"DBSCAN (non-noise): Silhouette={sil_db:.5f}, ARI={ari_db:.5f}")


In [None]:

# Save metrics table
metrics = pd.DataFrame([m_lr, m_svm, m_rf])
csv_path = os.path.join(ART_DIR, "classifier_metrics_combined.csv")
os.makedirs(ART_DIR, exist_ok=True)
metrics.to_csv(csv_path, index=False)
print("Saved metrics to:", csv_path)
metrics



## Wrap-up

- **Best supervised model** is typically **SVM (RBF)** or **constrained Random Forest** on this combined dataset.
- ROC curves and feature importances are saved to `artifacts/plots/`.
- Metrics table saved to `artifacts/classifier_metrics_combined.csv` for your report appendix.
- Clustering results are included for the second ML category requirement; tune DBSCAN's `eps`/`min_samples` if desired.
