In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, roc_curve, auc, classification_report

# Load dataset
file_path = r"C:\Users\nandu\OneDrive\Desktop\processed_data.csv"
df = pd.read_csv(file_path)

# Debugging: Check dataset
print("Dataset loaded successfully.")
print("Dataset shape:", df.shape)
print("Label unique values:", df['Label'].unique())

# Remove NaN values from 'Label' column
df.dropna(subset=['Label'], inplace=True)

# Convert labels to numeric (0 = Benign, 1 = Attack)
df.loc[:, 'Label'] = df['Label'].astype(float)  # Ensure float for processing
df.loc[:, 'Label'] = df['Label'].fillna(df['Label'].mode()[0])  # Fill missing values
df.loc[:, 'Label'] = df['Label'].astype(int)  # Convert back to integer

# Final check for missing values
if df['Label'].isnull().sum() > 0:
    raise ValueError("ERROR: 'Label' column still contains NaN values. Please check the dataset.")

# Define features and target variable
X = df.drop(columns=['Label'])
y = df['Label']

# Debugging: Check dataset after preprocessing
print("Processed dataset shape:", X.shape)
print("Target variable stats:", y.value_counts())

# Split data into training & testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#  Debugging: Verify train-test split
print("Train size:", X_train.shape, y_train.shape)
print("Test size:", X_test.shape, y_test.shape)

# Define models with **Decision Tree regularization** to limit overfitting
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree (Pruned)": DecisionTreeClassifier(max_depth=10, min_samples_split=10),
    "Random Forest": RandomForestClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "SVM": SVC(probability=True)  # Enable probability estimates for ROC curves
}

# Store evaluation metrics
metrics_list = []
roc_curves = {}

# Train and evaluate models using **cross-validation** for robustness
for name, model in models.items():
    pipeline = Pipeline([
        ('scaler', StandardScaler()),  # Normalize features
        ('classifier', model)
    ])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_pred_proba = pipeline.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    #  Perform cross-validation
    cv_score = np.mean(cross_val_score(model, X_train, y_train, cv=5))

    # Debugging: Verify predictions
    print(f"{name} - Accuracy: {np.mean(y_pred == y_test):.4f}, CV Score: {cv_score:.4f}")

    #  Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(4, 3))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Benign", "Attack"], yticklabels=["Benign", "Attack"])
    plt.title(f"Confusion Matrix - {name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.savefig(f"conf_matrix_{name.lower().replace(' ', '_')}.png")
    plt.close()

    #  ROC Curve & AUC Score
    if y_pred_proba is not None:
        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        auc_score = auc(fpr, tpr)
        roc_curves[name] = (fpr, tpr, auc_score)

    # Store evaluation metrics dynamically
    report = classification_report(y_test, y_pred, output_dict=True)
    attack_class = str(y_test.max())  # Preserve float format

    metrics_list.append([
        name, 
        report["accuracy"], 
        report[attack_class]["precision"], 
        report[attack_class]["recall"], 
        report[attack_class]["f1-score"], 
        cv_score  # Add Cross-validation score
    ])

# Plot ROC Curves
plt.figure(figsize=(8, 6))
for name, (fpr, tpr, auc_score) in roc_curves.items():
    plt.plot(fpr, tpr, label=f"{name} (AUC: {auc_score:.2f})")
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves for Binary Classification Models")
plt.legend()
plt.savefig("roc_curves.png")
plt.close()

# Save model comparison metrics to CSV
metrics_df = pd.DataFrame(metrics_list, columns=["Model", "Accuracy", "Precision", "Recall", "F1-Score", "Cross-Validation Score"])
metrics_df.to_csv("model_comparison.csv", index=False)

print("Model training & evaluation complete! Metrics saved to 'model_comparison.csv'. ROC and Confusion Matrix plots saved.")


In [None]:
PS C:\Users\nandu\OneDrive\Desktop> python binary_models.py
Dataset loaded successfully.
Dataset shape: (1044751, 15)
Label unique values: [0. 1.]
Processed dataset shape: (1044751, 14)
Target variable stats: Label
0.0    663808
1.0    380943
Name: count, dtype: int64
Train size: (835800, 14) (835800,)
Test size: (208951, 14) (208951,)
Logistic Regression - Accuracy: 0.9997, CV Score: 0.9996
Decision Tree (Pruned) - Accuracy: 1.0000, CV Score: 1.0000
Random Forest - Accuracy: 1.0000, CV Score: 1.0000
K-Nearest Neighbors - Accuracy: 1.0000, CV Score: 1.0000
SVM - Accuracy: 0.9998, CV Score: 0.9998
Model training & evaluation complete! Metrics saved to 'model_comparison.csv'. ROC and Confusion Matrix plots saved.