In [1]:
import math
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder 
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
import itertools
from sklearn.utils import resample
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB 
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

LGR_Classifier = OneVsRestClassifier(
    LogisticRegression(
        solver='lbfgs',
        max_iter=2000,
        class_weight='balanced',
        random_state=42
    )
)

RF_Classifier = RandomForestClassifier(
    n_estimators=100,
    max_depth=15,
    random_state=42,
    n_jobs=-1
)

SVM_Classifier = LinearSVC(
    C=1.5,
    class_weight='balanced',
    max_iter=20000,
    tol=1e-3,
    random_state=42
)

KNN_Classifier = KNeighborsClassifier(
    n_neighbors=5,
    n_jobs=-1
)

BNB_Classifier = BernoulliNB()

DTC_Classifier = tree.DecisionTreeClassifier(
    criterion='entropy',
    random_state=42
)


In [2]:
# Read the dataset
data = pd.read_csv('/kaggle/input/cicddos-dataset/cicddos2019_dataset.csv')

# ===============================
# Increased dataset usage (reduces overfitting)
# ===============================
SAMPLE_RATIO = 0.25

sample_size = int(SAMPLE_RATIO * len(data))
rows = data.sample(n=sample_size, random_state=42)

print(f"Using {SAMPLE_RATIO*100}% of dataset: {len(rows)} samples")

print("Original dataset size:", len(rows))

# Split the dataset into training and testing sets
X = rows.drop(columns=['Label'])
y = rows['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print("Training set size:", len(X_train), len(y_train))
print("Testing set size:", len(X_test), len(y_test))

Using 25.0% of dataset: 107842 samples
Original dataset size: 107842
Training set size: 75489 75489
Testing set size: 32353 32353


In [3]:
# ===============================
# Scale numerical features (explicit dtype casting)
# ===============================
scaler = StandardScaler()
numerical_cols = X_train.select_dtypes(include=['float64', 'int64']).columns

# Explicitly convert to float to avoid dtype warnings
X_train[numerical_cols] = X_train[numerical_cols].astype(float)
X_test[numerical_cols]  = X_test[numerical_cols].astype(float)

X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols]  = scaler.transform(X_test[numerical_cols])

In [4]:
# # One-hot encode the target variable
# onehotencoder = OneHotEncoder()
# y_train_encoded = onehotencoder.fit_transform(y_train.values.reshape(-1,1)).toarray()
# y_test_encoded = onehotencoder.transform(y_test.values.reshape(-1,1)).toarray()

# ===============================
# Encode target labels (single source of truth)
# ===============================
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded  = label_encoder.transform(y_test)


In [5]:
from sklearn.preprocessing import LabelEncoder

# Identify string columns
string_cols = X_train.select_dtypes(include=['object']).columns

# Encode string columns
for col in string_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col])

In [6]:
# Assuming you already have `LabelEncoder` instance `label_encoder` for encoding the target variable

# Encode target variables
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Evaluate models
models = [
    ('Naive Bayes', BNB_Classifier),
    ('Decision Tree', DTC_Classifier),
    ('KNN', KNN_Classifier),
    ('Logistic Regression (OvR)', LGR_Classifier),
    ('Random Forest', RF_Classifier),
    ('SVM', SVM_Classifier)
]


from sklearn.model_selection import StratifiedKFold

for name, model in models:
    print()
    print(f"================ {name} =================")
    print()

    # ---- Cross-validation (skip SVM) ----
    if name != 'SVM':
        skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
        cv_scores = cross_val_score(
            model,
            X_train,
            y_train_encoded,
            cv=skf,
            scoring='f1_macro'
        )
        print("CV F1-Macro (Mean ± Std):")
        print(f"{cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
        print()
    else:
        print("CV skipped for SVM due to computational constraints")
        print()

    # ---- Train once ----
    model.fit(X_train, y_train_encoded)

    # ---- Test evaluation ----
    y_pred = model.predict(X_test)

    print("Test Accuracy:")
    print(metrics.accuracy_score(y_test_encoded, y_pred))
    print()

    print("Confusion Matrix:")
    print(metrics.confusion_matrix(y_test_encoded, y_pred))
    print()

    print("Classification Report:")
    print(metrics.classification_report(y_test_encoded, y_pred))




CV F1-Macro (Mean ± Std):
0.3908 ± 0.0105

Test Accuracy:
0.8537384477482768

Confusion Matrix:
[[7073    0    0    0    2    8    0    0    0    0   35   15   84    0
     0  100   21    1]
 [   1    7    0    0    6    0    0    0    0  221   37    2    0    0
     2    0    0    0]
 [   0    0    0   98    1    0    0    0    0    0    0    0    0    0
     0    0    0    0]
 [   0    0    0  452    0    0    0    3    0    0    0    0    0    0
     0    0    0    0]
 [   9    0    0 1241 7344   43   19  427    0    0    6    1    3    0
     0    0    0   14]
 [   0    0    0   13    0   14    0    0    0    0   14    0    0    0
     0    0    0    0]
 [   0    0    0  206    0    1    0    0    0    0    1    0    0    0
     0    0    0    0]
 [   0    0    0   29    4    1    3  731    0    0   10    0    0    0
     0    0    0    0]
 [   0    0    0    0    0    0    0    0    0  142    0    0    0    0
     0    0    0    0]
 [   0    0    0    0    0    0    0    0    0 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       1.00      0.96      0.98      7339
           1       0.37      0.03      0.05       276
           2       0.00      0.00      0.00        99
           3       0.22      0.99      0.36       455
           4       1.00      0.81      0.89      9107
           5       0.21      0.34      0.26        41
           6       0.00      0.00      0.00       208
           7       0.63      0.94      0.75       778
           8       0.00      0.00      0.00       142
           9       0.47      0.98      0.64       644
          10       0.20      0.90      0.33        41
          11       0.01      0.13      0.01        47
          12       0.86      0.79      0.82      3687
          13       1.00      0.95      0.98      7478
          14       0.88      0.94      0.91      1344
          15       0.04      0.01      0.02       657
          16       0.00      0.00      0.00         5
          17       0.00    



In [7]:
# Assuming you already have `LabelEncoder` instance `label_encoder` for encoding the target variable

# Encode target variables
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

from sklearn.model_selection import StratifiedKFold

for name, model in models:
    print()
    print(f"================ {name} =================")
    print()

    # ---- Stratified 10-Fold CV ----
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    cv_scores = cross_val_score(
        model,
        X_train,
        y_train_encoded,
        cv=skf,
        scoring='f1_macro'
    )

    print("CV F1-Macro (Mean ± Std):")
    print(f"{cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
    print()

    # ---- Fit once on full training set ----
    model.fit(X_train, y_train_encoded)

    # ---- Test evaluation ----
    y_pred = model.predict(X_test)

    print("Test Accuracy:")
    print(metrics.accuracy_score(y_test_encoded, y_pred))
    print()

    print("Confusion Matrix:")
    print(metrics.confusion_matrix(y_test_encoded, y_pred))
    print()

    print("Classification Report:")
    print(
    metrics.classification_report(
        y_test_encoded,
        y_pred,
        zero_division=0,
        output_dict=False
    )
)




CV F1-Macro (Mean ± Std):
0.3908 ± 0.0105

Test Accuracy:
0.8537384477482768

Confusion Matrix:
[[7073    0    0    0    2    8    0    0    0    0   35   15   84    0
     0  100   21    1]
 [   1    7    0    0    6    0    0    0    0  221   37    2    0    0
     2    0    0    0]
 [   0    0    0   98    1    0    0    0    0    0    0    0    0    0
     0    0    0    0]
 [   0    0    0  452    0    0    0    3    0    0    0    0    0    0
     0    0    0    0]
 [   9    0    0 1241 7344   43   19  427    0    0    6    1    3    0
     0    0    0   14]
 [   0    0    0   13    0   14    0    0    0    0   14    0    0    0
     0    0    0    0]
 [   0    0    0  206    0    1    0    0    0    0    1    0    0    0
     0    0    0    0]
 [   0    0    0   29    4    1    3  731    0    0   10    0    0    0
     0    0    0    0]
 [   0    0    0    0    0    0    0    0    0  142    0    0    0    0
     0    0    0    0]
 [   0    0    0    0    0    0    0    0    0 



CV F1-Macro (Mean ± Std):
0.7112 ± 0.0263

Test Accuracy:
0.9614255246808642

Confusion Matrix:
[[7230    8    3    4    0    3    0    1    4    0    0    3    0    2
     0    0   81    0]
 [   0  234    0    0    0    0    0    0    0    0   22   19    0    0
     0    1    0    0]
 [   0    0   32    0    0    0   64    0    3    0    0    0    0    0
     0    0    0    0]
 [   0    0    2  431    0    0   22    0    0    0    0    0    0    0
     0    0    0    0]
 [   0    0    0   15 9082    2    0    1    0    0    5    0    1    0
     0    0    1    0]
 [   0    0    0   19    0    1    1    0    0    0   20    0    0    0
     0    0    0    0]
 [   0    0    9    2    0    0  194    0    1    0    2    0    0    0
     0    0    0    0]
 [   0    0    0    1    0    0    1  728    0   39    0    0    0    0
     1    0    8    0]
 [   0    0    0    0    0    0    0    0  138    4    0    0    0    0
     0    0    0    0]
 [   0    7    0    0    0    0    0    0   35  5



In [8]:
# PREDICTING FOR TEST DATA
pred_knn = KNN_Classifier.predict(X_test)
pred_NB = BNB_Classifier.predict(X_test)
pred_log = LGR_Classifier.predict(X_test)
pred_dt = DTC_Classifier.predict(X_test)

print("KNN: ", pred_knn)
print("BNB: ", pred_NB)
print("LGR: ", pred_log)
print("DTC: ", pred_dt)

KNN:  [12  4 13 ...  0  4 13]
BNB:  [11  4 13 ...  0  4 13]
LGR:  [12  4 13 ...  0  4 13]
DTC:  [12  4 13 ...  0  4 13]
