In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.metrics import (roc_auc_score, f1_score, matthews_corrcoef, recall_score)
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

In [3]:
data = pd.read_csv('/home/asavari/Downloads/Threats (1).csv')

In [4]:
data_cleaned = data.drop(columns=['id'], errors='ignore')
label_encoder = LabelEncoder()
data_cleaned['attack_cat_encoded'] = label_encoder.fit_transform(data_cleaned['attack_cat'])
data_cleaned = data_cleaned.drop(columns=['attack_cat'], errors='ignore')

X = data_cleaned.drop(columns=['attack_cat_encoded'], errors='ignore')
y = data_cleaned['attack_cat_encoded']

print("Classes in target variable:", label_encoder.classes_)

Classes in target variable: ['Analysis' 'Backdoor' 'DoS' 'Exploits' 'Fuzzers' 'Generic' 'Normal'
 'Reconnaissance' 'Shellcode' 'Worms']


In [5]:
X = X.fillna(0)
categorical_columns = X.select_dtypes(include=['object']).columns
X = X.drop(columns=[col for col in categorical_columns if X[col].nunique() > 20], errors='ignore')

In [6]:
X_encoded = pd.get_dummies(X, columns=X.select_dtypes(include=['object']).columns)

smote = SMOTE(random_state=42, sampling_strategy='auto') 
X_resampled, y_resampled = smote.fit_resample(X_encoded, y)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [8]:
# bagging 
base_tree = DecisionTreeClassifier(random_state=42, class_weight='balanced', max_depth=8, min_samples_split=20)
bagging_model = BaggingClassifier(estimator=base_tree, n_estimators=25, n_jobs=-1, random_state=42)
bagging_model.fit(X_train, y_train)
y_pred_bag = bagging_model.predict(X_test)

In [9]:
print("\nBagging Classifier Performance: ")
print("F1 Score:", f1_score(y_test, y_pred_bag, average="weighted"))
print("Recall Rate:", recall_score(y_test, y_pred_bag, average="weighted"))
print("MCC:", matthews_corrcoef(y_test, y_pred_bag))
print("AUC:", roc_auc_score(y_test, bagging_model.predict_proba(X_test), multi_class="ovr"))
cv_scores_bag = cross_val_score(bagging_model, X_resampled, y_resampled, cv=3, scoring='f1_weighted')
print("Mean CV F1 Score:", cv_scores_bag.mean())


Bagging Classifier Performance: 
F1 Score: 0.6932714182681371
Recall Rate: 0.7048875
MCC: 0.6843917371987307
AUC: 0.9428096508768329
Mean CV F1 Score: 0.6551834080921956


In [10]:
# boosting
boosting_model = AdaBoostClassifier(estimator=base_tree, n_estimators=25, algorithm='SAMME',random_state=42)
boosting_model.fit(X_train, y_train)
y_pred_boost = boosting_model.predict(X_test)

In [11]:
print("\nBoosting Classifier Performance: ")
print("F1 Score:", f1_score(y_test, y_pred_boost, average="weighted"))
print("Recall Rate:", recall_score(y_test, y_pred_boost, average="weighted"))
print("MCC:", matthews_corrcoef(y_test, y_pred_boost))
print("AUC:", roc_auc_score(y_test, boosting_model.predict_proba(X_test), multi_class="ovr"))
cv_scores_boost = cross_val_score(boosting_model, X_resampled, y_resampled, cv=3, scoring='f1_weighted')
print("Mean CV F1 Score:", cv_scores_boost.mean())


Boosting Classifier Performance: 
F1 Score: 0.7494291766866138
Recall Rate: 0.7412
MCC: 0.715369992814789
AUC: 0.9419260310889885
Mean CV F1 Score: 0.702493375964306
