In [4]:
import os
import warnings
import pandas as pd
import xgboost as xgb
from tqdm import tqdm
from utils.constant import *
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, classification_report

warnings.filterwarnings('ignore')

# Dataset

In [5]:
# File Paths
df_sets = [k for k in os.listdir(DATASET_DIRECTORY) if k.endswith('.csv')]
df_sets.sort()

# Split
training_sets = df_sets[:1]
test_sets = df_sets[15:16]

# Preprocessing Functions
scaler = StandardScaler()

# Dataset Processing

In [6]:
df = pd.DataFrame()
for train_set in tqdm(training_sets):

    # Load data into a single dataframe
    df_set = pd.read_csv(DATASET_DIRECTORY + train_set)
    df = df._append(df_set, ignore_index=True)

    # Fit scaler
    scaler.fit(df_set[FEATURES])

# Scale
df[FEATURES] = scaler.transform(df[FEATURES])

# Encode labels
df[LABELS] = df[LABELS].apply(lambda x: ATTACKS[x])

100%|██████████| 1/1 [00:01<00:00,  1.81s/it]


# Model

In [7]:
# Create instances of the classifiers
xgb_model = xgb.XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
logistic_model = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=42)

# Create an ensemble using VotingClassifier
ensemble_model = VotingClassifier(estimators=[
    ('xgb', xgb_model),
    ('rf', rf_model),
    ('logistic', logistic_model)],
    voting='soft')

# Fit the ensemble model on the training data
ensemble_model.fit(df[FEATURES], df[LABELS])

# Evaluation

In [8]:
df_test = pd.DataFrame()
for test_set in test_sets:
    
    # Load data into a single dataframe
    df_set = pd.read_csv(DATASET_DIRECTORY + test_set)
    df_test = df_test._append(df_set, ignore_index=True)

# Scale
df_test[FEATURES] = scaler.transform(df_test[FEATURES])

# Encode labels
df_test[LABELS] = df_test[LABELS].apply(lambda x: ATTACKS[x])
# Make predictions on the test data
y_pred = ensemble_model.predict(df_test[FEATURES])

# Evaluate the ensemble model's performance
accuracy = accuracy_score(df_test[LABELS], y_pred)
confusion = confusion_matrix(df_test[LABELS], y_pred)
classification_rep = classification_report(df_test[LABELS], y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion)
print("Classification Report:\n", classification_rep)

Accuracy: 0.9913232507844809
Confusion Matrix:
 [[23309     0     0 ...     0     0     0]
 [    0 23457     0 ...     0     0     0]
 [    0     1 23334 ...     0     0     0]
 ...
 [    0     0     0 ...     2     0     0]
 [    0     0     0 ...     0     0     0]
 [    0     0     0 ...     0     0    14]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     23313
           1       1.00      1.00      1.00     23462
           2       1.00      1.00      1.00     23360
           3       1.00      1.00      1.00     30995
           4       1.00      1.00      1.00     26133
           5       1.00      1.00      1.00     41621
           6       1.00      1.00      1.00     20651
           7       1.00      1.00      1.00      1606
           8       1.00      1.00      1.00      1643
           9       1.00      1.00      1.00      2560
          10       0.94      0.96      0.95       139
          11   