In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

In [4]:
import xgboost as xgb

In [5]:
normal_df = pd.read_csv('./data/CTU13_Normal_Traffic.csv')
attack_df = pd.read_csv('./data/CTU13_Attack_Traffic.csv')

In [6]:
df = pd.concat([normal_df, attack_df],ignore_index=False)
df.head()

Unnamed: 0.1,Unnamed: 0,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,...,Fwd Act Data Pkts,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,891,0,2,0,348,0,0,0.0,0.0,...,0,0.0,0.0,0,0,0.0,0.0,0,0,0
1,1,20835,0,2,0,266,0,0,0.0,0.0,...,0,0.0,0.0,0,0,0.0,0.0,0,0,0
2,2,545,0,2,0,442,0,0,0.0,0.0,...,0,0.0,0.0,0,0,0.0,0.0,0,0,0
3,3,83892,0,2,0,194,0,0,0.0,0.0,...,0,0.0,0.0,0,0,0.0,0.0,0,0,0
4,4,608,0,2,0,480,0,0,0.0,0.0,...,0,0.0,0.0,0,0,0.0,0.0,0,0,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 92212 entries, 0 to 38897
Data columns (total 59 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         92212 non-null  int64  
 1   Flow Duration      92212 non-null  int64  
 2   Tot Fwd Pkts       92212 non-null  int64  
 3   Tot Bwd Pkts       92212 non-null  int64  
 4   TotLen Fwd Pkts    92212 non-null  int64  
 5   TotLen Bwd Pkts    92212 non-null  int64  
 6   Fwd Pkt Len Max    92212 non-null  int64  
 7   Fwd Pkt Len Min    92212 non-null  int64  
 8   Fwd Pkt Len Mean   92212 non-null  float64
 9   Fwd Pkt Len Std    92212 non-null  float64
 10  Bwd Pkt Len Max    92212 non-null  int64  
 11  Bwd Pkt Len Min    92212 non-null  int64  
 12  Bwd Pkt Len Mean   92212 non-null  float64
 13  Bwd Pkt Len Std    92212 non-null  float64
 14  Flow Byts/s        92212 non-null  float64
 15  Flow Pkts/s        92212 non-null  float64
 16  Flow IAT Mean      922

In [8]:
'''
duration, total packets, total bytes, source bytes
corresponds to
duration: flow duration
total packets: total fwd pkts + total bwd pkts
total bytes: Fwd Init Win bytes + Bwd Init Win bytes
source bytes: Fwd Init Win bytes
'''

'\nduration, total packets, total bytes, source bytes\ncorresponds to\nduration: flow duration\ntotal packets: total fwd pkts + total bwd pkts\ntotal bytes: Fwd Init Win bytes + Bwd Init Win bytes\nsource bytes: Fwd Init Win bytes\n'

In [9]:
def createPacketsSumColumn(row):
    return row['Tot Fwd Pkts'] + row['Tot Bwd Pkts']

def createTotalBytesColumn(row):
    return row['Flow Byts/s'] * row['Flow Duration']

In [10]:
df['totalPackets'] = df.apply(createPacketsSumColumn, axis=1)
df['totalBytes'] = df.apply(createTotalBytesColumn, axis=1)
df['sourceBytes'] = df['Init Bwd Win Byts']

In [11]:
feature_df = df[['Flow Duration', 'totalPackets', 'totalBytes', 'sourceBytes','Label']].copy()

In [13]:
X = feature_df.drop('Label', axis=1)
y = feature_df.Label

In [14]:
'''
XGB how to Source: 
https://towardsdatascience.com/beginners-guide-to-xgboost-for-classification-problems-50f75aac5390
'''
numeric_pipeline = Pipeline(
    steps=[("impute", SimpleImputer(strategy="mean")), 
           ("scale", StandardScaler())]
)

full_processor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_pipeline, X.select_dtypes(include="number").columns),
    ]
)

In [15]:
xgb_cl = xgb.XGBClassifier()
print(type(xgb_cl))

<class 'xgboost.sklearn.XGBClassifier'>


In [16]:
X_processed = full_processor.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, stratify=y, random_state=42, test_size=0.20
)

In [17]:
xgb_cl = xgb.XGBClassifier()
xgb_cl.fit(X_train, y_train)
preds = xgb_cl.predict(X_test)

In [18]:
print(accuracy_score(y_test, preds))
print(classification_report(y_test, preds))

0.9930054763324838
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     10663
           1       0.99      0.99      0.99      7780

    accuracy                           0.99     18443
   macro avg       0.99      0.99      0.99     18443
weighted avg       0.99      0.99      0.99     18443



In [19]:
X_preds = xgb_cl.predict(X_train)
print(accuracy_score(y_train, X_preds))
print(classification_report(y_train, X_preds))

0.9959332510946333
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     42651
           1       1.00      0.99      1.00     31118

    accuracy                           1.00     73769
   macro avg       1.00      1.00      1.00     73769
weighted avg       1.00      1.00      1.00     73769



In [20]:
pickle.dump(xgb_cl, open('xgbmodel.pkl', "wb"))