In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

In [6]:
import xgboost as xgb

In [35]:
df = pd.read_csv('./data/Friday-WorkingHours-Morningpcap_ISCX.csv')

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 191033 entries, 0 to 191032
Data columns (total 79 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0    Destination Port             191033 non-null  int64  
 1    Flow Duration                191033 non-null  int64  
 2    Total Fwd Packets            191033 non-null  int64  
 3    Total Backward Packets       191033 non-null  int64  
 4   Total Length of Fwd Packets   191033 non-null  int64  
 5    Total Length of Bwd Packets  191033 non-null  int64  
 6    Fwd Packet Length Max        191033 non-null  int64  
 7    Fwd Packet Length Min        191033 non-null  int64  
 8    Fwd Packet Length Mean       191033 non-null  float64
 9    Fwd Packet Length Std        191033 non-null  float64
 10  Bwd Packet Length Max         191033 non-null  int64  
 11   Bwd Packet Length Min        191033 non-null  int64  
 12   Bwd Packet Length Mean       191033 non-nul

# Preprocessing

In [37]:
df = df.rename(columns={' Total Fwd Packets': 'Tot Fwd Pkts',
                   ' Total Backward Packets': 'Tot Bwd Pkts', 
                   'Flow Bytes/s': 'Flow Byts/s', 
                   ' Flow Duration':'Flow Duration',
                    ' Init_Win_bytes_backward': 'Init Bwd Win Byts',
                    ' Label': 'Label'})
df.replace('Label')

Unnamed: 0,Destination Port,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,3268,112740690,32,16,6448,1152,403,0,201.5,204.724205,...,32,3.594286e+02,1.199802e+01,380,343,16100000.0,4.988048e+05,16400000,15400000,BENIGN
1,389,112740560,32,16,6448,5056,403,0,201.5,204.724205,...,32,3.202857e+02,1.574499e+01,330,285,16100000.0,4.987937e+05,16400000,15400000,BENIGN
2,0,113757377,545,0,0,0,0,0,0.0,0.000000,...,0,9.361829e+06,7.324646e+06,18900000,19,12200000.0,6.935824e+06,20800000,5504997,BENIGN
3,5355,100126,22,0,616,0,28,28,28.0,0.000000,...,32,0.000000e+00,0.000000e+00,0,0,0.0,0.000000e+00,0,0,BENIGN
4,0,54760,4,0,0,0,0,0,0.0,0.000000,...,0,0.000000e+00,0.000000e+00,0,0,0.0,0.000000e+00,0,0,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191028,53,61452,4,2,180,354,45,45,45.0,0.000000,...,20,0.000000e+00,0.000000e+00,0,0,0.0,0.000000e+00,0,0,BENIGN
191029,53,171,2,2,80,272,40,40,40.0,0.000000,...,32,0.000000e+00,0.000000e+00,0,0,0.0,0.000000e+00,0,0,BENIGN
191030,53,222,2,2,90,354,45,45,45.0,0.000000,...,32,0.000000e+00,0.000000e+00,0,0,0.0,0.000000e+00,0,0,BENIGN
191031,123,16842,1,1,48,48,48,48,48.0,0.000000,...,20,0.000000e+00,0.000000e+00,0,0,0.0,0.000000e+00,0,0,BENIGN


In [38]:
df['Label'].unique()

array(['BENIGN', 'Bot'], dtype=object)

In [39]:
df = df.replace({'Label': {"BENIGN": 0, "Bot": 1}})

In [40]:
'''
duration, total packets, total bytes, source bytes
corresponds to
duration: flow duration
total packets: total fwd pkts + total bwd pkts
total bytes: Fwd Init Win bytes + Bwd Init Win bytes
source bytes: Fwd Init Win bytes
'''

'\nduration, total packets, total bytes, source bytes\ncorresponds to\nduration: flow duration\ntotal packets: total fwd pkts + total bwd pkts\ntotal bytes: Fwd Init Win bytes + Bwd Init Win bytes\nsource bytes: Fwd Init Win bytes\n'

In [41]:
def createPacketsSumColumn(row):
    return row['Tot Fwd Pkts'] + row['Tot Bwd Pkts']

def createTotalBytesColumn(row):
    return row['Flow Byts/s'] * row['Flow Duration']

In [42]:
df['totalPackets'] = df.apply(createPacketsSumColumn, axis=1)
df['totalBytes'] = df.apply(createTotalBytesColumn, axis=1)
df['sourceBytes'] = df['Init Bwd Win Byts']

  """


In [43]:
feature_df = df[['Flow Duration', 'totalPackets', 'totalBytes', 'sourceBytes','Label']].copy()

In [45]:
X = feature_df.drop('Label', axis=1)
y = feature_df.Label

In [46]:
'''
XGB how to Source: 
https://towardsdatascience.com/beginners-guide-to-xgboost-for-classification-problems-50f75aac5390
'''
numeric_pipeline = Pipeline(
    steps=[("impute", SimpleImputer(strategy="mean")), 
           ("scale", StandardScaler())]
)

full_processor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_pipeline, X.select_dtypes(include="number").columns),
    ]
)

In [47]:
X_processed = full_processor.fit_transform(X)

In [48]:
xgb_loaded = pickle.load(open('xgbmodel.pkl', "rb"))

In [49]:
preds = xgb_loaded.predict(X_processed)

In [51]:
print('Accuracy', accuracy_score(y, preds))
print(classification_report(y, preds))

Accuracy 0.11623122706548084
              precision    recall  f1-score   support

           0       0.99      0.11      0.19    189067
           1       0.01      0.95      0.02      1966

    accuracy                           0.12    191033
   macro avg       0.50      0.53      0.11    191033
weighted avg       0.98      0.12      0.19    191033

