    This Source Code Form is subject to the terms of the Mozilla Public
    License, v. 2.0. If a copy of the MPL was not distributed with this
    file, You can obtain one at http://mozilla.org/MPL/2.0/.

# Load and preprocess dataset

In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

from pathlib import Path

In [5]:
dsn = 'NF-BoT-IoT.csv'
dataset = Path('datasets/netflow-v1') / dsn
ignore_cols = {'IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'Label'}

In [7]:
df = pd.read_csv(dataset, usecols=lambda x: x not in ignore_cols)
df.head()

Unnamed: 0,L4_SRC_PORT,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,TCP_FLAGS,FLOW_DURATION_MILLISECONDS,Attack
0,52670,53,17,5.212,71,126,1,1,0,4294966,Benign
1,49160,4444,6,0.0,217753000,199100,4521,4049,24,4176249,Theft
2,3456,80,17,0.0,8508021,8918372,9086,9086,0,4175916,Benign
3,80,8080,6,7.0,8442138,9013406,9086,9086,0,4175916,Benign
4,80,80,6,7.0,8374706,0,9086,0,0,4175916,Benign


In [39]:
trns = ColumnTransformer([('cat', OrdinalEncoder(), ['Attack', 'TCP_FLAGS', 'PROTOCOL'])],
                         remainder=StandardScaler())
data = trns.fit_transform(df)
trns.get_feature_names_out()

array(['cat__Attack', 'cat__TCP_FLAGS', 'cat__PROTOCOL',
       'remainder__L4_SRC_PORT', 'remainder__L4_DST_PORT',
       'remainder__L7_PROTO', 'remainder__IN_BYTES',
       'remainder__OUT_BYTES', 'remainder__IN_PKTS',
       'remainder__OUT_PKTS', 'remainder__FLOW_DURATION_MILLISECONDS'],
      dtype=object)

# Train classifier model

In [98]:
import numpy as np

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

In [65]:
y, X = np.hsplit(data, [1])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [88]:
explained_model = XGBClassifier(n_estimators=3, max_depth=2, learning_rate=1, objective='binary:logistic')
explained_model.fit(X_train, y_train, eval_set=[(X_train, y_train)], verbose=True);

[0]	validation_0-mlogloss:0.49620
[1]	validation_0-mlogloss:0.37980
[2]	validation_0-mlogloss:0.35006


In [97]:
explained_model.score(X_test, y_test)

0.8329528411931345

# Train explanation model