In [1]:
%load_ext autoreload
%autoreload 2

# Loading Data

In [2]:
import numpy as np
import pandas as pd
from prettytable import PrettyTable

In [3]:
path_attacks = '../data/2024-adatelemzo-msc/ddos_data.attacks.anonimized.csv'
path_vectors = '../data/2024-adatelemzo-msc/ddos_data.vectors.anonimized.csv'

csv_data_attacks = pd.read_csv(path_attacks)
csv_data_vectors = pd.read_csv(path_vectors)

table = PrettyTable()
table.field_names = csv_data_attacks.columns.tolist()
table.add_row(csv_data_attacks.dtypes)
print(table)

+-----------+--------+-----------+-------------+-------------+--------------+------------------+--------------+------------+----------------+---------------------+------------+----------+----------------+--------+
| Attack ID |  Card  | Victim IP | Port number | Attack code | Detect count | Significant flag | Packet speed | Data speed | Avg packet len | Avg source IP count | Start time | End time | Whitelist flag |  Type  |
+-----------+--------+-----------+-------------+-------------+--------------+------------------+--------------+------------+----------------+---------------------+------------+----------+----------------+--------+
|   int64   | object |   object  |    int64    |    object   |    int64     |      int64       |    int64     |   int64    |     int64      |        int64        |   object   |  object  |     int64      | object |
+-----------+--------+-----------+-------------+-------------+--------------+------------------+--------------+------------+----------------+---

In [4]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
for category, dtype in zip(csv_data_attacks, csv_data_attacks.dtypes):
    if isinstance(dtype, np.dtypes.ObjectDType):
        csv_data_attacks[category] = label_encoder.fit_transform(csv_data_attacks[category])

table = PrettyTable()
table.field_names = csv_data_attacks.columns.tolist()
table.add_row(csv_data_attacks.dtypes)
print(table)

+-----------+-------+-----------+-------------+-------------+--------------+------------------+--------------+------------+----------------+---------------------+------------+----------+----------------+-------+
| Attack ID |  Card | Victim IP | Port number | Attack code | Detect count | Significant flag | Packet speed | Data speed | Avg packet len | Avg source IP count | Start time | End time | Whitelist flag |  Type |
+-----------+-------+-----------+-------------+-------------+--------------+------------------+--------------+------------+----------------+---------------------+------------+----------+----------------+-------+
|   int64   | int64 |   int64   |    int64    |    int64    |    int64     |      int64       |    int64     |   int64    |     int64      |        int64        |   int64    |  int64   |     int64      | int64 |
+-----------+-------+-----------+-------------+-------------+--------------+------------------+--------------+------------+----------------+------------

***

# WanDB init

In [5]:
import time, datetime

stamp_to_ms = lambda T : time.mktime(datetime.datetime.strptime(T, "%Y-%m-%dT%H:%M:%S").timetuple())

In [6]:
from sklearn.model_selection import train_test_split

# load and process data
data = csv_data_attacks.iloc[:, :-1]
target = csv_data_attacks.iloc[:, -1]

feature_names = csv_data_attacks.columns.to_numpy()
labels = np.array(list(set(csv_data_attacks['Type'])))

test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=test_size)

In [7]:
from sklearn.ensemble import RandomForestClassifier
type(RandomForestClassifier())

sklearn.ensemble._forest.RandomForestClassifier

In [8]:
from sklearn.ensemble import RandomForestClassifier

# train model
model = RandomForestClassifier()
model.fit(X_train, y_train)
model_params = model.get_params()

In [9]:
import wandb
from wandb.sklearn import plot_precision_recall, plot_feature_importances
from wandb.sklearn import plot_class_proportions, plot_learning_curve, plot_roc

# start a new wandb run and add your model hyperparameters
wandb.init(project='Halado_Adatelemzes_Labor', config=model_params)

# Add additional configs to wandb
wandb.config.update({"test_size" : test_size,
                    "train_len" : len(X_train),
                    "test_len" : len(X_test)})

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mlakybalint-wandb[0m ([33mdoktorloktor[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [27]:
y_pred = model.predict(X_test)
y_pred == y_test.to_numpy()

matches = np.count_nonzero(y_test == y_pred)
matches

26954

In [28]:
# get predictions
y_pred = model.predict(X_test)
y_probas = model.predict_proba(X_test)
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

# log additional visualisations to wandb
plot_class_proportions(y_train, y_test, labels)
plot_learning_curve(model, X_train, y_train)
plot_roc(y_test, y_probas, labels)
plot_precision_recall(y_test, y_probas, labels)
plot_feature_importances(model)

In [None]:
# Finish the wandb run
wandb.finish()