In [1]:
%load_ext autoreload
%autoreload 2

# Loading Data

In [2]:
from dataloading import *

In [3]:
PATH_DATA_TRAIN         = '../data/preprocessed-v2/train_vectors_scaled.csv'
PATH_DATA_VALIDATION    = '../data/preprocessed-v2/validation_vectors_scaled.csv'
PATH_DATA_TEST          = '../data/preprocessed-v2/test_vectors_scaled.csv'
TEST_SIZE = .2
UNIQUE_COLUMNS = False

(training_data_df, training_target_df) = get_all_data(
    path_all_vectors=PATH_DATA_TRAIN, unique=UNIQUE_COLUMNS, remove_columns=['is_synthetic'])

(validation_data_df, validation_target_df) = get_all_data(
    path_all_vectors=PATH_DATA_VALIDATION, unique=UNIQUE_COLUMNS)

(test_data_df, test_target_df) = get_all_data(
    path_all_vectors=PATH_DATA_TEST, unique=UNIQUE_COLUMNS)

In [4]:
# Assert inputs are of same length
assert training_data_df.columns.to_list() == validation_data_df.columns.to_list() == test_data_df.columns.to_list()
assert training_target_df.columns.to_list() == validation_target_df.columns.to_list() == test_target_df.columns.to_list()

In [5]:
print((training_data_df.columns.to_list()))
print((validation_data_df.columns.to_list()))
print((test_data_df.columns.to_list()))

['Avg packet len', 'CHARGEN', 'CLDAP', 'CoAP', 'DNS', 'Data speed', 'Generic UDP', 'High volume traffic', 'IPv4 fragmentation', 'NTP', 'Packet speed', 'Port number', 'RDP', 'SNMP', 'SSDP', 'SYN Attack', 'Significant flag', 'Source IP count', 'Suspicious traffic', 'TCP Anomaly', 'is_weekday', 'other_attack_codes', 'time_of_day', 'victim IP num']
['Avg packet len', 'CHARGEN', 'CLDAP', 'CoAP', 'DNS', 'Data speed', 'Generic UDP', 'High volume traffic', 'IPv4 fragmentation', 'NTP', 'Packet speed', 'Port number', 'RDP', 'SNMP', 'SSDP', 'SYN Attack', 'Significant flag', 'Source IP count', 'Suspicious traffic', 'TCP Anomaly', 'is_weekday', 'other_attack_codes', 'time_of_day', 'victim IP num']
['Avg packet len', 'CHARGEN', 'CLDAP', 'CoAP', 'DNS', 'Data speed', 'Generic UDP', 'High volume traffic', 'IPv4 fragmentation', 'NTP', 'Packet speed', 'Port number', 'RDP', 'SNMP', 'SSDP', 'SYN Attack', 'Significant flag', 'Source IP count', 'Suspicious traffic', 'TCP Anomaly', 'is_weekday', 'other_attack

In [6]:
X_train = training_data_df.to_numpy()
X_validation = validation_data_df.to_numpy()
X_test = test_data_df.to_numpy()

y_train = training_target_df.to_numpy().squeeze(1)
y_validation = validation_target_df.to_numpy().squeeze(1)
y_test = test_target_df.to_numpy().squeeze(1)

In [7]:
print('y_train.shape: ', y_train.shape)
print('y_validation.shape: ', y_validation.shape)
print('y_test.shape: ', y_test.shape)

y_train.shape:  (1346829,)
y_validation.shape:  (1247266,)
y_test.shape:  (1233449,)


***

# Model

In [8]:
from sklearn.ensemble import RandomForestClassifier

# MODEL INIT
model = RandomForestClassifier(
    verbose=True,
    max_depth=20,
    max_features=0.7,
    n_estimators=100,
    bootstrap=True,
    oob_score=False,
    )

In [9]:
# TRAINING

_ = model.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:  1.9min


In [11]:
from collections import defaultdict

count = defaultdict(int)
for tree in model.estimators_:
    # print(tree.get_depth(), end=', ')
    count[tree.get_depth()] += 1
count = dict(count)
for item, key in count.items():
    print(f'{key} trees have length {item}')

100 trees have length 20


In [12]:
import numpy as np

# get predictions
y_pred = model.predict(X_validation)

matches = np.count_nonzero(y_validation == y_pred)
print(f'Accuracy: {100 * matches / len(y_validation)} %')

Accuracy: 0.008017535954639988 %


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


In [13]:
from prettytable import PrettyTable

info_data = list(training_data_df.columns.values)

table = PrettyTable()
table.field_names = ["Property", "Importance (%)"]

importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

for i in indices:
    imp = 100 * importances[i]

    if imp > 0:
        if int(imp) == 0:
            imp = '<1 %'
        else:
            imp = f'{int(imp)} %'
    else:
        imp = '0 %'


    table.add_row([info_data[i], imp])

print(table)

+---------------------+----------------+
|       Property      | Importance (%) |
+---------------------+----------------+
|    victim IP num    |      35 %      |
|    Avg packet len   |      15 %      |
|   Source IP count   |      10 %      |
|     time_of_day     |      8 %       |
|     Port number     |      8 %       |
|      Data speed     |      6 %       |
|  Suspicious traffic |      4 %       |
| High volume traffic |      3 %       |
|   Significant flag  |      3 %       |
|     Packet speed    |      1 %       |
|      is_weekday     |      <1 %      |
|        CLDAP        |      <1 %      |
|         DNS         |      <1 %      |
|      SYN Attack     |      <1 %      |
|     Generic UDP     |      <1 %      |
|         NTP         |      <1 %      |
|  IPv4 fragmentation |      <1 %      |
|         SNMP        |      <1 %      |
|         CoAP        |      <1 %      |
|         SSDP        |      <1 %      |
|     TCP Anomaly     |      <1 %      |
|       CHARGEN 

# WanDB init

In [14]:
assert False, "WanDB not yet needed."

AssertionError: WanDB not yet needed.

In [None]:
import wandb

# start a new wandb run and add your model hyperparameters
wandb.init(project='Halado_Adatelemzes_Labor', config=model.get_params())

# Add additional configs to wandb
wandb.config.update({"test_size" : TEST_SIZE,
                    "train_len" : len(X_train),
                    "test_len" : len(X_validation)})

In [None]:
from wandb.sklearn import plot_precision_recall, plot_feature_importances
from wandb.sklearn import plot_class_proportions, plot_learning_curve, plot_roc

y_probas = model.predict_proba(X_validation)

# log additional visualisations to wandb
plot_class_proportions(y_train, y_validation, info_data)
# plot_learning_curve(model, X_train, y_train)
plot_roc(y_validation, y_probas, info_data)
plot_precision_recall(y_validation, y_probas, info_data)
plot_feature_importances(model)

In [None]:
# Finish the wandb run
wandb.finish()