In [None]:
%load_ext autoreload
%autoreload 2

# Loading Data

In [None]:
from dataloading import *

In [None]:
# +---------------------+----------------+
# |       Property      | Importance (%) |
# +---------------------+----------------+
# |......................................|
# |        CLDAP        |      <1 %      |
# |      is_weekday     |      <1 %      |
# |         DNS         |      <1 %      |
# |      SYN Attack     |      <1 %      |
# |     Generic UDP     |      <1 %      |
# |         NTP         |      <1 %      |
# |  IPv4 fragmentation |      <1 %      |
# |         CoAP        |      <1 %      |
# |         SNMP        |      <1 %      |
# |         SSDP        |      <1 %      |
# |     TCP Anomaly     |      <1 %      |
# |       CHARGEN       |      <1 %      |
# |  other_attack_codes |      <1 %      |
# |         RDP         |      0 %       |
# +---------------------+----------------+

irrelevants = [
    'CLDAP',
    'is_weekday',
    'DNS',
    'SYN Attack',
    'Generic UDP',
    'NTP',
    'IPv4 fragmentation',
    'CoAP',
    'SNMP',
    'SSDP',
    'TCP Anomaly',
    'CHARGEN',
    'other_attack_codes',
    'RDP'
]
irrelevants = []

In [None]:
PATH_DATA_TRAIN         = '../data/preprocessed-v2/train_vectors_scaled.csv'
PATH_DATA_VALIDATION    = '../data/preprocessed-v2/validation_vectors_scaled.csv'
PATH_DATA_TEST          = '../data/preprocessed-v2/test_vectors_scaled.csv'
PATH_DATA_GEN          = '../data/preprocessed-v2/generalisation_vectors_scaled.csv'
TEST_SIZE = .2
UNIQUE_COLUMNS = False

(training_data_df, training_target_df) = get_all_data(
    path_all_vectors=PATH_DATA_TRAIN, unique=UNIQUE_COLUMNS, remove_columns=['is_synthetic'] + irrelevants)

(validation_data_df, validation_target_df) = get_all_data(
    path_all_vectors=PATH_DATA_VALIDATION, unique=UNIQUE_COLUMNS, remove_columns=irrelevants)

(test_data_df, test_target_df) = get_all_data(
    path_all_vectors=PATH_DATA_TEST, unique=UNIQUE_COLUMNS, remove_columns=irrelevants)

(gen_data_df, gen_target_df) = get_all_data(
    path_all_vectors=PATH_DATA_GEN, unique=UNIQUE_COLUMNS, remove_columns=irrelevants)

In [None]:
# Assert inputs are of same length
assert training_data_df.columns.to_list() == validation_data_df.columns.to_list() == test_data_df.columns.to_list() == gen_data_df.columns.to_list()
assert training_target_df.columns.to_list() == validation_target_df.columns.to_list() == test_target_df.columns.to_list() == gen_target_df.columns.to_list()

In [None]:
print((training_data_df.columns.to_list()))
print((validation_data_df.columns.to_list()))
print((test_data_df.columns.to_list()))
print((gen_data_df.columns.to_list()))

In [None]:
X_train = training_data_df.to_numpy()
X_validation = validation_data_df.to_numpy()
X_test = test_data_df.to_numpy()
X_gen = gen_data_df.to_numpy()

y_train = training_target_df.to_numpy().squeeze(1)
y_validation = validation_target_df.to_numpy().squeeze(1)
y_test = test_target_df.to_numpy().squeeze(1)
y_gen = gen_target_df.to_numpy().squeeze(1)

In [None]:
print('X_train.shape: ', X_train.shape)
print('X_validation.shape: ', X_validation.shape)
print('X_test.shape: ', X_test.shape)
print('X_gen.shape: ', X_gen.shape)

In [None]:
print('y_train.shape: ', y_train.shape)
print('y_validation.shape: ', y_validation.shape)
print('y_test.shape: ', y_test.shape)
print('y_gen.shape: ', y_gen.shape)

***

# Model

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# MODEL INIT
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    max_features=0.7,
    bootstrap=True,
    oob_score=False,
    verbose=True,
    )

# model = GradientBoostingClassifier(
#     n_estimators=100,
#     learning_rate=1.0,
#     max_depth=1,
#     random_state=0)

In [None]:
# TRAINING

_ = model.fit(X_train, y_train)

In [None]:
# from collections import defaultdict

# count = defaultdict(int)
# for tree in model.estimators_:
#     print(tree.get_depth(), end=', ')
#     count[tree.get_depth()] += 1
# count = dict(count)
# for item, key in count.items():
#     print(f'{key} trees have length {item}')

In [None]:
import numpy as np

def test(dataset:np.ndarray, target:np.ndarray):
    # get predictions
    y_pred = model.predict(dataset)

    matches = np.count_nonzero(target == y_pred)
    print(f'Accuracy: {100 * matches / len(target)} %')


test(X_train, y_train)
test(X_validation, y_validation)
test(X_test, y_test)
test(X_gen, y_gen)

In [None]:
import joblib
joblib.dump(model, "random_forest_85%gen.joblib")
# model = joblib.load("random_forest.joblib")

In [None]:
from prettytable import PrettyTable

info_data = list(training_data_df.columns.values)

table = PrettyTable()
table.field_names = ["Property", "Importance (%)"]

importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

for i in indices:
    imp = 100 * importances[i]

    if imp > 0:
        if int(imp) == 0:
            imp = '<1 %'
        else:
            imp = f'{int(imp)} %'
    else:
        imp = '0 %'


    table.add_row([info_data[i], imp])

print(table)

# WanDB init

In [None]:
assert False, "WanDB not yet needed."

In [None]:
import wandb

# start a new wandb run and add your model hyperparameters
wandb.init(project='Halado_Adatelemzes_Labor', config=model.get_params())

# Add additional configs to wandb
wandb.config.update({"test_size" : TEST_SIZE,
                    "train_len" : len(X_train),
                    "test_len" : len(X_validation)})

In [None]:
from wandb.sklearn import plot_precision_recall, plot_feature_importances
from wandb.sklearn import plot_class_proportions, plot_learning_curve, plot_roc

y_probas = model.predict_proba(X_validation)

# log additional visualisations to wandb
plot_class_proportions(y_train, y_validation, info_data)
# plot_learning_curve(model, X_train, y_train)
plot_roc(y_validation, y_probas, info_data)
plot_precision_recall(y_validation, y_probas, info_data)
plot_feature_importances(model)

In [None]:
# Finish the wandb run
wandb.finish()