In [None]:
%load_ext autoreload
%autoreload 2

# Loading Data

In [None]:
import numpy as np
import pandas as pd
from prettytable import PrettyTable

In [None]:
path_attacks = '../data/all_vectors.csv'
path_vectors = '../data/synthetic_vectors.csv'

csv_data_attacks = pd.read_csv(path_attacks)
csv_data_vectors = pd.read_csv(path_vectors)

table = PrettyTable()
table.field_names = csv_data_attacks.columns.tolist()
table.add_row(csv_data_attacks.dtypes)
print(table)

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
for category, dtype in zip(csv_data_attacks, csv_data_attacks.dtypes):
    if isinstance(dtype, np.dtypes.ObjectDType):
        csv_data_attacks[category] = label_encoder.fit_transform(csv_data_attacks[category])

table = PrettyTable()
table.field_names = csv_data_attacks.columns.tolist()
table.add_row(csv_data_attacks.dtypes)
print(table)

***

# WanDB init

In [None]:
import time, datetime

stamp_to_ms = lambda T : time.mktime(datetime.datetime.strptime(T, "%Y-%m-%dT%H:%M:%S").timetuple())

In [None]:
from sklearn.model_selection import train_test_split

# load and process data
data = csv_data_attacks.iloc[:, :-1]
target = csv_data_attacks.iloc[:, -1]

feature_names = csv_data_attacks.columns.to_numpy()
labels = np.array(list(set(csv_data_attacks['Type'])))

test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=test_size)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# train model
model = RandomForestClassifier(
    verbose=True,
    max_depth=3,
    max_features=0.1,
    n_estimators=1,
    bootstrap=True,
    oob_score=False,
    )
model.fit(X_train, y_train)
model_params = model.get_params()

In [None]:
from collections import defaultdict
cnt = defaultdict(int)
for tree in model.estimators_:
    # print(tree.get_depth(), end=', ')
    cnt[tree.get_depth()] += 1
cnt = dict(cnt)
for item, key in cnt.items():
    print(f'{key} trees have length {item}')

In [None]:
import wandb

# start a new wandb run and add your model hyperparameters
wandb.init(project='Halado_Adatelemzes_Labor', config=model_params)

# Add additional configs to wandb
wandb.config.update({"test_size" : test_size,
                    "train_len" : len(X_train),
                    "test_len" : len(X_test)})

In [None]:
# get predictions
y_pred = model.predict(X_test)

matches = np.count_nonzero(y_test == y_pred)
print(f'Accuracy: {100 * matches / len(y_test)} %')

y_probas = model.predict_proba(X_test)
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

In [None]:
from wandb.sklearn import plot_precision_recall, plot_feature_importances
from wandb.sklearn import plot_class_proportions, plot_learning_curve, plot_roc

# log additional visualisations to wandb
plot_class_proportions(y_train, y_test, labels)
plot_learning_curve(model, X_train, y_train)
plot_roc(y_test, y_probas, labels)
plot_precision_recall(y_test, y_probas, labels)
plot_feature_importances(model)

In [None]:
# Finish the wandb run
wandb.finish()