# Note:
    Main idea is to train CatBoost Classificator to predict class based on encoded features of an event.

# Import libraries

In [12]:
from catboost import CatBoostClassifier, Pool

import torch


import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

# Import data

In [2]:
df_train = pd.read_parquet("../data/train_data_features.parquet")
df_test = pd.read_parquet("../data/test_data_features.parquet")
df_event_classes = pd.read_parquet("../data/event_classes_by_optics_50")

In [3]:
df_train.head()

Unnamed: 0,CLIENT_IP,EVENT_ID,HEADER_pattern,BOT_BOOL,CURL_HEAD_BOOL,WGET_HEAD_BOOL,SYMBOL_@,LENGTH_OF_USER_AGENT_HEAD_likely,windows_bool,linux_bool,...,REQUEST_URI\';\'REQUEST_ARGS,REQUEST_XML,RESPONSE_BODY,RESPONSE_HEADERS,2,3,4,5,Anomaly TOP,user_expected
9497,186.177.157.252,AVdZr3EHq1Ppo9zF1E1J,True,False,False,False,False,True,True,False,...,False,False,False,False,False,True,False,False,False,True
37540,217.175.140.69,1Fu9HGQB5cBXmMW1wjDp,True,False,False,False,False,True,True,False,...,False,False,False,False,True,False,False,False,False,True
49419,90.151.84.224,XK8XJGQB5cBXmMW1Dof6,True,False,False,False,False,True,True,False,...,False,False,False,False,False,False,True,False,False,True
40601,85.234.117.8,x7s9JWQB5cBXmMW1Rmbn,True,False,False,False,False,True,True,False,...,False,False,False,False,False,False,True,False,False,True
9767,195.222.10.251,Ccoc_2MBjksgoq1eTn4W,True,False,False,False,False,True,False,True,...,False,False,False,False,True,False,False,False,False,True


In [4]:
df_test.head()

Unnamed: 0,CLIENT_IP,EVENT_ID,HEADER_pattern,BOT_BOOL,CURL_HEAD_BOOL,WGET_HEAD_BOOL,SYMBOL_@,LENGTH_OF_USER_AGENT_HEAD_likely,windows_bool,linux_bool,...,REQUEST_URI\';\'REQUEST_ARGS,REQUEST_XML,RESPONSE_BODY,RESPONSE_HEADERS,2,3,4,5,Anomaly TOP,user_expected
23799,217.175.140.69,a14dHWQB5cBXmMW1Dl2w,True,False,False,False,False,True,True,False,...,False,False,False,False,False,False,False,True,False,True
51955,91.103.66.203,NZo5ImQB5cBXmMW14djt,True,False,False,False,False,True,True,False,...,False,False,False,False,False,False,True,False,False,True
9625,185.70.104.11,AVdqkkhHq1Ppo9zF-FYT,True,False,False,False,False,True,False,False,...,False,False,False,False,True,False,False,False,False,True
22484,95.188.66.154,wMC-E2QBoRd31uenQbLg,True,False,False,False,False,True,True,False,...,False,False,False,False,True,False,False,False,False,True
191,5.19.178.227,joV2IGQB5cBXmMW1JJXD,True,False,False,False,False,True,True,False,...,False,False,False,False,False,False,True,False,False,True


In [5]:
df_event_classes.head()

Unnamed: 0,EVENT_ID,class
0,AVdcJmIIq1Ppo9zF2YIp,0
1,iz7SN2YBrgKk_RFNZW_U,-1
2,AVdjekw4q1Ppo9zF6QT2,0
3,SqQGI2QB5cBXmMW1CDbp,-1
4,nFzwHGQB5cBXmMW1y_TD,39


# Initial preprocess pipline

### preprocess df_test and df_train 

In [6]:
df_train = pd.merge(df_train, df_event_classes, on=['EVENT_ID']).drop(axis=1, columns=["CLIENT_IP", "EVENT_ID"])

In [7]:
df_test = pd.merge(df_test, df_event_classes, on=['EVENT_ID']).drop(axis=1, columns=["CLIENT_IP", "EVENT_ID"])

In [10]:
#reset index 
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

### load MLP autoencdoer

In [24]:
model = torch.jit.load('../models/baseline_mlp_autoencdoer.pt')
@torch.inference_mode()
def encode_data(model, data) -> np.array:
    model.eval()
    return model.encoder(torch.Tensor(data.astype(float)).unsqueeze(0).view(1,1,42)).view(10).numpy()

In [25]:
list_of_train_features = []
for row in df_train.values:
    row = row[:-1]
    list_of_train_features.append(encode_data(model, np.array(row)))
list_of_train_features = np.array(list_of_train_features)

In [30]:
list_of_train_features.shape

(48490, 10)

In [26]:
list_of_test_features = []
for row in df_test.values:
    row = row[:-1]
    list_of_test_features.append(encode_data(model, np.array(row)))
list_of_test_features = np.array(list_of_test_features)

In [28]:
list_of_test_features.shape

(9773, 10)

# Prepaire data for training and test via CatBoost

In [31]:
train_dataset = Pool(list_of_train_features,
                     df_train['class'])

In [33]:
test_dataset = Pool(list_of_test_features,
                    df_test['class'])

# Init CatBoost and Train

In [37]:
model = CatBoostClassifier(learning_rate=1e-1,
                           eval_metric='Accuracy',
                           l2_leaf_reg=10)

In [43]:
model.fit(train_dataset,
          eval_set=test_dataset,
          verbose=False,
          plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x7f7a2cac07c0>

In [44]:
model.best_score_

{'learn': {'Accuracy': 0.9343782223138791, 'MultiClass': 0.24299713233132667},
 'validation': {'Accuracy': 0.9305228691292337,
  'MultiClass': 0.2666716608415625}}

# Check model

In [45]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

from sklearn import metrics

In [46]:
preds = model.predict(list_of_test_features)

In [47]:
print(classification_report(preds, df_test['class']))

              precision    recall  f1-score   support

          -1       0.95      0.95      0.95      1845
           0       1.00      1.00      1.00        99
           1       0.98      0.94      0.96       143
           2       0.97      1.00      0.99       113
           3       0.91      0.94      0.92       266
           4       0.98      0.99      0.98       217
           5       0.94      0.93      0.93       312
           6       0.99      0.98      0.99       191
           7       0.96      0.96      0.96       108
           8       0.93      0.94      0.94        88
           9       1.00      1.00      1.00        86
          10       0.77      0.79      0.78       216
          11       0.83      0.85      0.84       214
          12       0.99      1.00      1.00       136
          13       0.96      0.97      0.96       181
          14       1.00      0.99      0.99       175
          15       0.87      0.84      0.85       278
          16       1.00    

# Save model

In [49]:
# model.save_model('../models/catboost_accuracy_093_over50class.pkl')