#### Note:
    Main idea is to show how we can process raw data and predict class for an event 

# Import libraries

In [1]:
from catboost import CatBoostClassifier, Pool
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sys
sys.path.append("..")

from utils.DataTransformers import DataTransformer

# Import data

In [2]:
df = pd.read_csv("../data/part_10.csv")
df_event_classes = pd.read_parquet("../data/event_classes_by_optics_50")
df = pd.merge(df, df_event_classes, on=['EVENT_ID'])
df = df.drop(axis=1, columns="class")

# Import Models

#### load CatBoost Classifier

In [3]:
clf = CatBoostClassifier()
clf.load_model("../models/catboost_accuracy_093_over50class.pkl")

<catboost.core.CatBoostClassifier at 0x7f28c001a620>

In [4]:
mlp_autoencoder = torch.jit.load('../models/baseline_mlp_autoencdoer.pt')

In [5]:
mlp_autoencoder

RecursiveScriptModule(
  original_name=MLPEncoder
  (encoder): RecursiveScriptModule(
    original_name=Sequential
    (0): RecursiveScriptModule(original_name=Linear)
    (1): RecursiveScriptModule(original_name=BatchNorm1d)
    (2): RecursiveScriptModule(original_name=Sigmoid)
    (3): RecursiveScriptModule(original_name=Dropout)
    (4): RecursiveScriptModule(original_name=Linear)
    (5): RecursiveScriptModule(original_name=BatchNorm1d)
    (6): RecursiveScriptModule(original_name=Sigmoid)
    (7): RecursiveScriptModule(original_name=Dropout)
    (8): RecursiveScriptModule(original_name=Linear)
    (9): RecursiveScriptModule(original_name=BatchNorm1d)
    (10): RecursiveScriptModule(original_name=Sigmoid)
    (11): RecursiveScriptModule(original_name=Dropout)
  )
  (decoder): RecursiveScriptModule(
    original_name=Sequential
    (0): RecursiveScriptModule(original_name=Linear)
    (1): RecursiveScriptModule(original_name=BatchNorm1d)
    (2): RecursiveScriptModule(original_name=S

# Inference Pipline

In [6]:
#define simple raw data
get_dataframe_raw = df.copy().sample(1)
get_dataframe_raw.head()

Unnamed: 0,CLIENT_IP,CLIENT_USERAGENT,REQUEST_SIZE,RESPONSE_CODE,MATCHED_VARIABLE_SRC,MATCHED_VARIABLE_NAME,MATCHED_VARIABLE_VALUE,EVENT_ID
45127,178.65.103.69,Mozilla/5.0 (iPad; CPU OS 11_2 like Mac OS X) ...,878,404,REQUEST_GET_ARGS,REQUEST_GET_ARGS._,932660840,EWtbHmQB5cBXmMW1rGmH


In [7]:
get_dataframe_raw.columns

Index(['CLIENT_IP', 'CLIENT_USERAGENT', 'REQUEST_SIZE', 'RESPONSE_CODE',
       'MATCHED_VARIABLE_SRC', 'MATCHED_VARIABLE_NAME',
       'MATCHED_VARIABLE_VALUE', 'EVENT_ID'],
      dtype='object')

In [8]:
#transform and encode the raw data
x = DataTransformer(df=get_dataframe_raw, model=mlp_autoencoder).transform()
print(x)
#predict class of HTTP request
predict_class = clf.predict(x)[0]
print(predict_class)

[6.4781648e-01 9.8411798e-01 9.4067402e-02 4.1991088e-10 1.6102569e-01
 2.8881143e-08 1.2021849e-01 8.4925377e-01 9.7079122e-01 9.6793681e-01]
41


  self.df['HEADER_pattern'] = self.df["CLIENT_USERAGENT"].str.contains("([a-zA-Z]/[\d].*_*)").fillna(False)


# Define output function

In [9]:
def prepaire_json_result(event_id: str, predicted_class: int) -> dict:
    return {"EVENT_ID": event_id, "LABEL_PRED": predicted_class}

In [10]:
prepaire_json_result(get_dataframe_raw.get('EVENT_ID').values[0], predict_class)

{'EVENT_ID': 'EWtbHmQB5cBXmMW1rGmH', 'LABEL_PRED': 41}

In [13]:
test = [{
    "CLIENT_IP": "188.138.92.55",
    "EVENT_ID": "AVdhXFgVq1Ppo9zF5Fxu",
    "CLIENT_USERAGENT": "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0",
    "REQUEST_SIZE": 166,
    "RESPONSE_CODE": 404,
    "MATCHED_VARIABLE_SRC": "REQUEST_URI",
    "MATCHED_VARIABLE_NAME": "url",
    "MATCHED_VARIABLE_VALUE": "//tmp/20160925122692indo.php.vob"
}]

In [64]:
# curl -X 'POST'   'http://127.0.0.1:8127/predict'   -H 'accept: application/json'   -H 'Content-Type: application/json'   -d '[
#   {
#     "CLIENT_IP": "188.138.92.55",
#     "EVENT_ID": "AVdhXFgVq1Ppo9zF5Fxu",
#     "CLIENT_USERAGENT": "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0",
#     "REQUEST_SIZE": 166,
#     "RESPONSE_CODE": 404,
#     "MATCHED_VARIABLE_SRC": "REQUEST_URI",
#     "MATCHED_VARIABLE_NAME": "url",
#     "MATCHED_VARIABLE_VALUE": "//tmp/20160925122692indo.php.vob"
#   }
# ]'
