In [1]:
import numpy as np 
import pandas as pd 

In [2]:
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np

from sklearn.neighbors import KernelDensity

import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.figsize'] = (20, 10)

In [3]:
training_df = pd.read_csv('supervised_dataset.csv', index_col=False).drop('Unnamed: 0', axis=1)
evaluation_df = pd.read_csv('remaining_behavior_ext.csv', index_col=False).drop('Unnamed: 0', axis=1)

In [4]:
training_df.columns, evaluation_df.columns, len(training_df)

(Index(['_id', 'inter_api_access_duration(sec)', 'api_access_uniqueness',
        'sequence_length(count)', 'vsession_duration(min)', 'ip_type',
        'num_sessions', 'num_users', 'num_unique_apis', 'source',
        'classification'],
       dtype='object'),
 Index(['_id', 'inter_api_access_duration(sec)', 'api_access_uniqueness',
        'sequence_length(count)', 'vsession_duration(min)', 'ip_type',
        'behavior', 'behavior_type', 'num_sessions', 'num_users',
        'num_unique_apis', 'source'],
       dtype='object'),
 1699)

In [5]:
print("training data class distribution\n", training_df['classification'].value_counts())
print("class distribution based on ALGO-X\n", evaluation_df['behavior_type'].value_counts())

training data class distribution
 normal     1106
outlier     593
Name: classification, dtype: int64
class distribution based on ALGO-X
 outlier    24146
normal      8946
bot         1309
attack        22
Name: behavior_type, dtype: int64


In [6]:
train, test = train_test_split(training_df, test_size=0.4, random_state=1073)

param_grid = {
    "iterations": [10, 100, 1000],
    "learning_rate": [0.1, 0.01],
    "custom_loss": ['CrossEntropy', 'AUC', 'Logloss', ]
}
gclf = GridSearchCV(estimator=CatBoostClassifier(), param_grid=param_grid, cv=4)

In [7]:
gclf.fit(train.drop(['classification', 'source', '_id'], axis=1), train['classification'], cat_features=['ip_type'], verbose=1000)

0:	learn: 0.6158706	total: 89ms	remaining: 801ms
9:	learn: 0.2436058	total: 96.7ms	remaining: 0us
0:	learn: 0.6160841	total: 1.8ms	remaining: 16.2ms
9:	learn: 0.2454037	total: 8.76ms	remaining: 0us
0:	learn: 0.6153025	total: 2.96ms	remaining: 26.6ms
9:	learn: 0.2419216	total: 12.9ms	remaining: 0us
0:	learn: 0.6114441	total: 2.67ms	remaining: 24.1ms
9:	learn: 0.2467262	total: 10.7ms	remaining: 0us
0:	learn: 0.6851113	total: 1ms	remaining: 9.04ms
9:	learn: 0.6155766	total: 8.04ms	remaining: 0us
0:	learn: 0.6851335	total: 1.45ms	remaining: 13.1ms
9:	learn: 0.6157401	total: 9.46ms	remaining: 0us
0:	learn: 0.6850501	total: 973us	remaining: 8.76ms
9:	learn: 0.6151255	total: 8.12ms	remaining: 0us
0:	learn: 0.6846396	total: 1.12ms	remaining: 10.1ms
9:	learn: 0.6167871	total: 20ms	remaining: 0us
0:	learn: 0.6158706	total: 1.35ms	remaining: 134ms
99:	learn: 0.0063854	total: 110ms	remaining: 0us
0:	learn: 0.6160841	total: 2.08ms	remaining: 206ms
99:	learn: 0.0055462	total: 141ms	remaining: 0us
0:

GridSearchCV(cv=4,
             estimator=<catboost.core.CatBoostClassifier object at 0x7fe7f2a105c0>,
             param_grid={'custom_loss': ['CrossEntropy', 'AUC', 'Logloss'],
                         'iterations': [10, 100, 1000],
                         'learning_rate': [0.1, 0.01]})

In [8]:
gclf.best_params_

{'custom_loss': 'CrossEntropy', 'iterations': 100, 'learning_rate': 0.1}

In [9]:
evaluation_cols_removed_df= evaluation_df.drop(['behavior_type', 'behavior', 'source', '_id'], axis=1)
test_cols_removed_df= test.drop(['source', '_id', 'classification'], axis=1)

In [10]:
prediction=gclf.predict(evaluation_cols_removed_df)
after_prediction_pd = evaluation_df[['behavior_type', 'behavior', 'source']].copy()
after_prediction_pd['prediction']=prediction
eval_cols_evaluation_set = after_prediction_pd[['behavior_type', 'prediction']]

prediction=gclf.predict(test_cols_removed_df)
test_after_prediction_pd = test[['classification']].copy()
test_after_prediction_pd['prediction']=prediction
test_cols_evaluation_set = test_after_prediction_pd[['classification', 'prediction']]

In [11]:
def calculate_misclassification(eval_cols, predicted_col, orig_col):
    classes = dict(eval_cols[orig_col].value_counts())

    misclassification_error = 0
    for class_name in classes:
        selection = eval_cols[ (eval_cols[orig_col] == class_name)]
        weight = 1.0/classes[class_name]
        misclassifiction_cases = selection[(selection[predicted_col] != class_name)]
        misclassification_count = len(misclassifiction_cases)
        print({'class': class_name, 'weight': weight, 'count': len(selection), 'misclassification':misclassification_count})

        misclassification_error += weight*misclassification_count
    misclassification_error = misclassification_error / len(classes)
    return misclassification_error

In [12]:
print('Classification of test data')
print("Misclassification error: ",calculate_misclassification(test_cols_evaluation_set, 'prediction', 'classification'))

print("Classsification of evaluation_df")
print("Misclassification error: ",calculate_misclassification(eval_cols_evaluation_set, 'prediction', 'behavior_type'))

Classification of test data
{'class': 'normal', 'weight': 0.0022727272727272726, 'count': 440, 'misclassification': 0}
{'class': 'outlier', 'weight': 0.004166666666666667, 'count': 240, 'misclassification': 0}
Misclassification error:  0.0
Classsification of evaluation_df
{'class': 'outlier', 'weight': 4.141472707694856e-05, 'count': 24146, 'misclassification': 0}
{'class': 'normal', 'weight': 0.000111781801922647, 'count': 8946, 'misclassification': 8733}
{'class': 'bot', 'weight': 0.0007639419404125286, 'count': 1309, 'misclassification': 1309}
{'class': 'attack', 'weight': 0.045454545454545456, 'count': 22, 'misclassification': 22}
Misclassification error:  0.7440476190476191
