In [73]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

df = pd.read_csv('data/supervised_dataset.csv')
df.head(10)

Unnamed: 0.1,Unnamed: 0,_id,inter_api_access_duration(sec),api_access_uniqueness,sequence_length(count),vsession_duration(min),ip_type,num_sessions,num_users,num_unique_apis,source,classification
0,0,1f2c32d8-2d6e-3b68-bc46-789469f2b71e,0.000812,0.004066,85.643243,5405,default,1460.0,1295.0,451.0,E,normal
1,1,4c486414-d4f5-33f6-b485-24a8ed2925e8,6.3e-05,0.002211,16.166805,519,default,9299.0,8447.0,302.0,E,normal
2,2,7e5838fc-bce1-371f-a3ac-d8a0b2a05d9a,0.004481,0.015324,99.573276,6211,default,255.0,232.0,354.0,E,normal
3,3,82661ecd-d87f-3dff-855e-378f7cb6d912,0.017837,0.014974,69.792793,8292,default,195.0,111.0,116.0,E,normal
4,4,d62d56ea-775e-328c-8b08-db7ad7f834e5,0.000797,0.006056,14.952756,182,default,272.0,254.0,23.0,E,normal
5,5,45d84ed6-043b-39ba-9247-0e5c36c48889,0.00166,0.005821,92.222222,7359,default,897.0,801.0,430.0,E,normal
6,6,886654dc-3acf-3d11-9da7-2380dbe5b11d,0.004798,0.012119,100.098361,8790,default,342.0,305.0,370.0,E,normal
7,7,e538e1a9-3666-3e6f-ba3d-3de4ab256b8b,0.00757,0.008834,65.746725,6839,default,240.0,229.0,133.0,E,normal
8,8,0c8bc3da-f6c1-34f0-9afa-4179eaee4ebe,0.000708,0.014712,7.526846,96,default,326.0,298.0,33.0,E,normal
9,9,387a01c7-2223-3524-9e01-a278db41748e,0.000799,0.004379,79.392024,5056,default,1471.0,1329.0,462.0,E,normal


In [74]:
# Prepare data

df.drop(['Unnamed: 0', '_id'], axis=1, inplace=True)
df.dropna(inplace=True)

ips = list(df['ip_type'].unique())
df['ip_type'] = df['ip_type'].apply(lambda x: ips.index(x))
df['source'] = df['source'].apply(lambda x: 0 if x == 'E' else 1)
df['classification'] = df['classification'].apply(lambda x: 0 if x == 'normal' else 1)
df = df.apply(pd.to_numeric, errors='coerce')
df = df.dropna()

target_set = df['classification'].copy()
input_set = df[[
    'inter_api_access_duration(sec)',
    'api_access_uniqueness',
    'sequence_length(count)',
    'vsession_duration(min)', 
    'ip_type', 
    'num_sessions', 
    'num_users',
    'num_unique_apis',
    'source'
]].copy()

X_train, X_test, y_train, y_test = train_test_split(input_set, target_set, random_state=11, test_size=0.2)

In [75]:
# Train

model = LinearRegression()
model.fit(X_train, y_train)

# Get the metrics

for col, c in zip(X_train.columns, model.coef_):
    print(f'{col}: {c}')
print(model.intercept_)

inter_api_access_duration(sec): 0.00022656692621070466
api_access_uniqueness: 1.0911270287514623
sequence_length(count): 0.000615267031933207
vsession_duration(min): 1.0315796170612401e-06
ip_type: 0.3293743901791393
num_sessions: -6.076849811896719e-05
num_users: 5.824273428302416e-05
num_unique_apis: -0.0009078837636795928
source: -0.15834197443760356
0.17219711888120243


In [83]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def_r2 = r2_score(y_test, prediction)
prediction = model.predict(X_test)
print(f"MSE = {mean_squared_error(y_test, prediction)}")
print(f"MAE = {mean_absolute_error(y_test, prediction)}")
print(f"r2_score  = {def_r2}")

MSE = 0.06083277325836778
MAE = 0.1809951880400866
r2_score  = 0.733981615843802


In [80]:
cmp = pd.DataFrame(y_test.copy())
cmp['prediction'] = prediction
cmp

Unnamed: 0,classification,prediction
2,0,-0.066785
236,0,0.135421
458,0,0.183518
1008,0,0.013676
996,0,0.020576
...,...,...
700,0,0.103993
1409,1,0.552716
1001,0,0.014643
1057,0,0.013154


In [86]:
from sklearn.ensemble import GradientBoostingRegressor

gbmodel = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.1, max_depth=3).fit(X_train.fillna(0), y_train)
gb_r2 = gbmodel.score(X_test.fillna(0), y_test)
print(f'GBR Coefficients: {gbmodel.feature_importances_}')
print(f'GBR r2_score = {gb_r2}')
print(f'r2_score delta = {gb_r2 - def_r2}')

GBR Coefficients: [1.31326357e-15 3.72472240e-15 3.97756498e-15 3.20867403e-15
 0.00000000e+00 1.77844132e-16 1.00000000e+00 1.13936447e-15
 0.00000000e+00]
GBR r2_score = 0.999999999999999
r2_score delta = 0.26601838415619705
