# 03_Modeling

## Imports

In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import roc_auc_score, average_precision_score

## Load data

In [38]:
df = pd.read_csv('../data/processed/windowed_dataset_cleaned.csv')
X = df[df.columns.difference(['is_attack'])]  # Features
y = df['is_attack']  # Labels

In [43]:
contamination = y.mean()
print(f"Contamination (proporción de ataques): {contamination:.4f}")
from sklearn.ensemble import IsolationForest

model = IsolationForest(
    n_estimators=300,
    contamination=contamination,
    max_samples='auto',
    random_state=42,
    n_jobs=-1
)

model.fit(X)

Contamination (proporción de ataques): 0.0144


0,1,2
,"n_estimators  n_estimators: int, default=100 The number of base estimators in the ensemble.",300
,"max_samples  max_samples: ""auto"", int or float, default=""auto"" The number of samples to draw from X to train each base estimator. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples. - If ""auto"", then `max_samples=min(256, n_samples)`. If max_samples is larger than the number of samples provided, all samples will be used for all trees (no sampling).",'auto'
,"contamination  contamination: 'auto' or float, default='auto' The amount of contamination of the data set, i.e. the proportion of outliers in the data set. Used when fitting to define the threshold on the scores of the samples. - If 'auto', the threshold is determined as in the  original paper. - If float, the contamination should be in the range (0, 0.5]. .. versionchanged:: 0.22  The default value of ``contamination`` changed from 0.1  to ``'auto'``.",np.float64(0....3663820037493)
,"max_features  max_features: int or float, default=1.0 The number of features to draw from X to train each base estimator. - If int, then draw `max_features` features. - If float, then draw `max(1, int(max_features * n_features_in_))` features. Note: using a float number less than 1.0 or integer less than number of features will enable feature subsampling and leads to a longer runtime.",1.0
,"bootstrap  bootstrap: bool, default=False If True, individual trees are fit on random subsets of the training data sampled with replacement. If False, sampling without replacement is performed.",False
,"n_jobs  n_jobs: int, default=None The number of jobs to run in parallel for :meth:`fit`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",-1
,"random_state  random_state: int, RandomState instance or None, default=None Controls the pseudo-randomness of the selection of the feature and split values for each branching step and each tree in the forest. Pass an int for reproducible results across multiple function calls. See :term:`Glossary `.",42
,"verbose  verbose: int, default=0 Controls the verbosity of the tree building process.",0
,"warm_start  warm_start: bool, default=False When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new forest. See :term:`the Glossary `. .. versionadded:: 0.21",False


In [40]:
df['anomaly_score'] = -model.score_samples(X)
df['anomaly_label'] = model.predict(X) == -1
print(average_precision_score(y, df['anomaly_score']))

0.9915074385048404


In [41]:
df.sort_values('anomaly_score', ascending=False).head(20)

Unnamed: 0,n_connections,id.orig_p_mean,id.orig_p_std,id.orig_p_max,id.resp_p_std,orig_bytes_mean,orig_bytes_std,orig_bytes_max,resp_bytes_mean,resp_bytes_std,...,recent_activity_score_max,recent_docker_event_mean,recent_docker_event_std,recent_docker_event_max,time_since_container_start_mean,time_since_container_start_std,time_since_container_start_max,is_attack,anomaly_score,anomaly_label
335,6.0,0.451839,21.567171,0.403767,0.0,-0.349432,85.022406,-0.116995,463.292929,4356.381717,...,4.993675,-0.5,0.0,-1.0,-1.069591,0.0,-1.073349,1.0,0.770932,True
336,6.0,0.451839,21.567171,0.403767,0.0,-0.349432,85.022406,-0.116995,463.292929,4356.381717,...,4.993675,-0.5,0.0,-1.0,-1.069591,0.0,-1.073349,1.0,0.770932,True
334,6.0,0.451839,21.567171,0.403767,0.0,-0.349432,85.022406,-0.116995,463.292929,4356.381717,...,4.993675,-0.5,0.0,-1.0,-1.069591,0.0,-1.073349,1.0,0.770932,True
303,13.0,-0.259877,1858.960202,-0.172833,0.0,-0.343006,83.285851,-0.095528,168.379509,1992.226571,...,4.993675,-0.5,0.0,-1.0,-1.069591,0.0,-1.073349,1.0,0.770758,True
302,13.0,-0.259877,1858.960202,-0.172833,0.0,-0.343006,83.285851,-0.095528,168.379509,1992.226571,...,4.993675,-0.5,0.0,-1.0,-1.069591,0.0,-1.073349,1.0,0.770758,True
121,13.0,-0.592191,3588.107267,-0.341019,0.0,-0.347376,82.278541,-0.102683,316.236652,3615.472431,...,4.993675,-0.5,0.0,-1.0,-1.069591,0.0,-1.073349,1.0,0.769506,True
120,13.0,-0.592191,3588.107267,-0.341019,0.0,-0.347376,82.278541,-0.102683,316.236652,3615.472431,...,4.993675,-0.5,0.0,-1.0,-1.069591,0.0,-1.073349,1.0,0.769506,True
301,11.0,-0.241146,1849.474782,-0.172833,0.0,-0.330455,89.799229,-0.095528,182.316498,2104.726254,...,4.993675,-0.5,0.0,-1.0,-1.069591,0.0,-1.073349,1.0,0.768675,True
326,6.0,-0.611201,17.620335,-0.619289,0.0,-0.349432,85.407706,-0.113417,463.97114,4363.623789,...,4.993675,-0.5,0.0,-1.0,-1.069591,0.0,-1.073349,1.0,0.765332,True
325,6.0,-0.611201,17.620335,-0.619289,0.0,-0.349432,85.407706,-0.113417,463.97114,4363.623789,...,4.993675,-0.5,0.0,-1.0,-1.069591,0.0,-1.073349,1.0,0.765332,True


In [42]:
df.sort_values('anomaly_score').head(20)

Unnamed: 0,n_connections,id.orig_p_mean,id.orig_p_std,id.orig_p_max,id.resp_p_std,orig_bytes_mean,orig_bytes_std,orig_bytes_max,resp_bytes_mean,resp_bytes_std,...,recent_activity_score_max,recent_docker_event_mean,recent_docker_event_std,recent_docker_event_max,time_since_container_start_mean,time_since_container_start_std,time_since_container_start_max,is_attack,anomaly_score,anomaly_label
8365,0.0,-0.255592,0.0,-0.278943,0.0,0.488125,0.0,0.032916,0.373737,0.0,...,0.032655,0.5,0.0,0.0,0.368284,0.0,0.322398,0.0,0.383299,False
8366,0.0,-0.255592,0.0,-0.278943,0.0,0.488125,0.0,0.032916,0.373737,0.0,...,0.032655,0.5,0.0,0.0,0.368284,0.0,0.322398,0.0,0.383299,False
8367,0.0,-0.255592,0.0,-0.278943,0.0,0.488125,0.0,0.032916,0.373737,0.0,...,0.032655,0.5,0.0,0.0,0.368284,0.0,0.322398,0.0,0.383299,False
2670,0.0,-0.123348,0.0,-0.151673,0.0,0.487765,0.0,0.032558,0.323232,0.0,...,-0.704663,-0.5,0.0,-1.0,-0.172825,0.0,-0.202858,0.0,0.38355,False
2671,0.0,-0.123348,0.0,-0.151673,0.0,0.487765,0.0,0.032558,0.323232,0.0,...,-0.704663,-0.5,0.0,-1.0,-0.172825,0.0,-0.202858,0.0,0.38355,False
2672,0.0,-0.123348,0.0,-0.151673,0.0,0.487765,0.0,0.032558,0.323232,0.0,...,-0.704663,-0.5,0.0,-1.0,-0.172825,0.0,-0.202858,0.0,0.38355,False
20459,0.0,0.440963,0.0,0.391413,0.0,0.482368,0.0,0.027191,0.070707,0.0,...,-0.401829,0.5,0.0,0.0,-0.055757,0.0,-0.08922,0.0,0.384426,False
20461,0.0,0.440963,0.0,0.391413,0.0,0.482368,0.0,0.027191,0.070707,0.0,...,-0.401829,0.5,0.0,0.0,-0.055757,0.0,-0.08922,0.0,0.384426,False
20460,0.0,0.440963,0.0,0.391413,0.0,0.482368,0.0,0.027191,0.070707,0.0,...,-0.401829,0.5,0.0,0.0,-0.055757,0.0,-0.08922,0.0,0.384426,False
3655,0.0,-0.831787,0.0,-0.833466,0.0,-0.47697,0.0,-0.926655,0.333333,0.0,...,-0.510677,-0.5,0.0,-1.0,-0.053022,0.0,-0.086565,0.0,0.385518,False
