# Laboratorio 3 - Machine learning techniques
 Daniel Giraldo - 201328110  
 Iván García - 201614488

In [31]:
#Imports
import pandas as pd
import pandas_profiling
import pickle
import numpy as np

from sklearn.svm import SVC
from collections import Counter
from pandas_profiling import ProfileReport
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.base import BaseEstimator, TransformerMixin
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.model_selection import GridSearchCV

## Carga y entendimiento de datos

In [2]:
# Data
try:
    train = pd.read_pickle('./train.pkl')
    test = pd.read_pickle('./test.pkl')
except:
    train = pd.read_csv('https://raw.githubusercontent.com/jcestevezc/Machine-Learning-Techniques/master/Laboratorio%203/Punto%201/UNSW_NB15_training-set.csv')

    test = pd.read_csv('https://raw.githubusercontent.com/jcestevezc/Machine-Learning-Techniques/master/Laboratorio%203/Punto%201/UNSW_NB15_testing-set.csv')

    train.to_pickle('./train.pkl')
    test.to_pickle('./test.pkl')
    
train.head()

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,1.1e-05,udp,-,INT,2,0,496,0,90909.0902,...,1,2,0,0,0,1,2,0,Normal,0
1,2,8e-06,udp,-,INT,2,0,1762,0,125000.0003,...,1,2,0,0,0,1,2,0,Normal,0
2,3,5e-06,udp,-,INT,2,0,1068,0,200000.0051,...,1,3,0,0,0,1,3,0,Normal,0
3,4,6e-06,udp,-,INT,2,0,900,0,166666.6608,...,1,3,0,0,0,2,3,0,Normal,0
4,5,1e-05,udp,-,INT,2,0,2126,0,100000.0025,...,1,3,0,0,0,2,3,0,Normal,0


### Un análisis general de los datos:

In [58]:
#Lets start to hearing what the data have to say.

# First, compare the amount of data in 'Normal' with the not normal one, this could add some bias to the model in
#  terms of amount of data.
print("Size of all data", len(train.index))
print("Size of normal situation data", len(train.loc[train['attack_cat'] == 'Normal'].index))
print("Different kind of attacks in data", train['attack_cat'].unique(),
      "\nSize: ", len(train.loc[train['attack_cat'] != 'Normal'].index))
print("\nSize of test data: ", len(test.index))
print("Number of columns: ", len(train.columns))

Size of all data 82332
Size of normal situation data 37000
Different kind of attacks in data ['Normal' 'Reconnaissance' 'Backdoor' 'DoS' 'Exploits' 'Analysis'
 'Fuzzers' 'Worms' 'Shellcode' 'Generic'] 
Size:  45332

Size of test data:  175341
Number of columns:  45


### Re-muestreo de datos
Lo primero que intentamos hacer luego de hacer varias iteraciones de los modelos fue reducir la cardinalidad del problema, para así reducir también la complejidad. Esta decisión la basamos en el hecho de que el algoritmo que implementa la clase SVC de sklearn tiene una complejidad de O(n x m<sup>3</sup>) donde n son los numero de columnas y m el numero de muestras evaluadas en el entrenamiento.  

Decidimos trabajar con muestras de 6500 datos, al haber 10 clases distinas, nos parecio que tener un set de datos de 65000 muestras era lograble para el entrenamiento de los datos

In [4]:
#Know, for making things easier we will short the amount of data of each kind of attack
# And try to balance it, so the model could be train easyly

# We create a 6'700 sample size by each kind of data
data = {'Normal' : None, 
         'Reconnaissance' : None, 
         'Backdoor' : None, 
         'DoS' : None,
         'Exploits' : None,
         'Analysis' : None,
         'Fuzzers' : None, 
         'Worms': None, 
         'Shellcode': None, 
         'Generic': None
        }
attack_cats =  train['attack_cat'].unique()
for cat in attack_cats:
    size_data = len(train.loc[train['attack_cat'] == cat].index)
    if size_data >= 6500:
        data[cat] = train.loc[train['attack_cat'] == cat][:6500]
    else:
        data[cat] = train.loc[train['attack_cat'] == cat][:size_data]


### Observaciones
Con miras al balanceo de los datos con las técnicas estudiadas en clase de <em> oversampling </em> como son SMOT y ADASYN, miramos cuantos datos habian quedado de cada clase para poder estimar qué tipo de ténica era mejor.

In [5]:
# Testing good spliting
for cat in attack_cats:
    print("Size of ", cat, " categoty: ",len(data[cat].index))

Size of  Normal  categoty:  6500
Size of  Reconnaissance  categoty:  3496
Size of  Backdoor  categoty:  583
Size of  DoS  categoty:  4089
Size of  Exploits  categoty:  6500
Size of  Analysis  categoty:  677
Size of  Fuzzers  categoty:  6062
Size of  Worms  categoty:  44
Size of  Shellcode  categoty:  378
Size of  Generic  categoty:  6500


In [6]:
# making a new data frame
train_sized = pd.DataFrame()
#train_sized = pd.concat([train_sized, data['Normal']], axis=1)
for cat in attack_cats:
    train_sized = pd.concat([train_sized, data[cat]], axis=0)
train_sized.head(10)

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,1.1e-05,udp,-,INT,2,0,496,0,90909.0902,...,1,2,0,0,0,1,2,0,Normal,0
1,2,8e-06,udp,-,INT,2,0,1762,0,125000.0003,...,1,2,0,0,0,1,2,0,Normal,0
2,3,5e-06,udp,-,INT,2,0,1068,0,200000.0051,...,1,3,0,0,0,1,3,0,Normal,0
3,4,6e-06,udp,-,INT,2,0,900,0,166666.6608,...,1,3,0,0,0,2,3,0,Normal,0
4,5,1e-05,udp,-,INT,2,0,2126,0,100000.0025,...,1,3,0,0,0,2,3,0,Normal,0
5,6,3e-06,udp,-,INT,2,0,784,0,333333.3215,...,1,2,0,0,0,2,2,0,Normal,0
6,7,6e-06,udp,-,INT,2,0,1960,0,166666.6608,...,1,2,0,0,0,2,2,0,Normal,0
7,8,2.8e-05,udp,-,INT,2,0,1384,0,35714.28522,...,1,3,0,0,0,1,3,0,Normal,0
8,9,0.0,arp,-,INT,1,0,46,0,0.0,...,2,2,0,0,0,2,2,1,Normal,0
9,10,0.0,arp,-,INT,1,0,46,0,0.0,...,2,2,0,0,0,2,2,1,Normal,0


In [7]:
#Check for safe concat
for col in train_sized.columns:
    print("Column ", col, " has: ", train_sized[col].isnull().sum().sum(), "NaN")
print("Total size of the data: ", len(train_sized.index))

Column  id  has:  0 NaN
Column  dur  has:  0 NaN
Column  proto  has:  0 NaN
Column  service  has:  0 NaN
Column  state  has:  0 NaN
Column  spkts  has:  0 NaN
Column  dpkts  has:  0 NaN
Column  sbytes  has:  0 NaN
Column  dbytes  has:  0 NaN
Column  rate  has:  0 NaN
Column  sttl  has:  0 NaN
Column  dttl  has:  0 NaN
Column  sload  has:  0 NaN
Column  dload  has:  0 NaN
Column  sloss  has:  0 NaN
Column  dloss  has:  0 NaN
Column  sinpkt  has:  0 NaN
Column  dinpkt  has:  0 NaN
Column  sjit  has:  0 NaN
Column  djit  has:  0 NaN
Column  swin  has:  0 NaN
Column  stcpb  has:  0 NaN
Column  dtcpb  has:  0 NaN
Column  dwin  has:  0 NaN
Column  tcprtt  has:  0 NaN
Column  synack  has:  0 NaN
Column  ackdat  has:  0 NaN
Column  smean  has:  0 NaN
Column  dmean  has:  0 NaN
Column  trans_depth  has:  0 NaN
Column  response_body_len  has:  0 NaN
Column  ct_srv_src  has:  0 NaN
Column  ct_state_ttl  has:  0 NaN
Column  ct_dst_ltm  has:  0 NaN
Column  ct_src_dport_ltm  has:  0 NaN
Column  ct_d

## Perfilamiento

En este caso, luego de hacer el muestreo esperado de los datos, usamos el método de perfilamiento de Pandas para inspeccionar los datos y posteriormente se hizo un descarte de variables por la correlación entre estas, se encontraron las siguientes observaciones:

### Observaciones:

- Habían muchos datos que tenían correlaciones entre columnas, a pesar de no tener un diccionario que nos facilitara un entendimiento más específico se podía deducir por el patrón de un sufijo **s** o **d** antes del término compartido en ambas columnas. 

- Basados en este [articulo](https://medium.com/@raj5287/effects-of-multi-collinearity-in-logistic-regression-svm-rf-af6766d91f1b) del blog Medium, decidimos eliminar todas las variables con una correlacion superior a 0.9 dado que estan fuertemente correlacionadas entre ellas. 

- Encontramos valores categoricos con alta cardinalidad lo cual afecta en 2 sentidos: la complejidad del procesamiento de los datos y el rendimiento del modelo en el sentido que va a tener que generalizar entre muchas clases distintas de atributos y eso es más difícil de modelar con un número restringido de datos 
 

In [8]:
train_sized.describe()
#profile = train_sized.profile_report(title='Pandas Profiling Report')
#profile.to_file(outputfile="output_nn.html")

Unnamed: 0,id,dur,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,...,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,label
count,34829.0,34829.0,34829.0,34829.0,34829.0,34829.0,34829.0,34829.0,34829.0,34829.0,...,34829.0,34829.0,34829.0,34829.0,34829.0,34829.0,34829.0,34829.0,34829.0,34829.0
mean,24920.059893,1.234828,19.586092,17.975365,9610.202,14088.01,95142.59,193.746131,87.856327,75448150.0,...,5.810618,4.384105,8.256625,0.010824,0.010882,0.11209,7.497459,9.941198,0.000574,0.813374
std,19591.784365,5.878477,157.562074,153.335028,202915.6,202135.4,154235.7,95.956127,114.981627,183068700.0,...,9.560193,6.624989,12.577412,0.104581,0.105395,0.491877,9.559064,12.194247,0.023957,0.389617
min,1.0,0.0,1.0,0.0,24.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
25%,8708.0,8e-06,2.0,0.0,168.0,0.0,37.42407,62.0,0.0,14188.52,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
50%,17415.0,0.001787,4.0,2.0,456.0,162.0,4202.72,254.0,29.0,1365619.0,...,1.0,1.0,2.0,0.0,0.0,0.0,3.0,5.0,0.0,1.0
75%,45056.0,0.640271,12.0,10.0,1236.0,824.0,125000.0,254.0,252.0,91200000.0,...,4.0,3.0,7.0,0.0,0.0,0.0,10.0,12.0,0.0,1.0
max,65852.0,59.997478,10646.0,11018.0,14355770.0,14657530.0,1000000.0,255.0,253.0,5268000000.0,...,59.0,38.0,63.0,2.0,2.0,16.0,60.0,62.0,1.0,1.0


In [57]:
#Profile report
#train_sized.profile_report()

In [9]:
#Look for correlations in fields
corr = train_sized.corr()
corr.style.background_gradient(cmap='coolwarm')


Unnamed: 0,id,dur,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,dload,sloss,dloss,sinpkt,dinpkt,sjit,djit,swin,stcpb,dtcpb,dwin,tcprtt,synack,ackdat,smean,dmean,trans_depth,response_body_len,ct_srv_src,ct_state_ttl,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,label
id,1.0,0.032311,-0.005287,-0.003496,-0.009094,-0.005865,-0.131837,0.049321,0.14472,-0.001176,0.018444,-0.006975,-0.005678,0.024733,0.067087,0.087941,0.043092,0.143813,0.120589,0.113535,0.152426,0.16614,0.173368,0.144325,0.07843,-0.033778,-0.058166,-0.01309,-0.10944,-0.114243,-0.183132,-0.182302,-0.194375,-0.256603,-0.056175,-0.055647,-0.101952,-0.219257,-0.132306,-0.011242,-0.01426
dur,0.032311,1.0,0.281119,0.215361,0.213343,0.173386,-0.128976,0.027752,0.07513,-0.086122,-0.045464,0.232527,0.170603,0.172013,0.125776,0.148812,0.147831,0.060383,0.030143,0.027904,0.041341,0.082374,0.081485,0.076336,0.116935,0.032185,-0.005494,0.06085,-0.111073,0.187255,-0.092109,-0.095095,-0.095393,-0.098842,0.021691,0.02182,-0.009489,-0.105263,-0.114109,0.043893,0.028926
spkts,-0.005287,0.281119,1.0,0.333103,0.971661,0.181287,-0.068254,-0.087634,0.071151,-0.045559,0.053431,0.977936,0.179441,-0.004757,-0.001894,-0.001502,0.008696,0.108062,0.072638,0.079085,0.108527,0.039868,0.031676,0.045239,0.214996,0.106702,0.004842,0.062274,-0.057883,-0.06132,-0.049678,-0.052356,-0.056051,-0.059289,0.006418,0.00648,0.000493,-0.052353,-0.057997,-0.002797,-0.04705
dpkts,-0.003496,0.215361,0.333103,1.0,0.156966,0.983513,-0.07162,-0.143309,0.061129,-0.048071,0.092441,0.164438,0.985381,-0.006225,-0.004984,-0.002189,0.029958,0.123726,0.094531,0.090617,0.124616,0.02468,0.014046,0.033947,0.016482,0.320437,0.015364,0.305271,-0.053924,-0.089025,-0.048287,-0.053679,-0.059081,-0.060986,0.007725,0.007793,0.02689,-0.054979,-0.054548,-0.00281,-0.083815
sbytes,-0.009094,0.213343,0.971661,0.156966,1.0,0.00883,-0.028534,-0.023434,0.056856,-0.018202,-0.007532,0.993597,0.005812,-0.003076,-0.001173,-0.002645,-0.004251,0.046966,0.02495,0.035042,0.047306,0.036445,0.029167,0.041128,0.222899,-0.004865,0.001778,0.000788,-0.032075,-0.008769,-0.024426,-0.022823,-0.023644,-0.025774,-0.004063,-0.004049,-0.005092,-0.023714,-0.031786,-0.001129,0.013588
dbytes,-0.005865,0.173386,0.181287,0.983513,0.00883,1.0,-0.042686,-0.10243,0.035002,-0.028665,0.066974,0.009121,0.998571,-0.004503,-0.005797,-0.003914,0.025238,0.074727,0.057895,0.053788,0.075259,0.00833,-0.000703,0.017274,-0.024514,0.304184,0.015816,0.317321,-0.031938,-0.048243,-0.028982,-0.031692,-0.035194,-0.036323,-0.006107,-0.006088,0.028543,-0.033597,-0.0324,-0.001671,-0.051651
rate,-0.131837,-0.128976,-0.068254,-0.07162,-0.028534,-0.042686,1.0,0.374352,-0.470416,0.594581,-0.14942,-0.042144,-0.053443,-0.046592,-0.0592,-0.067897,-0.104819,-0.567403,-0.444972,-0.441529,-0.563448,-0.363605,-0.333316,-0.365117,-0.113331,-0.279016,-0.082465,-0.022586,0.372419,0.460889,0.329636,0.353599,0.37864,0.389144,-0.063692,-0.063536,-0.140314,0.362403,0.375445,-0.014787,0.278295
sttl,0.049321,0.027752,-0.087634,-0.143309,-0.023434,-0.10243,0.374352,1.0,-0.07387,0.250906,-0.427709,-0.041443,-0.121179,-0.032114,0.006314,0.041413,-0.107901,-0.444573,-0.350342,-0.344707,-0.448559,0.040165,0.051982,0.024134,-0.035031,-0.585796,-0.066622,-0.052603,0.267557,0.717752,0.215424,0.27957,0.315763,0.314215,-0.140754,-0.14059,-0.143864,0.241807,0.261363,-0.048399,0.774808
dttl,0.14472,0.07513,0.071151,0.061129,0.056856,0.035002,-0.470416,-0.07387,1.0,-0.314443,-0.115591,0.071509,0.044414,0.010265,0.103855,0.148789,0.163288,0.792916,0.634024,0.632188,0.798665,0.838932,0.767511,0.84406,0.162598,0.188785,0.154841,0.022841,-0.448529,-0.343479,-0.38416,-0.363681,-0.382397,-0.404102,0.099837,0.098787,0.261591,-0.429735,-0.455447,-0.018316,0.237313
sload,-0.001176,-0.086122,-0.045559,-0.048071,-0.018202,-0.028665,0.594581,0.250906,-0.314443,1.0,-0.103117,-0.02798,-0.035904,-0.025929,-0.039551,-0.045189,-0.070047,-0.380357,-0.298283,-0.295993,-0.377698,-0.242863,-0.222638,-0.243865,0.231093,-0.188107,-0.055111,-0.015115,0.115134,0.311067,0.068369,0.086274,0.0781,0.104851,-0.042629,-0.042524,-0.093801,0.092985,0.114988,-0.009879,0.177069


## Selección de variables

Basados en los resultados de la matriz de autocorrelación, escogimos las variables a descartar y establecimos los set de datos de prueba y de entrenamiento.

In [24]:
discard_vals = ["id", "spkts", "sbytes", "dpkts", "dbytes", "dload", "dloss", "sinpkt", "dinpkt", "djit",
                "sjit", "tcprtt", "synack", "dwin", "ackdat", "trans_depth", "is_ftp_login", "dmean",
                "response_body_len", "ct_flw_http_mthd", "ct_dst_ltm","attack_cat", "ct_src_dport_ltm", "ct_dst_src_ltm",
                "ct_srv_src"]

X_train = train_sized.drop(discard_vals, axis=1)
y_train = train_sized['attack_cat']

X_test = test.drop(discard_vals, axis=1)
y_test = test['attack_cat']

corr = X_train.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,dur,rate,sttl,dttl,sload,sloss,swin,stcpb,dtcpb,smean,ct_state_ttl,ct_dst_sport_ltm,ct_ftp_cmd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,label
dur,1.0,-0.128976,0.027752,0.07513,-0.086122,0.232527,0.060383,0.030143,0.027904,0.116935,0.187255,-0.095393,0.02182,-0.105263,-0.114109,0.043893,0.028926
rate,-0.128976,1.0,0.374352,-0.470416,0.594581,-0.042144,-0.567403,-0.444972,-0.441529,-0.113331,0.460889,0.37864,-0.063536,0.362403,0.375445,-0.014787,0.278295
sttl,0.027752,0.374352,1.0,-0.07387,0.250906,-0.041443,-0.444573,-0.350342,-0.344707,-0.035031,0.717752,0.315763,-0.14059,0.241807,0.261363,-0.048399,0.774808
dttl,0.07513,-0.470416,-0.07387,1.0,-0.314443,0.071509,0.792916,0.634024,0.632188,0.162598,-0.343479,-0.382397,0.098787,-0.429735,-0.455447,-0.018316,0.237313
sload,-0.086122,0.594581,0.250906,-0.314443,1.0,-0.02798,-0.380357,-0.298283,-0.295993,0.231093,0.311067,0.0781,-0.042524,0.092985,0.114988,-0.009879,0.177069
sloss,0.232527,-0.042144,-0.041443,0.071509,-0.02798,1.0,0.074107,0.044985,0.053425,0.224277,-0.026363,-0.034418,0.003453,-0.034499,-0.04082,-0.001647,-0.002317
swin,0.060383,-0.567403,-0.444573,0.792916,-0.380357,0.074107,1.0,0.784158,0.778098,0.141909,-0.630756,-0.465376,0.111327,-0.452156,-0.463844,-0.022231,-0.269062
stcpb,0.030143,-0.444972,-0.350342,0.634024,-0.298283,0.044985,0.784158,1.0,0.646067,0.107387,-0.525625,-0.364967,0.089673,-0.358761,-0.369247,-0.017432,-0.196476
dtcpb,0.027904,-0.441529,-0.344707,0.632188,-0.295993,0.053425,0.778098,0.646067,1.0,0.108988,-0.519311,-0.362953,0.079511,-0.356054,-0.367716,-0.017298,-0.190911
smean,0.116935,-0.113331,-0.035031,0.162598,0.231093,0.224277,0.141909,0.107387,0.108988,1.0,-0.064449,-0.184699,-0.04093,-0.171443,-0.172388,-0.010662,0.034042


### Limpieza de datos. 
En el proceso de limpieza de datos llevamos acabo los siguientes procesos:

- Disminución de la cardinalidad de los datos categoricos. 
- Codificación de las variables categoricas de los datos usando la técnica OneHotEncoding explicada en el libro referencia de la materia. 
- Balanceo de los datos para un entrenamiento mucho más efectivo

In [26]:
#There is a problem with 'proto' variable: high cardinality, we are going to try to fix it.

protos = X_train['proto'].unique()
n_protos = []
for proto in protos:
    size = len(X_train.loc[X_train['proto'] == proto].index)
    if size < 100:
        n_protos.append(proto)

for not_include in n_protos:
    X_train.loc[X_train['proto'] == not_include, 'proto'] = 'otros'
    X_test.loc[X_test['proto'] == not_include, 'proto'] = 'otros'

In [13]:
"""
One-hot encode categorical data
"""

class CategoricalTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.encoders = []
    
    def fit(self, X, y=None):
        for col in self.columns:
            lb = LabelBinarizer()
            lb = lb.fit(X[col])
            self.encoders.append(lb)
        return self
    def transform(self, X, y=None):
        for i, col in enumerate(self.columns):
            enc = self.encoders[i].transform(X[col])
            enc_df = pd.DataFrame(enc, columns=self.encoders[i].classes_, index=X.index)
            enc_df = enc_df.add_prefix('{}_'.format(col))
            X = X.join(enc_df)
            X = X.drop(col, axis=1)
        return X

CT = CategoricalTransformer(['state',
                             'proto',
                             'service'])
CT_test = CT.fit_transform(X_train)
print(CT_test.columns)

Index(['dur', 'rate', 'sttl', 'dttl', 'sload', 'sloss', 'swin', 'stcpb',
       'dtcpb', 'smean', 'ct_state_ttl', 'ct_dst_sport_ltm', 'ct_ftp_cmd',
       'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports', 'label', 'state_ACC',
       'state_CLO', 'state_CON', 'state_FIN', 'state_INT', 'state_REQ',
       'state_RST', 'proto_ospf', 'proto_otros', 'proto_sctp', 'proto_tcp',
       'proto_udp', 'proto_unas', 'service_-', 'service_dhcp', 'service_dns',
       'service_ftp', 'service_ftp-data', 'service_http', 'service_irc',
       'service_pop3', 'service_radius', 'service_smtp', 'service_snmp',
       'service_ssh', 'service_ssl'],
      dtype='object')


## Modelamiento

- Para este problema en particular se decidió usar un balanceador SMOTE, con su variante de Borderline, el cual define una replica sintética en un vector borde entre dos ejemplares cercanos definidos por una función de distancia. Según lo investigado, se puede ver en esta [guía de usuario] que BorderLineSMOTE parece ser el balanceador que menos <em>overlaping</em> genera, por dicha razón lo escogimos. 

- En cuanto al kernel para la máquina de soporte vectorial, escogimos el kernel gaussiano <em>RBF</em> ya que acorde con lo consultado en el libro guía de la materia es uno de los que mejor rendimiento tiene. Además, al no saber si es un problema linearmente separable, nos parecio adecuado descartar el kernel lineal a pesar de que LinearSVC tuviera un rendimiento mucho mejor que SVC

In [14]:
pipe = Pipeline([
    ('encoding', CategoricalTransformer(['state',
                             'proto',
                             'service'])),
    ('standarize', StandardScaler())
])

X_prepared_data_train = pipe.fit_transform(X_train)
sm = BorderlineSMOTE(random_state=42, n_jobs=5)
print('Original dataset shape %s' % Counter(y_train))
X_balanced_train, y_balanced_train = sm.fit_resample(X_prepared_data, y_train)
print('Balanced dataset shape %s' % Counter(y_balanced_train))


Original dataset shape Counter({'Normal': 6500, 'Exploits': 6500, 'Generic': 6500, 'Fuzzers': 6062, 'DoS': 4089, 'Reconnaissance': 3496, 'Analysis': 677, 'Backdoor': 583, 'Shellcode': 378, 'Worms': 44})
Balanced dataset shape Counter({'Normal': 6500, 'Reconnaissance': 6500, 'Backdoor': 6500, 'DoS': 6500, 'Exploits': 6500, 'Analysis': 6500, 'Fuzzers': 6500, 'Worms': 6500, 'Shellcode': 6500, 'Generic': 6500})


In [15]:
param_grid = { 'C': [12.5, 6.0, 3.0, 0.5], 
               'gamma': [0.1, 0.5, 2.5, 5] }

gm = SVC(kernel="rbf")

model = GridSearchCV(gm, param_grid=param_grid, cv=5, n_jobs=5, verbose=20).best_estimator_
model.fit(X_balanced_train, y_balanced_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   1 tasks      | elapsed:  4.3min
[Parallel(n_jobs=5)]: Done   2 tasks      | elapsed:  4.4min
[Parallel(n_jobs=5)]: Done   3 tasks      | elapsed:  4.4min
[Parallel(n_jobs=5)]: Done   4 tasks      | elapsed:  4.5min
[Parallel(n_jobs=5)]: Done   5 tasks      | elapsed:  4.5min
[Parallel(n_jobs=5)]: Done   6 tasks      | elapsed:  8.4min
[Parallel(n_jobs=5)]: Done   7 tasks      | elapsed:  8.6min
[Parallel(n_jobs=5)]: Done   8 tasks      | elapsed:  8.6min
[Parallel(n_jobs=5)]: Done   9 tasks      | elapsed:  8.7min
[Parallel(n_jobs=5)]: Done  10 tasks      | elapsed:  8.7min
[Parallel(n_jobs=5)]: Done  11 tasks      | elapsed: 13.8min
[Parallel(n_jobs=5)]: Done  12 tasks      | elapsed: 13.9min
[Parallel(n_jobs=5)]: Done  13 tasks      | elapsed: 14.0min
[Parallel(n_jobs=5)]: Done  14 tasks      | elapsed: 14.1min
[Parallel(n_jobs=5)]: Done  15 tasks      | elapsed: 14.3min
[Parallel(

GridSearchCV(cv=5, estimator=SVC(), n_jobs=5,
             param_grid={'C': [12.5, 6.0, 3.0, 0.5],
                         'gamma': [0.1, 0.5, 2.5, 5]},
             verbose=20)

## Resultados
Después de entrenarse el modelo con una variacion definida de los parámetros C y gamma se obtuvieron los resultados a contiinuación expuestos.  

Cabe aclarar que debido a la falta de tiempo no se pudieron realizar más iteraciones sobre el modelo diseñado y como puede ver la duración de su entrenamiento duró cerca de 80 minutos reducida en gran tamaño de complejidad. Acerca de los resultados se pueden generar las siguientes conclusiones: 

1. El modelo tiene un rendimiento óptimo para los datos de prueba usados. No obstante, su rendimiento para los datos de prueba no supera el 60% en promedio para el tamaño de datos usado (170 mil), de esto se puede inferir que el modelo está cayendo en un patrón de sobreajuste, por lo que la varianza del problema está siendo muy alta. 

2. Al reconocer el parámetro de sobreajuste, se podría buscar implementar algunas técnicas de regularización para disminuir la complejidad del modelo, como cambiar a un kernel polinomico de grado 3 o 2 o buscar mejores insights de los datos con un modelo regresivo logístico previo a la SVM. 

3. Se podría buscar también aumentar la cantidad de datos del set de entrenamiento, subir la cantidad a 8 mil o 10 mil datos por clase ayudaría a mejorar los errores por sesgo que también pueda estar teniendo el modelo. 

In [16]:
"""
Hyperparameters Results
"""
results = model.cv_results_
resultsSorted = sorted(zip(results["mean_test_score"], results["params"]), key=lambda pair: -pair[0])

print('Best hyperparameters: \n')
for mean_score, params in resultsSorted:
    print('mean accuracy: ', mean_score, 'params: ', params)

Best hyperparameters: 

mean accuracy:  0.7182923076923078 params:  {'C': 12.5, 'gamma': 5}
mean accuracy:  0.7161846153846154 params:  {'C': 6.0, 'gamma': 5}
mean accuracy:  0.7154769230769231 params:  {'C': 12.5, 'gamma': 2.5}
mean accuracy:  0.7113230769230768 params:  {'C': 6.0, 'gamma': 2.5}
mean accuracy:  0.7112153846153847 params:  {'C': 3.0, 'gamma': 5}
mean accuracy:  0.7044461538461538 params:  {'C': 3.0, 'gamma': 2.5}
mean accuracy:  0.6910000000000001 params:  {'C': 12.5, 'gamma': 0.5}
mean accuracy:  0.6847538461538462 params:  {'C': 0.5, 'gamma': 5}
mean accuracy:  0.6836769230769232 params:  {'C': 6.0, 'gamma': 0.5}
mean accuracy:  0.6783384615384616 params:  {'C': 12.5, 'gamma': 0.1}
mean accuracy:  0.6768 params:  {'C': 3.0, 'gamma': 0.5}
mean accuracy:  0.675523076923077 params:  {'C': 0.5, 'gamma': 2.5}
mean accuracy:  0.6731076923076923 params:  {'C': 6.0, 'gamma': 0.1}
mean accuracy:  0.665 params:  {'C': 3.0, 'gamma': 0.1}
mean accuracy:  0.6643230769230769 param

In [23]:
"""
Validation
"""
#predictions = model.predict(X_test)
train_predictions = model.predict(X_balanced_train)
print('__ Train Scores __\n')
print(classification_report(y_balanced_train, train_predictions))



__ Train Scores __

                precision    recall  f1-score   support

      Analysis       0.51      0.59      0.54      6500
      Backdoor       0.49      0.58      0.53      6500
           DoS       0.56      0.73      0.63      6500
      Exploits       0.91      0.61      0.73      6500
       Fuzzers       0.97      0.74      0.84      6500
       Generic       1.00      1.00      1.00      6500
        Normal       1.00      1.00      1.00      6500
Reconnaissance       0.88      0.81      0.84      6500
     Shellcode       0.96      0.97      0.97      6500
         Worms       1.00      1.00      1.00      6500

      accuracy                           0.80     65000
     macro avg       0.83      0.80      0.81     65000
  weighted avg       0.83      0.80      0.81     65000

__ Test Scores __



In [33]:
# Testing process
X_prepared_data_test = pipe.fit_transform(X_test)
print(CT_test.columns)
predictions = model.predict(X_prepared_data_test)


Index(['dur', 'rate', 'sttl', 'dttl', 'sload', 'sloss', 'swin', 'stcpb',
       'dtcpb', 'smean', 'ct_state_ttl', 'ct_dst_sport_ltm', 'ct_ftp_cmd',
       'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports', 'label', 'state_ACC',
       'state_CLO', 'state_CON', 'state_FIN', 'state_INT', 'state_REQ',
       'state_RST', 'proto_ospf', 'proto_otros', 'proto_sctp', 'proto_tcp',
       'proto_udp', 'proto_unas', 'service_-', 'service_dhcp', 'service_dns',
       'service_ftp', 'service_ftp-data', 'service_http', 'service_irc',
       'service_pop3', 'service_radius', 'service_smtp', 'service_snmp',
       'service_ssh', 'service_ssl'],
      dtype='object')
__ Test Scores __



TypeError: classification_report() got an unexpected keyword argument 'average'

In [39]:
print('__ Test Scores __\n')
print(classification_report(y_test, predictions, labels= np.unique(predictions)))

__ Test Scores __

                precision    recall  f1-score   support

      Analysis       0.05      0.00      0.00      2000
      Backdoor       0.05      0.01      0.02      1746
           DoS       0.29      0.34      0.31     12264
      Exploits       0.22      0.71      0.33     33393
       Fuzzers       0.75      0.63      0.68     18184
       Generic       1.00      0.67      0.80     40000
Reconnaissance       0.73      0.61      0.67     10491
     Shellcode       0.42      0.29      0.35      1133
         Worms       0.00      0.00      0.00       130

     micro avg       0.42      0.61      0.49    119341
     macro avg       0.39      0.36      0.35    119341
  weighted avg       0.61      0.61      0.56    119341

