# Preparación del conjunto de datos


###### En este notebook se muestran algunas de las técnicas más utilizadas para transformar el cojunto de datos

##### Imports

In [19]:
import arff
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

##### Funciones auxiliares

In [20]:
def load_kdd_dataset(data_path):
  """Lectura del cojunto de datos NSL-KDD"""
  with open(data_path, 'r') as train_set:
    dataset = arff.load(train_set)
  attributes = [attr[0] for attr in dataset['attributes']]
  
  return pd.DataFrame(dataset['data'], columns=attributes)

In [21]:
def train_val_test_split(df, rstate=42, shuffle=True, stratify=None):
  strat = df[stratify] if stratify else None
  train_set, test_set = train_test_split(
    df, train_size=0.4, random_state=rstate, shuffle=shuffle, stratify=strat)
  strat = test_set[stratify] if stratify else None
  val_set, test_set = train_test_split(
    test_set, train_size=0.5, random_state=rstate, shuffle=shuffle, stratify=strat)

  return (train_set, val_set, test_set) 

##### 1. Lectura del conjunto de datos

In [22]:
df = load_kdd_dataset('datasets/NSL-KDD/KDDTrain+.arff')

In [23]:
df

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0.0,tcp,ftp_data,SF,491.0,0.0,0,0.0,0.0,0.0,...,25.0,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00,normal
1,0.0,udp,other,SF,146.0,0.0,0,0.0,0.0,0.0,...,1.0,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00,normal
2,0.0,tcp,private,S0,0.0,0.0,0,0.0,0.0,0.0,...,26.0,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00,anomaly
3,0.0,tcp,http,SF,232.0,8153.0,0,0.0,0.0,0.0,...,255.0,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01,normal
4,0.0,tcp,http,SF,199.0,420.0,0,0.0,0.0,0.0,...,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125968,0.0,tcp,private,S0,0.0,0.0,0,0.0,0.0,0.0,...,25.0,0.10,0.06,0.00,0.00,1.00,1.00,0.00,0.00,anomaly
125969,8.0,udp,private,SF,105.0,145.0,0,0.0,0.0,0.0,...,244.0,0.96,0.01,0.01,0.00,0.00,0.00,0.00,0.00,normal
125970,0.0,tcp,smtp,SF,2231.0,384.0,0,0.0,0.0,0.0,...,30.0,0.12,0.06,0.00,0.00,0.72,0.00,0.01,0.00,normal
125971,0.0,tcp,klogin,S0,0.0,0.0,0,0.0,0.0,0.0,...,8.0,0.03,0.05,0.00,0.00,1.00,1.00,0.00,0.00,anomaly


##### 2. División del conjunto de datos

In [24]:
train_set, val_set, test_set = train_val_test_split(df, stratify='protocol_type')

In [25]:
print('Longitud del Training set: ', len(train_set))
print('Longitud del Validation set: ', len(val_set))
print('Longitud del Test set: ', len(test_set))

Longitud del Training set:  50389
Longitud del Validation set:  37792
Longitud del Test set:  37792


##### 3. Limpiado los datos

###### Antes de comenzar, vamos a recuperar el conjunto de datos limpio y vamos a separar las etiquetas del resto de datos, no necesariamente queremos aplicar las mismas transformaciones en los mismos datos.

In [26]:
# Separamos las características de entrada de las de la salida.
X_train = train_set.drop('class', axis=1)
y_train = train_set['class'].copy()

In [27]:
# Para ilustrar esta sección vamos a añadir algunos valores nulos
# a algunas características del conjunto de datos.

X_train.loc[(X_train['src_bytes']>400) & (X_train['src_bytes']<800), 'src_bytes'] = np.nan
X_train.loc[(X_train['dst_bytes']>500) & (X_train['dst_bytes']<2000), 'dst_bytes'] = np.nan

La mayoría de algoritmos de machine learning no pueden trabajar sobre características que contengan valores nulos. Por ello existen 3 opciones para remplazarlos:
* Eliminar las filas correspondientes
* Eliminar el atributo (columna) correspondiente
* Rellenarlos con un valor determinado (cero, media...)

In [28]:
# Comprobamos si existe algún atributo con valores nulos
X_train.isna().any()

duration                       False
protocol_type                  False
service                        False
flag                           False
src_bytes                       True
dst_bytes                       True
land                           False
wrong_fragment                 False
urgent                         False
hot                            False
num_failed_logins              False
logged_in                      False
num_compromised                False
root_shell                     False
su_attempted                   False
num_root                       False
num_file_creations             False
num_shells                     False
num_access_files               False
num_outbound_cmds              False
is_host_login                  False
is_guest_login                 False
count                          False
srv_count                      False
serror_rate                    False
srv_serror_rate                False
rerror_rate                    False
s

In [29]:
# Seleccionamos las filas que contienen valores nulos
filas_valores_nulos = X_train[X_train.isnull().any(axis=1)]
filas_valores_nulos

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
41109,0.0,tcp,http,SF,328.0,,0,0.0,0.0,0.0,...,255.0,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0
30153,0.0,icmp,ecr_i,SF,,0.0,0,0.0,0.0,0.0,...,255.0,255.0,1.00,0.00,1.00,0.00,0.00,0.00,0.0,0.0
121561,0.0,tcp,smtp,SF,,334.0,0,0.0,0.0,0.0,...,92.0,165.0,0.74,0.04,0.01,0.01,0.00,0.00,0.0,0.0
56792,0.0,tcp,http,SF,203.0,,0,0.0,0.0,0.0,...,59.0,71.0,1.00,0.00,0.02,0.07,0.02,0.03,0.0,0.0
24744,0.0,tcp,http,SF,246.0,,0,0.0,0.0,0.0,...,255.0,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37489,0.0,tcp,http,SF,222.0,,0,0.0,0.0,0.0,...,255.0,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0
47120,0.0,tcp,http,SF,367.0,,0,0.0,0.0,0.0,...,255.0,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0
78843,0.0,tcp,http,SF,323.0,,0,0.0,0.0,0.0,...,72.0,255.0,1.00,0.00,0.01,0.03,0.00,0.00,0.0,0.0
36593,0.0,tcp,http,SF,286.0,,0,0.0,0.0,0.0,...,4.0,237.0,1.00,0.00,0.25,0.07,0.00,0.01,0.0,0.0


##### Opción 1: Eliminar las filas con valores nulos

In [30]:
# Copiamos el conjunto de datos para no alterar el original
X_train_copy = X_train.copy()

In [31]:
# Eliminarmos las filas con valores nulos
X_train_copy.dropna(subset=['src_bytes', 'dst_bytes'], inplace=True)
X_train_copy

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
86646,0.0,icmp,eco_i,SF,18.0,0.0,0,0.0,0.0,0.0,...,1.0,164.0,1.00,0.00,1.00,1.00,0.00,0.0,0.00,0.0
98426,0.0,tcp,ftp_data,SF,259.0,0.0,0,0.0,0.0,0.0,...,59.0,43.0,0.03,0.05,0.03,0.05,0.00,0.0,0.00,0.0
88488,2.0,tcp,ftp,SF,1244.0,2449.0,0,0.0,0.0,28.0,...,255.0,96.0,0.38,0.02,0.00,0.00,0.00,0.0,0.01,0.0
65203,0.0,tcp,ftp_data,SF,7280.0,0.0,0,0.0,0.0,0.0,...,226.0,67.0,0.23,0.01,0.23,0.03,0.00,0.0,0.00,0.0
74716,0.0,udp,domain_u,SF,76.0,43.0,0,0.0,0.0,0.0,...,255.0,166.0,0.65,0.01,0.16,0.00,0.00,0.0,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60380,0.0,tcp,http,SF,230.0,5541.0,0,0.0,0.0,0.0,...,15.0,255.0,1.00,0.00,0.07,0.05,0.00,0.0,0.00,0.0
81392,0.0,tcp,http,SF,340.0,283.0,0,0.0,0.0,0.0,...,255.0,227.0,0.89,0.01,0.00,0.00,0.00,0.0,0.00,0.0
74750,0.0,icmp,ecr_i,SF,30.0,0.0,0,0.0,0.0,0.0,...,219.0,219.0,1.00,0.00,1.00,0.00,0.00,0.0,0.00,0.0
87053,25.0,tcp,telnet,SF,151.0,6412.0,0,0.0,0.0,0.0,...,1.0,7.0,1.00,0.00,1.00,0.57,0.00,0.0,0.00,0.0


In [32]:
# Contamos el número de filas eliminadas
print('El número de filas eliminadas es: ', len(X_train) - len(X_train_copy))

El número de filas eliminadas es:  6628


##### Opción 2: Eliminamos los atriutos nulos

In [33]:
# Copiamos el conjunto de datos para no alterar el original
X_train_copy = X_train.copy()

In [34]:
# Eliminamos los atributos con valores nulos
X_train_copy.drop(['src_bytes', 'dst_bytes'], axis=1, inplace=True)
X_train_copy

Unnamed: 0,duration,protocol_type,service,flag,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
41109,0.0,tcp,http,SF,0,0.0,0.0,0.0,0.0,1,...,255.0,255.0,1.00,0.00,0.00,0.00,0.00,0.0,0.00,0.0
86646,0.0,icmp,eco_i,SF,0,0.0,0.0,0.0,0.0,0,...,1.0,164.0,1.00,0.00,1.00,1.00,0.00,0.0,0.00,0.0
98426,0.0,tcp,ftp_data,SF,0,0.0,0.0,0.0,0.0,0,...,59.0,43.0,0.03,0.05,0.03,0.05,0.00,0.0,0.00,0.0
88488,2.0,tcp,ftp,SF,0,0.0,0.0,28.0,0.0,1,...,255.0,96.0,0.38,0.02,0.00,0.00,0.00,0.0,0.01,0.0
65203,0.0,tcp,ftp_data,SF,0,0.0,0.0,0.0,0.0,0,...,226.0,67.0,0.23,0.01,0.23,0.03,0.00,0.0,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60380,0.0,tcp,http,SF,0,0.0,0.0,0.0,0.0,1,...,15.0,255.0,1.00,0.00,0.07,0.05,0.00,0.0,0.00,0.0
81392,0.0,tcp,http,SF,0,0.0,0.0,0.0,0.0,1,...,255.0,227.0,0.89,0.01,0.00,0.00,0.00,0.0,0.00,0.0
74750,0.0,icmp,ecr_i,SF,0,0.0,0.0,0.0,0.0,0,...,219.0,219.0,1.00,0.00,1.00,0.00,0.00,0.0,0.00,0.0
87053,25.0,tcp,telnet,SF,0,0.0,0.0,0.0,0.0,1,...,1.0,7.0,1.00,0.00,1.00,0.57,0.00,0.0,0.00,0.0


In [35]:
# Contamos el número de atributos eliminados
print('El número de atributos eliminados es: ', len(list(X_train)) - len(list(X_train_copy)))

El número de atributos eliminados es:  2


##### Opción 3: Rellenamos los valores nulos con un valor determinado

In [36]:
# Copiamos el conjunto de datos para no alterar el original
X_train_copy = X_train.copy()

In [37]:
# Rellenamos los valores nulos con la media de los valores del atributo
media_srcbytes = X_train_copy['src_bytes'].mean()
media_dstbytes = X_train_copy['dst_bytes'].mean()

X_train_copy['src_bytes'].fillna(media_srcbytes, inplace=True)
X_train_copy['dst_bytes'].fillna(media_dstbytes, inplace=True)

X_train_copy

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
41109,0.0,tcp,http,SF,328.0,11849.349497,0,0.0,0.0,0.0,...,255.0,255.0,1.00,0.00,0.00,0.00,0.00,0.0,0.00,0.0
86646,0.0,icmp,eco_i,SF,18.0,0.000000,0,0.0,0.0,0.0,...,1.0,164.0,1.00,0.00,1.00,1.00,0.00,0.0,0.00,0.0
98426,0.0,tcp,ftp_data,SF,259.0,0.000000,0,0.0,0.0,0.0,...,59.0,43.0,0.03,0.05,0.03,0.05,0.00,0.0,0.00,0.0
88488,2.0,tcp,ftp,SF,1244.0,2449.000000,0,0.0,0.0,28.0,...,255.0,96.0,0.38,0.02,0.00,0.00,0.00,0.0,0.01,0.0
65203,0.0,tcp,ftp_data,SF,7280.0,0.000000,0,0.0,0.0,0.0,...,226.0,67.0,0.23,0.01,0.23,0.03,0.00,0.0,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60380,0.0,tcp,http,SF,230.0,5541.000000,0,0.0,0.0,0.0,...,15.0,255.0,1.00,0.00,0.07,0.05,0.00,0.0,0.00,0.0
81392,0.0,tcp,http,SF,340.0,283.000000,0,0.0,0.0,0.0,...,255.0,227.0,0.89,0.01,0.00,0.00,0.00,0.0,0.00,0.0
74750,0.0,icmp,ecr_i,SF,30.0,0.000000,0,0.0,0.0,0.0,...,219.0,219.0,1.00,0.00,1.00,0.00,0.00,0.0,0.00,0.0
87053,25.0,tcp,telnet,SF,151.0,6412.000000,0,0.0,0.0,0.0,...,1.0,7.0,1.00,0.00,1.00,0.57,0.00,0.0,0.00,0.0


In [38]:
# Copiamos el conjunto de datos para no alterar el original
X_train_copy = X_train.copy()

In [39]:
# Un valor muy alto en el atributo puede disparar la media
# rellenamos los valores con la mediana
mediana_srcbytes = X_train_copy['src_bytes'].median()
mediana_dstbytes = X_train_copy['dst_bytes'].median()

X_train_copy['src_bytes'].fillna(mediana_srcbytes, inplace=True)
X_train_copy['dst_bytes'].fillna(mediana_dstbytes, inplace=True)

X_train_copy

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
41109,0.0,tcp,http,SF,328.0,0.0,0,0.0,0.0,0.0,...,255.0,255.0,1.00,0.00,0.00,0.00,0.00,0.0,0.00,0.0
86646,0.0,icmp,eco_i,SF,18.0,0.0,0,0.0,0.0,0.0,...,1.0,164.0,1.00,0.00,1.00,1.00,0.00,0.0,0.00,0.0
98426,0.0,tcp,ftp_data,SF,259.0,0.0,0,0.0,0.0,0.0,...,59.0,43.0,0.03,0.05,0.03,0.05,0.00,0.0,0.00,0.0
88488,2.0,tcp,ftp,SF,1244.0,2449.0,0,0.0,0.0,28.0,...,255.0,96.0,0.38,0.02,0.00,0.00,0.00,0.0,0.01,0.0
65203,0.0,tcp,ftp_data,SF,7280.0,0.0,0,0.0,0.0,0.0,...,226.0,67.0,0.23,0.01,0.23,0.03,0.00,0.0,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60380,0.0,tcp,http,SF,230.0,5541.0,0,0.0,0.0,0.0,...,15.0,255.0,1.00,0.00,0.07,0.05,0.00,0.0,0.00,0.0
81392,0.0,tcp,http,SF,340.0,283.0,0,0.0,0.0,0.0,...,255.0,227.0,0.89,0.01,0.00,0.00,0.00,0.0,0.00,0.0
74750,0.0,icmp,ecr_i,SF,30.0,0.0,0,0.0,0.0,0.0,...,219.0,219.0,1.00,0.00,1.00,0.00,0.00,0.0,0.00,0.0
87053,25.0,tcp,telnet,SF,151.0,6412.0,0,0.0,0.0,0.0,...,1.0,7.0,1.00,0.00,1.00,0.57,0.00,0.0,0.00,0.0


**Existe otra alternativa para la opción 3 que consiste en usar la clase imputer de sklearn**

In [40]:
# Copiamos el conjunto de datos para no alterar el original
X_train_copy = X_train.copy()

In [41]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')

In [42]:
# La clase imputer no admite valores categóricos, eliminamos los atributos categoricos.
X_train_copy_num = X_train_copy.select_dtypes(exclude=['object'])
X_train_copy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50389 entries, 41109 to 44717
Data columns (total 41 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   duration                     50389 non-null  float64
 1   protocol_type                50389 non-null  object 
 2   service                      50389 non-null  object 
 3   flag                         50389 non-null  object 
 4   src_bytes                    49122 non-null  float64
 5   dst_bytes                    45019 non-null  float64
 6   land                         50389 non-null  object 
 7   wrong_fragment               50389 non-null  float64
 8   urgent                       50389 non-null  float64
 9   hot                          50389 non-null  float64
 10  num_failed_logins            50389 non-null  float64
 11  logged_in                    50389 non-null  object 
 12  num_compromised              50389 non-null  float64
 13  root_shell  

In [43]:
# se le proporcionan los atributos numéricos para que calcule los valores
imputer.fit(X_train_copy_num)

In [44]:
# Rellenamos los valores nulos
X_train_copy_num_nonan = imputer.transform(X_train_copy_num)

In [45]:
# Transformamos el resultado a un DataFrame de Pandas
X_train_copy = pd.DataFrame(X_train_copy_num_nonan, columns=X_train_copy_num.columns)

In [46]:
X_train_copy.head(10)

Unnamed: 0,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,num_compromised,root_shell,su_attempted,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0.0,328.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,164.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
2,0.0,259.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,59.0,43.0,0.03,0.05,0.03,0.05,0.0,0.0,0.0,0.0
3,2.0,1244.0,2449.0,0.0,0.0,28.0,0.0,0.0,0.0,0.0,...,255.0,96.0,0.38,0.02,0.0,0.0,0.0,0.0,0.01,0.0
4,0.0,7280.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,226.0,67.0,0.23,0.01,0.23,0.03,0.0,0.0,0.0,0.0
5,0.0,43.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6,0.0,76.0,43.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,166.0,0.65,0.01,0.16,0.0,0.0,0.0,0.0,0.0
7,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,17.0,0.07,0.16,0.47,0.0,0.0,0.0,0.46,0.0
8,0.0,222.0,338.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,53.0,1.0,0.0,0.33,0.09,0.0,0.02,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,1.0,0.0,0.88,0.0,0.0,0.05,0.0,0.77,1.0


#### APIs de sklearn
Antes de continuar hacemos una pequeña reseña sobre cómo funcionan las APIs de sklearn:
* **Estimators:** Cualquier objeto que puede estimar algún parámetro:
  * El propio estimator se transforma mediante el método fit(), que siempre toma un dataset como argumento
  * Cualquier otro parámetro de este método es un hiperparámetro
* **Transformers:** Son estimadores capaces de transformar el conjunto de datos (como inputer)
  * La transformación es realizada por medio del método transform()
  * Reciben un dataset como parámetro de entrada
* **Predictors:** Son estimadores capaces de realizar predicciones
  * La predicción se realiza mediante el método predict()
  * Reciben un dataset como entrada
  * Retornan un dataset con las predicciones
  * Tienen un método score() para evaluar el resultado de la predicción

##### 4. Transformación de atributos categóricos a numéricos

In [47]:
X_train = train_set.drop('class', axis=1)
y_train = train_set['class'].copy()

In [48]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50389 entries, 41109 to 44717
Data columns (total 41 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   duration                     50389 non-null  float64
 1   protocol_type                50389 non-null  object 
 2   service                      50389 non-null  object 
 3   flag                         50389 non-null  object 
 4   src_bytes                    50389 non-null  float64
 5   dst_bytes                    50389 non-null  float64
 6   land                         50389 non-null  object 
 7   wrong_fragment               50389 non-null  float64
 8   urgent                       50389 non-null  float64
 9   hot                          50389 non-null  float64
 10  num_failed_logins            50389 non-null  float64
 11  logged_in                    50389 non-null  object 
 12  num_compromised              50389 non-null  float64
 13  root_shell  

###### Existen diferentes formas de convertir atributos categóricos a numéricos. Probablemente, la más sencilla es la que proporciona el método **factorize** de Pandas. Que transforma cada categoría en un número secuancial.

In [49]:
protocol_type = X_train['protocol_type']
protocol_type_encoded, categorias = protocol_type.factorize()

In [50]:
# Mostramos por pantalla como se ha codificado
for index, type in enumerate(protocol_type_encoded):
  print('El ejemplo ' + str(index) + 'es ' + categorias[type])

El ejemplo 0es tcp
El ejemplo 1es icmp
El ejemplo 2es tcp
El ejemplo 3es tcp
El ejemplo 4es tcp
El ejemplo 5es icmp
El ejemplo 6es udp
El ejemplo 7es udp
El ejemplo 8es tcp
El ejemplo 9es tcp
El ejemplo 10es tcp
El ejemplo 11es tcp
El ejemplo 12es tcp
El ejemplo 13es tcp
El ejemplo 14es udp
El ejemplo 15es tcp
El ejemplo 16es tcp
El ejemplo 17es tcp
El ejemplo 18es tcp
El ejemplo 19es tcp
El ejemplo 20es udp
El ejemplo 21es udp
El ejemplo 22es icmp
El ejemplo 23es tcp
El ejemplo 24es tcp
El ejemplo 25es udp
El ejemplo 26es tcp
El ejemplo 27es tcp
El ejemplo 28es tcp
El ejemplo 29es tcp
El ejemplo 30es udp
El ejemplo 31es tcp
El ejemplo 32es icmp
El ejemplo 33es tcp
El ejemplo 34es tcp
El ejemplo 35es tcp
El ejemplo 36es tcp
El ejemplo 37es tcp
El ejemplo 38es tcp
El ejemplo 39es tcp
El ejemplo 40es tcp
El ejemplo 41es tcp
El ejemplo 42es tcp
El ejemplo 43es tcp
El ejemplo 44es tcp
El ejemplo 45es tcp
El ejemplo 46es tcp
El ejemplo 47es tcp
El ejemplo 48es tcp
El ejemplo 49es icmp
El ej

In [51]:
for i in range(10):
  print(protocol_type.iloc[i], ' = ', protocol_type_encoded[i])

tcp  =  0
icmp  =  1
tcp  =  0
tcp  =  0
tcp  =  0
icmp  =  1
udp  =  2
udp  =  2
tcp  =  0
tcp  =  0


In [52]:
print(categorias)

Index(['tcp', 'icmp', 'udp'], dtype='object')


**Transformaciones avanzadas mediante sklearn**

###### **Ordinal Encoding**
###### Realiza la misma codificación que el método **factorize** de Pandas

In [53]:
from sklearn.preprocessing import OrdinalEncoder

protocol_type = X_train[['protocol_type']]

ordinal_encoder = OrdinalEncoder()
protocol_type_encoded = ordinal_encoder.fit_transform(protocol_type)

In [54]:
# Mostramos por pantalla como se ha codificado
for i in range(10): 
  print(protocol_type['protocol_type'].iloc[i], ' = ', protocol_type_encoded[i])

tcp  =  [1.]
icmp  =  [0.]
tcp  =  [1.]
tcp  =  [1.]
tcp  =  [1.]
icmp  =  [0.]
udp  =  [2.]
udp  =  [2.]
tcp  =  [1.]
tcp  =  [1.]


In [55]:
print(ordinal_encoder.categories_)

[array(['icmp', 'tcp', 'udp'], dtype=object)]


###### El problema de esta codifiación radica en que ciertos algoritmos de ML que funcionan por similitud de dos puntos por distancia, van a considrar que el 1 está más cerca del 2 que del 3, y en este caso (para valores categóricos), no tiene sentido, por ello se utilizan otros métodos de categorización, como, por ejemplo, One-Hot Encoding

**One-Hot Encoding**
<br>
Genera para cada categoría del atributo categórico una matriz binaria que representa el valor.

In [56]:
# La Sparse Matrix solo almacena la posición de los valores que no son '0' para ahorrar memoria
from sklearn.preprocessing import OneHotEncoder

protocol_type = X_train[['protocol_type']]
oh_encoder = OneHotEncoder()

protocol_type_oh = oh_encoder.fit_transform(protocol_type)
protocol_type_oh

<50389x3 sparse matrix of type '<class 'numpy.float64'>'
	with 50389 stored elements in Compressed Sparse Row format>

In [57]:
# Convertir la sparse matrix a un array de Numpy
protocol_type_oh.toarray()

array([[0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [58]:
# Mostramos por pantalla como se ha codificado
for i in range(10): 
  print(protocol_type['protocol_type'].iloc[i], ' = ', protocol_type_oh.toarray()[i])

tcp  =  [0. 1. 0.]
icmp  =  [1. 0. 0.]
tcp  =  [0. 1. 0.]
tcp  =  [0. 1. 0.]
tcp  =  [0. 1. 0.]
icmp  =  [1. 0. 0.]
udp  =  [0. 0. 1.]
udp  =  [0. 0. 1.]
tcp  =  [0. 1. 0.]
tcp  =  [0. 1. 0.]


In [59]:
print(ordinal_encoder.categories_)

[array(['icmp', 'tcp', 'udp'], dtype=object)]


En muchas ocasiones al particionar le conjunto de datos o realizar una predicción con un nuevo ejemplo apareceran nuevos valorea para determinadas categiría que produciran un error en la función **transform()**. La clase OneHotEncoding proporciona el parámetro **handle_uknown** ya sea para generar un error o ignorar si una característica categórica desconocida está presente durante la transformación (El valor predeterminado es lanzar un error)..
<br>
Cuando este parámetro se establece en "ignorar" y se encuentra una categoría desconocida durante la transformación, las columnas codificadas resultates para esta caracteristica serán todos ceros. En la transformación inversa, una categoría desconocida se denotará como None. 

In [60]:
oh_enconder = OneHotEncoder(handle_unknown='ignore')

**Get Dummies**

Get Dummies es un método sencillo de utilizar que permite aplicar One-Hot Encoding a un Data Frame de Pandas

In [61]:
pd.get_dummies(X_train['protocol_type'])

Unnamed: 0,icmp,tcp,udp
41109,0,1,0
86646,1,0,0
98426,0,1,0
88488,0,1,0
65203,0,1,0
...,...,...,...
60380,0,1,0
81392,0,1,0
74750,1,0,0
87053,0,1,0


##### 5. Escalado del conjunto de datos

Antes de comenzar vamos a recuperar el conjunto de datos limpio y vamos a separar las etiquetas del resto de datos, no necesariamente queremos aplicar las mismas transformaciones en ambos conjuntos

In [62]:
X_train = train_set.drop('class', axis=1)
y_train = train_set['class'].copy()

Como norma general los algoritmos de Machine Learning no se comportan correctamente si los valores de las características de entrada que reciben como entrada se encuentran en rangos muy dispersos. por ello se utilizar distintas técnicas de escalado. **Importante tener en cuenta que estos mecanismos de escalado no deben aplicarse sobre las etiquetas**.
<br>
  * **Normalización:** Los valores del atriuto se escalan para adquirir un valor entre 0 y 1
  * **Estandarización:** Los valores del atributo se escalan y reciben un valor similar, pero no se encuentra dentro de un rango
<br>
<br>
**Es importante que para probar estos valores se realicen transformaciones solo sobre el conjunto de datos de entrenamiento. Después se aplicarán sobre el conjunto de datos de prueba para testear.**

In [63]:
from sklearn.preprocessing import RobustScaler

scale_attrs = X_train[['src_bytes', 'dst_bytes']]

robust_scaler = RobustScaler()
X_train_scaled = robust_scaler.fit_transform(scale_attrs)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=['src_bytes', 'dst_bytes'])

In [64]:
X_train_scaled.head(10)

Unnamed: 0,src_bytes,dst_bytes
0,1.036496,2.274102
1,-0.094891,0.0
2,0.784672,0.0
3,4.379562,4.62949
4,26.408759,0.0
5,1.737226,0.0
6,0.116788,0.081285
7,-0.156934,0.0
8,0.649635,0.638941
9,-0.160584,0.0


In [65]:
X_train.head(10)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
41109,0.0,tcp,http,SF,328.0,1203.0,0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86646,0.0,icmp,eco_i,SF,18.0,0.0,0,0.0,0.0,0.0,...,1.0,164.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
98426,0.0,tcp,ftp_data,SF,259.0,0.0,0,0.0,0.0,0.0,...,59.0,43.0,0.03,0.05,0.03,0.05,0.0,0.0,0.0,0.0
88488,2.0,tcp,ftp,SF,1244.0,2449.0,0,0.0,0.0,28.0,...,255.0,96.0,0.38,0.02,0.0,0.0,0.0,0.0,0.01,0.0
65203,0.0,tcp,ftp_data,SF,7280.0,0.0,0,0.0,0.0,0.0,...,226.0,67.0,0.23,0.01,0.23,0.03,0.0,0.0,0.0,0.0
30153,0.0,icmp,ecr_i,SF,520.0,0.0,0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
74716,0.0,udp,domain_u,SF,76.0,43.0,0,0.0,0.0,0.0,...,255.0,166.0,0.65,0.01,0.16,0.0,0.0,0.0,0.0,0.0
91475,0.0,udp,private,SF,1.0,0.0,0,0.0,0.0,0.0,...,255.0,17.0,0.07,0.16,0.47,0.0,0.0,0.0,0.46,0.0
51329,0.0,tcp,http,SF,222.0,338.0,0,0.0,0.0,0.0,...,3.0,53.0,1.0,0.0,0.33,0.09,0.0,0.02,0.0,0.0
38175,0.0,tcp,private,REJ,0.0,0.0,0,0.0,0.0,0.0,...,255.0,1.0,0.0,0.88,0.0,0.0,0.05,0.0,0.77,1.0
