# __Preprocesamiento de datos para el perceptron multicapa__

### 1️⃣ __Importacion de modulos__

In [13]:
import dask.dataframe as dd
import pandas as pd
import os
import pickle  # Usado en normalize_data para guardar el scaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
#from sklearn.model_selection import train_test_split  # Usado para dividir datos en train y test

### 2️⃣ __Leer el CSV preprocesado__

In [14]:
ctu13_df = dd.read_csv('./preprocessing_data/dataset_procesado.csv')

### 3️⃣ __Obtenemos los flujos de red que solamente pertenecen a una botnet__

In [15]:
df_botnets = ctu13_df[ctu13_df['is_botnet'] == 1].copy()
df_botnets.compute().shape

(338370, 17)

### 4️⃣ __Asignamos una etiqueta objetivo idnetificando cada tipo de botnet a clasificar__

In [16]:
escenarios = {0: 'Murlo', 1: 'Neris', 2: 'RBot', 3: 'Virut'}
df_botnets['target'] = df_botnets['BOTNET_NAME'].map({v: k for k, v in escenarios.items()}, meta=('BOTNET_NAME', 'int64'))
df_botnets['target'].value_counts().compute()

target
2    106352
3     40904
1    184987
0      6127
Name: count, dtype: int64

### 5️⃣ __Eliminamos las columnas innecesarias__

In [17]:
df_only_botnets = df_botnets.drop(['StartTime', 'SrcAddr', 'DstAddr', 'State', 'Label', 'is_botnet'], axis=1).copy()
df_only_botnets = df_only_botnets.rename(columns={'BOTNET_NAME': 'label'})

df_only_botnets.compute()

Unnamed: 0,Dur,Proto,Sport,Dir,Dport,sTos,dTos,TotPkts,TotBytes,SrcBytes,label,target
923,1.000221,udp,1025.0,<->,53.0,0.0,0.0,4.0,1010.0,156.0,Murlo,0
1103,500.002869,tcp,1039.0,->,80.0,0.0,0.0,14.0,5498.0,531.0,Murlo,0
8354,0.000218,udp,1025.0,<->,53.0,0.0,0.0,2.0,203.0,64.0,Murlo,0
8611,0.000513,udp,1025.0,<->,53.0,0.0,0.0,2.0,590.0,87.0,Murlo,0
8612,0.037986,tcp,1027.0,->,80.0,0.0,0.0,7.0,882.0,629.0,Murlo,0
...,...,...,...,...,...,...,...,...,...,...,...,...
417016,1.308413,tcp,23049.0,->,443.0,0.0,0.0,6.0,366.0,186.0,Virut,3
417097,1.207547,tcp,23050.0,->,443.0,0.0,0.0,6.0,366.0,186.0,Virut,3
417191,1.307525,tcp,23051.0,->,443.0,0.0,0.0,6.0,366.0,186.0,Virut,3
417271,1.308276,tcp,23052.0,->,443.0,0.0,0.0,6.0,366.0,186.0,Virut,3


### __Definir columnas numéricas y categóricas__

In [18]:

numerical_columns = ['Dur', 'Sport', 'Dport', 'TotPkts', 'TotBytes', 'SrcBytes']
categorical_columns = list(set(df_only_botnets.columns) - set(numerical_columns))
categorical_columns.remove('target')
categorical_columns.remove('label')
print("Columnas numericas:", numerical_columns)
print("Columnas categoricas:", categorical_columns)

Columnas numericas: ['Dur', 'Sport', 'Dport', 'TotPkts', 'TotBytes', 'SrcBytes']
Columnas categoricas: ['Proto', 'dTos', 'sTos', 'Dir']


### __Normalizar columnas numéricas__

In [19]:
# Convertir de Dask a Pandas antes de continuar
df_only_botnets_pd = df_only_botnets.compute()
print("Tipo de DataFrame:", type(df_only_botnets_pd))

# Normalizar columnas numéricas
scaler = MinMaxScaler()
df_only_botnets_pd[numerical_columns] = scaler.fit_transform(df_only_botnets_pd[numerical_columns])

# Guardar el scaler
pickle.dump(scaler, open(os.path.join('pklfiles', 'scaletrainBotnet.pkl'), 'wb'))

# Crear y ajustar codificadores
ohe_stos = OneHotEncoder(sparse_output=False)
ohe_dtos = OneHotEncoder(sparse_output=False)
ohe_dir = OneHotEncoder(sparse_output=False)
ohe_proto = OneHotEncoder(sparse_output=False)

ohe_stos.fit(df_only_botnets_pd[['sTos']])
ohe_dtos.fit(df_only_botnets_pd[['dTos']])
ohe_dir.fit(df_only_botnets_pd[['Dir']])
ohe_proto.fit(df_only_botnets_pd[['Proto']])

# Aplicar OneHotEncoding y concatenar con Pandas
def apply_onehot_encoding(df, col_name, encoder):
    encoded = encoder.transform(df[[col_name]])
    encoded_df = pd.DataFrame(encoded, dtype='int64', columns=[f'{col_name}_{cat}' for cat in encoder.categories_[0]])
    df = df.drop(columns=[col_name]).reset_index(drop=True)
    return pd.concat([df, encoded_df], axis=1)

# Copiar DataFrame
encoded_df = df_only_botnets_pd.copy()

# Aplicar OneHotEncoder en columnas
encoded_df = apply_onehot_encoding(encoded_df, 'sTos', ohe_stos)
encoded_df = apply_onehot_encoding(encoded_df, 'dTos', ohe_dtos)
encoded_df = apply_onehot_encoding(encoded_df, 'Dir', ohe_dir)
encoded_df = apply_onehot_encoding(encoded_df, 'Proto', ohe_proto)

# Mezclar filas y resetear el índice
encoded_df = encoded_df.sample(frac=1).reset_index(drop=True)

# Separar target y label
target = encoded_df.pop('target')
label = encoded_df.pop('label')

# Resultado final
encoded_df, target, label

Tipo de DataFrame: <class 'pandas.core.frame.DataFrame'>


(                 Dur     Sport         Dport   TotPkts      TotBytes  \
 0       6.158122e-04  0.055407  8.199873e-07  0.000687  7.242618e-04   
 1       9.166939e-08  0.019685  9.646909e-08  0.000006  2.854290e-06   
 2       2.676736e-01  0.102849  1.391381e-07  0.000006  1.493457e-05   
 3       6.898538e-06  0.028367  9.646909e-08  0.000006  1.470392e-06   
 4       3.742695e-04  0.017884  8.199873e-07  0.000028  2.205588e-06   
 ...              ...       ...           ...       ...           ...   
 338365  3.631588e-04  0.207132  8.199873e-07  0.000028  2.205588e-06   
 338366  0.000000e+00  0.059604  4.452420e-08  0.000000  1.441561e-08   
 338367  1.892500e-02  0.032884  1.465588e-07  0.000970  1.043027e-03   
 338368  5.702947e-06  0.015610  9.646909e-08  0.000006  2.169549e-06   
 338369  4.330374e-03  0.069400  1.215121e-04  0.000170  8.198877e-05   
 
             SrcBytes  sTos_0.0  dTos_0.0  Dir_   ->  Dir_  <->  Proto_icmp  \
 0       6.737135e-05         1         1  

In [23]:
encoded_df.head()

Unnamed: 0,Dur,Sport,Dport,TotPkts,TotBytes,SrcBytes,sTos_0.0,dTos_0.0,Dir_ ->,Dir_ <->,Proto_icmp,Proto_tcp,Proto_udp
0,0.0006158122,0.055407,8.199873e-07,0.000687,0.000724,6.737135e-05,1,1,1,0,0,1,0
1,9.166939e-08,0.019685,9.646909e-08,6e-06,3e-06,6.487024e-08,1,1,0,1,0,0,1
2,0.2676736,0.102849,1.391381e-07,6e-06,1.5e-05,1.493457e-05,1,1,1,0,1,0,0
3,6.898538e-06,0.028367,9.646909e-08,6e-06,1e-06,9.370145e-08,1,1,0,1,0,0,1
4,0.0003742695,0.017884,8.199873e-07,2.8e-05,2e-06,9.081833e-07,1,1,1,0,0,1,0


In [22]:
target.head()

0    1
1    1
2    2
3    1
4    3
Name: target, dtype: int64

In [25]:
label.head()

0    Neris
1    Neris
2     RBot
3    Neris
4    Virut
Name: label, dtype: object

### __Generamos la matriz de caracteristicas y el vector de etiquetas en sus respectivos archivos .csv__

In [26]:
encoded_df.to_csv(os.path.join('mlp_data', 'X_features_matrix.csv'), index=False)
target.to_csv(os.path.join('mlp_data', 'Y_target_vector.csv'), index=False)