# __Preprocesamiento de datos para el perceptron multicapa__

### 1️⃣ __Importacion de modulos__

In [None]:
import dask.dataframe as dd
import pandas as pd
import os
import pickle  # Usado en normalize_data para guardar el scaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
#from sklearn.model_selection import train_test_split  # Usado para dividir datos en train y test

### 2️⃣ __Leer el CSV preprocesado__

In [2]:
ctu13_df = dd.read_csv('./preprocessing_data/dataset_procesado.csv')

### 3️⃣ __Obtenemos los flujos de red que solamente pertenecen a una botnet__

In [3]:
df_botnets = ctu13_df[ctu13_df['is_botnet'] == 1].copy()
df_botnets.compute().shape

(339700, 17)

### 4️⃣ __Asignamos una etiqueta objetivo idnetificando cada tipo de botnet a clasificar__

In [4]:
escenarios = {0: 'Sogou', 1: 'Murlo', 2: 'Neris', 3: 'RBot', 4: 'NsisAy', 5: 'Virut'}
df_botnets['target'] = df_botnets['BOTNET_NAME'].map({v: k for k, v in escenarios.items()}, meta=('BOTNET_NAME', 'int64'))
df_botnets['target'].value_counts().compute()

target
3    106352
5     40003
1      6127
0        63
2    184987
4      2168
Name: count, dtype: int64

### 5️⃣ __Eliminamos las columnas innecesarias__

In [5]:
df_only_botnets = df_botnets.drop(['StartTime', 'SrcAddr', 'DstAddr', 'State', 'Label', 'is_botnet'], axis=1).copy()
df_only_botnets = df_only_botnets.rename(columns={'BOTNET_NAME': 'label'})

df_only_botnets.compute()

Unnamed: 0,Dur,Proto,Sport,Dir,Dport,sTos,dTos,TotPkts,TotBytes,SrcBytes,label,target
14659,0.000274,udp,1025.0,<->,53.0,0.0,0.0,2.0,203.0,64.0,Sogou,0
14888,0.010509,udp,1025.0,<->,53.0,0.0,0.0,2.0,590.0,87.0,Sogou,0
14892,0.054527,tcp,1027.0,->,80.0,0.0,0.0,7.0,882.0,629.0,Sogou,0
15090,0.000502,udp,1025.0,<->,53.0,0.0,0.0,2.0,479.0,76.0,Sogou,0
15092,2.995112,udp,123.0,->,123.0,0.0,0.0,2.0,180.0,180.0,Sogou,0
...,...,...,...,...,...,...,...,...,...,...,...,...
411906,1.308413,tcp,23049.0,->,443.0,0.0,0.0,6.0,366.0,186.0,Virut,5
411987,1.207547,tcp,23050.0,->,443.0,0.0,0.0,6.0,366.0,186.0,Virut,5
412081,1.307525,tcp,23051.0,->,443.0,0.0,0.0,6.0,366.0,186.0,Virut,5
412161,1.308276,tcp,23052.0,->,443.0,0.0,0.0,6.0,366.0,186.0,Virut,5


### __Definir columnas numéricas y categóricas__

In [6]:

numerical_columns = ['Dur', 'Sport', 'Dport', 'TotPkts', 'TotBytes', 'SrcBytes']
categorical_columns = list(set(df_only_botnets.columns) - set(numerical_columns))
categorical_columns.remove('target')
categorical_columns.remove('label')
print("Columnas numericas:", numerical_columns)
print("Columnas categoricas:", categorical_columns)

Columnas numericas: ['Dur', 'Sport', 'Dport', 'TotPkts', 'TotBytes', 'SrcBytes']
Columnas categoricas: ['Proto', 'dTos', 'sTos', 'Dir']


### __Normalizar columnas numéricas__

In [7]:
# Convertir de Dask a Pandas antes de continuar
df_only_botnets_pd = df_only_botnets.compute()
print("Tipo de DataFrame:", type(df_only_botnets_pd))

# Normalizar columnas numéricas
scaler = MinMaxScaler()
df_only_botnets_pd[numerical_columns] = scaler.fit_transform(df_only_botnets_pd[numerical_columns])

# Guardar el scaler
pickle.dump(scaler, open(os.path.join('pklfiles', 'scaletrainBotnet.pkl'), 'wb'))

# Crear y ajustar codificadores
ohe_stos = OneHotEncoder(sparse_output=False)
ohe_dtos = OneHotEncoder(sparse_output=False)
ohe_dir = OneHotEncoder(sparse_output=False)
ohe_proto = OneHotEncoder(sparse_output=False)

ohe_stos.fit(df_only_botnets_pd[['sTos']])
ohe_dtos.fit(df_only_botnets_pd[['dTos']])
ohe_dir.fit(df_only_botnets_pd[['Dir']])
ohe_proto.fit(df_only_botnets_pd[['Proto']])

# Aplicar OneHotEncoding y concatenar con Pandas
def apply_onehot_encoding(df, col_name, encoder):
    encoded = encoder.transform(df[[col_name]])
    encoded_df = pd.DataFrame(encoded, dtype='int64', columns=[f'{col_name}_{cat}' for cat in encoder.categories_[0]])
    df = df.drop(columns=[col_name]).reset_index(drop=True)
    return pd.concat([df, encoded_df], axis=1)

# Copiar DataFrame
encoded_df = df_only_botnets_pd.copy()

# Aplicar OneHotEncoder en columnas
encoded_df = apply_onehot_encoding(encoded_df, 'sTos', ohe_stos)
encoded_df = apply_onehot_encoding(encoded_df, 'dTos', ohe_dtos)
encoded_df = apply_onehot_encoding(encoded_df, 'Dir', ohe_dir)
encoded_df = apply_onehot_encoding(encoded_df, 'Proto', ohe_proto)

# Mezclar filas y resetear el índice
encoded_df = encoded_df.sample(frac=1).reset_index(drop=True)

# Separar target y label
target = encoded_df.pop('target')
label = encoded_df.pop('label')

# Resultado final
encoded_df, target, label

Tipo de DataFrame: <class 'pandas.core.frame.DataFrame'>


(                 Dur     Sport         Dport   TotPkts      TotBytes  \
 0       4.018275e-04  0.017197  1.236660e-05  0.000028  2.205588e-06   
 1       2.474820e-01  0.673355  4.062833e-07  0.000006  1.493457e-05   
 2       9.139160e-08  0.043825  9.646909e-08  0.000006  2.854290e-06   
 3       8.174055e-03  0.056063  1.215121e-04  0.001027  1.209635e-03   
 4       0.000000e+00  0.060763  4.452420e-08  0.000000  1.441561e-08   
 ...              ...       ...           ...       ...           ...   
 339695  2.503552e-03  0.061404  4.452420e-08  0.000011  9.081833e-07   
 339696  0.000000e+00  0.465552  9.646909e-08  0.000000  7.251051e-06   
 339697  2.503762e-03  0.026689  4.452420e-08  0.000011  9.081833e-07   
 339698  8.343373e-04  0.025865  4.452420e-08  0.000006  4.612995e-07   
 339699  6.287937e-05  0.042330  1.465588e-07  0.000057  1.982867e-05   
 
             SrcBytes  sTos_0.0  dTos_0.0  dTos_2.0  Dir_   ->  Dir_  <->  \
 0       9.081833e-07         1         1    

In [8]:
encoded_df.head()

Unnamed: 0,Dur,Sport,Dport,TotPkts,TotBytes,SrcBytes,sTos_0.0,dTos_0.0,dTos_2.0,Dir_ ->,Dir_ <->,Proto_icmp,Proto_rtp,Proto_tcp,Proto_udp
0,0.0004018275,0.017197,1.23666e-05,2.8e-05,2.205588e-06,9.081833e-07,1,1,0,1,0,0,0,1,0
1,0.247482,0.673355,4.062833e-07,6e-06,1.493457e-05,1.493457e-05,1,1,0,1,0,1,0,0,0
2,9.13916e-08,0.043825,9.646909e-08,6e-06,2.85429e-06,6.487024e-08,1,1,0,0,1,0,0,0,1
3,0.008174055,0.056063,0.0001215121,0.001027,0.001209635,0.001176364,1,1,0,1,0,0,0,1,0
4,0.0,0.060763,4.45242e-08,0.0,1.441561e-08,1.441561e-08,1,1,0,1,0,0,0,1,0


In [9]:
target.head()

0    2
1    3
2    2
3    2
4    2
Name: target, dtype: int64

In [10]:
label.head()

0    Neris
1     RBot
2    Neris
3    Neris
4    Neris
Name: label, dtype: object

### __Generamos la matriz de caracteristicas y el vector de etiquetas en sus respectivos archivos .csv__

In [12]:
encoded_df.to_csv(os.path.join('mlp_data', 'X_features_matrix.csv'), index=False)
target.to_csv(os.path.join('mlp_data', 'Y_target_vector.csv'), index=False)