# __Preprocesamiento de datos para el perceptron multicapa__

### 1️⃣ __Importacion de modulos__

In [41]:
import dask.dataframe as dd
import pandas as pd
import os
import pickle  # Usado en normalize_data para guardar el scaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split  # Usado para dividir datos en train y test

### 2️⃣ __Leer el CSV preprocesado__

In [16]:
ctu13_df = dd.read_csv('./preprocessing_data/dataset_procesado.csv')

### 3️⃣ __Obtenemos los flujos de red que solamente pertenecen a una botnet__

In [17]:
df_botnets = ctu13_df[ctu13_df['is_botnet'] == 1].copy()
df_botnets.compute().shape

(339700, 17)

### 4️⃣ __Asignamos una etiqueta objetivo idnetificando cada tipo de botnet a clasificar__

In [18]:
escenarios = {0: 'Sogou', 1: 'Murlo', 2: 'Neris', 3: 'RBot', 4: 'NsisAy', 5: 'Virut'}
df_botnets['target'] = df_botnets['BOTNET_NAME'].map({v: k for k, v in escenarios.items()}, meta=('BOTNET_NAME', 'int64'))
df_botnets['target'].value_counts().compute()

target
3    106352
5     40003
1      6127
0        63
2    184987
4      2168
Name: count, dtype: int64

### 5️⃣ __Eliminamos las columnas innecesarias__

In [61]:
df_only_botnets = df_botnets.drop(['StartTime', 'SrcAddr', 'DstAddr', 'State', 'Label', 'is_botnet'], axis=1).copy()
df_only_botnets = df_only_botnets.rename(columns={'BOTNET_NAME': 'label'})

df_only_botnets.compute()

Unnamed: 0,Dur,Proto,Sport,Dir,Dport,sTos,dTos,TotPkts,TotBytes,SrcBytes,label,target
14659,0.000274,udp,1025.0,<->,53.0,0.0,0.0,2.0,203.0,64.0,Sogou,0
14888,0.010509,udp,1025.0,<->,53.0,0.0,0.0,2.0,590.0,87.0,Sogou,0
14892,0.054527,tcp,1027.0,->,80.0,0.0,0.0,7.0,882.0,629.0,Sogou,0
15090,0.000502,udp,1025.0,<->,53.0,0.0,0.0,2.0,479.0,76.0,Sogou,0
15092,2.995112,udp,123.0,->,123.0,0.0,0.0,2.0,180.0,180.0,Sogou,0
...,...,...,...,...,...,...,...,...,...,...,...,...
411906,1.308413,tcp,23049.0,->,443.0,0.0,0.0,6.0,366.0,186.0,Virut,5
411987,1.207547,tcp,23050.0,->,443.0,0.0,0.0,6.0,366.0,186.0,Virut,5
412081,1.307525,tcp,23051.0,->,443.0,0.0,0.0,6.0,366.0,186.0,Virut,5
412161,1.308276,tcp,23052.0,->,443.0,0.0,0.0,6.0,366.0,186.0,Virut,5


### __Definir columnas numéricas y categóricas__

In [62]:

numerical_columns = ['Dur', 'Sport', 'Dport', 'TotPkts', 'TotBytes', 'SrcBytes']
categorical_columns = list(set(df_only_botnets.columns) - set(numerical_columns))
categorical_columns.remove('target')
categorical_columns.remove('label')
print("Columnas numericas:", numerical_columns)
print("Columnas categoricas:", categorical_columns)

Columnas numericas: ['Dur', 'Sport', 'Dport', 'TotPkts', 'TotBytes', 'SrcBytes']
Columnas categoricas: ['Dir', 'Proto', 'dTos', 'sTos']


### __Normalizar columnas numéricas__

In [63]:
scaler = MinMaxScaler()
scaler.fit(df_only_botnets[numerical_columns].compute())

df_only_botnets[numerical_columns] = df_only_botnets[numerical_columns].map_partitions(lambda df: scaler.transform(df), meta={col: 'float64' for col in numerical_columns})
# Guardar el scaler
pickle.dump(scaler, open(os.path.join('pklfiles', 'scaletrainBotnet.pkl'), 'wb'))

### __Ajustar codificadores one-hot__

In [64]:
# Crear y ajustar codificadores
ohe_stos = OneHotEncoder(sparse_output=True)
ohe_dtos = OneHotEncoder(sparse_output=True)
ohe_dir = OneHotEncoder(sparse_output=True)
ohe_proto = OneHotEncoder(sparse_output=True)

ohe_stos.fit(df_only_botnets['sTos'].compute().values.reshape(-1, 1))
ohe_dtos.fit(df_only_botnets['dTos'].compute().values.reshape(-1, 1))
ohe_dir.fit(df_only_botnets['Dir'].compute().values.reshape(-1, 1))
ohe_proto.fit(df_only_botnets['Proto'].compute().values.reshape(-1, 1))

### __Guardar los codificadores__

In [65]:
# Guardar codificadores
pickle.dump(ohe_stos, open(os.path.join('pklfiles', 'ohe_stosTrainBotnet.pkl'), 'wb'))
pickle.dump(ohe_dtos, open(os.path.join('pklfiles', 'ohe_dtosTrainBotnet.pkl'), 'wb'))
pickle.dump(ohe_dir, open(os.path.join('pklfiles', 'ohe_dirTrainBotnet.pkl'), 'wb'))
pickle.dump(ohe_proto, open(os.path.join('pklfiles', 'ohe_protoTrainBotnet.pkl'), 'wb'))

### __Transformar y concatenar columnas categóricas__

In [69]:
# Aplicar OneHotEncoding y concatenar con Dask
'''
def apply_onehot_encoding(df, col_name, encoder):
    def encode_partition(partition):
        encoded = encoder.transform(partition[col_name].values.reshape(-1, 1)).toarray()
        encoded_df = pd.DataFrame(encoded, dtype='int64', columns=[f'{col_name}_{cat}' for cat in encoder.categories_[0]])
        partition = partition.drop(columns=[col_name]).reset_index(drop=True)
        return pd.concat([partition, encoded_df], axis=1)
    
    return df.map_partitions(encode_partition, meta={**{col: 'float64' for col in df.columns if col != col_name}, **{f'{col_name}_{cat}': 'int64' for cat in encoder.categories_[0]}})

# Aplicar OneHotEncoder en columnas
encoded_df = df_only_botnets.copy()
encoded_df = apply_onehot_encoding(encoded_df, 'sTos', ohe_stos)
encoded_df = apply_onehot_encoding(encoded_df, 'dTos', ohe_dtos)
encoded_df = apply_onehot_encoding(encoded_df, 'Dir', ohe_dir)
encoded_df = apply_onehot_encoding(encoded_df, 'Proto', ohe_proto)

# Mezclar filas y resetear el índice
encoded_df = encoded_df.sample(frac=1).reset_index(drop=True)

# Separar target y label
target = encoded_df['target']
label = encoded_df['label']
encoded_df = encoded_df.drop(['target', 'label'], axis=1)

# Resultado
encoded_df, target, label
'''

# Convertir de Dask a Pandas
df_only_botnets_pd = df_only_botnets.compute()
print("Tipo de DataFrame:", type(df_only_botnets_pd))
# Aplicar OneHotEncoding y concatenar con Pandas
def apply_onehot_encoding(df, col_name, encoder):
    encoded = encoder.transform(df[col_name].values.reshape(-1, 1)).toarray()
    encoded_df = pd.DataFrame(encoded, dtype='int64', columns=[f'{col_name}_{cat}' for cat in encoder.categories_[0]])
    df = df.drop(columns=[col_name]).reset_index(drop=True)
    return pd.concat([df, encoded_df], axis=1)

# Copiar DataFrame
encoded_df = df_only_botnets_pd.copy()

# Aplicar OneHotEncoder en columnas
encoded_df = apply_onehot_encoding(encoded_df, 'sTos', ohe_stos)
encoded_df = apply_onehot_encoding(encoded_df, 'dTos', ohe_dtos)
encoded_df = apply_onehot_encoding(encoded_df, 'Dir', ohe_dir)
encoded_df = apply_onehot_encoding(encoded_df, 'Proto', ohe_proto)

# Mezclar filas y resetear el índice
encoded_df = encoded_df.sample(frac=1).reset_index(drop=True)

# Separar target y label
target = encoded_df['target']
label = encoded_df['label']
encoded_df = encoded_df.drop(['target', 'label'], axis=1)

# Resultado
encoded_df, target, label


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [59]:

encoded_df = encoded_df.to_dask_dataframe()
target = target.to_dask_dataframe()
label = label.to_dask_dataframe()

encoded_df = encoded_df.persist()
target = target.persist()
label = label.persist()

print(encoded_df.head())
print(target.head())
print(label.head())




IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices