## Importación de librerías

In [20]:
import pandas as pd
import json
import numpy as np
import math
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif

## Configuración de variables globales

In [21]:
with open('setup.json', 'r') as f:
    SETUP_JSON = json.load(f)
RAW_DATASET = SETUP_JSON['raw_dataset_parquet'] # Fichero Parquet de entrada
FINAL_DATASET = SETUP_JSON['final_dataset'] # Fichero Parquet de salida
TAKE_FULL_DATASET = SETUP_JSON['take_full_dataset'] # Si es True, se toma el dataset completo, si es False, se toma una muestra


## Análisis

In [22]:
df = pd.read_parquet(RAW_DATASET)
df.shape

(8284254, 80)

In [23]:
df.head()

Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,0,14/02/2018 08:31:01,112641719,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320859.5,139.3000358938,56320958,56320761,Benign
1,0,0,14/02/2018 08:33:50,112641466,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320733.0,114.5512985522,56320814,56320652,Benign
2,0,0,14/02/2018 08:36:39,112638623,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56319311.5,301.9345955667,56319525,56319098,Benign
3,22,6,14/02/2018 08:40:13,6453966,15,10,1239,2273,744,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign
4,22,6,14/02/2018 08:40:23,8804066,14,11,1143,2209,744,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign


Verificar si hay encabezados duplicados

In [24]:
df_headers = df[df['Label'] == 'Label']
df_headers.shape[0]

59

Ver la cantidad de filas con valores faltantes

In [25]:
df.isna().any(axis=1).value_counts()

False    8284254
Name: count, dtype: int64

Ver los tipos de datos de las columnas

In [26]:
df.dtypes

Dst Port         object
Protocol         object
Timestamp        object
Flow Duration    object
Tot Fwd Pkts     object
                  ...  
Idle Mean        object
Idle Std         object
Idle Max         object
Idle Min         object
Label            object
Length: 80, dtype: object

## Preparación de los datos

Eliminar encabezados duplicados

In [27]:
df = df[df['Label'] != 'Label']
df.shape

(8284195, 80)

Asignar tipo de dato correspondiente a cada columna

In [28]:
# Asignar un código a cada etiqueta
label_mapping = {label: int(i) for i, label in enumerate(df['Label'].unique())}
df['Label Code'] = df['Label'].map(label_mapping)

In [29]:
# Seleccionar posibles columnas para aplicar formato decimal
for col in df.drop(columns=['Label', 'Timestamp', 'Protocol', 'Dst Port', 'Label Code']).columns:
    # Aplicar formato decimal
    df[col] = df[col].astype('float64')
        
# Asignar formato entero a columnas que no contienen decimales
df["Protocol"] = df["Protocol"].astype(int)
df["Dst Port"] = df["Dst Port"].astype(int)
df["Label Code"] = df["Label Code"].astype(int)

# Conversión de formato de timestamp a formato legible para el modelo
df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d/%m/%Y %H:%M:%S', errors='coerce') # Convertimos la timestamp a formato fecha de pandas
df['Timestamp'] = np.sin(2*math.pi*df['Timestamp'].dt.hour/24)  # Convertimos la fecha en un ciclo legible para el modelo

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8284195 entries, 0 to 8284253
Data columns (total 81 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Dst Port           int64  
 1   Protocol           int64  
 2   Timestamp          float64
 3   Flow Duration      float64
 4   Tot Fwd Pkts       float64
 5   Tot Bwd Pkts       float64
 6   TotLen Fwd Pkts    float64
 7   TotLen Bwd Pkts    float64
 8   Fwd Pkt Len Max    float64
 9   Fwd Pkt Len Min    float64
 10  Fwd Pkt Len Mean   float64
 11  Fwd Pkt Len Std    float64
 12  Bwd Pkt Len Max    float64
 13  Bwd Pkt Len Min    float64
 14  Bwd Pkt Len Mean   float64
 15  Bwd Pkt Len Std    float64
 16  Flow Byts/s        float64
 17  Flow Pkts/s        float64
 18  Flow IAT Mean      float64
 19  Flow IAT Std       float64
 20  Flow IAT Max       float64
 21  Flow IAT Min       float64
 22  Fwd IAT Tot        float64
 23  Fwd IAT Mean       float64
 24  Fwd IAT Std        float64
 25  Fwd IAT Max        floa

Eliminar filas con valores infinitos

In [31]:
df = df[~df.isin([np.inf, -np.inf]).any(axis=1)]
df.reset_index(drop=True, inplace=True)
df.shape

(8247888, 81)

## Selección de características

In [32]:
# Columnas de características
X = df.drop(columns=['Timestamp', 'Label', 'Label Code'])
y = df['Label Code'].values # Usar etiquetas numéricas y obtener como array numpy

# Eliminar columnas con varianza 0 (constantes)
selector_var = VarianceThreshold(threshold=0.0)
# Convertir X_var a float32 para reducir la huella de memoria
X_var = selector_var.fit_transform(X).astype(np.float32)
features_non_constant = X.columns[selector_var.get_support()]

# Aplicar SelectKBest solo a estas características
selector_kbest = SelectKBest(score_func=f_classif, k=30)
selector_kbest.fit(X_var, y) # El error de memoria ocurría aquí
selected_mask = selector_kbest.get_support(indices=True)
selected_features = features_non_constant[selected_mask]

# Seleccionar solo las columnas necesarias en el dataframe original
cols_final = ['Timestamp'] + selected_features.tolist() + ['Label', 'Label Code']
df = df.loc[:, cols_final]


Agregar columna de ID secuencia

In [33]:
# Identificar secuencias en el dataset
df["Sequence ID"] = (df["Label"] != df["Label"].shift()).cumsum()

In [34]:
if not TAKE_FULL_DATASET:
    # Filtrar el número de filas por clase según setup.json
    balances = SETUP_JSON["balances"]
    df = pd.concat([
        group.head(balances.get(label, 0))
        for label, group in df.groupby("Label")
        if balances.get(label, 0) > 0
    ], ignore_index=True)

In [35]:
df.head()

Unnamed: 0,Timestamp,Dst Port,Protocol,Tot Fwd Pkts,TotLen Fwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Mean,Bwd Pkt Len Max,Bwd Pkt Len Min,Bwd Pkt Len Mean,...,Fwd Seg Size Avg,Bwd Seg Size Avg,Subflow Fwd Pkts,Subflow Fwd Byts,Init Fwd Win Byts,Fwd Act Data Pkts,Fwd Seg Size Min,Label,Label Code,Sequence ID
0,0.5,21,6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,26883.0,0.0,40.0,FTP-BruteForce,1,2
1,0.5,21,6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,26883.0,0.0,40.0,FTP-BruteForce,1,2
2,0.5,21,6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,26883.0,0.0,40.0,FTP-BruteForce,1,2
3,0.5,21,6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,26883.0,0.0,40.0,FTP-BruteForce,1,2
4,0.5,21,6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,26883.0,0.0,40.0,FTP-BruteForce,1,2


In [36]:
df.tail()

Unnamed: 0,Timestamp,Dst Port,Protocol,Tot Fwd Pkts,TotLen Fwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Mean,Bwd Pkt Len Max,Bwd Pkt Len Min,Bwd Pkt Len Mean,...,Fwd Seg Size Avg,Bwd Seg Size Avg,Subflow Fwd Pkts,Subflow Fwd Byts,Init Fwd Win Byts,Fwd Act Data Pkts,Fwd Seg Size Min,Label,Label Code,Sequence ID
145,0.5,21,6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,26883.0,0.0,40.0,FTP-BruteForce,1,4
146,0.5,21,6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,26883.0,0.0,40.0,FTP-BruteForce,1,4
147,0.5,21,6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,26883.0,0.0,40.0,FTP-BruteForce,1,4
148,0.5,21,6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,26883.0,0.0,40.0,FTP-BruteForce,1,4
149,0.5,21,6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,26883.0,0.0,40.0,FTP-BruteForce,1,4


## Exportación del dataframe final en formato Parquet

In [37]:
# df.to_parquet(FINAL_DATASET, index=False)
# print(f"Archivo Parquet guardado como {FINAL_DATASET}")
df.to_csv(FINAL_DATASET, index=False)
print(f"Archivo Parquet guardado como {FINAL_DATASET}")

Archivo Parquet guardado como prueba.csv
