## Importación de librerías

In [1]:
import pandas as pd
import json
import numpy as np
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif
import math

## Configuración de variables globales

In [2]:
with open('setup.json', 'r') as f:
    SETUP_JSON = json.load(f)
RAW_DATASET = SETUP_JSON['raw_dataset_parquet'] # Fichero Parquet de entrada
FINAL_DATASET = SETUP_JSON['final_dataset'] # Fichero Parquet de salida
BALANCES = SETUP_JSON["balances"]

## Cargar el dataset bruto

In [3]:
df = pd.read_parquet(RAW_DATASET)
df.shape

(8284254, 80)

In [4]:
df.head()

Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,0,14/02/2018 08:31:01,112641719,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320859.5,139.3000358938,56320958,56320761,Benign
1,0,0,14/02/2018 08:33:50,112641466,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320733.0,114.5512985522,56320814,56320652,Benign
2,0,0,14/02/2018 08:36:39,112638623,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56319311.5,301.9345955667,56319525,56319098,Benign
3,22,6,14/02/2018 08:40:13,6453966,15,10,1239,2273,744,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign
4,22,6,14/02/2018 08:40:23,8804066,14,11,1143,2209,744,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign


## Limpieza

Verificar si hay encabezados duplicados

In [5]:
df_headers = df[df['Label'] == 'Label']
df_headers.shape[0]

59

Eliminar encabezados duplicados

In [6]:
df = df[df['Label'] != 'Label']
df.shape

(8284195, 80)

Ver la cantidad de filas con valores faltantes

In [7]:
df.isna().any(axis=1).value_counts()

False    8284195
Name: count, dtype: int64

## Conversión de tipos

Convertimos Timestamp a formato de fecha

In [8]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d/%m/%Y %H:%M:%S', errors='coerce')
df.dtypes

Dst Port                 object
Protocol                 object
Timestamp        datetime64[ns]
Flow Duration            object
Tot Fwd Pkts             object
                      ...      
Idle Mean                object
Idle Std                 object
Idle Max                 object
Idle Min                 object
Label                    object
Length: 80, dtype: object

Ordenamos por fecha de forma ascendente

In [9]:
df = df.sort_values(by='Timestamp').reset_index(drop=True)
df['Timestamp'].head()

0   1970-01-05 03:01:17
1   1970-01-08 07:32:33
2   1970-01-10 03:04:26
3   1970-01-11 03:51:32
4   1970-01-11 05:12:30
Name: Timestamp, dtype: datetime64[ns]

Eliminar fechas fuera de rango

In [10]:
before = len(df)
df = df[df['Timestamp'] > pd.Timestamp('2018-02-14 00:00:00')]
df.reset_index(drop=True, inplace=True)
print("filas dropeadas: ", before - len(df))
df.head()

filas dropeadas:  14


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,443,6,2018-02-14 01:00:00,111873906,7,3,169,92,46,0,...,20,155665.5,161.9274528917,155780,155551,55780941.0,3933152.77691599,58562100,52999782,Benign
1,3389,6,2018-02-14 01:00:00,4363661,8,11,1148,1581,677,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
2,3389,6,2018-02-14 01:00:00,1671932,8,7,1144,1581,677,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
3,3389,6,2018-02-14 01:00:00,3641507,8,10,1148,1581,677,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
4,80,6,2018-02-14 01:00:00,89,2,0,0,0,0,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign


Crear nueva columna con el código por cada etiqueta

In [11]:
# Asignar un código a cada etiqueta
label_mapping = {label: int(i) for i, label in enumerate(df['Label'].unique())}
df['Label Code'] = df['Label'].map(label_mapping)

Conversión de formato de timestamp a formato legible para el modelo

In [None]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d/%m/%Y %H:%M:%S', errors='coerce') # Convertimos la timestamp a formato fecha de pandas
df['Timestamp'] = np.sin(2*math.pi*df['Timestamp'].dt.hour/24)                                     # Convertimos la fecha en un ciclo legible para el modelo

Asignar formato numérico a las columnas numéricas

In [13]:
# Seleccionar posibles columnas para aplicar formato decimal
for col in df.drop(columns=['Label', 'Timestamp', 'Protocol', 'Dst Port', 'Label Code']).columns:
    # Aplicar formato decimal
    df[col] = df[col].astype('float64')
        
# Asignar formato entero a columnas que no contienen decimales
df["Protocol"] = df["Protocol"].astype('int64')
df["Dst Port"] = df["Dst Port"].astype('int64')
df["Label Code"] = df["Label Code"].astype('int64')

In [14]:
df.dtypes

Dst Port           int64
Protocol           int64
Timestamp        float64
Flow Duration    float64
Tot Fwd Pkts     float64
                  ...   
Idle Std         float64
Idle Max         float64
Idle Min         float64
Label             object
Label Code         int64
Length: 81, dtype: object

Eliminar filas con valores infinitos

In [15]:
df = df[~df.isin([np.inf, -np.inf]).any(axis=1)]
df.reset_index(drop=True, inplace=True)
df.shape

(8247874, 81)

## Selección de características

Seleccionar las 60 mejores características

In [16]:
# Definir las columnas a eliminar antes de la selección (incluye etiquetas y features cíclicos que no quieres usar aquí)
drop_cols = ['Label', 'Label Code', 'Timestamp']

X = df.drop(columns=drop_cols)
y = df['Label Code']

# Elimina columnas constantes antes de SelectKBest
selector_var = VarianceThreshold(threshold=0.0)
X_var = selector_var.fit_transform(X)
features_non_constant = X.columns[selector_var.get_support()]

# Ahora aplica SelectKBest solo a estas columnas
selector_kbest = SelectKBest(score_func=f_classif, k=60)
selector_kbest.fit(X_var, y)
selected_mask = selector_kbest.get_support(indices=True)
selected_features = features_non_constant[selected_mask]

# Selecciona solo las columnas necesarias en el dataframe original
cols_final = selected_features.tolist() + drop_cols
df = df.loc[:, cols_final]

In [17]:
df.dtypes

Dst Port             int64
Protocol             int64
Flow Duration      float64
Tot Fwd Pkts       float64
TotLen Fwd Pkts    float64
                    ...   
Idle Max           float64
Idle Min           float64
Label               object
Label Code           int64
Timestamp          float64
Length: 63, dtype: object

## Identificar secuencias de paquetes

In [18]:
# Identificar secuencias en el dataset
df["Sequence ID"] = (df["Label"] != df["Label"].shift()).cumsum()

## Exportación del dataframe final en formato Parquet

In [19]:
sample_idx = (
    df.groupby("Label")
    .apply(lambda group: group.head(BALANCES.get(group.name, 0)))
    .index.get_level_values(-1)
)
sample_df = df.loc[sample_idx].copy()
sample_df.drop(columns=['Label Code', 'Label', 'Sequence ID'], inplace=True)

# --- Elimina esas filas del DataFrame principal antes de exportar el Parquet ---
df_final = df.drop(index=sample_idx).reset_index(drop=True)

  .apply(lambda group: group.head(BALANCES.get(group.name, 0)))


In [20]:
df.to_parquet(FINAL_DATASET, index=False)
print(f"Archivo Parquet guardado como {FINAL_DATASET}")

Archivo Parquet guardado como final_dataset.parquet


Exportar una muestra para probar el modelo

In [21]:
sample_df.to_csv('test.csv', index=False)
print(f"Archivo CSV guardado como test.csv")
sample_df.head()

Archivo CSV guardado como test.csv


Unnamed: 0,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,TotLen Fwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,Bwd Pkt Len Max,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Timestamp
3133937,80,6,15246.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5
3133938,80,6,15469.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5
3133939,80,6,15650.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5
3133940,80,6,15876.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5
3133941,80,6,16121.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5
