## Importación de librerías

In [1]:
import pandas as pd
import json
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

## Configuración de variables globales

In [2]:
with open('setup.json', 'r') as f:
    SETUP_JSON = json.load(f)
OUTPUT_CSV = SETUP_JSON['dataset_csv'] # Output CSV file
OUTPUT_PARQUET = SETUP_JSON['dataset_parquet'] # Output CSV file

## Cargar el dataset

In [3]:
df = pd.read_csv(OUTPUT_CSV) # Leer csv

## Extracción de secuencias de paquetes

Obtener todas las secuencias de paquetes presentes en el dataset.

In [4]:
# Crear una copia del DataFrame original para no modificarlo directamente.
df_temp = df.copy()

# Cada vez que el valor de 'label' cambia, se incrementa el contador acumulativo.
df_temp['sequence'] = (df_temp['Label'] != df_temp['Label'].shift()).cumsum()

# Agrupar el DataFrame por la columna 'sequence' y crear una lista de DataFrames.
sequences = [sequence for _, sequence in df_temp.groupby('sequence')]  

for sequence_df in sequences:  # Iterar sobre cada DataFrame en la lista de secuencias.
    # Eliminar la columna 'sequence' de cada DataFrame, ya que solo se usó para agrupar.
    sequence_df.drop(columns='sequence', inplace=True)  
    # Reiniciar los índices de cada DataFrame para que comiencen desde 0.
    sequence_df.reset_index(drop=True, inplace=True)

## Análisis de las secuencias

In [56]:
print(f"Total de secuencias de paquetes encontradas: {len(sequences)}")

Total de secuencias de paquetes encontradas: 15947


In [57]:
print(f"Primera secuencia: {sequences[0]['Label'].unique()}")
print(f"Número de paquetes de la primera secuencia: {sequences[0].shape[0]}")

Primera secuencia: [0]
Número de paquetes de la primera secuencia: 94


In [58]:
print(f"Última secuencia: {sequences[-1]['Label'].unique()}")
print(f"Número de paquetes de la última secuencia: {sequences[-1].shape[0]}")

Última secuencia: [1]
Número de paquetes de la última secuencia: 17


In [5]:
# Almacenar los tamaños de las secuencias y las secuencias en un DataFrame.
sequences_df = pd.DataFrame({
    'Size': [sequence.shape[0] for sequence in sequences],
    'Sequence': sequences,
    'ID': range(1, len(sequences) + 1),  # Asigna un número único a cada secuencia
    'Label': [sequence['Label'].unique()[0] for sequence in sequences],
})

In [60]:
sequences_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15947 entries, 0 to 15946
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Size      15947 non-null  int64 
 1   Sequence  15947 non-null  object
 2   ID        15947 non-null  int64 
 3   Label     15947 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 498.5+ KB


In [61]:
sequences_df.head()

Unnamed: 0,Size,Sequence,ID,Label
0,94,Dst Port Protocol Timestamp Flow Durati...,1,0
1,64,Dst Port Protocol Timestamp Flow Durati...,2,9
2,1,Dst Port Protocol Timestamp Flow Duratio...,3,0
3,5610,Dst Port Protocol Timestamp Flow Dura...,4,9
4,1,Dst Port Protocol Timestamp Flow Duratio...,5,0


In [62]:
print("Análisis estadístico de las secuencias")
print(f"{sequences_df['Size'].describe()}")

Análisis estadístico de las secuencias
count     15947.000000
mean         45.026212
std        2050.380944
min           1.000000
25%           1.000000
50%           1.000000
75%           3.000000
max      237376.000000
Name: Size, dtype: float64


Obtener media y desviación estándar del número de paquetes.

In [23]:
# Obtener la media de paquetes por secuencia
mean = sequences_df['Size'].mean()

# Obtener la desviación estándar muestral (por defecto ddof=1)
deviation = sequences_df['Size'].std()

print(f"Número promedio de paquetes por secuencia: {mean}")
print(f"Desviación estándar (muestral): {deviation}")

Número promedio de paquetes por secuencia: 45.02621182667586
Desviación estándar (muestral): 2050.3809442995553


Obtener número de secuencias asociadas a cada etiqueta

In [64]:
sequences_df["Label"].value_counts()  # Contar la cantidad de secuencias por etiqueta

Label
6     5026
8     4668
0     3805
1     1548
5      635
3      109
2       73
10      32
7       30
9       18
4        3
Name: count, dtype: int64

## Normalizar las secuencias

In [None]:
# Número de características de cada secuencia
num_features = sequences[0].shape[1] - 1
# Número de secuencias
num_sequences = len(sequences)

Establecer tamaño fijo para todas las secuencias y separar en X e y

In [28]:
# Media de paquetes por secuencia como longitud fija
sequence_length = int(mean)

X_fixed = []
y_fixed = []

for seq in sequences:
    data = seq.drop(columns='Label')
    label = seq['Label'].iloc[0]

    if len(data) < sequence_length:
        # Rellenar con ceros
        pad_width = sequence_length - len(data)
        padded = np.pad(data, ((0, pad_width), (0, 0)), mode='constant')
    else:
        # Recortar
        padded = data.iloc[:sequence_length]

    X_fixed.append(padded)
    y_fixed.append(label)

X = np.array(X_fixed)
y = np.array(y_fixed)

In [42]:
print(X.shape)
print(y.shape)

(15947, 45, 79)
(15947,)


Escalamiento de características

In [41]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X.reshape(-1, num_features)).reshape(num_sequences, sequence_length, num_features)

In [55]:
len(X_scaled[0][0])

79

In [54]:
X_scaled[0][0]

array([-1.97766439e-01, -3.05405244e-01,  5.68190209e+00,  8.12004927e+00,
       -4.73690641e-02, -2.50313939e-02, -4.80153164e-02, -2.14519146e-02,
       -8.64026885e-02, -4.50510071e-02, -1.21849657e-01, -1.49213584e-01,
       -1.26366686e-01, -2.17263470e-02, -1.05943549e-01, -1.24705272e-01,
       -1.58717163e-02, -5.66842088e-02,  1.65410922e+01, -8.51779032e-02,
        8.04836710e+00,  2.01800762e+01,  8.12388850e+00,  1.08897615e+01,
       -1.06614661e-01,  8.05041136e+00,  1.11119149e+01, -9.62176715e-02,
       -8.66951591e-02, -8.82799320e-02, -9.11858688e-02, -7.39201631e-02,
       -4.84270760e-02,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
       -4.80447927e-02, -2.54582743e-02, -5.91369030e-02, -5.31152136e-02,
       -4.36678477e-02, -1.02340459e-01, -1.58660621e-01, -1.58068186e-01,
       -1.15428856e-02, -1.41177517e-02, -4.84270760e-02, -1.69606715e-01,
       -2.33054881e-01, -2.07100081e-01, -2.59252925e-02,  0.00000000e+00,
       -1.69606715e-01, -

In [58]:
X_scaled[-1][-1]

array([-0.19776644, -0.30540524, -0.28998415, -0.14015261, -0.04787582,
       -0.02503139, -0.04801532, -0.02145191, -0.08640269, -0.04505101,
       -0.12184966, -0.14921358, -0.12636669, -0.02172635, -0.10594355,
       -0.12470527, -0.01587172, -0.05668464, -0.10808506, -0.08522507,
       -0.12741051, -0.07621184, -0.13981388, -0.09463694, -0.1067136 ,
       -0.1270681 , -0.07738228, -0.09621767, -0.08669516, -0.08827993,
       -0.09118587, -0.07392016, -0.04842708,  0.        ,  0.        ,
        0.        , -0.04804479, -0.02545827, -0.05913774, -0.05311521,
       -0.04366785, -0.10234046, -0.15866062, -0.15806819, -0.01154289,
       -0.01411775, -0.04842708, -0.16960672, -0.23305488, -0.20710008,
       -0.02592529,  0.        , -0.16960672, -0.17087755, -0.15886076,
       -0.12184966, -0.10594355,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        , -0.04787582, -0.04801532,
       -0.02503139, -0.02145191, -0.20488827, -0.04115293, -0.04

## Separar secuencias en entrenamiento y prueba

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [45]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(12757, 45, 79) (12757,)
(3190, 45, 79) (3190,)


Convertir los arrays a tensores

In [51]:
# Convertir a tensores
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

In [49]:
num_classes = len(sequences_df['Label'].unique())
num_classes

11

## Inicialización del modelo

In [None]:
# Modelo LSTM
class NetFlowLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.classifier = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        _, (h_n, _) = self.lstm(x)         # h_n shape: (1, batch, hidden)
        return self.classifier(h_n[-1])    # usar último estado oculto

model = NetFlowLSTM(input_dim=num_features, hidden_dim=32, output_dim=num_classes)

## Entrenamiento

In [None]:
# Configurar entrenamiento
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
epochs = 20

# Entrenar
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

    # Evaluar
    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test_tensor)
        preds = torch.argmax(test_outputs, dim=1)
        acc = (preds == y_test_tensor).float().mean().item()

    print(f"Epoch {epoch+1:02d} | Loss: {loss.item():.4f} | Test Accuracy: {acc:.2f}")


Epoch 01 | Loss: 2.3254 | Test Accuracy: 0.30
Epoch 02 | Loss: 2.1652 | Test Accuracy: 0.34
Epoch 03 | Loss: 1.9810 | Test Accuracy: 0.34
Epoch 04 | Loss: 1.7884 | Test Accuracy: 0.34
Epoch 05 | Loss: 1.6593 | Test Accuracy: 0.34
Epoch 06 | Loss: 1.5968 | Test Accuracy: 0.34
Epoch 07 | Loss: 1.5615 | Test Accuracy: 0.34
Epoch 08 | Loss: 1.5334 | Test Accuracy: 0.34
Epoch 09 | Loss: 1.5122 | Test Accuracy: 0.34
Epoch 10 | Loss: 1.5014 | Test Accuracy: 0.25
Epoch 11 | Loss: 1.4967 | Test Accuracy: 0.25
Epoch 12 | Loss: 1.4893 | Test Accuracy: 0.25
Epoch 13 | Loss: 1.4778 | Test Accuracy: 0.34
Epoch 14 | Loss: 1.4659 | Test Accuracy: 0.34
Epoch 15 | Loss: 1.4571 | Test Accuracy: 0.34
Epoch 16 | Loss: 1.4522 | Test Accuracy: 0.34
Epoch 17 | Loss: 1.4499 | Test Accuracy: 0.35
Epoch 18 | Loss: 1.4480 | Test Accuracy: 0.35
Epoch 19 | Loss: 1.4452 | Test Accuracy: 0.35
Epoch 20 | Loss: 1.4417 | Test Accuracy: 0.35


In [None]:
"""# Asignar la columna 'ID' dentro de cada DataFrame en train
for i, sequence in enumerate(train["Sequence"]):
    sequence["ID"] = train.iloc[i]["ID"]
# Asignar la columna 'ID' dentro de cada DataFrame en train
for i, sequence in enumerate(test["Sequence"]):
    sequence["ID"] = test.iloc[i]["ID"]


# Concatenar manteniendo la columna 'ID'
train_concatenado = pd.concat(train["Sequence"].tolist(), ignore_index=True)
# Concatenar manteniendo la columna 'ID'
test_concatenado = pd.concat(test["Sequence"].tolist(), ignore_index=True)"""

In [None]:
"""# Dividir los DataFrames en características (X) y etiquetas (y)
x_train = train_concatenado.drop(columns=['Label', 'ID'])
y_train = train_concatenado[['Label', 'ID']]
# Crear la nueva columna basada en el mapeo
y_train['Label_Mapped'] = y_train['Label'].map(LABELS)

x_test = test_concatenado.drop(columns=['Label', 'ID'])
y_test = test_concatenado[['Label', 'ID']]
# Crear la nueva columna basada en el mapeo
y_test['Label_Mapped'] = y_test['Label'].map(LABELS)

print(f"Dimensiones de x_train: {x_train.shape}")
print(y_train[['Label']].value_counts()) 
y_train.head() # Muestra los primeros 5 paquetes del dataframe de entrenamiento"""

In [None]:
# Cambiar etiquetas a su estado original
"""original_labels = le.inverse_transform(sequences[-1]['Label'])
print(original_labels)
print(len(original_labels))"""