## Importación de librerías

In [4]:
import pandas as pd
import json
import numpy as np
from skimpy import skim
from sklearn.preprocessing import LabelEncoder

## Configuración de variables globales

In [5]:
with open('setup.json', 'r') as f:
    SETUP_JSON = json.load(f)
HEADER = SETUP_JSON['header']
NA_VALUES = SETUP_JSON['navalues'] # Consistent NA values,
OUTPUT_CSV = SETUP_JSON['dataset_csv'] # Output CSV file
OUTPUT_PARQUET = SETUP_JSON['dataset_parquet'] # Output CSV file

## Cargar el dataset

In [6]:
df = pd.read_csv(OUTPUT_CSV, na_values=NA_VALUES)         # Leer csv
# df = pd.read_parquet(OUTPUT_PARQUET)                        # Leer parquet

## Análisis del dataframe

In [7]:
df.shape

(719991, 81)

In [8]:
df.head()

Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Protocol.1
0,0,0,0.866025,112641719,3,0,0,0.0,0,0,...,0.0,0.0,0.0,0.0,56320859.5,139.300036,56320958.0,56320761.0,0,0
1,0,0,0.866025,112641466,3,0,0,0.0,0,0,...,0.0,0.0,0.0,0.0,56320733.0,114.551299,56320814.0,56320652.0,0,0
2,0,0,0.866025,112638623,3,0,0,0.0,0,0,...,0.0,0.0,0.0,0.0,56319311.5,301.934596,56319525.0,56319098.0,0,0
3,22,6,0.866025,6453966,15,10,1239,2273.0,744,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,6
4,22,6,0.866025,8804066,14,11,1143,2209.0,744,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,6


In [9]:
df.tail()

Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Protocol.1
719986,8080,6,0.5,10070,3,4,326,129.0,326,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,6
719987,8080,6,0.5,575,2,0,0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,6
719988,8080,6,0.5,13760,3,4,326,129.0,326,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,6
719989,8080,6,0.5,605,2,0,0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,6
719990,8080,6,0.5,10255,3,4,326,129.0,326,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,6


In [10]:
df.dtypes

Dst Port           int64
Protocol           int64
Timestamp        float64
Flow Duration      int64
Tot Fwd Pkts       int64
                  ...   
Idle Std         float64
Idle Max         float64
Idle Min         float64
Label              int64
Protocol           int64
Length: 81, dtype: object

## Normalización del dataframe

Eliminar duplicados

In [11]:
df.shape

(719991, 81)

Asignar tipo correspondiente a cada columna

In [12]:
with open(HEADER, 'r') as f:
    header = json.load(f)
for col, dtype in header.items():
    print(col, dtype)

Dst Port int64
Protocol int64
Timestamp object
Flow Duration int64
Tot Fwd Pkts int64
Tot Bwd Pkts int64
TotLen Fwd Pkts int64
TotLen Bwd Pkts int64
Fwd Pkt Len Max int64
Fwd Pkt Len Min int64
Fwd Pkt Len Mean int64
Fwd Pkt Len Std int64
Bwd Pkt Len Max int64
Bwd Pkt Len Min int64
Bwd Pkt Len Mean int64
Bwd Pkt Len Std int64
Flow Byts/s int64
Flow Pkts/s float64
Flow IAT Mean float64
Flow IAT Std float64
Flow IAT Max int64
Flow IAT Min int64
Fwd IAT Tot int64
Fwd IAT Mean float64
Fwd IAT Std float64
Fwd IAT Max int64
Fwd IAT Min int64
Bwd IAT Tot int64
Bwd IAT Mean int64
Bwd IAT Std int64
Bwd IAT Max int64
Bwd IAT Min int64
Fwd PSH Flags int64
Bwd PSH Flags int64
Fwd URG Flags int64
Bwd URG Flags int64
Fwd Header Len int64
Bwd Header Len int64
Fwd Pkts/s float64
Bwd Pkts/s int64
Pkt Len Min int64
Pkt Len Max int64
Pkt Len Mean int64
Pkt Len Std int64
Pkt Len Var int64
FIN Flag Cnt int64
SYN Flag Cnt int64
RST Flag Cnt int64
PSH Flag Cnt int64
ACK Flag Cnt int64
URG Flag Cnt int64
CWE F

In [13]:
df.dtypes

Dst Port           int64
Protocol           int64
Timestamp        float64
Flow Duration      int64
Tot Fwd Pkts       int64
                  ...   
Idle Std         float64
Idle Max         float64
Idle Min         float64
Label              int64
Protocol           int64
Length: 81, dtype: object

Convertir la columna de etiqueta a numérica

In [14]:
df['Label'].unique()

array([ 0,  9,  6,  8,  7,  5,  4,  2,  3, 10,  1])

In [15]:
df['Label'].dtype

dtype('int64')

In [16]:
df.head()

Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Protocol.1
0,0,0,0.866025,112641719,3,0,0,0.0,0,0,...,0.0,0.0,0.0,0.0,56320859.5,139.300036,56320958.0,56320761.0,0,0
1,0,0,0.866025,112641466,3,0,0,0.0,0,0,...,0.0,0.0,0.0,0.0,56320733.0,114.551299,56320814.0,56320652.0,0,0
2,0,0,0.866025,112638623,3,0,0,0.0,0,0,...,0.0,0.0,0.0,0.0,56319311.5,301.934596,56319525.0,56319098.0,0,0
3,22,6,0.866025,6453966,15,10,1239,2273.0,744,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,6
4,22,6,0.866025,8804066,14,11,1143,2209.0,744,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,6


## Extracción de secuencias de paquetes

Obtener todas las secuencias de paquetes presentes en el dataset.

In [17]:
# Crear una copia del DataFrame original para no modificarlo directamente.
df_temp = df.copy()

# Cada vez que el valor de 'label' cambia, se incrementa el contador acumulativo.
df_temp['sequence'] = (df_temp['Label'] != df_temp['Label'].shift()).cumsum()

# Agrupar el DataFrame por la columna 'sequence' y crear una lista de DataFrames.
sequences = [sequence for _, sequence in df_temp.groupby('sequence')]  

for sequence_df in sequences:  # Iterar sobre cada DataFrame en la lista de secuencias.
    # Eliminar la columna 'sequence' de cada DataFrame, ya que solo se usó para agrupar.
    sequence_df.drop(columns='sequence', inplace=True)  
    # Reiniciar los índices de cada DataFrame para que comiencen desde 0.
    sequence_df.reset_index(drop=True, inplace=True)

## Análisis de las secuencias

In [18]:
print(f"Total de secuencias de paquetes encontradas: {len(sequences)}")

Total de secuencias de paquetes encontradas: 15947


In [19]:
print(f"Primera secuencia: {sequences[0]['Label'].unique()}")
print(f"Número de paquetes de la primera secuencia: {sequences[0].shape[0]}")

Primera secuencia: [0]
Número de paquetes de la primera secuencia: 94


In [20]:
print(f"Última secuencia: {sequences[-1]['Label'].unique()}")
print(f"Número de paquetes de la última secuencia: {sequences[-1].shape[0]}")

Última secuencia: [1]
Número de paquetes de la última secuencia: 17


In [21]:
# Almacenar los tamaños de las secuencias y las secuencias en un DataFrame.
sequences_df = pd.DataFrame({
    'Size': [sequence.shape[0] for sequence in sequences],
    'Sequence': sequences,
    'ID': range(1, len(sequences) + 1),  # Asigna un número único a cada secuencia
    'Label': [sequence['Label'].unique()[0] for sequence in sequences],
})

In [22]:
sequences_df.head()

Unnamed: 0,Size,Sequence,ID,Label
0,94,Dst Port Protocol Timestamp Flow Durati...,1,0
1,64,Dst Port Protocol Timestamp Flow Durati...,2,9
2,1,Dst Port Protocol Timestamp Flow Duratio...,3,0
3,5610,Dst Port Protocol Timestamp Flow Dura...,4,9
4,1,Dst Port Protocol Timestamp Flow Duratio...,5,0


In [23]:
print("Análisis estadístico de los paquetes")
print(f"{sequences_df['Size'].describe()}")

Análisis estadístico de los paquetes
count     15947.000000
mean         45.148994
std        2061.803273
min           1.000000
25%           1.000000
50%           1.000000
75%           3.000000
max      238823.000000
Name: Size, dtype: float64


Obtener media y desviación estándar del número de paquetes.

In [24]:
# Obtener la media de paquetes por secuencia
mean = sequences_df['Size'].mean()

# Obtener la desviación estándar muestral (por defecto ddof=1)
deviation = sequences_df['Size'].std()

print(f"Número promedio de paquetes por secuencia: {mean}")
print(f"Desviación estándar (muestral): {deviation}")

Número promedio de paquetes por secuencia: 45.14899354110491
Desviación estándar (muestral): 2061.8032725880016


Obtener número de secuencias asociadas a cada etiqueta

In [25]:
sequences_df["Label"].value_counts()  # Contar la cantidad de secuencias por etiqueta

Label
6     5026
8     4668
0     3805
1     1548
5      635
3      109
2       73
10      32
7       30
9       18
4        3
Name: count, dtype: int64

In [26]:
import pandas as pd

# Filtrar las filas cuyo valor en la columna 'Label' no esté duplicado
df_unico = sequences_df[~sequences_df.duplicated(subset=['Label'], keep=False)]

# Imprimir el resultado
print(df_unico)


Empty DataFrame
Columns: [Size, Sequence, ID, Label]
Index: []


## Normalización de las secuencias

In [27]:
from sklearn.model_selection import train_test_split

# Eliminar filas con valores NaN en la columna
sequences_df = sequences_df.dropna(subset=["Label"])

# train y test serán los dataframes de secuencias que usaremos para entrenar al modelo
train, test = train_test_split(sequences_df, test_size=0.2, stratify=sequences_df["Label"])

# test.head()
train.head() # Es un dataframe barajado que contiene otros dataframes (secuencias) en la columna "Sequence"

Unnamed: 0,Size,Sequence,ID,Label
5128,1,Dst Port Protocol Timestamp Flow Duratio...,5129,8
12034,6,Dst Port Protocol Timestamp Flow Duratio...,12035,5
4554,1,Dst Port Protocol Timestamp Flow Duratio...,4555,6
15277,1,Dst Port Protocol Timestamp Flow Duratio...,15278,0
586,3,Dst Port Protocol Timestamp Flow Duratio...,587,8


In [28]:
# Asignar la columna 'ID' dentro de cada DataFrame en train
for i, sequence in enumerate(train["Sequence"]):
    sequence["ID"] = train.iloc[i]["ID"]
# Asignar la columna 'ID' dentro de cada DataFrame en train
for i, sequence in enumerate(test["Sequence"]):
    sequence["ID"] = test.iloc[i]["ID"]


# Concatenar manteniendo la columna 'ID'
train_concatenado = pd.concat(train["Sequence"].tolist(), ignore_index=True)
# Concatenar manteniendo la columna 'ID'
test_concatenado = pd.concat(test["Sequence"].tolist(), ignore_index=True)

## Separación de secuencias en train, test y validation

In [29]:
# Dividir los DataFrames en características (X) y etiquetas (y)
x_train = train_concatenado.drop(columns=['Label', 'ID'])
y_train = train_concatenado[['Label', 'ID']]
# Crear la nueva columna basada en el mapeo
y_train['Label_Mapped'] = y_train['Label'].map(LABELS)

x_test = test_concatenado.drop(columns=['Label', 'ID'])
y_test = test_concatenado[['Label', 'ID']]
# Crear la nueva columna basada en el mapeo
y_test['Label_Mapped'] = y_test['Label'].map(LABELS)

print(f"Dimensiones de x_train: {x_train.shape}")
print(y_train[['Label']].value_counts()) 
y_train.head() # Muestra los primeros 5 paquetes del dataframe de entrenamiento

NameError: name 'LABELS' is not defined

In [None]:
# Cambiar etiquetas a su estado original
"""original_labels = le.inverse_transform(sequences[-1]['Label'])
print(original_labels)
print(len(original_labels))"""

['Bot' 'Bot' 'Bot' 'Bot' 'Bot' 'Bot' 'Bot' 'Bot' 'Bot' 'Bot' 'Bot' 'Bot'
 'Bot' 'Bot' 'Bot' 'Bot' 'Bot']
17
