## Importación de librerías

In [1]:
import pandas as pd
import os
import glob
import json

## Configuración de variables globales

In [9]:
with open('setup.json', 'r') as f:
    SETUP_JSON = json.load(f)
DATASETS_PATH = SETUP_JSON['datasets_path'] # Ruta de la carpeta de los datasets,
DATASETS_FOLDER = os.path.join(os.getcwd(), DATASETS_PATH) # Carpeta de los datasets,
DATASETS = glob.glob(os.path.join(DATASETS_FOLDER, '*.csv')) # Lista de los datasets
RAW_DATASET_CSV = SETUP_JSON['raw_dataset_csv'] # Fichero CSV de salida
RAW_DATASET_PARQUET = SETUP_JSON['raw_dataset_parquet'] # Fichero Parquet de salida
TAKE_FULL_DATASET = SETUP_JSON['take_full_dataset'] # Obtener el dataset completo o una muestra
NA_VAL = SETUP_JSON['navalues'] # Valores a considerar como NaN
balances = SETUP_JSON['balances'] # Balances a considerar

## Selección de los datasets a combinar

Obtener todos los encabezados diferentes

In [3]:
unique_headers = set()
for dataset in DATASETS:
    df_dtypes = pd.read_csv(dataset, nrows=1).dtypes.keys()
    unique_headers.add(tuple(df_dtypes))
for header in unique_headers:
    print(header)

('Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s', 'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt', 'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg', 'Fwd Seg Size Avg', 'Bwd Seg Size Avg', 'Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg', 'Bwd By

Separar los grupos de datasets por encabezados diferentes

In [4]:
datasets_by_header = {}
for header in unique_headers:
    datasets_group = []
    for dataset in DATASETS:
        df_dtypes = pd.read_csv(dataset, nrows=1).dtypes.keys()
        if tuple(df_dtypes) == header:
            datasets_group.append(dataset)
    datasets_by_header[header] = datasets_group
    print(f"Grupo de datasets:")
    for dataset in datasets_group:
        print(dataset)
    print(f"Número de columnas del grupo de datasets: {len(header)}")
    print()

Grupo de datasets:
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-14-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-15-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-16-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-21-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-22-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-23-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-28-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\03-01-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\03-02-2018.csv
Número de columnas del grupo de datasets: 80

Grupo de datasets:
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-20-2018.csv
Número de columnas del grupo de datasets: 84



Obtener etiquetas diferentes encontradas en cada grupo de datasets

In [5]:
datasets_by_num_labels = {}

for datasets_group in datasets_by_header.values():
    unique_labels = set()
    for dataset in datasets_group:
        all_labels = list(pd.read_csv(dataset, usecols=["Label"])["Label"])
        for label in all_labels:
            unique_labels.add(label)
    datasets_by_num_labels[tuple(datasets_group)] = len(unique_labels)
    print(f"Grupo de datasets:")
    for dataset in datasets_group:
        print(dataset)
    print(f"Etiquetas diferentes encontradas: ", unique_labels)
    print()


Grupo de datasets:
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-14-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-15-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-16-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-21-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-22-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-23-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-28-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\03-01-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\03-02-2018.csv
Etiquetas diferentes encontradas:  {'Bot', 'DoS attacks-Hulk', 'DoS attacks-GoldenEye', 'Brute Force -Web', 'DDOS attack-HOIC', 'Infilteration', 'Label', 'SQL Injection', 'DoS attacks-Slowloris', 'Brute Force -XSS', 'DoS attacks-SlowHTTPTest', 'SSH-Bruteforce', 'Benign'

Seleccionar el grupo de datasets que abarca más etiquetas diferentes

In [6]:
for datasets_group, num_labels in datasets_by_num_labels.items():
    if (num_labels) == max(datasets_by_num_labels.values()):
        datasets_selected = datasets_group
print(f"Selección de datasets a procesar:")
for dataset in datasets_selected:
    print(dataset)

Selección de datasets a procesar:
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-14-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-15-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-16-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-21-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-22-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-23-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-28-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\03-01-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\03-02-2018.csv


## Combinar datasets

Unimos todos los datasets seleccionados en un sólo dataset

In [12]:
# Inicializar DataFrame vacío
df = pd.DataFrame()

for dataset in datasets_selected:
    current_df = pd.read_csv(dataset, sep=',', low_memory=False, na_values=NA_VAL) 
    df = pd.concat([df, current_df], ignore_index=True)
    print(f"Dataset concatenado: {dataset}, Dimensiones: {current_df.shape}")

# Convertir todas las columnas a string
df = df.astype(str)

Dataset concatenado: c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-14-2018.csv, Dimensiones: (1048575, 80)
Dataset concatenado: c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-15-2018.csv, Dimensiones: (1048575, 80)
Dataset concatenado: c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-16-2018.csv, Dimensiones: (1048575, 80)
Dataset concatenado: c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-21-2018.csv, Dimensiones: (1048575, 80)
Dataset concatenado: c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-22-2018.csv, Dimensiones: (1048575, 80)
Dataset concatenado: c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-23-2018.csv, Dimensiones: (1048575, 80)
Dataset concatenado: c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-28-2018.csv, Dimensiones: (613104, 80)
Dataset concatenado: c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\03-01-2018.csv, Dime

## Exportación del dataset bruto en formato Parquet y CSV

Exportar el dataset en Parquet para preprocesamiento, entrenamiento y evaluación

In [None]:
# Guardar en Parquet
df.to_parquet(RAW_DATASET_PARQUET, index=False)
print(f"Archivo Parquet guardado como {RAW_DATASET_PARQUET}")

Archivo parquet guardado como raw_dataset.parquet


Exportar el dataset en CSV para análisis exploratorio con Tableau

In [14]:
# Guardar en CSV
df.to_csv(RAW_DATASET_CSV, index=False)
print(f"Archivo CSV guardado como {RAW_DATASET_CSV}")

Archivo CSV guardado como raw_dataset.csv


In [None]:
"""import pandas as pd

# Inicializar DataFrame vacío
df = pd.DataFrame()

if TAKE_FULL_DATASET:
    for dataset in datasets_selected:
        current_df = pd.read_csv(dataset, sep=',', low_memory=False, na_values=NA_VAL) 

        # Identificar secuencias originales dentro de cada dataset
        current_df["Original_Sequence_ID"] = (current_df["Label"] != current_df["Label"].shift()).cumsum()

        df = pd.concat([df, current_df], ignore_index=True)
        print(f"Dataset concatenado: {dataset}, Dimensiones: {current_df.shape}")

else:
    for dataset in datasets_selected:
        current_df = pd.read_csv(dataset, sep=',', low_memory=False, na_values=NA_VAL)
        
        # Identificar secuencias originales dentro de cada dataset
        current_df["Original_Sequence_ID"] = (current_df["Label"] != current_df["Label"].shift()).cumsum()

        # Agregar una columna auxiliar que cuenta la aparición de cada Label
        current_df["Label_Count"] = current_df.groupby(["Label", "Original_Sequence_ID"]).cumcount() + 1

        # Filtrar solo las primeras N apariciones de cada Label
        copy_df = current_df[current_df.apply(lambda row: row["Label_Count"] <= balances.get(row["Label"], 0), axis=1)]

        # Eliminar columna auxiliar antes de concatenar
        copy_df.drop(columns=["Label_Count"], inplace=True)

        df = pd.concat([df, copy_df], ignore_index=True)

        print(f"Dataset concatenado: {dataset}, Dimensiones: {copy_df.shape}")

# Crear columna Sequence_ID considerando cambios en Label y Original_Sequence_ID
df["Sequence_ID"] = (df["Label"] != df["Label"].shift()) | (df["Original_Sequence_ID"] != df["Original_Sequence_ID"].shift())
df["Sequence_ID"] = df["Sequence_ID"].cumsum()"""