## Importación de librerías

In [55]:
import pandas as pd
import os
import glob
import json
import numpy as np
import math
from sklearn.preprocessing import LabelEncoder

## Configuración de variables globales

In [56]:
with open('setup.json', 'r') as f:
    SETUP_JSON = json.load(f)
DATASETS_PATH = SETUP_JSON['datasets_path'] # Path to the datasets,
DATASETS_FOLDER = os.path.join(os.getcwd(), DATASETS_PATH) # Folder containing the datasets,
DATASETS = glob.glob(os.path.join(DATASETS_FOLDER, '*.csv')) # List of datasets
OUTPUT_CSV = SETUP_JSON['dataset_csv'] # Output CSV file
OUTPUT_PARQUET = SETUP_JSON['dataset_parquet'] # Output CSV file
N_ROWS = SETUP_JSON['n_rows']
HEADER = SETUP_JSON['header']
with open(HEADER, 'r') as f:
    HEADER_JSON = json.load(f)
NA_VAL = SETUP_JSON['navalues']

## Selección de los datasets a combinar

Obtener todos los encabezados diferentes

In [57]:
unique_headers = set()
for dataset in DATASETS:
    df = pd.read_csv(dataset, nrows=1)
    df_dtypes = df.dtypes.keys()
    unique_headers.add(tuple(df_dtypes))
for header in unique_headers:
    print(header)

('Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s', 'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt', 'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg', 'Fwd Seg Size Avg', 'Bwd Seg Size Avg', 'Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg', 'Bwd By

Separar los grupos de datasets por encabezados diferentes

In [58]:
datasets_by_header = {}
for header in unique_headers:
    datasets_group = []
    for dataset in DATASETS:
        df = pd.read_csv(dataset, nrows=1)
        df_dtypes = df.dtypes.keys()
        if tuple(df_dtypes) == header:
            datasets_group.append(dataset)
    datasets_by_header[header] = datasets_group
    print(f"Grupo de datasets:")
    for dataset in datasets_group:
        print(dataset)
    print(f"Número de columnas del grupo de datasets: {len(header)}")
    print()

Grupo de datasets:
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-14-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-15-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-16-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-21-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-22-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-23-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-28-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\03-01-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\03-02-2018.csv
Número de columnas del grupo de datasets: 80

Grupo de datasets:
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-20-2018.csv
Número de columnas del grupo de datasets: 84



Separar los grupos de datasets por etiquetas diferentes encontradas

In [59]:
datasets_by_num_labels = {}

for datasets_group in datasets_by_header.values():
    unique_labels = set()
    for dataset in datasets_group:
        df = pd.read_csv(dataset, dtype=str)
        last_column = df.columns[-1]  # Obtiene el nombre de la última columna
        for label in list(df[last_column]):  # Usa la última columna en lugar de "Label"
            unique_labels.add(label)
    datasets_by_num_labels[tuple(datasets_group)] = len(unique_labels)
    print(f"Grupo de datasets:")
    for dataset in datasets_group:
        print(dataset)
    print(f"Etiquetas diferentes encontradas: ", unique_labels)
    print()


Grupo de datasets:
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-14-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-15-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-16-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-21-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-22-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-23-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-28-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\03-01-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\03-02-2018.csv
Etiquetas diferentes encontradas:  {'Bot', 'FTP-BruteForce', 'Label', 'SQL Injection', 'DoS attacks-GoldenEye', 'DoS attacks-Slowloris', 'Brute Force -XSS', 'DoS attacks-SlowHTTPTest', 'DoS attacks-Hulk', 'DDOS attack-HOIC', 'Brute Force -Web', 'Infilteration', 'DDOS at

Seleccionar el grupo de datasets que abarca más etiquetas diferentes

In [60]:
for datasets_group, num_labels in datasets_by_num_labels.items():
    if (num_labels) == max(datasets_by_num_labels.values()):
        datasets_selected = datasets_group
print(f"Selección de datasets a procesar:")
for dataset in datasets_selected:
    print(dataset)

Selección de datasets a procesar:
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-14-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-15-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-16-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-21-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-22-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-23-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-28-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\03-01-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\03-02-2018.csv


Seleccionar encabezado de los datasets a combinar

In [61]:
# Seleccionar encabezado de los datasets a combinar
x_columns = pd.read_csv(datasets_selected[0], nrows=1).dtypes.to_dict()
x_columns.pop('Label')

dtype('O')

In [62]:
for col, dtype in x_columns.items():
    print(col, dtype)

Dst Port int64
Protocol int64
Timestamp object
Flow Duration int64
Tot Fwd Pkts int64
Tot Bwd Pkts int64
TotLen Fwd Pkts int64
TotLen Bwd Pkts int64
Fwd Pkt Len Max int64
Fwd Pkt Len Min int64
Fwd Pkt Len Mean int64
Fwd Pkt Len Std int64
Bwd Pkt Len Max int64
Bwd Pkt Len Min int64
Bwd Pkt Len Mean int64
Bwd Pkt Len Std int64
Flow Byts/s int64
Flow Pkts/s float64
Flow IAT Mean float64
Flow IAT Std float64
Flow IAT Max int64
Flow IAT Min int64
Fwd IAT Tot int64
Fwd IAT Mean float64
Fwd IAT Std float64
Fwd IAT Max int64
Fwd IAT Min int64
Bwd IAT Tot int64
Bwd IAT Mean int64
Bwd IAT Std int64
Bwd IAT Max int64
Bwd IAT Min int64
Fwd PSH Flags int64
Bwd PSH Flags int64
Fwd URG Flags int64
Bwd URG Flags int64
Fwd Header Len int64
Bwd Header Len int64
Fwd Pkts/s float64
Bwd Pkts/s int64
Pkt Len Min int64
Pkt Len Max int64
Pkt Len Mean int64
Pkt Len Std int64
Pkt Len Var int64
FIN Flag Cnt int64
SYN Flag Cnt int64
RST Flag Cnt int64
PSH Flag Cnt int64
ACK Flag Cnt int64
URG Flag Cnt int64
CWE F

## Combinar datasets

Unimos todos los datasets seleccionados en un sólo dataframe.

In [63]:
df = pd.DataFrame()
for dataset in datasets_selected:
    current_df = pd.read_csv(dataset, sep=',', low_memory=False, na_values=NA_VAL, nrows=N_ROWS)
    df = pd.concat([df, current_df], ignore_index=True)
    print(f"Dataset concatenado: {dataset}")

Dataset concatenado: c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-14-2018.csv
Dataset concatenado: c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-15-2018.csv
Dataset concatenado: c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-16-2018.csv
Dataset concatenado: c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-21-2018.csv
Dataset concatenado: c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-22-2018.csv
Dataset concatenado: c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-23-2018.csv
Dataset concatenado: c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-28-2018.csv
Dataset concatenado: c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\03-01-2018.csv
Dataset concatenado: c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\03-02-2018.csv


In [64]:
df.shape

(720000, 80)

Eliminar encabezados repetidos

In [65]:
# Eliminar encabezados repetidos
df = df[df['Label'] != 'Label']

In [66]:
df.shape

(719991, 80)

Eliminar valores duplicados

In [67]:
# df = df.drop_duplicates()

In [68]:
df.shape

(719991, 80)

Convertir etiquetas a números

In [69]:
# Crear instancia del codificador
le = LabelEncoder()
# Aplicar LabelEncoder sobre la columna 'Label'
df['Label'] = le.fit_transform(df['Label'])

In [70]:
df['Label'].unique()

array([ 0,  9,  6,  8,  7,  5,  4,  2,  3, 10,  1])

Asignar tipo correspondiente a cada una de las columnas

In [71]:
df_copy = df.copy()

In [72]:
for col1, dtype1 in df_copy.dtypes.to_dict().items():
    for col2, dtype2 in x_columns.items():
        if col2 == col1:
            # Si la columna es numérica, convertir a float64
            if np.issubdtype(dtype2, np.number):
                df_copy[col1] = pd.to_numeric(df_copy[col1], errors='coerce').astype('float64')
            else:
                df_copy[col1] = df_copy[col1].astype(dtype2)
            print(f"Columna {col1} actualizada: {df_copy[col1].dtype}")

Columna Dst Port actualizada: float64
Columna Protocol actualizada: float64
Columna Timestamp actualizada: object
Columna Flow Duration actualizada: float64
Columna Tot Fwd Pkts actualizada: float64
Columna Tot Bwd Pkts actualizada: float64
Columna TotLen Fwd Pkts actualizada: float64
Columna TotLen Bwd Pkts actualizada: float64
Columna Fwd Pkt Len Max actualizada: float64
Columna Fwd Pkt Len Min actualizada: float64
Columna Fwd Pkt Len Mean actualizada: float64
Columna Fwd Pkt Len Std actualizada: float64
Columna Bwd Pkt Len Max actualizada: float64
Columna Bwd Pkt Len Min actualizada: float64
Columna Bwd Pkt Len Mean actualizada: float64
Columna Bwd Pkt Len Std actualizada: float64
Columna Flow Byts/s actualizada: float64
Columna Flow Pkts/s actualizada: float64
Columna Flow IAT Mean actualizada: float64
Columna Flow IAT Std actualizada: float64
Columna Flow IAT Max actualizada: float64
Columna Flow IAT Min actualizada: float64
Columna Fwd IAT Tot actualizada: float64
Columna Fwd IAT

# Conversión de timestamp a formato legible

In [73]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d/%m/%Y %H:%M:%S', errors='coerce') #convertimos la timestamp a formato fecha de pandas
df['Timestamp'] = np.sin(2*math.pi*df['Timestamp'].dt.hour/24)  # convertimos la fecha en un ciclo legible para el modelo

In [74]:
df.shape

(719991, 80)

In [75]:
df.dtypes

Dst Port          object
Protocol          object
Timestamp        float64
Flow Duration     object
Tot Fwd Pkts      object
                  ...   
Idle Mean         object
Idle Std          object
Idle Max          object
Idle Min          object
Label              int64
Length: 80, dtype: object

## Exportación del dataframe en formato CSV y parquet

In [None]:
df["Protocol "] = df["Protocol"].astype(int)
df["Dst Port"] = df["Dst Port"].astype(int)
# Guardar en csv
df.to_csv(OUTPUT_CSV, index=False)
print(f"Archivo CSV guardado como {OUTPUT_CSV}")

# Guardar en Parquet
# df.to_parquet(OUTPUT_PARQUET, index=False)  # No está funcionando
# print(f"Archivo CSV guardado como {OUTPUT_PARQUET}")

df.head()

Archivo CSV guardado como clean_dataset.csv
Archivo CSV guardado como clean_dataset.parquet


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Protocol.1
0,0,0,0.866025,112641719,3,0,0,0,0,0,...,0.0,0.0,0,0,56320859.5,139.300036,56320958,56320761,0,0
1,0,0,0.866025,112641466,3,0,0,0,0,0,...,0.0,0.0,0,0,56320733.0,114.551299,56320814,56320652,0,0
2,0,0,0.866025,112638623,3,0,0,0,0,0,...,0.0,0.0,0,0,56319311.5,301.934596,56319525,56319098,0,0
3,22,6,0.866025,6453966,15,10,1239,2273,744,0,...,0.0,0.0,0,0,0.0,0.0,0,0,0,6
4,22,6,0.866025,8804066,14,11,1143,2209,744,0,...,0.0,0.0,0,0,0.0,0.0,0,0,0,6


In [None]:
"""def find_datasets_with_same_header():
    headers = {}
    same_files = {}

    for file in DATASETS:
        with open(file, newline='', encoding='utf-8') as f:
            header = tuple(next(csv.reader(f), None))
            if header:
                if header in headers:
                    headers[header].append(file)
                else:
                    headers[header] = [file]
    print("Archivos con headers distintos:", tuple(headers.values())[1])
    return tuple(headers.values())[0]

DATASETS = find_datasets_with_same_header()"""

'def find_datasets_with_same_header():\n    headers = {}\n    same_files = {}\n\n    for file in DATASETS:\n        with open(file, newline=\'\', encoding=\'utf-8\') as f:\n            header = tuple(next(csv.reader(f), None))\n            if header:\n                if header in headers:\n                    headers[header].append(file)\n                else:\n                    headers[header] = [file]\n    print("Archivos con headers distintos:", tuple(headers.values())[1])\n    return tuple(headers.values())[0]\n\nDATASETS = find_datasets_with_same_header()'

In [None]:
"""def mixed_types_columns(df):
    """
    # Recorre todas las columnas del DataFrame y devuelve un diccionario
    # con las columnas que contienen más de un tipo de dato.

    # Args:
    #     df (pd.DataFrame): El DataFrame a analizar.

    # Returns:
    #     dict: Un diccionario con nombres de columnas como claves,
    #           y otro diccionario con los tipos de datos y sus cantidades como valores.
    """
    mixed_types_columns = {}

    for col in df.columns:
        types = {}
        for val in df[col]:
            typeVal = type(val)
            types[typeVal] = types.get(typeVal, 0) + 1

        if len(types) > 1:
            mixed_types_columns[col] = types

    return mixed_types_columns

df = pd.read_csv('merged_output.csv')
mixed_types_columns = mixed_types_columns(df)
for col, types in mixed_types_columns.items():
    print(f"Columna: {col}")
    for typeVal, quantity in types.items():
        print(f"  type: {typeVal.__name__}, Cantidad: {quantity}")
    print()"""

IndentationError: unexpected indent (4188301155.py, line 12)

In [None]:
"""columns_lengths = []

# Obtener número de columnas por dataset
for dataset in DATASETS:
    columns_length = len(pd.read_csv(dataset, nrows=1).dtypes)
    columns_lengths.append(columns_length)

# Obtener grupo de características más largo
for dataset in DATASETS:
    df = pd.read_csv(dataset, nrows=1)
    columns_length = len(df.dtypes)
    if columns_length == max(columns_lengths):
        X_columns = df.dtypes.to_dict()
        if 'Label' in X_columns:
            X_columns.pop('Label')
print(len(X_columns))
for column, type in X_columns.items():
    print(column, type)"""