## Importación de librerías

In [56]:
import pandas as pd
import os
import glob
import json
import numpy as np

## Configuración de variables globales

In [58]:
with open('setup.json', 'r') as f:
    SETUP_JSON = json.load(f)
DATASETS_PATH = SETUP_JSON['datasets_path'] # Path to the datasets,
DATASETS_FOLDER = os.path.join(os.getcwd(), DATASETS_PATH) # Folder containing the datasets,
DATASETS = glob.glob(os.path.join(DATASETS_FOLDER, '*.csv')) # List of datasets
OUTPUT_CSV = SETUP_JSON['output_csv'] # Output CSV file
OUTPUT_PARQUET = SETUP_JSON['output_parquet'] # Output CSV file
N_ROWS = SETUP_JSON['n_rows']
HEADER = SETUP_JSON['header']
NA_VAL = ['N/a', 'na', 'Na', 'NA', 'NAN', 'Nan', 'NaN', np.nan]

## Selección de los datasets a combinar

Obtener todos los encabezados diferentes

In [59]:
unique_headers = set()
for dataset in DATASETS:
    df = pd.read_csv(dataset, nrows=1)
    df_dtypes = df.dtypes.keys()
    unique_headers.add(tuple(df_dtypes))
for header in unique_headers:
    print(header)

('Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s', 'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt', 'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg', 'Fwd Seg Size Avg', 'Bwd Seg Size Avg', 'Fwd Byts/b Avg', 'F

Separar los grupos de datasets por encabezados diferentes

In [60]:
datasets_by_header = {}
for header in unique_headers:
    datasets_group = []
    for dataset in DATASETS:
        df = pd.read_csv(dataset, nrows=1)
        df_dtypes = df.dtypes.keys()
        if tuple(df_dtypes) == header:
            datasets_group.append(dataset)
    datasets_by_header[header] = datasets_group
    print(f"Grupo de datasets:")
    for dataset in datasets_group:
        print(dataset)
    print(f"Número de columnas del grupo de datasets: {len(header)}")
    print()

Grupo de datasets:
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-20-2018.csv
Número de columnas del grupo de datasets: 84

Grupo de datasets:
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-14-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-15-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-16-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-21-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-22-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-23-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-28-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\03-01-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\03-02-2018.csv
Número de columnas del grupo de datasets: 80



Separar los grupos de datasets por etiquetas diferentes encontradas

In [61]:
datasets_by_num_labels = {}

for datasets_group in datasets_by_header.values():
    unique_labels = set()
    for dataset in datasets_group:
        df = pd.read_csv(dataset, dtype=str)
        last_column = df.columns[-1]  # Obtiene el nombre de la última columna
        for label in list(df[last_column]):  # Usa la última columna en lugar de "Label"
            unique_labels.add(label)
    datasets_by_num_labels[tuple(datasets_group)] = len(unique_labels)
    print(f"Grupo de datasets:")
    for dataset in datasets_group:
        print(dataset)
    print(f"Etiquetas diferentes encontradas: ", unique_labels)
    print()


Grupo de datasets:
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-20-2018.csv
Etiquetas diferentes encontradas:  {'DDoS attacks-LOIC-HTTP', 'Benign'}

Grupo de datasets:
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-14-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-15-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-16-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-21-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-22-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-23-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-28-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\03-01-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\03-02-2018.csv
Etiquetas diferentes encontradas:  {'Brute Force -XSS', 'Label', 'Bot', 'FTP-BruteForce', 'DoS atta

Seleccionar el grupo de datasets que abarca más etiquetas diferentes

In [62]:
for datasets_group, num_labels in datasets_by_num_labels.items():
    if (num_labels) == max(datasets_by_num_labels.values()):
        datasets_selected = datasets_group
print(f"Selección de datasets a procesar:")
for dataset in datasets_selected:
    print(dataset)

Selección de datasets a procesar:
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-14-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-15-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-16-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-21-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-22-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-23-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-28-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\03-01-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\03-02-2018.csv


Exportar encabezado del grupo de datasets en formato JSON

In [63]:
# Seleccionar encabezado de los datasets a combinar
header = pd.read_csv(datasets_selected[0], nrows=1).dtypes.to_dict()
header = {col: str(dtype) for col, dtype in header.items()}
header.pop('Label')

# Exportar el encabezado
with open(HEADER, 'w') as f:
    json.dump(header, f, indent=4)

## Combinar datasets

Unimos todos los datasets seleccionados en un sólo dataframe.

In [64]:
#
#   NO BORRAR
#

def concatenar_csv(lista_archivos, num_lineas=N_ROWS):  # Concatena los datasets sin duplicar el header ¡¡¡FUNCIONA!!!
    """
    Toma una lista de archivos CSV y los concatena en un solo DataFrame,
    utilizando solo la cabecera del primer archivo y permitiendo especificar el número de líneas a leer.

    :param lista_archivos: Lista de rutas de archivos CSV.
    :param num_lineas: Número de líneas a leer de cada archivo (None para leer todo).
    :return: DataFrame concatenado.
    """
    df_from_each_file = []

    for file in lista_archivos:
        print(f'Appending {file}')
        read_df = pd.read_csv(file, sep=',', low_memory=False, na_values=NA_VAL, nrows=num_lineas)

        # Eliminar columna 'Unnamed: 0' si existe
        if 'Unnamed: 0' in list(read_df.columns):
            read_df.drop(['Unnamed: 0'], axis=1, inplace=True)

        df_from_each_file.append(read_df)

    # Concatenación de todos los DataFrames
    df = pd.concat(df_from_each_file, ignore_index=True)
    return df


df = concatenar_csv(datasets_selected)
df.head()

Appending c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-14-2018.csv
Appending c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-15-2018.csv
Appending c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-16-2018.csv
Appending c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-21-2018.csv
Appending c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-22-2018.csv
Appending c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-23-2018.csv
Appending c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-28-2018.csv
Appending c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\03-01-2018.csv
Appending c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\03-02-2018.csv


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,0,14/02/2018 08:31:01,112641719,3,0,0,0,0,0,...,0,0,0,0,0,56320859.5,139.300036,56320958,56320761,Benign
1,0,0,14/02/2018 08:33:50,112641466,3,0,0,0,0,0,...,0,0,0,0,0,56320733.0,114.551299,56320814,56320652,Benign
2,0,0,14/02/2018 08:36:39,112638623,3,0,0,0,0,0,...,0,0,0,0,0,56319311.5,301.934596,56319525,56319098,Benign
3,22,6,14/02/2018 08:40:13,6453966,15,10,1239,2273,744,0,...,32,0,0,0,0,0.0,0.0,0,0,Benign
4,22,6,14/02/2018 08:40:23,8804066,14,11,1143,2209,744,0,...,32,0,0,0,0,0.0,0.0,0,0,Benign


In [65]:
df.shape

(720000, 80)

In [66]:
df.dtypes

Dst Port         object
Protocol         object
Timestamp        object
Flow Duration    object
Tot Fwd Pkts     object
                  ...  
Idle Mean        object
Idle Std         object
Idle Max         object
Idle Min         object
Label            object
Length: 80, dtype: object

## Exportación del dataframe en formato CSV

In [67]:
# Guardar en csv
df.to_csv(OUTPUT_CSV, index=False)
print(f"Archivo CSV guardado como {OUTPUT_CSV}")

# Guardar en Parquet
df.to_parquet(OUTPUT_PARQUET, index=False)
print(f"Archivo CSV guardado como {OUTPUT_PARQUET}")
df.head()

Archivo CSV guardado como merged_dataset.csv


ArrowInvalid: ("Could not convert '443' with type str: tried to convert to int64", 'Conversion failed for column Dst Port with type object')

In [None]:
"""def find_datasets_with_same_header():
    headers = {}
    same_files = {}

    for file in DATASETS:
        with open(file, newline='', encoding='utf-8') as f:
            header = tuple(next(csv.reader(f), None))
            if header:
                if header in headers:
                    headers[header].append(file)
                else:
                    headers[header] = [file]
    print("Archivos con headers distintos:", tuple(headers.values())[1])
    return tuple(headers.values())[0]

DATASETS = find_datasets_with_same_header()"""

'def find_datasets_with_same_header():\n    headers = {}\n    same_files = {}\n\n    for file in DATASETS:\n        with open(file, newline=\'\', encoding=\'utf-8\') as f:\n            header = tuple(next(csv.reader(f), None))\n            if header:\n                if header in headers:\n                    headers[header].append(file)\n                else:\n                    headers[header] = [file]\n    print("Archivos con headers distintos:", tuple(headers.values())[1])\n    return tuple(headers.values())[0]\n\nDATASETS = find_datasets_with_same_header()'

In [None]:
"""def mixed_types_columns(df):
    """
    # Recorre todas las columnas del DataFrame y devuelve un diccionario
    # con las columnas que contienen más de un tipo de dato.

    # Args:
    #     df (pd.DataFrame): El DataFrame a analizar.

    # Returns:
    #     dict: Un diccionario con nombres de columnas como claves,
    #           y otro diccionario con los tipos de datos y sus cantidades como valores.
    """
    mixed_types_columns = {}

    for col in df.columns:
        types = {}
        for val in df[col]:
            typeVal = type(val)
            types[typeVal] = types.get(typeVal, 0) + 1

        if len(types) > 1:
            mixed_types_columns[col] = types

    return mixed_types_columns

df = pd.read_csv('merged_output.csv')
mixed_types_columns = mixed_types_columns(df)
for col, types in mixed_types_columns.items():
    print(f"Columna: {col}")
    for typeVal, quantity in types.items():
        print(f"  type: {typeVal.__name__}, Cantidad: {quantity}")
    print()"""

IndentationError: unexpected indent (4188301155.py, line 12)

In [None]:
"""columns_lengths = []

# Obtener número de columnas por dataset
for dataset in DATASETS:
    columns_length = len(pd.read_csv(dataset, nrows=1).dtypes)
    columns_lengths.append(columns_length)

# Obtener grupo de características más largo
for dataset in DATASETS:
    df = pd.read_csv(dataset, nrows=1)
    columns_length = len(df.dtypes)
    if columns_length == max(columns_lengths):
        X_columns = df.dtypes.to_dict()
        if 'Label' in X_columns:
            X_columns.pop('Label')
print(len(X_columns))
for column, type in X_columns.items():
    print(column, type)"""