## Importación de librerías

In [1]:
import pandas as pd
import os
import glob
import csv
import json

## Configuración de variables globales

In [2]:
with open('setup.json', 'r') as f:
    SETUP_JSON = json.load(f)
DATASETS_PATH = SETUP_JSON['datasets_path'] # Path to the datasets,
DATASETS_FOLDER = os.path.join(os.getcwd(), DATASETS_PATH) # Folder containing the datasets,
DATASETS = glob.glob(os.path.join(DATASETS_FOLDER, '*.csv')) # List of datasets
OUTPUT_CSV = SETUP_JSON['output_csv'] # Output CSV file
OUTPUT_PARQUET = SETUP_JSON['output_parquet'] # Output CSV file

## Combinar datasets

Unimos todos los archivos CSV y guardamos el dataset en formato CSV (para analizarlo con Tableau) y en formato Parquet (para su tratamiento con Python).

In [10]:
datasets = [f"{DATASETS_PATH}\\02-14-2018.csv", f"{DATASETS_PATH}\\02-15-2018.csv", f"{DATASETS_PATH}\\02-16-2018.csv", f"{DATASETS_PATH}\\02-21-2018.csv", f"{DATASETS_PATH}\\02-22-2018.csv"]
for dataset in datasets:
    print(dataset)

datasets\02-14-2018.csv
datasets\02-15-2018.csv
datasets\02-16-2018.csv
datasets\02-21-2018.csv
datasets\02-22-2018.csv


In [24]:
# Detectar todas las columnas únicas
all_columns = set()
for file in DATASETS:
    df_sample = pd.read_csv(file, nrows=1000)
    all_columns.update(df_sample.columns)
print(f"Todas las columnas únicas: {len(all_columns)}")

Todas las columnas únicas: 84


In [37]:
# Añadir columnas faltantes a los datasets
def add_missing_columns(dataset):
    df = pd.read_csv(dataset, dtype=str)
    for col in all_columns:
        if col not in df.columns:
            df.insert(loc=df.shape[1] - 1, column=col, value=pd.NA)
    return df

In [38]:
# Cargar los datasets
df = pd.DataFrame()
for dataset in DATASETS:
    df_temp = add_missing_columns(dataset)
    df = pd.concat([df, df_temp], ignore_index=True, sort=False)

In [39]:
df.shape

(16233002, 84)

In [40]:
df.dtypes.unique()

array([dtype('O')], dtype=object)

In [41]:
df.head()

Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Dst IP,Src IP,Flow ID,Src Port,Label
0,0,0,14/02/2018 08:31:01,112641719,3,0,0,0,0,0,...,0,56320859.5,139.3000358938,56320958,56320761,,,,,Benign
1,0,0,14/02/2018 08:33:50,112641466,3,0,0,0,0,0,...,0,56320733.0,114.5512985522,56320814,56320652,,,,,Benign
2,0,0,14/02/2018 08:36:39,112638623,3,0,0,0,0,0,...,0,56319311.5,301.9345955667,56319525,56319098,,,,,Benign
3,22,6,14/02/2018 08:40:13,6453966,15,10,1239,2273,744,0,...,0,0.0,0.0,0,0,,,,,Benign
4,22,6,14/02/2018 08:40:23,8804066,14,11,1143,2209,744,0,...,0,0.0,0.0,0,0,,,,,Benign


In [42]:
df.tail()

Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Dst IP,Src IP,Flow ID,Src Port,Label
16232997,3389,6,02/03/2018 02:08:18,3982183,14,8,1442,1731,725,0,...,0,0,0,0,0,,,,,Benign
16232998,3389,6,02/03/2018 02:08:22,3802316,14,8,1440,1731,725,0,...,0,0,0,0,0,,,,,Benign
16232999,3389,6,02/03/2018 02:08:25,4004239,14,8,1459,1731,741,0,...,0,0,0,0,0,,,,,Benign
16233000,3389,6,02/03/2018 02:08:29,3998435,14,8,1459,1731,741,0,...,0,0,0,0,0,,,,,Benign
16233001,3389,6,02/03/2018 02:08:33,3972651,14,8,1439,1731,725,0,...,0,0,0,0,0,,,,,Benign


In [43]:
label = df.columns[-1]  # La última columna es la etiqueta de la secuencia
labels = df[label].unique() # Obtener etiquetas únicas del dataframe

# Crear un diccionario para almacenar las etiquetas y sus índices
labels_dict = {label: i for i, label in enumerate(labels)}

In [45]:
df[label] = df[label].map(labels_dict) # Mapear etiquetas a índices

In [None]:
df.head()

In [47]:
# Guardar el diccionario de etiquetas en un archivo JSON
with open('labels.json', 'w') as f:
    json.dump(labels_dict, f, indent=4)

In [66]:
# Guardar en CSV
df.to_csv(OUTPUT_CSV, index=False)
print(f"Archivo CSV guardado como {OUTPUT_CSV}")

# Guardar en Parquet
df.to_parquet(OUTPUT_PARQUET, index=False)
print(f"Archivo Parquet guardado como {OUTPUT_PARQUET}")

Archivo CSV guardado como merged_output.csv
Archivo Parquet guardado como merged_output.parquet


In [87]:

def find_datasets_with_same_header():
    headers = {}
    same_files = {}

    for file in DATASETS:
        with open(file, newline='', encoding='utf-8') as f:
            header = tuple(next(csv.reader(f), None))
            if header:
                if header in headers:
                    headers[header].append("datasets\\"+file)
                else:
                    headers[header] = ["datasets\\"+file]
    print("Archivos con headers distintos:", tuple(headers.values())[1])
    return tuple(headers.values())[0]

In [88]:
csv_files = find_datasets_with_same_header()
print("Archivos con el mismo encabezado:", csv_files)

Archivos con headers distintos: ['datasets\\c:\\Users\\isard\\Desktop\\AI-for-Traffic-network-classify\\datasets\\02-20-2018.csv']
Archivos con el mismo encabezado: ['datasets\\c:\\Users\\isard\\Desktop\\AI-for-Traffic-network-classify\\datasets\\02-14-2018.csv', 'datasets\\c:\\Users\\isard\\Desktop\\AI-for-Traffic-network-classify\\datasets\\02-15-2018.csv', 'datasets\\c:\\Users\\isard\\Desktop\\AI-for-Traffic-network-classify\\datasets\\02-16-2018.csv', 'datasets\\c:\\Users\\isard\\Desktop\\AI-for-Traffic-network-classify\\datasets\\02-21-2018.csv', 'datasets\\c:\\Users\\isard\\Desktop\\AI-for-Traffic-network-classify\\datasets\\02-22-2018.csv', 'datasets\\c:\\Users\\isard\\Desktop\\AI-for-Traffic-network-classify\\datasets\\02-23-2018.csv', 'datasets\\c:\\Users\\isard\\Desktop\\AI-for-Traffic-network-classify\\datasets\\02-28-2018.csv', 'datasets\\c:\\Users\\isard\\Desktop\\AI-for-Traffic-network-classify\\datasets\\03-01-2018.csv', 'datasets\\c:\\Users\\isard\\Desktop\\AI-for-Traff