## Importación de librerías

In [4]:
import pandas as pd
import os
import glob
import csv
import json

## Configuración de variables globales

In [25]:
with open('setup.json', 'r') as f:
    SETUP_JSON = json.load(f)
DATASETS_PATH = SETUP_JSON['datasets_path'] # Path to the datasets,
DATASETS_FOLDER = os.path.join(os.getcwd(), DATASETS_PATH) # Folder containing the datasets,
DATASETS = glob.glob(os.path.join(DATASETS_FOLDER, '*.csv')) # List of datasets
OUTPUT_CSV = SETUP_JSON['output_csv'] # Output CSV file
OUTPUT_PARQUET = SETUP_JSON['output_parquet'] # Output CSV file

## Combinar datasets

Unimos todos los archivos CSV y guardamos el dataset en formato CSV (para analizarlo con Tableau) y en formato Parquet (para su tratamiento con Python).

In [58]:
datasets = [f"{DATASETS_PATH}\\02-14-2018.csv", f"{DATASETS_PATH}\\02-15-2018.csv", f"{DATASETS_PATH}\\02-16-2018.csv", f"{DATASETS_PATH}\\02-21-2018.csv", f"{DATASETS_PATH}\\02-22-2018.csv"]
for dataset in datasets:
    print(dataset)

datasets\02-14-2018.csv
datasets\02-15-2018.csv
datasets\02-16-2018.csv
datasets\02-21-2018.csv
datasets\02-22-2018.csv


In [59]:
df = pd.DataFrame()

# Leer los datasets y concatenar
for dataset in datasets:
    df = pd.read_csv(dataset)
    df = pd.concat([df, df.iloc[1:]])  # Excluir el encabezado después del primer archivo

  df = pd.read_csv(dataset)


In [60]:
df.shape

(2097149, 80)

In [61]:
df.head()

Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,22,6,22/02/2018 08:26:03,20553406,10,7,1063,1297,744,0,...,20,1027304.0,0.0,1027304,1027304,19526080.0,0.0,19526080,19526080,Benign
1,34989,6,22/02/2018 08:26:24,790,2,0,848,0,848,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
2,500,17,22/02/2018 08:25:10,99745913,5,0,2500,0,500,500,...,8,4000203.0,0.0,4000203,4000203,31915240.0,37927870.0,75584115,7200679,Benign
3,500,17,22/02/2018 08:25:10,99745913,5,0,2500,0,500,500,...,8,4000189.0,0.0,4000189,4000189,31915240.0,37927880.0,75584130,7200693,Benign
4,500,17,22/02/2018 08:24:59,89481361,6,0,3000,0,500,500,...,8,4000554.0,0.0,4000554,4000554,21370200.0,15281090.0,41990741,7200848,Benign


In [62]:
df.tail()

Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
1048570,53,17,22/02/2018 09:57:34,61898,2,2,78,254,39,39,...,8,0.0,0.0,0,0,0.0,0.0,0,0,Benign
1048571,1500,6,22/02/2018 04:51:29,86213373,2,0,0,0,0,0,...,20,0.0,0.0,0,0,86213373.0,0.0,86213373,86213373,Benign
1048572,53,17,22/02/2018 03:52:37,642,1,1,39,67,39,39,...,8,0.0,0.0,0,0,0.0,0.0,0,0,Benign
1048573,53,17,22/02/2018 09:40:42,78472,1,1,32,121,32,32,...,8,0.0,0.0,0,0,0.0,0.0,0,0,Benign
1048574,3389,6,22/02/2018 04:34:32,2013403,8,7,1144,1581,677,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign


In [63]:
label = df.columns[-1]  # La última columna es la etiqueta de la secuencia
labels = df[label].unique() # Obtener etiquetas únicas del dataframe
print(f"Etiquetas diferentes: {labels}")

Etiquetas diferentes: ['Benign' 'Brute Force -Web' 'Brute Force -XSS' 'SQL Injection']


In [64]:
# Crear una nueva columna con índices de etiquetas
labels_indexes = {label: index for index, label in enumerate(labels)} # Crear un diccionario de etiquetas únicas y sus índices
df['Label_index'] = df[label].map(labels_indexes) # Mapear etiquetas a índices

In [65]:
df.head()

Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Label_index
0,22,6,22/02/2018 08:26:03,20553406,10,7,1063,1297,744,0,...,1027304.0,0.0,1027304,1027304,19526080.0,0.0,19526080,19526080,Benign,0
1,34989,6,22/02/2018 08:26:24,790,2,0,848,0,848,0,...,0.0,0.0,0,0,0.0,0.0,0,0,Benign,0
2,500,17,22/02/2018 08:25:10,99745913,5,0,2500,0,500,500,...,4000203.0,0.0,4000203,4000203,31915240.0,37927870.0,75584115,7200679,Benign,0
3,500,17,22/02/2018 08:25:10,99745913,5,0,2500,0,500,500,...,4000189.0,0.0,4000189,4000189,31915240.0,37927880.0,75584130,7200693,Benign,0
4,500,17,22/02/2018 08:24:59,89481361,6,0,3000,0,500,500,...,4000554.0,0.0,4000554,4000554,21370200.0,15281090.0,41990741,7200848,Benign,0


In [67]:
# Revisar cada columna y contar los tipos de valores
for col in df.columns:
    tipos = df[col].apply(type).value_counts()
    if len(tipos) > 1:  # Si hay más de un tipo, lo imprimimos
        print(f"Columna: {col}")
        print(tipos)
        print("-" * 30)
    else:
        print(f"Columna: {col} tiene un solo tipo de dato: {tipos.index[0]}")
        print("-" * 30)

Columna: Dst Port tiene un solo tipo de dato: <class 'int'>
------------------------------
Columna: Protocol tiene un solo tipo de dato: <class 'int'>
------------------------------
Columna: Timestamp tiene un solo tipo de dato: <class 'str'>
------------------------------
Columna: Flow Duration tiene un solo tipo de dato: <class 'int'>
------------------------------
Columna: Tot Fwd Pkts tiene un solo tipo de dato: <class 'int'>
------------------------------
Columna: Tot Bwd Pkts tiene un solo tipo de dato: <class 'int'>
------------------------------
Columna: TotLen Fwd Pkts tiene un solo tipo de dato: <class 'int'>
------------------------------
Columna: TotLen Bwd Pkts tiene un solo tipo de dato: <class 'int'>
------------------------------
Columna: Fwd Pkt Len Max tiene un solo tipo de dato: <class 'int'>
------------------------------
Columna: Fwd Pkt Len Min tiene un solo tipo de dato: <class 'int'>
------------------------------
Columna: Fwd Pkt Len Mean tiene un solo tipo de 

In [66]:
# Guardar en CSV
df.to_csv(OUTPUT_CSV, index=False)
print(f"Archivo CSV guardado como {OUTPUT_CSV}")

# Guardar en Parquet
df.to_parquet(OUTPUT_PARQUET, index=False)
print(f"Archivo Parquet guardado como {OUTPUT_PARQUET}")

Archivo CSV guardado como merged_output.csv
Archivo Parquet guardado como merged_output.parquet


In [87]:

def find_datasets_with_same_header():
    headers = {}
    same_files = {}

    for file in DATASETS:
        with open(file, newline='', encoding='utf-8') as f:
            header = tuple(next(csv.reader(f), None))
            if header:
                if header in headers:
                    headers[header].append("datasets\\"+file)
                else:
                    headers[header] = ["datasets\\"+file]
    print("Archivos con headers distintos:", tuple(headers.values())[1])
    return tuple(headers.values())[0]

In [88]:
csv_files = find_datasets_with_same_header()
print("Archivos con el mismo encabezado:", csv_files)

Archivos con headers distintos: ['datasets\\c:\\Users\\isard\\Desktop\\AI-for-Traffic-network-classify\\datasets\\02-20-2018.csv']
Archivos con el mismo encabezado: ['datasets\\c:\\Users\\isard\\Desktop\\AI-for-Traffic-network-classify\\datasets\\02-14-2018.csv', 'datasets\\c:\\Users\\isard\\Desktop\\AI-for-Traffic-network-classify\\datasets\\02-15-2018.csv', 'datasets\\c:\\Users\\isard\\Desktop\\AI-for-Traffic-network-classify\\datasets\\02-16-2018.csv', 'datasets\\c:\\Users\\isard\\Desktop\\AI-for-Traffic-network-classify\\datasets\\02-21-2018.csv', 'datasets\\c:\\Users\\isard\\Desktop\\AI-for-Traffic-network-classify\\datasets\\02-22-2018.csv', 'datasets\\c:\\Users\\isard\\Desktop\\AI-for-Traffic-network-classify\\datasets\\02-23-2018.csv', 'datasets\\c:\\Users\\isard\\Desktop\\AI-for-Traffic-network-classify\\datasets\\02-28-2018.csv', 'datasets\\c:\\Users\\isard\\Desktop\\AI-for-Traffic-network-classify\\datasets\\03-01-2018.csv', 'datasets\\c:\\Users\\isard\\Desktop\\AI-for-Traff