## Importación de librerías

In [3]:
import pandas as pd
import os
import glob
import csv
import json

## Configuración de variables globales

In [4]:
with open('setup.json', 'r') as f:
    SETUP_JSON = json.load(f)
DATASETS_PATH = SETUP_JSON['datasets_path'] # Path to the datasets,
DATASETS_FOLDER = os.path.join(os.getcwd(), DATASETS_PATH) # Folder containing the datasets,
DATASETS = glob.glob(os.path.join(DATASETS_FOLDER, '*.csv')) # List of datasets
OUTPUT_CSV = SETUP_JSON['output_csv'] # Output CSV file
OUTPUT_PARQUET = SETUP_JSON['output_parquet'] # Output CSV file

## Análisis de los datasets

In [10]:
datasets = [f"{DATASETS_PATH}\\02-14-2018.csv", f"{DATASETS_PATH}\\02-15-2018.csv", f"{DATASETS_PATH}\\02-16-2018.csv", f"{DATASETS_PATH}\\02-21-2018.csv"]

In [11]:
# Obtener todas las columnas únicas y sus tipos
all_columns_dtypes = {}
for dataset in datasets:
    df = pd.read_csv(dataset, nrows=1)
    df_types = df.dtypes
    df_types.pop(df.columns[-1]) # Eliminar la columna de la etiqueta
    all_columns_dtypes.update(df_types.to_dict())

print(f"Todas las columnas únicas: {len(all_columns_dtypes)}")
for column in all_columns_dtypes:
    print(f"Columna: {column}, Tipo: {all_columns_dtypes[column]}")

Todas las columnas únicas: 79
Columna: Dst Port, Tipo: int64
Columna: Protocol, Tipo: int64
Columna: Timestamp, Tipo: object
Columna: Flow Duration, Tipo: int64
Columna: Tot Fwd Pkts, Tipo: int64
Columna: Tot Bwd Pkts, Tipo: int64
Columna: TotLen Fwd Pkts, Tipo: int64
Columna: TotLen Bwd Pkts, Tipo: int64
Columna: Fwd Pkt Len Max, Tipo: int64
Columna: Fwd Pkt Len Min, Tipo: int64
Columna: Fwd Pkt Len Mean, Tipo: int64
Columna: Fwd Pkt Len Std, Tipo: float64
Columna: Bwd Pkt Len Max, Tipo: int64
Columna: Bwd Pkt Len Min, Tipo: int64
Columna: Bwd Pkt Len Mean, Tipo: float64
Columna: Bwd Pkt Len Std, Tipo: float64
Columna: Flow Byts/s, Tipo: float64
Columna: Flow Pkts/s, Tipo: float64
Columna: Flow IAT Mean, Tipo: float64
Columna: Flow IAT Std, Tipo: float64
Columna: Flow IAT Max, Tipo: int64
Columna: Flow IAT Min, Tipo: int64
Columna: Fwd IAT Tot, Tipo: int64
Columna: Fwd IAT Mean, Tipo: float64
Columna: Fwd IAT Std, Tipo: float64
Columna: Fwd IAT Max, Tipo: int64
Columna: Fwd IAT Min, T

## Combinar datasets

Unimos todos los archivos CSV y guardamos el dataset en formato CSV (para analizarlo con Tableau) y en formato Parquet (para su tratamiento con Python).

In [8]:
# Añadir columnas faltantes
def add_missing_columns(filename, df):
    print(f"Procesando {filename}: {df.shape}")
    for col in all_columns_dtypes.keys():
        if col not in df.columns:
            df.insert(loc=df.shape[1] - 1, column=col, value=pd.NA)
    return df

In [13]:
# Combinar los datasets
df = pd.DataFrame()
for dataset in datasets:
    next_df = add_missing_columns(dataset, pd.read_csv(dataset, dtype=str))
    df = pd.concat([df, next_df], ignore_index=True, sort=False)

Procesando datasets\02-14-2018.csv: (1048575, 80)
Procesando datasets\02-15-2018.csv: (1048575, 80)
Procesando datasets\02-16-2018.csv: (1048575, 80)
Procesando datasets\02-21-2018.csv: (1048575, 80)


In [14]:
df.shape

(4194300, 80)

In [15]:
df.dtypes.unique()

array([dtype('O')], dtype=object)

In [16]:
df.info()  # Display DataFrame information

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4194300 entries, 0 to 4194299
Data columns (total 80 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   Dst Port           object
 1   Protocol           object
 2   Timestamp          object
 3   Flow Duration      object
 4   Tot Fwd Pkts       object
 5   Tot Bwd Pkts       object
 6   TotLen Fwd Pkts    object
 7   TotLen Bwd Pkts    object
 8   Fwd Pkt Len Max    object
 9   Fwd Pkt Len Min    object
 10  Fwd Pkt Len Mean   object
 11  Fwd Pkt Len Std    object
 12  Bwd Pkt Len Max    object
 13  Bwd Pkt Len Min    object
 14  Bwd Pkt Len Mean   object
 15  Bwd Pkt Len Std    object
 16  Flow Byts/s        object
 17  Flow Pkts/s        object
 18  Flow IAT Mean      object
 19  Flow IAT Std       object
 20  Flow IAT Max       object
 21  Flow IAT Min       object
 22  Fwd IAT Tot        object
 23  Fwd IAT Mean       object
 24  Fwd IAT Std        object
 25  Fwd IAT Max        object
 26  Fwd IAT Min   

In [17]:
df.head()

Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,0,14/02/2018 08:31:01,112641719,3,0,0,0,0,0,...,0,0,0,0,0,56320859.5,139.3000358938,56320958,56320761,Benign
1,0,0,14/02/2018 08:33:50,112641466,3,0,0,0,0,0,...,0,0,0,0,0,56320733.0,114.5512985522,56320814,56320652,Benign
2,0,0,14/02/2018 08:36:39,112638623,3,0,0,0,0,0,...,0,0,0,0,0,56319311.5,301.9345955667,56319525,56319098,Benign
3,22,6,14/02/2018 08:40:13,6453966,15,10,1239,2273,744,0,...,32,0,0,0,0,0.0,0.0,0,0,Benign
4,22,6,14/02/2018 08:40:23,8804066,14,11,1143,2209,744,0,...,32,0,0,0,0,0.0,0.0,0,0,Benign


## Exportación de archivos JSON

In [18]:
# Convertir los tipos a strings
serializable_dtypes = {col: str(dtype) for col, dtype in all_columns_dtypes.items()}

# Guardar en JSON
with open('dtypes.json', 'w') as f:
    json.dump(serializable_dtypes, f, indent=4)

In [24]:
# Obtener las etiquetas únicas
labels = {}
for i, label in enumerate(df[df.columns[-1]].unique()):
    labels[label] = i
labels.pop(df.columns[-1])

for label in labels:
    print(f"{label}: {labels[label]}")

# Guardar etiquetas en JSON
with open('labels.json', 'w') as f:
    json.dump(labels, f, indent=4)

Benign: 0
FTP-BruteForce: 1
SSH-Bruteforce: 2
DoS attacks-GoldenEye: 3
DoS attacks-Slowloris: 4
DoS attacks-SlowHTTPTest: 5
DoS attacks-Hulk: 6
DDOS attack-LOIC-UDP: 8
DDOS attack-HOIC: 9


## Exportación del dataframe en CSV y Parquet

In [25]:
# Guardar en CSV
df.to_csv(OUTPUT_CSV, index=False)
print(f"Archivo CSV guardado como {OUTPUT_CSV}")

# Guardar en Parquet
df.to_parquet(OUTPUT_PARQUET, index=False)
print(f"Archivo Parquet guardado como {OUTPUT_PARQUET}")

Archivo CSV guardado como merged_output.csv
Archivo Parquet guardado como merged_output.parquet


In [87]:

def find_datasets_with_same_header():
    headers = {}
    same_files = {}

    for file in DATASETS:
        with open(file, newline='', encoding='utf-8') as f:
            header = tuple(next(csv.reader(f), None))
            if header:
                if header in headers:
                    headers[header].append("datasets\\"+file)
                else:
                    headers[header] = ["datasets\\"+file]
    print("Archivos con headers distintos:", tuple(headers.values())[1])
    return tuple(headers.values())[0]

In [88]:
csv_files = find_datasets_with_same_header()
print("Archivos con el mismo encabezado:", csv_files)

Archivos con headers distintos: ['datasets\\c:\\Users\\isard\\Desktop\\AI-for-Traffic-network-classify\\datasets\\02-20-2018.csv']
Archivos con el mismo encabezado: ['datasets\\c:\\Users\\isard\\Desktop\\AI-for-Traffic-network-classify\\datasets\\02-14-2018.csv', 'datasets\\c:\\Users\\isard\\Desktop\\AI-for-Traffic-network-classify\\datasets\\02-15-2018.csv', 'datasets\\c:\\Users\\isard\\Desktop\\AI-for-Traffic-network-classify\\datasets\\02-16-2018.csv', 'datasets\\c:\\Users\\isard\\Desktop\\AI-for-Traffic-network-classify\\datasets\\02-21-2018.csv', 'datasets\\c:\\Users\\isard\\Desktop\\AI-for-Traffic-network-classify\\datasets\\02-22-2018.csv', 'datasets\\c:\\Users\\isard\\Desktop\\AI-for-Traffic-network-classify\\datasets\\02-23-2018.csv', 'datasets\\c:\\Users\\isard\\Desktop\\AI-for-Traffic-network-classify\\datasets\\02-28-2018.csv', 'datasets\\c:\\Users\\isard\\Desktop\\AI-for-Traffic-network-classify\\datasets\\03-01-2018.csv', 'datasets\\c:\\Users\\isard\\Desktop\\AI-for-Traff