## Importación de librerías

In [32]:
import pandas as pd
import os
import glob
import json
import numpy as np
import math
from sklearn.preprocessing import LabelEncoder

## Configuración de variables globales

In [33]:
with open('setup.json', 'r') as f:
    SETUP_JSON = json.load(f)
DATASETS_PATH = SETUP_JSON['datasets_path'] # Path to the datasets,
DATASETS_FOLDER = os.path.join(os.getcwd(), DATASETS_PATH) # Folder containing the datasets,
DATASETS = glob.glob(os.path.join(DATASETS_FOLDER, '*.csv')) # List of datasets
OUTPUT_CSV = SETUP_JSON['dataset_csv'] # Output CSV file
OUTPUT_PARQUET = SETUP_JSON['dataset_parquet'] # Output CSV file
TAKE_FULL_DATASET = SETUP_JSON['take_full_dataset']
N_ROWS = SETUP_JSON['n_rows']
NA_VAL = SETUP_JSON['navalues']

## Análisis de los datasets

Obtener todos los encabezados diferentes

In [34]:
unique_headers = set()
for dataset in DATASETS:
    df = pd.read_csv(dataset, nrows=1)
    df_dtypes = df.dtypes.keys()
    unique_headers.add(tuple(df_dtypes))
for header in unique_headers:
    print(header)

('Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s', 'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt', 'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg', 'Fwd Seg Size Avg', 'Bwd Seg Size Avg', 'Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg', 'Bwd By

Separar los grupos de datasets por encabezados diferentes

In [35]:
datasets_by_header = {}
for header in unique_headers:
    datasets_group = []
    for dataset in DATASETS:
        df = pd.read_csv(dataset, nrows=1)
        df_dtypes = df.dtypes.keys()
        if tuple(df_dtypes) == header:
            datasets_group.append(dataset)
    datasets_by_header[header] = datasets_group
    print(f"Grupo de datasets:")
    for dataset in datasets_group:
        print(dataset)
    print(f"Número de columnas del grupo de datasets: {len(header)}")
    print()

Grupo de datasets:
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-14-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-15-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-16-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-21-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-22-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-23-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-28-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\03-01-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\03-02-2018.csv
Número de columnas del grupo de datasets: 80

Grupo de datasets:
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-20-2018.csv
Número de columnas del grupo de datasets: 84



Separar los grupos de datasets por etiquetas diferentes encontradas

In [36]:
datasets_by_num_labels = {}

for datasets_group in datasets_by_header.values():
    unique_labels = set()
    for dataset in datasets_group:
        df = pd.read_csv(dataset, dtype=str)
        last_column = df.columns[-1]  # Obtiene el nombre de la última columna
        for label in list(df[last_column]):  # Usa la última columna en lugar de "Label"
            unique_labels.add(label)
    datasets_by_num_labels[tuple(datasets_group)] = len(unique_labels)
    print(f"Grupo de datasets:")
    for dataset in datasets_group:
        print(dataset)
    print(f"Etiquetas diferentes encontradas: ", unique_labels)
    print()


Grupo de datasets:
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-14-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-15-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-16-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-21-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-22-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-23-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-28-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\03-01-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\03-02-2018.csv
Etiquetas diferentes encontradas:  {'DDOS attack-HOIC', 'Infilteration', 'DoS attacks-SlowHTTPTest', 'Label', 'Brute Force -XSS', 'DoS attacks-Slowloris', 'SSH-Bruteforce', 'FTP-BruteForce', 'DoS attacks-Hulk', 'DoS attacks-GoldenEye', 'DDOS attack-LOIC-UDP', 'Benign', 

Seleccionar el grupo de datasets que abarca más etiquetas diferentes

In [37]:
for datasets_group, num_labels in datasets_by_num_labels.items():
    if (num_labels) == max(datasets_by_num_labels.values()):
        datasets_selected = datasets_group
print(f"Selección de datasets a procesar:")
for dataset in datasets_selected:
    print(dataset)

Selección de datasets a procesar:
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-14-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-15-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-16-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-21-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-22-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-23-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-28-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\03-01-2018.csv
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\03-02-2018.csv


## Combinar datasets

Unimos todos los datasets seleccionados en un sólo dataframe

In [None]:
df = pd.DataFrame()

if TAKE_FULL_DATASET:
    for dataset in datasets_selected:
        current_df = pd.read_csv(dataset, sep=',', low_memory=False, na_values=NA_VAL)
        df = pd.concat([df, current_df], ignore_index=True)
        print(f"Dataset concatenado: {dataset}")
else:
    for dataset in datasets_selected:
        current_df = pd.read_csv(dataset, sep=',', low_memory=False, na_values=NA_VAL, nrows=N_ROWS)
        df = pd.concat([df, current_df], ignore_index=True)
        print(f"Dataset concatenado: {dataset}")

Dataset procesado: c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-14-2018.csv
Dataset procesado: c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-15-2018.csv
Dataset procesado: c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-16-2018.csv
Dataset procesado: c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-21-2018.csv
Dataset procesado: c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-22-2018.csv
Dataset procesado: c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-23-2018.csv
Dataset procesado: c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-28-2018.csv
Dataset procesado: c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\03-01-2018.csv
Dataset procesado: c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\03-02-2018.csv
Dataset balanceado guardado como 'balanced_dataset.csv'


In [39]:
df.shape

(7948748, 84)

## Limpieza y normalización del dataframe

Eliminar encabezados repetidos

In [40]:
# Eliminar encabezados repetidos
df = df[df['Label'] != 'Label']

Eliminar valores faltantes

In [41]:
# Eliminar los valores faltantes
df = df.dropna()

In [42]:
df.shape

(7911981, 84)

In [43]:
df.reset_index(drop=True, inplace=True)

Crear un json con todos los valores de etiquetas mapeados a números

In [44]:
# Obtener los valores únicos de la columna 'Label'
unique_labels = df['Label'].unique()

# Crear un diccionario mapeando los valores únicos a números
label_mapping = {label: int(idx) for idx, label in enumerate(unique_labels)}
print()
# Invertir el diccionario (intercambiar claves y valores)
inverted_mapping = {v: k for k, v in label_mapping.items()}

# Guardar el diccionario invertido como un archivo JSON
with open('labels.json', 'w') as json_file:
    json.dump(inverted_mapping, json_file)


print("Archivo JSON creado con el mapeo de etiquetas")


Archivo JSON creado con el mapeo de etiquetas


Convertir etiquetas a números

In [45]:
df['Label Name'] = df['Label']
# Transformar cada valor de la columna 'Label' a su equivalente según el mapeo de labels
df['Label'] = df['Label'].map(label_mapping)

Asignar tipo correspondiente a las columnas

In [46]:
# Asignar formato decimal a columnas numéricas
for col, dtype in df.dtypes.to_dict().items():
    temp_col = df[col].dropna()
    numeric_col = pd.to_numeric(temp_col, errors='coerce').notna().all()
    if numeric_col:
        df[col] = pd.to_numeric(temp_col, errors='coerce').astype('float64')
        
# Asignar formato entero a columnas que no contienen decimales
df["Protocol"] = df["Protocol"].astype(int)
df["Dst Port"] = df["Dst Port"].astype(int)
df["Label"] = df["Label"].astype(int)

Conversión de timestamp a formato legible para el modelo

In [47]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d/%m/%Y %H:%M:%S', errors='coerce') # Convertimos la timestamp a formato fecha de pandas
df['Timestamp'] = np.sin(2*math.pi*df['Timestamp'].dt.hour/24)  # Convertimos la fecha en un ciclo legible para el modelo

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7911981 entries, 0 to 7911980
Data columns (total 85 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Flow ID            object 
 1   Src IP             object 
 2   Src Port           float64
 3   Dst IP             object 
 4   Dst Port           int64  
 5   Protocol           int64  
 6   Timestamp          float64
 7   Flow Duration      float64
 8   Tot Fwd Pkts       float64
 9   Tot Bwd Pkts       float64
 10  TotLen Fwd Pkts    float64
 11  TotLen Bwd Pkts    float64
 12  Fwd Pkt Len Max    float64
 13  Fwd Pkt Len Min    float64
 14  Fwd Pkt Len Mean   float64
 15  Fwd Pkt Len Std    float64
 16  Bwd Pkt Len Max    float64
 17  Bwd Pkt Len Min    float64
 18  Bwd Pkt Len Mean   float64
 19  Bwd Pkt Len Std    float64
 20  Flow Byts/s        float64
 21  Flow Pkts/s        float64
 22  Flow IAT Mean      float64
 23  Flow IAT Std       float64
 24  Flow IAT Max       float64
 25  Flow IAT Min      

In [49]:
df.head()

Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,...,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Label Name
0,172.31.69.25-94.231.103.172-22-45498-6,94.231.103.172,45498.0,172.31.69.25,22,6,0.866025,888751.0,11.0,11.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,Benign
1,8.0.6.4-8.6.0.1-0-0-0,8.6.0.1,0.0,8.0.6.4,0,0,0.866025,112642816.0,3.0,0.0,...,0.0,0.0,0.0,0.0,56300000.0,7.071068,56300000.0,56300000.0,0,Benign
2,8.0.6.4-8.6.0.1-0-0-0,8.6.0.1,0.0,8.0.6.4,0,0,0.866025,112642712.0,3.0,0.0,...,0.0,0.0,0.0,0.0,56300000.0,18.384776,56300000.0,56300000.0,0,Benign
3,8.0.6.4-8.6.0.1-0-0-0,8.6.0.1,0.0,8.0.6.4,0,0,0.866025,112642648.0,3.0,0.0,...,0.0,0.0,0.0,0.0,56300000.0,5.656854,56300000.0,56300000.0,0,Benign
4,8.0.6.4-8.6.0.1-0-0-0,8.6.0.1,0.0,8.0.6.4,0,0,0.866025,112642702.0,3.0,0.0,...,0.0,0.0,0.0,0.0,56300000.0,65.053824,56300000.0,56300000.0,0,Benign


In [50]:
df.tail()

Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,...,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Label Name
7911976,172.31.69.7-185.2.197.19-623-42801-6,185.2.197.19,42801.0,172.31.69.7,623,6,0.258819,94042.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,Benign
7911977,172.31.69.7-195.22.125.42-22-2178-6,195.22.125.42,2178.0,172.31.69.7,22,6,0.258819,251281.0,4.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,Benign
7911978,172.31.69.7-107.3.188.61-23-14259-6,107.3.188.61,14259.0,172.31.69.7,23,6,0.866025,21.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,Benign
7911979,172.31.69.7-5.188.11.188-3039-54193-6,5.188.11.188,54193.0,172.31.69.7,3039,6,0.5,181954.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,Benign
7911980,172.31.69.7-188.19.76.194-445-57958-6,188.19.76.194,57958.0,172.31.69.7,445,6,0.866025,687378.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,Benign


## Exportación del dataframe final en formato CSV

In [51]:
# Guardar en CSV
df.to_csv(OUTPUT_CSV, index=False)
print(f"Archivo CSV guardado como {OUTPUT_CSV}")

df.to_parquet(OUTPUT_PARQUET, index=False)
print(f"Archivo parquet guardado como {OUTPUT_PARQUET}")

Archivo CSV guardado como clean_dataset.csv
Archivo parquet guardado como clean_dataset.parquet


In [52]:
df['Label'].value_counts()

Label
0    7335790
1     576191
Name: count, dtype: int64