## Importación de librerías

In [13]:
import pandas as pd
import os
import glob
import json

## Configuración de variables globales

In [14]:
with open('setup.json', 'r') as f:
    SETUP_JSON = json.load(f)
DATASETS_PATH = SETUP_JSON['datasets_path'] # Path to the datasets,
DATASETS_FOLDER = os.path.join(os.getcwd(), DATASETS_PATH) # Folder containing the datasets,
DATASETS = glob.glob(os.path.join(DATASETS_FOLDER, '*.csv')) # List of datasets
OUTPUT_CSV = SETUP_JSON['output_csv'] # Output CSV file
OUTPUT_PARQUET = SETUP_JSON['output_parquet'] # Output CSV file
N_ROWS = SETUP_JSON['n_rows']
HEADER = SETUP_JSON['header']

## Selección de los datasets a combinar

Obtener todos los encabezados diferentes

In [15]:
unique_headers = set()
for dataset in DATASETS:
    df = pd.read_csv(dataset, nrows=1)
    df_dtypes = df.dtypes.keys()
    unique_headers.add(tuple(df_dtypes))
for header in unique_headers:
    print(header)

('443', '6', '28/02/2018 08:22:13', '94658', '6.1', '7', '708', '3718', '387', '0', '118', '159.2846508613', '1460', '0.1', '531.1428571429', '673.1182235367', '46757.8017705846', '137.3365167234', '7888.1666666667', '11130.0425943262', '24325', '0.2', '72880', '14576', '12590.3839695221', '24385', '363', '72178', '12029.6666666667', '13189.2575176416', '24718', '0.3', '0.4', '0.5', '0.6', '0.7', '132', '152', '63.3860846416', '73.9504320818', '0.8', '1460.1', '316.1428571429', '519.2058813734', '269574.747252747', '0.9', '0.10', '1', '1.1', '0.11', '0.12', '0.13', '1.2', '1.3', '340.4615384615', '118.1', '531.1428571429.1', '0.14', '0.15', '0.16', '0.17', '0.18', '0.19', '6.2', '708.1', '7.1', '3718.1', '8192', '7484', '3', '20', '0.20', '0.21', '0.22', '0.23', '0.24', '0.25', '0.26', '0.27', 'Benign')
('0', '0.1', '16/02/2018 08:27:23', '112640768', '3', '0.2', '0.3', '0.4', '0.5', '0.6', '0.7', '0.8', '0.9', '0.10', '0.11', '0.12', '0.13', '0.026633341', '56300000', '138.5929291', '

Separar los grupos de datasets por encabezados diferentes

In [16]:
datasets_by_header = {}
for header in unique_headers:
    datasets_group = []
    for dataset in DATASETS:
        df = pd.read_csv(dataset, nrows=1)
        df_dtypes = df.dtypes.keys()
        if tuple(df_dtypes) == header:
            datasets_group.append(dataset)
    datasets_by_header[header] = datasets_group
    print(f"Grupo de datasets:")
    for dataset in datasets_group:
        print(dataset)
    print(f"Número de columnas del grupo de datasets: {len(header)}")
    print()

Grupo de datasets:
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-28-2018.csv
Número de columnas del grupo de datasets: 80

Grupo de datasets:
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-16-2018.csv
Número de columnas del grupo de datasets: 80

Grupo de datasets:
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-20-2018.csv
Número de columnas del grupo de datasets: 84

Grupo de datasets:
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-23-2018.csv
Número de columnas del grupo de datasets: 80

Grupo de datasets:
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-21-2018.csv
Número de columnas del grupo de datasets: 80

Grupo de datasets:
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\03-02-2018.csv
Número de columnas del grupo de datasets: 80

Grupo de datasets:
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-15-2018.csv
Número de columnas del grupo de datase

Separar los grupos de datasets por etiquetas diferentes encontradas

In [17]:
datasets_by_num_labels = {}

for datasets_group in datasets_by_header.values():
    unique_labels = set()
    for dataset in datasets_group:
        df = pd.read_csv(dataset, dtype=str)
        last_column = df.columns[-1]  # Obtiene el nombre de la última columna
        for label in list(df[last_column]):  # Usa la última columna en lugar de "Label"
            unique_labels.add(label)
    datasets_by_num_labels[tuple(datasets_group)] = len(unique_labels)
    print(f"Grupo de datasets:")
    for dataset in datasets_group:
        print(dataset)
    print(f"Etiquetas diferentes encontradas: ", unique_labels)
    print()


Grupo de datasets:
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-28-2018.csv
Etiquetas diferentes encontradas:  {'Benign', 'Infilteration', 'Label'}

Grupo de datasets:
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-16-2018.csv
Etiquetas diferentes encontradas:  {'Benign', 'DoS attacks-SlowHTTPTest', 'DoS attacks-Hulk', 'Label'}

Grupo de datasets:
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-20-2018.csv
Etiquetas diferentes encontradas:  {'Benign', 'DDoS attacks-LOIC-HTTP'}

Grupo de datasets:
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-23-2018.csv
Etiquetas diferentes encontradas:  {'Benign', 'Brute Force -Web', 'SQL Injection', 'Brute Force -XSS'}

Grupo de datasets:
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-21-2018.csv
Etiquetas diferentes encontradas:  {'Benign', 'DDOS attack-LOIC-UDP', 'DDOS attack-HOIC'}

Grupo de datasets:
c:\Users\isard\Desktop\AI-for-Traffic-network-

Seleccionar el grupo de datasets que abarca más etiquetas diferentes

In [18]:
for datasets_group, num_labels in datasets_by_num_labels.items():
    if (num_labels) == max(datasets_by_num_labels.values()):
        datasets_selected = datasets_group
print(f"Selección de datasets a procesar:")
for dataset in datasets_selected:
    print(dataset)

Selección de datasets a procesar:
c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-22-2018.csv


Exportar encabezado del grupo de datasets en formato JSON

In [20]:
# Seleccionar encabezado de los datasets a combinar
header = pd.read_csv(datasets_selected[0], nrows=1).dtypes.to_dict()
header = {col: str(dtype) for col, dtype in header.items()}
# header.pop('Label')

# Exportar el encabezado
with open(HEADER, 'w') as f:
    json.dump(header, f, indent=4)

## Combinar datasets

Unimos todos los datasets seleccionados en un sólo dataframe.

In [21]:
df = pd.DataFrame()
for dataset in datasets_selected:
    current_df = pd.read_csv(dataset, nrows=N_ROWS, dtype=str)
    df = pd.concat([df, current_df], ignore_index=True, sort=False)
    print(f"Dataset {dataset} concatenado")

Dataset c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets\02-22-2018.csv concatenado


In [22]:
df.shape

(80000, 80)

In [23]:
df.dtypes

22                     object
6                      object
22/02/2018 08:26:03    object
20553406               object
10                     object
                        ...  
19526080.2             object
0.22                   object
19526080.3             object
19526080.4             object
Benign                 object
Length: 80, dtype: object

## Exportación del dataframe en formato CSV

In [None]:
df.to_csv(OUTPUT_CSV, index=False)
print(f"Archivo CSV guardado como {OUTPUT_CSV}")
df.to_parquet(OUTPUT_PARQUET)
df.head()

       Dst Port  Protocol            Timestamp  Flow Duration  Tot Fwd Pkts  \
0           0.0       0.0  14/02/2018 08:31:01    112641719.0           3.0   
1           NaN       NaN                  NaN            NaN           NaN   
2           NaN       NaN                  NaN            NaN           NaN   
3           NaN       NaN                  NaN            NaN           NaN   
4           NaN       NaN                  NaN            NaN           NaN   
...         ...       ...                  ...            ...           ...   
79996       NaN       NaN                  NaN            NaN           NaN   
79997       NaN       NaN                  NaN            NaN           NaN   
79998       NaN       NaN                  NaN            NaN           NaN   
79999       NaN       NaN                  NaN            NaN           NaN   
80000       NaN       NaN                  NaN            NaN           NaN   

       Tot Bwd Pkts  TotLen Fwd Pkts  TotLen Bwd Pk

Unnamed: 0,22,6,22/02/2018 08:26:03,20553406,10,7,1063,1297,744,0,...,20,1027304,0.21,1027304.1,1027304.2,19526080.2,0.22,19526080.3,19526080.4,Benign
0,34989,6,22/02/2018 08:26:24,790,2,0,848,0,848,0,...,20,0,0,0,0,0.0,0.0,0,0,Benign
1,500,17,22/02/2018 08:25:10,99745913,5,0,2500,0,500,500,...,8,4000203,0,4000203,4000203,31915236.6666667,37927869.4859419,75584115,7200679,Benign
2,500,17,22/02/2018 08:25:10,99745913,5,0,2500,0,500,500,...,8,4000189,0,4000189,4000189,31915241.3333333,37927877.3079527,75584130,7200693,Benign
3,500,17,22/02/2018 08:24:59,89481361,6,0,3000,0,500,500,...,8,4000554,0,4000554,4000554,21370201.75,15281092.9995064,41990741,7200848,Benign
4,500,17,22/02/2018 08:24:59,89481358,6,0,3000,0,500,500,...,8,4000553,0,4000553,4000553,21370201.25,15281092.1551763,41990740,7200849,Benign


In [None]:
"""def find_datasets_with_same_header():
    headers = {}
    same_files = {}

    for file in DATASETS:
        with open(file, newline='', encoding='utf-8') as f:
            header = tuple(next(csv.reader(f), None))
            if header:
                if header in headers:
                    headers[header].append(file)
                else:
                    headers[header] = [file]
    print("Archivos con headers distintos:", tuple(headers.values())[1])
    return tuple(headers.values())[0]

DATASETS = find_datasets_with_same_header()"""

In [None]:
"""def mixed_types_columns(df):
    """
    # Recorre todas las columnas del DataFrame y devuelve un diccionario
    # con las columnas que contienen más de un tipo de dato.

    # Args:
    #     df (pd.DataFrame): El DataFrame a analizar.

    # Returns:
    #     dict: Un diccionario con nombres de columnas como claves,
    #           y otro diccionario con los tipos de datos y sus cantidades como valores.
    """
    mixed_types_columns = {}

    for col in df.columns:
        types = {}
        for val in df[col]:
            typeVal = type(val)
            types[typeVal] = types.get(typeVal, 0) + 1

        if len(types) > 1:
            mixed_types_columns[col] = types

    return mixed_types_columns

df = pd.read_csv('merged_output.csv')
mixed_types_columns = mixed_types_columns(df)
for col, types in mixed_types_columns.items():
    print(f"Columna: {col}")
    for typeVal, quantity in types.items():
        print(f"  type: {typeVal.__name__}, Cantidad: {quantity}")
    print()"""

In [None]:
"""columns_lengths = []

# Obtener número de columnas por dataset
for dataset in DATASETS:
    columns_length = len(pd.read_csv(dataset, nrows=1).dtypes)
    columns_lengths.append(columns_length)

# Obtener grupo de características más largo
for dataset in DATASETS:
    df = pd.read_csv(dataset, nrows=1)
    columns_length = len(df.dtypes)
    if columns_length == max(columns_lengths):
        X_columns = df.dtypes.to_dict()
        if 'Label' in X_columns:
            X_columns.pop('Label')
print(len(X_columns))
for column, type in X_columns.items():
    print(column, type)"""