## Importación de librerías

In [16]:
%pip install skimpy

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [17]:
import pandas as pd
import os
import glob
import numpy as np
from skimpy import skim
import json
import random

## Configuración de variables globales

In [18]:
with open('setup.json', 'r') as f:
    SETUP_JSON = json.load(f)
BASE_DIR = os.getcwd() # Base directory for the project,
DATASET_DIR = os.path.join(BASE_DIR, "datasets") # Input directory,
NA_VALUES = SETUP_JSON['navalues'] # Consistent NA values,
CHUNK_SIZE = SETUP_JSON['chunk_size'] # Chunk size for reading CSV files,
SAMPLE_FRACTION = SETUP_JSON['sample_fraction'] # Fraction of data to sample,
RESULT_CSV = SETUP_JSON['result_csv'] # Result CSV file name
RESULT_PARQUET = SETUP_JSON['result_parquet'] # Result CSV file name
MAX_COLUMNS = SETUP_JSON['max_columns'] # Maximum number of columns to display
DATASETS_FOLDER = os.path.join(os.getcwd(), "datasets")
csv_files = glob.glob(os.path.join(DATASETS_FOLDER, "*.csv"))

In [19]:
pd.set_option('display.max_columns', MAX_COLUMNS) # Set max columns to display in DataFrame

## Carga del dataset

Unimos todos los archivos CSV y guardamos el dataset en formato CSV (para analizarlo con Tableau) y en formato Parquet (para su tratamiento con Python).

In [20]:
import os
import csv

def encontrar_csv_con_mismo_encabezado(directorio):
    encabezados = {}
    archivos_iguales = {}

    for archivo in os.listdir(directorio):
        if archivo.endswith(".csv"):
            ruta_archivo = os.path.join(directorio, archivo)
            with open(ruta_archivo, newline='', encoding='utf-8') as f:
                lector = csv.reader(f)
                encabezado = tuple(next(lector, None))

            if encabezado:
                if encabezado in encabezados:
                    encabezados[encabezado].append("datasets\\"+archivo)
                else:
                    encabezados[encabezado] = ["datasets\\"+archivo]
    print("archivos con encabezados distintos:", tuple(encabezados.values())[1])
    return tuple(encabezados.values())[0]

# Uso
directorio = "datasets"
csv_files = encontrar_csv_con_mismo_encabezado(directorio)
print("Archivos con el mismo encabezado:", csv_files)

archivos con encabezados distintos: ['datasets\\02-20-2018.csv']
Archivos con el mismo encabezado: ['datasets\\02-14-2018.csv', 'datasets\\02-14-2018_10000.csv', 'datasets\\02-15-2018.csv', 'datasets\\02-16-2018.csv', 'datasets\\02-21-2018.csv', 'datasets\\02-22-2018.csv', 'datasets\\02-23-2018.csv', 'datasets\\02-28-2018.csv', 'datasets\\03-01-2018.csv', 'datasets\\03-02-2018.csv']


In [28]:
import pandas as pd

def merge_csv_files(file_list, output_csv, output_parquet):
    merged_df = pd.DataFrame()

    for file in file_list:
        df = pd.read_csv(file, nrows=5001)  # Leer encabezado + 5000 líneas
        merged_df = pd.concat([merged_df, df.iloc[1:]])  # Excluir el encabezado después del primer archivo


file_list = ["datasets\\02-14-2018.csv", "datasets\\02-15-2018.csv", "datasets\\02-16-2018.csv", "datasets\\02-21-2018.csv", "datasets\\02-22-2018.csv"]
output_csv = "merged_output.csv"
output_parquet = "merged_output.parquet"
merge_csv_files(file_list, output_csv, output_parquet)


In [29]:
df = pd.read_csv(RESULT_CSV, na_values=NA_VALUES, low_memory=False)
df.info()
def contar_y_listar_valores_unicos(df, columna_labels):
    """
    Cuenta y lista los valores únicos en la columna especificada de un DataFrame.

    Args:
        df (pd.DataFrame): El DataFrame que contiene la columna.
        columna_labels (str): El nombre de la columna a analizar.

    Returns:
        tuple: Número de valores únicos y una lista con los valores únicos.
    """
    valores_unicos = df[columna_labels].unique()
    print(f"Número de valores únicos en la columna '{columna_labels}': {len(valores_unicos)}")
    # return len(valores_unicos), valores_unicos
    return valores_unicos

valores_unicos = list(contar_y_listar_valores_unicos(df, 'Label'))
print("Valores únicos en la columna 'Label':", valores_unicos)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 81 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Dst Port           25000 non-null  int64  
 1   Protocol           25000 non-null  int64  
 2   Timestamp          25000 non-null  object 
 3   Flow Duration      25000 non-null  int64  
 4   Tot Fwd Pkts       25000 non-null  int64  
 5   Tot Bwd Pkts       25000 non-null  int64  
 6   TotLen Fwd Pkts    25000 non-null  int64  
 7   TotLen Bwd Pkts    25000 non-null  int64  
 8   Fwd Pkt Len Max    25000 non-null  int64  
 9   Fwd Pkt Len Min    25000 non-null  int64  
 10  Fwd Pkt Len Mean   25000 non-null  float64
 11  Fwd Pkt Len Std    25000 non-null  float64
 12  Bwd Pkt Len Max    25000 non-null  int64  
 13  Bwd Pkt Len Min    25000 non-null  int64  
 14  Bwd Pkt Len Mean   25000 non-null  float64
 15  Bwd Pkt Len Std    25000 non-null  float64
 16  Flow Byts/s        249

In [30]:
valores_unicos = {label: index for index, label in enumerate(valores_unicos)}
df['Label_index'] = df['Label'].map(valores_unicos)
print(df['Label_index'])


0        0
1        0
2        0
3        0
4        0
        ..
24995    0
24996    0
24997    0
24998    0
24999    0
Name: Label_index, Length: 25000, dtype: int64


In [33]:
# Guardar en CSV
df.to_csv(output_csv, index=False)
print(f"Archivo CSV guardado como {output_csv}")

# Guardar en Parquet
df.to_parquet(output_parquet, index=False)
print(f"Archivo Parquet guardado como {output_parquet}")


Archivo CSV guardado como merged_output.csv
Archivo Parquet guardado como merged_output.parquet


In [34]:
contar_y_listar_valores_unicos(df, 'Label_index')

Número de valores únicos en la columna 'Label_index': 9


array([0, 1, 2, 3, 4, 5, 6, 7, 8])

In [26]:
import pandas as pd

# Revisar cada columna y contar los tipos de valores
for col in df.columns:
    tipos = df[col].apply(type).value_counts()
    if len(tipos) > 1:  # Si hay más de un tipo, lo imprimimos
        print(f"Columna: {col}")
        print(tipos)
        print("-" * 30)
    else:
        print(f"Columna: {col} tiene un solo tipo de dato: {tipos.index[0]}")
        print("-" * 30)


Columna: Dst Port tiene un solo tipo de dato: <class 'int'>
------------------------------
Columna: Protocol tiene un solo tipo de dato: <class 'int'>
------------------------------
Columna: Timestamp tiene un solo tipo de dato: <class 'str'>
------------------------------
Columna: Flow Duration tiene un solo tipo de dato: <class 'int'>
------------------------------
Columna: Tot Fwd Pkts tiene un solo tipo de dato: <class 'int'>
------------------------------
Columna: Tot Bwd Pkts tiene un solo tipo de dato: <class 'int'>
------------------------------
Columna: TotLen Fwd Pkts tiene un solo tipo de dato: <class 'int'>
------------------------------
Columna: TotLen Bwd Pkts tiene un solo tipo de dato: <class 'int'>
------------------------------
Columna: Fwd Pkt Len Max tiene un solo tipo de dato: <class 'int'>
------------------------------
Columna: Fwd Pkt Len Min tiene un solo tipo de dato: <class 'int'>
------------------------------
Columna: Fwd Pkt Len Mean tiene un solo tipo de 

In [27]:
df.head()

Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,Bwd Pkt Len Max,Bwd Pkt Len Min,Bwd Pkt Len Mean,Bwd Pkt Len Std,Flow Byts/s,Flow Pkts/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Tot,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Tot,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Len,Bwd Header Len,Fwd Pkts/s,Bwd Pkts/s,Pkt Len Min,Pkt Len Max,Pkt Len Mean,Pkt Len Std,Pkt Len Var,FIN Flag Cnt,SYN Flag Cnt,RST Flag Cnt,PSH Flag Cnt,ACK Flag Cnt,URG Flag Cnt,CWE Flag Count,ECE Flag Cnt,Down/Up Ratio,Pkt Size Avg,Fwd Seg Size Avg,Bwd Seg Size Avg,Fwd Byts/b Avg,Fwd Pkts/b Avg,Fwd Blk Rate Avg,Bwd Byts/b Avg,Bwd Pkts/b Avg,Bwd Blk Rate Avg,Subflow Fwd Pkts,Subflow Fwd Byts,Subflow Bwd Pkts,Subflow Bwd Byts,Init Fwd Win Byts,Init Bwd Win Byts,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Label_index
0,0,0,14/02/2018 08:33:50,112641466,3,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.026633,56320730.0,114.551299,56320814,56320652,112641466,56320730.0,114.551299,56320814,56320652,0,0.0,0.0,0,0,0,0,0,0,0,0,0.026633,0.0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,3,0,0,0,-1,-1,0,0,0.0,0.0,0,0,56320733.0,114.551299,56320814,56320652,Benign,0
1,0,0,14/02/2018 08:36:39,112638623,3,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.026634,56319310.0,301.934596,56319525,56319098,112638623,56319310.0,301.934596,56319525,56319098,0,0.0,0.0,0,0,0,0,0,0,0,0,0.026634,0.0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,3,0,0,0,-1,-1,0,0,0.0,0.0,0,0,56319311.5,301.934596,56319525,56319098,Benign,0
2,22,6,14/02/2018 08:40:13,6453966,15,10,1239,2273,744,0,82.6,196.741237,976,0,227.3,371.677892,544.161528,3.873587,268915.2,247443.778966,673900,22,6453966,460997.6,123109.423588,673900,229740,5637902,626433.555556,455082.214224,1167293,554,0,0,0,0,488,328,2.324152,1.549435,0,976,135.076923,277.83476,77192.153846,0,0,0,1,0,0,0,0,0,140.48,82.6,227.3,0,0,0,0,0,0,15,1239,10,2273,65535,233,6,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign,0
3,22,6,14/02/2018 08:40:23,8804066,14,11,1143,2209,744,0,81.642857,203.745545,976,0,200.818182,362.249864,380.733175,2.839597,366836.1,511356.609733,1928102,21,8804066,677235.8,532416.970959,1928102,246924,7715481,771548.1,755543.082717,2174893,90,0,0,0,0,456,360,1.590174,1.249423,0,976,128.923077,279.763032,78267.353846,0,0,0,1,0,0,0,0,0,134.08,81.642857,200.818182,0,0,0,0,0,0,14,1143,11,2209,5808,233,6,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign,0
4,22,6,14/02/2018 08:40:31,6989341,16,12,1239,2273,744,0,77.4375,190.831154,976,0,189.416667,347.642569,502.479418,4.0061,258864.5,291724.147911,951098,20,6989341,465956.1,244363.896416,951098,265831,5980598,543690.727273,460713.519752,1254338,78,0,0,0,0,332,252,2.2892,1.7169,0,976,121.103448,265.708668,70601.096059,0,0,0,1,0,0,0,0,0,125.428571,77.4375,189.416667,0,0,0,0,0,0,16,1239,12,2273,5808,234,7,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign,0
