## Importación de librerías

In [2]:
%pip install skimpy

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import gc
import pandas as pd
import os
from glob import glob
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from skimpy import skim
from sklearn.model_selection import train_test_split

## Configuración de

In [4]:
BASE_DIR = os.getcwd() # Or specify a fixed base path if needed
DATASET_DIR = os.path.join(BASE_DIR, "datasets") # Input directory
CHUNK_SIZE = 100000 # Adjust chunk size based on available memory and file characteristics
NA_VALUES = ['N/a', 'na', 'Na', 'NA', 'NAN', 'Nan', 'NaN', np.nan] # Consistent NA values
EXPECTED_DTYPES = None
final_dataframe = pd.DataFrame() # Initialize an empty DataFrame for final output

In [5]:
pd.set_option("display.max_columns", 80)

## Cargar los datasets

In [6]:
first_chunk_overall = True # Flag to control header writing in the output file

print(f"Searching for CSV files in: {DATASET_DIR}")
csv_files = glob(os.path.join(DATASET_DIR, "*.csv")) # Find all CSV files
file_count = len(csv_files)
print(f"Found {file_count} files to process.")

if not csv_files:
    print("No CSV files found. Exiting.")
else:
    # Iterate through each found CSV file
    for i, file_path_str in enumerate(csv_files):
        file_path = Path(file_path_str)
        print(f"\nProcessing file {i+1}/{file_count}: {file_path.name}...")

        file_df = pd.DataFrame() # Initialize an empty DataFrame for the current file

        try:
            # Read the CSV file chunk by chunk using an iterator
            chunk_iterator = pd.read_csv(file_path,sep=',',low_memory=False, na_values=NA_VALUES,chunksize=CHUNK_SIZE,dtype=EXPECTED_DTYPES,iterator=True)
            processed_chunk_count = 0
            
            # Process each chunk from the iterator
            for chunk in chunk_iterator:
                processed_chunk_count += 1
                print(f"  Processing chunk {processed_chunk_count} from {file_path.name} (shape: {chunk.shape})...")

                # Drop the 'Unnamed: 0' column if it exists in this chunk
                if 'Unnamed: 0' in chunk.columns:
                    chunk.drop(['Unnamed: 0'], axis=1, inplace=True)

                # Determine write mode ('w' for first chunk ever, 'a' for subsequent)
                write_mode = 'w' if first_chunk_overall else 'a'
                # Include header only for the very first chunk written
                include_header = first_chunk_overall

                # Append the processed chunk to the df
                file_df = pd.concat([file_df, chunk], ignore_index=True)

                # Explicitly delete the chunk variable to potentially free memory faster
                del chunk

            if processed_chunk_count == 0:
                 print(f"  Warning: No data chunks were processed for {file_path.name}. The file might be empty or only contain headers.")

            final_dataframe = pd.concat([final_dataframe, file_df], ignore_index=True) # Concatenate the current file's data to the final DataFrame 
        except pd.errors.EmptyDataError:
            print(f"  Warning: File {file_path.name} is empty. Skipping.")
        except ValueError as ve:
             print(f"  Error processing {file_path.name}: {ve}")
             print(f"  This might be due to incorrect data matching EXPECTED_DTYPES if specified.")
        except Exception as e:
            print(f"  An unexpected error occurred while processing {file_path.name}: {e}")

    print(f"\nFinished processing all files.")

Searching for CSV files in: c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets
Found 10 files to process.

Processing file 1/10: 02-14-2018.csv...
  Processing chunk 1 from 02-14-2018.csv (shape: (100000, 80))...
  Processing chunk 2 from 02-14-2018.csv (shape: (100000, 80))...
  Processing chunk 3 from 02-14-2018.csv (shape: (100000, 80))...
  Processing chunk 4 from 02-14-2018.csv (shape: (100000, 80))...
  Processing chunk 5 from 02-14-2018.csv (shape: (100000, 80))...
  Processing chunk 6 from 02-14-2018.csv (shape: (100000, 80))...
  Processing chunk 7 from 02-14-2018.csv (shape: (100000, 80))...
  Processing chunk 8 from 02-14-2018.csv (shape: (100000, 80))...
  Processing chunk 9 from 02-14-2018.csv (shape: (100000, 80))...
  Processing chunk 10 from 02-14-2018.csv (shape: (100000, 80))...
  Processing chunk 11 from 02-14-2018.csv (shape: (46298, 80))...

Processing file 2/10: 02-15-2018.csv...
  Processing chunk 1 from 02-15-2018.csv (shape: (100000, 80))...
  Proc

In [7]:
print(f"Final DataFrame shape: {final_dataframe.shape}")
print(final_dataframe['Label'].unique()) # Display unique labels in the final DataFrame

Final DataFrame shape: (3138526, 80)
['Benign' 'FTP-BruteForce' 'SSH-Bruteforce' 'DoS attacks-GoldenEye'
 'DoS attacks-Slowloris' 'DoS attacks-SlowHTTPTest' 'DoS attacks-Hulk']


## Análisis del dataframe

In [12]:
import pandas as pd

def summarizeDataset(df: pd.DataFrame, chunk_size=10000):
    """
    Muestra un resumen del DataFrame procesándolo en chunks para evitar problemas de memoria.

    Args:
        df (pd.DataFrame): El DataFrame a analizar.
        chunk_size (int): Número de filas por chunk a procesar.
    """
    if not isinstance(df, pd.DataFrame):
        print("Error: La entrada debe ser un DataFrame de pandas.")
        return

    print("--- Resumen del Dataset ---\n")

    # Procesar por chunks para evitar MemoryError
    num_chunks = len(df) // chunk_size + 1
    
    for i in range(num_chunks):
        chunk = df.iloc[i*chunk_size : (i+1)*chunk_size]

        print(f"Chunk {i+1}/{num_chunks} (Filas {i*chunk_size} a {(i+1)*chunk_size}):\n")
        
        for column in chunk.columns:
            print(f"'{column}': {chunk[column].map(type).value_counts(normalize=True) * 100}")
        
        print("\n" + "-"*40 + "\n")

# Ejemplo de uso
# df = pd.read_csv("archivo_grande.csv")  # Cargar un dataset grande
# summarizeDataset(df, chunk_size=50000)  # Procesarlo por partes




In [13]:
summarizeDataset(final_dataframe)

--- Resumen del Dataset ---

Chunk 1/314 (Filas 0 a 10000):

'Dst Port': Dst Port
<class 'float'>    100.0
Name: proportion, dtype: float64
'Protocol': Protocol
<class 'float'>    100.0
Name: proportion, dtype: float64
'Timestamp': Timestamp
<class 'str'>    100.0
Name: proportion, dtype: float64
'Flow Duration': Flow Duration
<class 'float'>    100.0
Name: proportion, dtype: float64
'Tot Fwd Pkts': Tot Fwd Pkts
<class 'float'>    100.0
Name: proportion, dtype: float64
'Tot Bwd Pkts': Tot Bwd Pkts
<class 'float'>    100.0
Name: proportion, dtype: float64
'TotLen Fwd Pkts': TotLen Fwd Pkts
<class 'float'>    100.0
Name: proportion, dtype: float64
'TotLen Bwd Pkts': TotLen Bwd Pkts
<class 'float'>    100.0
Name: proportion, dtype: float64
'Fwd Pkt Len Max': Fwd Pkt Len Max
<class 'float'>    100.0
Name: proportion, dtype: float64
'Fwd Pkt Len Min': Fwd Pkt Len Min
<class 'float'>    100.0
Name: proportion, dtype: float64
'Fwd Pkt Len Mean': Fwd Pkt Len Mean
<class 'float'>    100.0
Name:

In [None]:
skim(final_dataframe)

MemoryError: Unable to allocate 5.26 GiB for an array with shape (80, 8829012) and data type object