## Importación de librerías

In [4]:
import gc
import pandas as pd
import os
from glob import glob
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

## Configuración

In [5]:
BASE_DIR = os.getcwd() # Or specify a fixed base path if needed
DATASET_DIR = os.path.join(BASE_DIR, "datasets") # Input directory
CHUNK_SIZE = 100000 # Adjust chunk size based on available memory and file characteristics
NA_VALUES = ['N/a', 'na', 'Na', 'NA', 'NAN', 'Nan', 'NaN', np.nan] # Consistent NA values
EXPECTED_DTYPES = None
final_dataframe = pd.DataFrame() # Initialize an empty DataFrame for final output

In [6]:
pd.set_option("display.max_columns", 80)

## Cargar los datasets

In [None]:
first_chunk_overall = True # Flag to control header writing in the output file

print(f"Searching for CSV files in: {DATASET_DIR}")
csv_files = glob(os.path.join(DATASET_DIR, "*.csv")) # Find all CSV files
file_count = len(csv_files)
print(f"Found {file_count} files to process.")

if not csv_files:
    print("No CSV files found. Exiting.")
else:
    # Iterate through each found CSV file
    for i, file_path_str in enumerate(csv_files):
        file_path = Path(file_path_str)
        print(f"\nProcessing file {i+1}/{file_count}: {file_path.name}...")

        file_df = pd.DataFrame() # Initialize an empty DataFrame for the current file

        try:
            # Read the CSV file chunk by chunk using an iterator
            chunk_iterator = pd.read_csv(file_path,sep=',',low_memory=False, na_values=NA_VALUES,chunksize=CHUNK_SIZE,dtype=EXPECTED_DTYPES,iterator=True)
            processed_chunk_count = 0
            
            # Process each chunk from the iterator
            for chunk in chunk_iterator:
                processed_chunk_count += 1
                print(f"  Processing chunk {processed_chunk_count} from {file_path.name} (shape: {chunk.shape})...")

                # Drop the 'Unnamed: 0' column if it exists in this chunk
                if 'Unnamed: 0' in chunk.columns:
                    chunk.drop(['Unnamed: 0'], axis=1, inplace=True)

                # Determine write mode ('w' for first chunk ever, 'a' for subsequent)
                write_mode = 'w' if first_chunk_overall else 'a'
                # Include header only for the very first chunk written
                include_header = first_chunk_overall

                # Append the processed chunk to the df
                file_df = pd.concat([file_df, chunk], ignore_index=True)

                # Explicitly delete the chunk variable to potentially free memory faster
                del chunk

            if processed_chunk_count == 0:
                 print(f"  Warning: No data chunks were processed for {file_path.name}. The file might be empty or only contain headers.")

            final_dataframe = pd.concat([final_dataframe, file_df], ignore_index=True) # Concatenate the current file's data to the final DataFrame 
        except pd.errors.EmptyDataError:
            print(f"  Warning: File {file_path.name} is empty. Skipping.")
        except ValueError as ve:
             print(f"  Error processing {file_path.name}: {ve}")
             print(f"  This might be due to incorrect data matching EXPECTED_DTYPES if specified.")
        except Exception as e:
            print(f"  An unexpected error occurred while processing {file_path.name}: {e}")

    print(f"\nFinished processing all files.")

Searching for CSV files in: c:\Users\isard\Desktop\AI-for-Traffic-network-classify\datasets
Found 10 files to process.

Processing file 1/10: 02-14-2018.csv...
  Processing chunk 1 from 02-14-2018.csv (shape: (100000, 80))...
  Processing chunk 2 from 02-14-2018.csv (shape: (100000, 80))...
  Processing chunk 3 from 02-14-2018.csv (shape: (100000, 80))...
  Processing chunk 4 from 02-14-2018.csv (shape: (100000, 80))...
  Processing chunk 5 from 02-14-2018.csv (shape: (100000, 80))...
  Processing chunk 6 from 02-14-2018.csv (shape: (100000, 80))...
  Processing chunk 7 from 02-14-2018.csv (shape: (100000, 80))...
  Processing chunk 8 from 02-14-2018.csv (shape: (100000, 80))...
  Processing chunk 9 from 02-14-2018.csv (shape: (100000, 80))...
  Processing chunk 10 from 02-14-2018.csv (shape: (100000, 80))...
  Processing chunk 11 from 02-14-2018.csv (shape: (48575, 80))...

Processing file 2/10: 02-15-2018.csv...
  Processing chunk 1 from 02-15-2018.csv (shape: (0, 80))...


  final_dataframe = pd.concat([final_dataframe, file_df], ignore_index=True) # Concatenate the current file's data to the final DataFrame



Processing file 3/10: 02-16-2018.csv...
  Processing chunk 1 from 02-16-2018.csv (shape: (100000, 80))...
  Processing chunk 2 from 02-16-2018.csv (shape: (100000, 80))...
  Processing chunk 3 from 02-16-2018.csv (shape: (100000, 80))...
  Processing chunk 4 from 02-16-2018.csv (shape: (100000, 80))...
  Processing chunk 5 from 02-16-2018.csv (shape: (100000, 80))...
  Processing chunk 6 from 02-16-2018.csv (shape: (100000, 80))...
  Processing chunk 7 from 02-16-2018.csv (shape: (100000, 80))...
  Processing chunk 8 from 02-16-2018.csv (shape: (100000, 80))...
  Processing chunk 9 from 02-16-2018.csv (shape: (100000, 80))...
  Processing chunk 10 from 02-16-2018.csv (shape: (100000, 80))...
  Processing chunk 11 from 02-16-2018.csv (shape: (48575, 80))...

Processing file 4/10: 02-20-2018.csv...
  Processing chunk 1 from 02-20-2018.csv (shape: (100000, 84))...
  Processing chunk 2 from 02-20-2018.csv (shape: (100000, 84))...
  Processing chunk 3 from 02-20-2018.csv (shape: (100000, 8

In [None]:
print(f"Final DataFrame shape: {final_dataframe.shape}")
print(final_dataframe['Label'].unique()) # Display unique labels in the final DataFrame

In [10]:
def summarizeDataset(df: pd.DataFrame):
    """
    Muestra las características principales y un resumen de un DataFrame de pandas.

    Args:
        df (pd.DataFrame): El DataFrame a analizar.
    """
    if not isinstance(df, pd.DataFrame):
        print("Error: La entrada debe ser un DataFrame de pandas.")
        return

    # Iteramos sobre todas las columnas
    for column in df.columns:
        print(f"´{column}: {df[column].value_counts(normalize= True, dropna = False)}")

    print("--- Resumen del Dataset ---")
    print("\n")

summarizeDataset(final_dataframe)

´Dst Port: Dst Port
80       2.269409e-01
21       1.372798e-01
53       1.086133e-01
22       7.783097e-02
443      5.115483e-02
             ...     
45671    4.118150e-07
39545    4.118150e-07
3129     4.118150e-07
58795    4.118150e-07
59495    4.118150e-07
Name: proportion, Length: 48812, dtype: float64
´Protocol: Protocol
6           0.785573
17          0.112938
6           0.075369
17          0.018205
0           0.006308
0           0.001596
Protocol    0.000011
Name: proportion, dtype: float64
´Timestamp: Timestamp
16/02/2018 01:45:28    3.460481e-03
16/02/2018 01:45:29    3.378942e-03
16/02/2018 01:45:30    3.317581e-03
16/02/2018 01:45:32    3.316758e-03
16/02/2018 01:45:31    3.300285e-03
                           ...     
01/03/2018 12:53:41    4.118150e-07
01/03/2018 12:53:54    4.118150e-07
01/03/2018 09:30:33    4.118150e-07
01/03/2018 09:31:54    4.118150e-07
01/03/2018 09:32:00    4.118150e-07
Name: proportion, Length: 67289, dtype: float64
´Flow Duration: Flow Dur

In [11]:
skim(final_dataframe)

NameError: name 'skim' is not defined