In [1]:
import pandas as pd

In [37]:
#Standard Deviation Per Column

import os
import pandas as pd

# Path to the folder containing the CSV files
folder_path = os.path.join(os.getcwd(), 'CIC')

# List of all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

num_chunks = {}  # Keep count of the number of rows per column processed
partial_sum = {}  # Sum of values for each column
partial_sum_squares = {}  # Sum of squared values for each column

for csv_file in csv_files:
    csv_path = os.path.join(folder_path, csv_file)
    chunksize = 10000  # Number of rows per chunk
    
    for chunk in pd.read_csv(csv_path, chunksize=chunksize):
        # Iterate through each column in the chunk
        for col in chunk.columns:
            # Skip non-numeric columns
            if not pd.api.types.is_numeric_dtype(chunk[col]):
                continue
            
            # Initialize counters for new columns
            if col not in partial_sum:
                partial_sum[col] = 0
                partial_sum_squares[col] = 0
                num_chunks[col] = 0
            
            # Aggregate partial sums and count
            partial_sum[col] += chunk[col].sum()
            partial_sum_squares[col] += (chunk[col]**2).sum()
            num_chunks[col] += len(chunk[col].dropna())

# Calculate the standard deviation incrementally for each column
stdevs = {}
for col in partial_sum:
    if num_chunks[col] > 1:  # Avoid division by zero
        mean = partial_sum[col] / num_chunks[col]
        variance = (partial_sum_squares[col] / num_chunks[col]) - (mean**2)
        stdevs[col] = variance**0.5  # Standard deviation = sqrt(variance)

# Print the standard deviation for each numeric column
print("Standard Deviation Per Column")
for col, stdev in stdevs.items():
    print(f'{col}: {stdev}')

Standard Deviation Per Column
flow_duration: 285.03416828033625
Header_Length: 461331.74232421746
Protocol Type: 8.945532822145422
Duration: 14.019187951426009
Rate: 99562.48957037824
Srate: 99562.48957037824
Drate: 0.007250765859210306
fin_flag_number: 0.28120695314897837
syn_flag_number: 0.4053977831154013
rst_flag_number: 0.2869035027063886
psh_flag_number: 0.28293106060457146
ack_flag_number: 0.3289320633770767
ece_flag_number: 0.0012157049878482081
cwr_flag_number: 0.0008533815996231708
ack_count: 0.2864314373830108
syn_count: 0.6635353923256602
fin_count: 0.32711641412377823
urg_count: 71.8524528598533
rst_count: 325.3846543835321
HTTP: 0.21426078827176917
HTTPS: 0.22817382472631212
DNS: 0.01143078636106617
Telnet: 0.00014635378864648267
SMTP: 0.00025349219238624606
SSH: 0.006397722947079261
IRC: 0.00038721570330887744
TCP: 0.49451845315212895
UDP: 0.4086667606492646
DHCP: 0.0013090269731860019
ARP: 0.008135211368447137
ICMP: 0.3700227254461768
IPv: 0.010614844924732221
LLC: 0.01

In [5]:
#Logic good but no label mappings

import os
import pandas as pd

# Path to the folder containing the CSV files
folder_path = os.path.join(os.getcwd(), 'CIC')

# List of all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Dictionaries to store intermediate summaries
num_chunks = {}  # Keep count of the number of rows for each column processed
partial_sum = {}  # Sum of values for each column
partial_sum_squares = {}  # Sum of squared values for each column
max_values = {}  # Maximum values for each column
min_values = {}  # Minimum values for each column
value_counts = {}  # Value counts for mode calculation

chunksize = 1000000000  # Number of rows per chunk

# Process each CSV file in the folder
for csv_file in csv_files:
    csv_path = os.path.join(folder_path, csv_file)
    print(f'Processing file: {csv_file}')

    # Process the file in chunks
    for chunk in pd.read_csv(csv_path, chunksize=chunksize):
        for col in chunk.columns:
            # Check for numeric columns
            if not pd.api.types.is_numeric_dtype(chunk[col]):
                continue

            # Initialize dictionaries for new columns
            if col not in partial_sum:
                partial_sum[col] = 0
                partial_sum_squares[col] = 0
                num_chunks[col] = 0
                max_values[col] = float('-inf')
                min_values[col] = float('inf')
                value_counts[col] = pd.Series(dtype=int)

            # Aggregate statistics
            partial_sum[col] += chunk[col].sum()
            partial_sum_squares[col] += (chunk[col] ** 2).sum()
            num_chunks[col] += len(chunk[col].dropna())
            max_values[col] = max(max_values[col], chunk[col].max())
            min_values[col] = min(min_values[col], chunk[col].min())
            value_counts[col] = value_counts[col].add(chunk[col].value_counts(), fill_value=0)

# Calculate the statistics incrementally for each column
stats_summary = {}

for col in partial_sum:
    if num_chunks[col] > 0:
        mean = partial_sum[col] / num_chunks[col]
        if num_chunks[col] > 1:
            variance = (partial_sum_squares[col] / num_chunks[col]) - (mean ** 2)
            stdev = variance ** 0.5
        else:
            stdev = None

        mode_values = value_counts[col][value_counts[col] == value_counts[col].max()].index.tolist()
        mode = mode_values[0] if mode_values else None

        stats_summary[col] = {
            'Mean': mean,
            'Mode': mode,
            'Max': max_values[col],
            'Min': min_values[col],
            'Standard Deviation': stdev,
        }

# Print the summary statistics for numeric columns
print("\nSummary Statistics Per Column")
for col, stats in stats_summary.items():
    print(f'\n{col}:')
    for stat_name, value in stats.items():
        print(f'  {stat_name}: {value}')


Processing file: part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00013-363d1ba3-8

In [12]:
#This one converts the labels from categorical to numerical and also performs the summary statistics on it.

import os
import pandas as pd

# Path to the folder containing the CSV files
folder_path = os.path.join(os.getcwd(), 'CIC')

# List of all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Dictionaries to store intermediate summaries
num_chunks = {}              # Tracks the number of non-null rows processed per column
partial_sum = {}             # Accumulates the sum of values per column
partial_sum_squares = {}     # Accumulates the sum of squares of values per column (for variance)
max_values = {}              # Tracks the maximum value encountered per column
min_values = {}              # Tracks the minimum value encountered per column
value_counts = {}            # Stores frequency counts of unique values per column
label_mapping = {}           # Maps categorical labels to numerical values

#how many rows will be loaded at one time -> 500k seems to run okay with 16Gigs ram without maxing
# lower number will be slower, this seems like a good balance at least on my machine
chunksize = 500000

# Process each CSV file in the folder
for csv_file in csv_files:
    # Get the full path of the current CSV file
    csv_path = os.path.join(folder_path, csv_file)
    # Display progress by printing the current file being processed
    print(f'Processing file: {csv_file}')

    # Read the CSV file in chunks to avoid memory overload
    for chunk in pd.read_csv(csv_path, chunksize=chunksize):
        # Convert categorical labels to numerical
        if 'label' in chunk.columns:
            # Create label mapping if it doesnt exist
            if not label_mapping:
                unique_labels = chunk['label'].unique()
                label_mapping = {label: idx for idx, label in enumerate(unique_labels)}
            #Map categorical labels to numerical    
            chunk['label'] = chunk['label'].map(label_mapping)

        # Iterate through each column in the chunk
        for col in chunk.columns:
            # Skip non-numeric columns
            if not pd.api.types.is_numeric_dtype(chunk[col]):
                continue

            #initialize
            if col not in partial_sum:
                partial_sum[col] = 0
                partial_sum_squares[col] = 0
                num_chunks[col] = 0
                max_values[col] = float('-inf')
                min_values[col] = float('inf')
                value_counts[col] = pd.Series(dtype=int)

            # Accumulate sum and sum of squares for mean and variance calculations
            partial_sum[col] += chunk[col].sum()
            partial_sum_squares[col] += (chunk[col] ** 2).sum()
            # Update the number of non-null values processed
            num_chunks[col] += len(chunk[col].dropna())
            # Update max and min values
            max_values[col] = max(max_values[col], chunk[col].max())
            min_values[col] = min(min_values[col], chunk[col].min())
            # Update for mode calculation
            value_counts[col] = value_counts[col].add(chunk[col].value_counts(), fill_value=0)

# Calculate the statistics incrementally for each column
stats_summary = {}
print("\nCalculating statistics now")

for col in partial_sum:
    if num_chunks[col] > 0:
        # Calculate values
        mean = partial_sum[col] / num_chunks[col]
        variance = (partial_sum_squares[col] / num_chunks[col]) - (mean ** 2) if num_chunks[col] > 1 else 0
        stdev = variance ** 0.5
        mode_values = value_counts[col][value_counts[col] == value_counts[col].max()].index.tolist()
        mode = mode_values[0] if mode_values else None

        # Store calculated statistics for the current column
        stats_summary[col] = {
            'Mean': mean,
            'Mode': mode,
            'Max': max_values[col],
            'Min': min_values[col],
            'Standard Deviation': stdev,
        }

# Print the summary statistics for numeric columns
print('\nSummary Statistics Per Column')
for col, stats in stats_summary.items():
    print(f'\n{col}:')
    for stat_name, value in stats.items():
        print(f'  {stat_name}: {value}')

# Print the label mapping
print('\nLabel Mapping:')
for label, num in label_mapping.items():
    print(f'  {label} -> {num}')


Processing file: part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00013-363d1ba3-8

In [14]:
#This one converts the labels from categorical to numerical and also performs the summary statistics on it.
#additonally extracts and calculates more features that are commonly used
import os
import pandas as pd

# Path to the folder containing the CSV files
folder_path = os.path.join(os.getcwd(), 'CIC')

# List of all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
# Dictionaries to store intermediate summaries
num_chunks = {}              # Tracks the number of non-null rows processed per column
partial_sum = {}             # Accumulates the sum of values per column
partial_sum_squares = {}     # Accumulates the sum of squares of values per column (for variance)
max_values = {}              # Tracks the maximum value encountered per column
min_values = {}              # Tracks the minimum value encountered per column
value_counts = {}            # Stores frequency counts of unique values per column
label_mapping = {}           # Maps categorical labels to numerical values

#how many rows will be loaded at one time -> 500k seems to run okay with 16Gigs ram without maxing
# lower number will be slower, this seems like a good balance at least on my machine
chunksize = 500000

# Process each CSV file in the folder
for csv_file in csv_files:
    csv_path = os.path.join(folder_path, csv_file)
    print(f'Processing file: {csv_file}')
    
    # Read the CSV file in chunks
    for chunk in pd.read_csv(csv_path, chunksize=chunksize):
        # Convert categorical labels to numerical
        if 'label' in chunk.columns:
            if not label_mapping:
                unique_labels = chunk['label'].unique()
                label_mapping = {label: idx for idx, label in enumerate(unique_labels)}
            chunk['label'] = chunk['label'].map(label_mapping)

        # Feature extraction: Compute new features
        # 1. Packets per Duration
        if 'Tot sum' in chunk.columns and 'Duration' in chunk.columns:
            chunk['Packets_Duration_Ratio'] = chunk['Tot sum'] / (chunk['Duration'] + 1e-6)

        # 2. Bytes per Flow Duration
        if 'Tot sum' in chunk.columns and 'flow_dura' in chunk.columns:
            chunk['Bytes_Flow_Ratio'] = chunk['Tot sum'] / (chunk['flow_dura'] + 1e-6)

        # 3. Header Ratio
        if 'Header_Le' in chunk.columns and 'flow_dura' in chunk.columns:
            chunk['Header_Ratio'] = chunk['Header_Le'] / (chunk['flow_dura'] + 1e-6)

        # 4. TCP Flag Combinations
        if 'syn_flag_r' in chunk.columns and 'psh_flag' in chunk.columns:
            chunk['syn_psh_combo'] = chunk['syn_flag_r'] + chunk['psh_flag']

        if 'ack_flag' in chunk.columns and 'rst_flag' in chunk.columns:
            chunk['ack_rst_combo'] = chunk['ack_flag'] + chunk['rst_flag']
        
        if all(flag in chunk.columns for flag in ['fin_flag_r', 'syn_flag_r', 'rst_flag', 
                                                  'psh_flag', 'ack_flag', 'ece_flag', 'cwr_flag']):
            chunk['Total_Flags'] = (chunk['fin_flag_r'] + chunk['syn_flag_r'] + chunk['rst_flag'] +
                                    chunk['psh_flag'] + chunk['ack_flag'] + chunk['ece_flag'] + chunk['cwr_flag'])

        # 5. Active Protocol Count
        protocol_cols = ['HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'TCP', 'UDP']
        if all(protocol in chunk.columns for protocol in protocol_cols):
            chunk['Active_Protocol_Count'] = chunk[protocol_cols].sum(axis=1)

        # 6. Range for Stats
        if 'Max' in chunk.columns and 'Min' in chunk.columns:
            chunk['Range'] = chunk['Max'] - chunk['Min']
        
        # Continue processing to calculate summary stats as in the original code
        for col in chunk.columns:
            # Skip non-numeric columns
            if not pd.api.types.is_numeric_dtype(chunk[col]):
                continue

            # Initialize column summary trackers if not set
            if col not in partial_sum:
                partial_sum[col] = 0
                partial_sum_squares[col] = 0
                num_chunks[col] = 0
                max_values[col] = float('-inf')
                min_values[col] = float('inf')
                value_counts[col] = pd.Series(dtype=int)

            # Summarize statistics for chunk
            partial_sum[col] += chunk[col].sum()
            partial_sum_squares[col] += (chunk[col] ** 2).sum()
            num_chunks[col] += len(chunk[col].dropna())
            max_values[col] = max(max_values[col], chunk[col].max())
            min_values[col] = min(min_values[col], chunk[col].min())
            value_counts[col] = value_counts[col].add(chunk[col].value_counts(), fill_value=0)

# Calculate the statistics incrementally for each column
stats_summary = {}
print("\nCalculating statistics now")

for col in partial_sum:
    if num_chunks[col] > 0:
        # Calculate values
        mean = partial_sum[col] / num_chunks[col]
        variance = (partial_sum_squares[col] / num_chunks[col]) - (mean ** 2) if num_chunks[col] > 1 else 0
        stdev = variance ** 0.5
        mode_values = value_counts[col][value_counts[col] == value_counts[col].max()].index.tolist()
        mode = mode_values[0] if mode_values else None

        # Store calculated statistics for the current column
        stats_summary[col] = {
            'Mean': mean,
            'Mode': mode,
            'Max': max_values[col],
            'Min': min_values[col],
            'Standard Deviation': stdev,
        }

# Print the summary statistics for numeric columns
print('\nSummary Statistics Per Column')
for col, stats in stats_summary.items():
    print(f'\n{col}:')
    for stat_name, value in stats.items():
        print(f'  {stat_name}: {value}')

# Print the label mapping
print('\nLabel Mapping:')
for label, num in label_mapping.items():
    print(f'  {label} -> {num}')


Processing file: part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00013-363d1ba3-8

In [16]:
#This one converts the labels from categorical to numerical and also performs the summary statistics on it.
#additonally extracts and calculates more features that are commonly used
# as well as printing everthing to a csv file.
import os
import pandas as pd

# Path to the folder containing the CSV files
folder_path = os.path.join(os.getcwd(), 'CIC')

# List of all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Dictionaries to store intermediate summaries
num_chunks = {}              # Tracks the number of non-null rows processed per column
partial_sum = {}             # Accumulates the sum of values per column
partial_sum_squares = {}     # Accumulates the sum of squares of values per column (for variance)
max_values = {}              # Tracks the maximum value encountered per column
min_values = {}              # Tracks the minimum value encountered per column
value_counts = {}            # Stores frequency counts of unique values per column
label_mapping = {}           # Maps categorical labels to numerical values

# How many rows will be loaded at one time -> 500k seems to run okay with 16GB RAM without maxing
chunksize = 500000

# Process each CSV file in the folder
for csv_file in csv_files:
    csv_path = os.path.join(folder_path, csv_file)
    print(f'Processing file: {csv_file}')
    
    # Read the CSV file in chunks
    for chunk in pd.read_csv(csv_path, chunksize=chunksize):
        # Convert categorical labels to numerical
        if 'label' in chunk.columns:
            if not label_mapping:
                unique_labels = chunk['label'].unique()
                label_mapping = {label: idx for idx, label in enumerate(unique_labels)}
            chunk['label'] = chunk['label'].map(label_mapping)

        # Feature extraction: Compute new features
        # 1. Packets per Duration
        if 'Tot sum' in chunk.columns and 'Duration' in chunk.columns:
            chunk['Packets_Duration_Ratio'] = chunk['Tot sum'] / (chunk['Duration'] + 1e-6)

        # 2. Bytes per Flow Duration
        if 'Tot sum' in chunk.columns and 'flow_dura' in chunk.columns:
            chunk['Bytes_Flow_Ratio'] = chunk['Tot sum'] / (chunk['flow_dura'] + 1e-6)

        # 3. Header Ratio
        if 'Header_Le' in chunk.columns and 'flow_dura' in chunk.columns:
            chunk['Header_Ratio'] = chunk['Header_Le'] / (chunk['flow_dura'] + 1e-6)

        # 4. TCP Flag Combinations
        if 'syn_flag_r' in chunk.columns and 'psh_flag' in chunk.columns:
            chunk['syn_psh_combo'] = chunk['syn_flag_r'] + chunk['psh_flag']

        if 'ack_flag' in chunk.columns and 'rst_flag' in chunk.columns:
            chunk['ack_rst_combo'] = chunk['ack_flag'] + chunk['rst_flag']
        
        if all(flag in chunk.columns for flag in ['fin_flag_r', 'syn_flag_r', 'rst_flag', 
                                                  'psh_flag', 'ack_flag', 'ece_flag', 'cwr_flag']):
            chunk['Total_Flags'] = (chunk['fin_flag_r'] + chunk['syn_flag_r'] + chunk['rst_flag'] +
                                    chunk['psh_flag'] + chunk['ack_flag'] + chunk['ece_flag'] + chunk['cwr_flag'])

        # 5. Active Protocol Count
        protocol_cols = ['HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'TCP', 'UDP']
        if all(protocol in chunk.columns for protocol in protocol_cols):
            chunk['Active_Protocol_Count'] = chunk[protocol_cols].sum(axis=1)

        # 6. Range for Stats
        if 'Max' in chunk.columns and 'Min' in chunk.columns:
            chunk['Range'] = chunk['Max'] - chunk['Min']
        
        # Continue processing to calculate summary stats as in the original code
        for col in chunk.columns:
            # Skip non-numeric columns
            if not pd.api.types.is_numeric_dtype(chunk[col]):
                continue

            # Initialize column summary trackers if not set
            if col not in partial_sum:
                partial_sum[col] = 0
                partial_sum_squares[col] = 0
                num_chunks[col] = 0
                max_values[col] = float('-inf')
                min_values[col] = float('inf')
                value_counts[col] = pd.Series(dtype=int)

            # Summarize statistics for chunk
            partial_sum[col] += chunk[col].sum()
            partial_sum_squares[col] += (chunk[col] ** 2).sum()
            num_chunks[col] += len(chunk[col].dropna())
            max_values[col] = max(max_values[col], chunk[col].max())
            min_values[col] = min(min_values[col], chunk[col].min())
            value_counts[col] = value_counts[col].add(chunk[col].value_counts(), fill_value=0)

# Calculate the statistics incrementally for each column
stats_summary = {}
print("\nCalculating statistics now")

for col in partial_sum:
    if num_chunks[col] > 0:
        # Calculate summary statistics
        mean = partial_sum[col] / num_chunks[col]
        variance = (partial_sum_squares[col] / num_chunks[col]) - (mean ** 2) if num_chunks[col] > 1 else 0
        stdev = variance ** 0.5
        mode_values = value_counts[col][value_counts[col] == value_counts[col].max()].index.tolist()
        mode = mode_values[0] if mode_values else None

        # Store calculated statistics for the current column
        stats_summary[col] = {
            'Metric': 'Summary',
            'Mean': mean,
            'Mode': mode,
            'Max': max_values[col],
            'Min': min_values[col],
            'Standard Deviation': stdev,
        }

# Combine everything into a single DataFrame
print('\nCombining summary statistics and label mapping into a single CSV...')

# Process the summary statistics
summary_df = pd.DataFrame.from_dict(stats_summary, orient='index').reset_index()
summary_df.rename(columns={'index': 'Feature'}, inplace=True)
summary_df['Group'] = 'Summary'  # Add a group identifier

# Process the label mapping
label_mapping_df = pd.DataFrame(list(label_mapping.items()), columns=['Label', 'Mapped_Value'])
label_mapping_df['Metric'] = 'Label Mapping'
label_mapping_df['Feature'] = 'label'
label_mapping_df['Group'] = 'Mapping'

# Match label mapping columns to summary columns
for col in ['Mean', 'Mode', 'Max', 'Min', 'Standard Deviation']:
    label_mapping_df[col] = None

# Combine the two tables
combined_df = pd.concat([summary_df, label_mapping_df], ignore_index=True)

# Save the combined CSV
output_path = os.path.join(os.getcwd(), 'output_combined.csv')
combined_df.to_csv(output_path, index=False)

print(f"\nCombined output saved to: {output_path}")
print("Done!")

Processing file: part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00013-363d1ba3-8

  combined_df = pd.concat([summary_df, label_mapping_df], ignore_index=True)
