In [2]:
#preprocessing for netwrotk traffic EVSE-A(all files in the directory)

import pandas as pd
import os
import re

# Base directory for CSV files
base_dir = r"J:\\R JEEVAN\B.TECH CYS\\4th Sem\\Machine Learning\\ML Project\\CICEVSE2024_Dataset\\Network Traffic\\EVSE-B\\csv"

# Get all CSV files in the directory
file_paths = []
for file_name in os.listdir(base_dir):
    if file_name.endswith('.csv'):
        # Extract attack type from file name
        attack_type = re.sub(r'EVSE-B-(charging|idle)-', '', file_name).replace('.csv', '').replace('-', ' ').title()
        file_paths.append({"file": os.path.join(base_dir, file_name), "attack_type": attack_type})

# Preprocessing function
def preprocess_network_traffic(file_path, attack_type):
    try:
        # Load CSV with auto-detection of delimiters
        df = pd.read_csv(file_path, engine='python', on_bad_lines='skip')
        
        # Ensure the file has data
        if df.empty:
            print(f'Skipping empty file: {file_path}')
            return
        
        # Check number of columns
        print(f"Columns detected in {file_path}: {df.columns.tolist()}")
        print(df.head())
        
        # If only one column, assume data is space or tab delimited
        if len(df.columns) == 1:
            df = df.iloc[:, 0].str.split(expand=True)
        
        # Adjust column renaming based on detected structure
        num_cols = len(df.columns)
        if num_cols >= 4:
            df.columns = ['time', 'counts', 'unit', 'events'] + [f'extra_col_{i}' for i in range(5, num_cols + 1)]
        else:
            print(f"Unexpected column structure in {file_path}, skipping.")
            return
        
        # Convert 'time' and 'counts' to numeric
        df['time'] = pd.to_numeric(df['time'], errors='coerce')
        df['counts'] = pd.to_numeric(df['counts'], errors='coerce')
        
        # Add attack type column
        df['attack_type'] = attack_type

        # Feature engineering: rolling averages and derivatives
        df['rolling_avg_counts'] = df['counts'].rolling(window=10).mean().fillna(0)
        df['counts_derivative'] = df['counts'].diff().fillna(0)

        # Time-based features
        df['time_diff'] = df['time'].diff().fillna(0)
        df['packets_per_second'] = df['counts'] / df['time_diff'].replace(0, 1)

        # Save preprocessed data to the same location
        output_file = os.path.join(os.path.dirname(file_path), 'Preprocessed_' + os.path.basename(file_path))
        df.to_csv(output_file, index=False)
        print(f"Preprocessed file saved to: {output_file}")
    except Exception as e:
        print(f'Error processing {file_path}: {e}')

# Apply preprocessing to all files
for file_info in file_paths:
    preprocess_network_traffic(file_info['file'], file_info['attack_type'])


Columns detected in J:\\R JEEVAN\B.TECH CYS\\4th Sem\\Machine Learning\\ML Project\\CICEVSE2024_Dataset\\Network Traffic\\EVSE-B\\csv\EVSE-B-charging-aggressive-scan.csv: ['id', 'expiration_id', 'src_ip', 'src_mac', 'src_oui', 'src_port', 'dst_ip', 'dst_mac', 'dst_oui', 'dst_port', 'protocol', 'ip_version', 'vlan_id', 'tunnel_id', 'bidirectional_first_seen_ms', 'bidirectional_last_seen_ms', 'bidirectional_duration_ms', 'bidirectional_packets', 'bidirectional_bytes', 'src2dst_first_seen_ms', 'src2dst_last_seen_ms', 'src2dst_duration_ms', 'src2dst_packets', 'src2dst_bytes', 'dst2src_first_seen_ms', 'dst2src_last_seen_ms', 'dst2src_duration_ms', 'dst2src_packets', 'dst2src_bytes', 'bidirectional_min_ps', 'bidirectional_mean_ps', 'bidirectional_stddev_ps', 'bidirectional_max_ps', 'src2dst_min_ps', 'src2dst_mean_ps', 'src2dst_stddev_ps', 'src2dst_max_ps', 'dst2src_min_ps', 'dst2src_mean_ps', 'dst2src_stddev_ps', 'dst2src_max_ps', 'bidirectional_min_piat_ms', 'bidirectional_mean_piat_ms', 'b