In [None]:
r"J:\\R JEEVAN\B.TECH CYS\\4th Sem\\Machine Learning\\ML Project\\CICEVSE2024_Dataset\\Host Events\\Individual Files"

In [None]:
import os
import re
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Base directory for CSV files
base_dir = r"J:\\R JEEVAN\\B.TECH CYS\\4th Sem\\Machine Learning\\ML Project\\CICEVSE2024_Dataset\\Host Events\\Individual Files"
output_dir = os.path.join(base_dir, 'preprocessed')
os.makedirs(output_dir, exist_ok=True)

# Get all CSV files in the directory
file_paths = []
for file_name in os.listdir(base_dir):
    if file_name.endswith('.csv'):
        attack_type = re.sub(r'EVSE-B-(charging|idle)-', '', file_name).replace('.csv', '').replace('-', ' ').title()
        file_paths.append({"file": os.path.join(base_dir, file_name), "attack_type": attack_type})

# Preprocess and save each file separately
def preprocess_file(file_path, attack_type):
    try:
        # Read CSV with pandas, auto-detect separator, and handle missing headers
        df = pd.read_csv(file_path, comment='#', header=None, on_bad_lines='skip')

        # Log detected columns
        print(f"Raw data from {file_path}:")
        print(df.head())

        # Dynamically assign columns based on data pattern
        col_count = len(df.columns)
        default_cols = [f'col_{i+1}' for i in range(col_count)]
        df.columns = default_cols

        # Attempt to identify key columns
        df['attack_type'] = attack_type
        if col_count >= 1:
            df['time'] = pd.to_numeric(df.iloc[:, 0], errors='coerce')
        if col_count >= 2:
            df['counts'] = pd.to_numeric(df.iloc[:, 1].astype(str).str.replace(',', '', regex=True), errors='coerce')
        if col_count >= 3:
            df['unit'] = df.iloc[:, 2].astype(str)
        if col_count >= 4:
            df['events'] = df.iloc[:, 3].astype(str)

        # Feature engineering
        if 'time' in df.columns:
            df['time_diff'] = df['time'].diff().fillna(0)
        if 'counts' in df.columns and 'time_diff' in df.columns:
            df['packets_per_second'] = df['counts'] / df['time_diff'].replace(0, 1)
            df['rolling_avg_counts'] = df['counts'].rolling(window=10).mean().fillna(0)
            df['counts_derivative'] = df['counts'].diff().fillna(0)

        # Encode categorical features
        for col in ['unit', 'events']:
            if col in df.columns:
                df[col] = LabelEncoder().fit_transform(df[col].astype(str))

        # Handle NaN and infinite values
        df.replace([np.inf, -np.inf], np.nan, inplace=True)
        df.fillna(0, inplace=True)

        # Scale numerical features
        numerical_columns = ['counts', 'time_diff', 'rolling_avg_counts', 'counts_derivative', 'packets_per_second']
        numerical_columns = [col for col in numerical_columns if col in df.columns]
        if numerical_columns:
            df[numerical_columns] = StandardScaler().fit_transform(df[numerical_columns])

        # Save preprocessed file
        output_file = os.path.join(output_dir, 'Preprocessed_' + os.path.basename(file_path))
        df.to_csv(output_file, index=False)

        print(df.head())
        print(f"Preprocessed file saved to: {output_file}")
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

# Process each file
for file_info in file_paths:
    preprocess_file(file_info['file'], file_info['attack_type'])

print("All files processed and saved separately!")


Raw data from J:\\R JEEVAN\\B.TECH CYS\\4th Sem\\Machine Learning\\ML Project\\CICEVSE2024_Dataset\\Host Events\\Individual Files\EVSE-B-CHARGING-AGRESSIVE-SCAN.csv:
                             0    1  \
0       5.004696918         39  880   
1       5.004696918         90  371   
2       5.004696918        104  470   
3       5.004696918         11  927   
4       5.004696918         76  515   

                                                   2  
0  929      branch-misses                        ...  
1                881      cache-misses                
2                368      L1-dcache-load-misses       
3                733      L1-dcache-store-misses      
4                103      L1-icache-load-misses       
                         col_1 col_2  \
0       5.004696918         39   880   
1       5.004696918         90   371   
2       5.004696918        104   470   
3       5.004696918         11   927   
4       5.004696918         76   515   

                            