In [1]:
#preprocessing for netwrotk traffic EVSE-A (EVSE-A-charging-Aggressive-scan.csv)

import pandas as pd
import os

# List of network traffic files with attack labels
file_paths = [
    {"file": r"J:\\R JEEVAN\B.TECH CYS\\4th Sem\\Machine Learning\\ML Project\\CICEVSE2024_Dataset\\Network Traffic\\EVSE-A\\csv\\EVSE-A-charging-Aggressive-scan.csv", "attack_type": "Aggressive-Scan"}
]

# Preprocessing function
def preprocess_network_traffic(file_path, attack_type):
    try:
        # Load CSV with auto-detection of delimiters
        df = pd.read_csv(file_path, engine='python', on_bad_lines='skip')
        
        # Ensure the file has data
        if df.empty:
            print(f'Skipping empty file: {file_path}')
            return
        
        # Check number of columns
        print(f"Columns detected in {file_path}: {df.columns.tolist()}")
        print(df.head())
        
        # If only one column, assume data is space or tab delimited
        if len(df.columns) == 1:
            df = df.iloc[:, 0].str.split(expand=True)
        
        # Adjust column renaming based on detected structure
        num_cols = len(df.columns)
        if num_cols >= 4:
            df.columns = ['time', 'counts', 'unit', 'events'] + [f'extra_col_{i}' for i in range(5, num_cols + 1)]
        else:
            print(f"Unexpected column structure in {file_path}, skipping.")
            return
        
        # Convert 'time' and 'counts' to numeric
        df['time'] = pd.to_numeric(df['time'], errors='coerce')
        df['counts'] = pd.to_numeric(df['counts'], errors='coerce')
        
        # Add attack type column
        df['attack_type'] = attack_type

        # Feature engineering: rolling averages and derivatives
        df['rolling_avg_counts'] = df['counts'].rolling(window=10).mean().fillna(0)
        df['counts_derivative'] = df['counts'].diff().fillna(0)

        # Time-based features
        df['time_diff'] = df['time'].diff().fillna(0)
        df['packets_per_second'] = df['counts'] / df['time_diff'].replace(0, 1)

        # Save preprocessed data to the same location
        output_file = os.path.join(os.path.dirname(file_path), 'Preprocessed_' + os.path.basename(file_path))
        df.to_csv(output_file, index=False)
        print(f"Preprocessed file saved to: {output_file}")
    except Exception as e:
        print(f'Error processing {file_path}: {e}')

# Apply preprocessing to all files
for file_info in file_paths:
    preprocess_network_traffic(file_info['file'], file_info['attack_type'])


Columns detected in J:\\R JEEVAN\B.TECH CYS\\4th Sem\\Machine Learning\\ML Project\\CICEVSE2024_Dataset\\Network Traffic\\EVSE-A\\csv\\EVSE-A-charging-Aggressive-scan.csv: ['id', 'expiration_id', 'src_ip', 'src_mac', 'src_oui', 'src_port', 'dst_ip', 'dst_mac', 'dst_oui', 'dst_port', 'protocol', 'ip_version', 'vlan_id', 'tunnel_id', 'bidirectional_first_seen_ms', 'bidirectional_last_seen_ms', 'bidirectional_duration_ms', 'bidirectional_packets', 'bidirectional_bytes', 'src2dst_first_seen_ms', 'src2dst_last_seen_ms', 'src2dst_duration_ms', 'src2dst_packets', 'src2dst_bytes', 'dst2src_first_seen_ms', 'dst2src_last_seen_ms', 'dst2src_duration_ms', 'dst2src_packets', 'dst2src_bytes', 'bidirectional_min_ps', 'bidirectional_mean_ps', 'bidirectional_stddev_ps', 'bidirectional_max_ps', 'src2dst_min_ps', 'src2dst_mean_ps', 'src2dst_stddev_ps', 'src2dst_max_ps', 'dst2src_min_ps', 'dst2src_mean_ps', 'dst2src_stddev_ps', 'dst2src_max_ps', 'bidirectional_min_piat_ms', 'bidirectional_mean_piat_ms', '

In [4]:
#preprocessing for netwrotk traffic EVSE-A(all files in the directory)

import pandas as pd
import os
import re

# Base directory for CSV files
base_dir = r"C:\\R JEEVAN\\B.TECH CYS\\4th Sem\\Machine Learning\\ML Project\\CICEVSE2024_Dataset\\Network Traffic\\EVSE-A\\csv"

# Get all CSV files in the directory
file_paths = []
for file_name in os.listdir(base_dir):
    if file_name.endswith('.csv'):
        # Extract attack type from file name
        attack_type = re.sub(r'EVSE-A-(charging|idle)-', '', file_name).replace('.csv', '').replace('-', ' ').title()
        file_paths.append({"file": os.path.join(base_dir, file_name), "attack_type": attack_type})

# Preprocessing function
def preprocess_network_traffic(file_path, attack_type):
    try:
        # Load CSV with auto-detection of delimiters
        df = pd.read_csv(file_path, engine='python', on_bad_lines='skip')
        
        # Ensure the file has data
        if df.empty:
            print(f'Skipping empty file: {file_path}')
            return
        
        # Check number of columns
        print(f"Columns detected in {file_path}: {df.columns.tolist()}")
        print(df.head())
        
        # If only one column, assume data is space or tab delimited
        if len(df.columns) == 1:
            df = df.iloc[:, 0].str.split(expand=True)
        
        # Adjust column renaming based on detected structure
        num_cols = len(df.columns)
        if num_cols >= 4:
            df.columns = ['time', 'counts', 'unit', 'events'] + [f'extra_col_{i}' for i in range(5, num_cols + 1)]
        else:
            print(f"Unexpected column structure in {file_path}, skipping.")
            return
        
        # Convert 'time' and 'counts' to numeric
        df['time'] = pd.to_numeric(df['time'], errors='coerce')
        df['counts'] = pd.to_numeric(df['counts'], errors='coerce')
        
        # Add attack type column
        df['attack_type'] = attack_type

        # Feature engineering: rolling averages and derivatives
        df['rolling_avg_counts'] = df['counts'].rolling(window=10).mean().fillna(0)
        df['counts_derivative'] = df['counts'].diff().fillna(0)

        # Time-based features
        df['time_diff'] = df['time'].diff().fillna(0)
        df['packets_per_second'] = df['counts'] / df['time_diff'].replace(0, 1)

        # Save preprocessed data to the same location
        output_file = os.path.join(os.path.dirname(file_path), 'Preprocessed_' + os.path.basename(file_path))
        df.to_csv(output_file, index=False)
        print(f"Preprocessed file saved to: {output_file}")
    except Exception as e:
        print(f'Error processing {file_path}: {e}')

# Apply preprocessing to all files
for file_info in file_paths:
    preprocess_network_traffic(file_info['file'], file_info['attack_type'])


Columns detected in C:\\R JEEVAN\\B.TECH CYS\\4th Sem\\Machine Learning\\ML Project\\CICEVSE2024_Dataset\\Network Traffic\\EVSE-A\\csv\EVSE-A-charging-Aggressive-scan.csv: ['id', 'expiration_id', 'src_ip', 'src_mac', 'src_oui', 'src_port', 'dst_ip', 'dst_mac', 'dst_oui', 'dst_port', 'protocol', 'ip_version', 'vlan_id', 'tunnel_id', 'bidirectional_first_seen_ms', 'bidirectional_last_seen_ms', 'bidirectional_duration_ms', 'bidirectional_packets', 'bidirectional_bytes', 'src2dst_first_seen_ms', 'src2dst_last_seen_ms', 'src2dst_duration_ms', 'src2dst_packets', 'src2dst_bytes', 'dst2src_first_seen_ms', 'dst2src_last_seen_ms', 'dst2src_duration_ms', 'dst2src_packets', 'dst2src_bytes', 'bidirectional_min_ps', 'bidirectional_mean_ps', 'bidirectional_stddev_ps', 'bidirectional_max_ps', 'src2dst_min_ps', 'src2dst_mean_ps', 'src2dst_stddev_ps', 'src2dst_max_ps', 'dst2src_min_ps', 'dst2src_mean_ps', 'dst2src_stddev_ps', 'dst2src_max_ps', 'bidirectional_min_piat_ms', 'bidirectional_mean_piat_ms', '

In [6]:
## accuracy prediction after preprocessing (all files, using random forest)

import pandas as pd
import os
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Base directory for CSV files
base_dir = r"J:\\R JEEVAN\B.TECH CYS\\4th Sem\\Machine Learning\\ML Project\\CICEVSE2024_Dataset\\Network Traffic\\EVSE-A\\csv"

# Get all preprocessed CSV files
file_paths = []
for file_name in os.listdir(base_dir):
    if file_name.startswith('Preprocessed_') and file_name.endswith('.csv'):
        attack_type = re.sub(r'Preprocessed_EVSE-A-(charging|idle)-', '', file_name).replace('.csv', '').replace('-', ' ').title()
        file_paths.append({"file": os.path.join(base_dir, file_name), "attack_type": attack_type})

# Load and combine all preprocessed network traffic data in chunks
combined_df = pd.DataFrame()
chunk_size = 10000
for file_info in file_paths:
    try:
        for chunk in pd.read_csv(file_info['file'], chunksize=chunk_size, low_memory=False):
            chunk['attack_type'] = file_info['attack_type']
            combined_df = pd.concat([combined_df, chunk], ignore_index=True)
    except Exception as e:
        print(f"Error loading {file_info['file']}: {e}")

# Check columns in combined_df
print("Columns in combined network traffic dataset:", combined_df.columns)

# Encode categorical features
label_encoders = {}
for col in ['attack_type', 'unit', 'events']:
    if col in combined_df.columns:
        le = LabelEncoder()
        combined_df[col] = le.fit_transform(combined_df[col].astype(str))
        label_encoders[col] = le

# Select only numeric columns for features (X)
numeric_columns = combined_df.select_dtypes(include=['number']).columns.tolist()
X = combined_df[numeric_columns].drop(columns=['attack_type'], errors='ignore')
y = combined_df['attack_type']

# Scale numeric features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Stratified split for balanced class representation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Train Random Forest Classifier with class balancing
clf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(classification_report(y_test, y_pred, zero_division=1))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))


Columns in combined network traffic dataset: Index(['time', 'counts', 'unit', 'events', 'extra_col_5', 'extra_col_6',
       'extra_col_7', 'extra_col_8', 'extra_col_9', 'extra_col_10',
       ...
       'extra_col_87', 'extra_col_88', 'extra_col_89', 'extra_col_90',
       'extra_col_91', 'extra_col_92', 'extra_col_93', 'extra_col_94',
       'extra_col_95', 'extra_col_96'],
      dtype='object', length=118)
Model Accuracy: 0.98
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3700
           1       1.00      1.00      1.00        20
           2       0.00      0.00      0.00         1
           3       0.40      0.40      0.40         5
           4       1.00      1.00      1.00      2514
           5       1.00      1.00      1.00      3928
           6       0.95      0.92      0.94      3700
           7       1.00      1.00      1.00        20
           8       1.00      1.00      1.00         4
  

In [2]:
## finding precision,recall,f1-score, accuracy using random forest

import pandas as pd
import os
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Base directory for CSV files
base_dir = r"J:\\R JEEVAN\B.TECH CYS\\4th Sem\\Machine Learning\\ML Project\\CICEVSE2024_Dataset\\Network Traffic\\EVSE-A\\csv"

# Define attack type to attack group mapping
attack_group_mapping = {
    'Aggressive Scan': 'Recon',
    'Benign': 'Benign',
    'Icmp Flood': 'DoS',
    'Icmp Fragmentation': 'DoS',
    'Os Fingerprinting': 'Recon',
    'Portscan': 'Recon',
    'Push Ack Flood': 'DoS',
    'Service Detection': 'Recon',
    'Slowloris Scan': 'DoS',
    'Syn Flood': 'DoS',
    'Synonymous Ip': 'Evasion',
    'Syn Stealth': 'DoS',
    'Tcp Flood': 'DoS',
    'Udp Flood': 'DoS',
    'Vulnerability Scan': 'Recon'
}

# Get all preprocessed CSV files
file_paths = []
for file_name in os.listdir(base_dir):
    if file_name.startswith('Preprocessed_') and file_name.endswith('.csv'):
        attack_type = re.sub(r'Preprocessed_EVSE-A-(charging|idle)-', '', file_name).replace('.csv', '').replace('-', ' ').title()
        attack_group = attack_group_mapping.get(attack_type, 'Unknown')
        file_paths.append({"file": os.path.join(base_dir, file_name), "attack_type": attack_type, "attack_group": attack_group})

# Load and combine all preprocessed network traffic data in chunks
combined_df = pd.DataFrame()
chunk_size = 10000
for file_info in file_paths:
    try:
        for chunk in pd.read_csv(file_info['file'], chunksize=chunk_size, low_memory=False):
            chunk['attack_type'] = file_info['attack_type']
            chunk['attack_group'] = file_info['attack_group']
            combined_df = pd.concat([combined_df, chunk], ignore_index=True)
    except Exception as e:
        print(f"Error loading {file_info['file']}: {e}")

# Check class distribution
print("Attack group distribution:")
print(combined_df['attack_group'].value_counts())

# Encode categorical features
label_encoders = {}
for col in ['attack_group', 'unit', 'events']:
    if col in combined_df.columns:
        le = LabelEncoder()
        combined_df[col] = le.fit_transform(combined_df[col].astype(str))
        label_encoders[col] = le

# Select only numeric columns for features (X)
numeric_columns = combined_df.select_dtypes(include=['number']).columns.tolist()
X = combined_df[numeric_columns].drop(columns=['attack_group'], errors='ignore')
y = combined_df['attack_group']

# Scale numeric features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Stratified split for balanced class representation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Train Random Forest Classifier with class balancing
clf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(classification_report(y_test, y_pred, zero_division=1))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))


Attack group distribution:
attack_group
DoS        355937
Evasion    131093
Recon       53527
Unknown      7229
Benign         68
Name: count, dtype: int64
Model Accuracy: 1.00
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        20
           1       1.00      1.00      1.00    106782
           2       1.00      1.00      1.00     39328
           3       1.00      1.00      1.00     16058
           4       1.00      1.00      1.00      2169

    accuracy                           1.00    164357
   macro avg       1.00      1.00      1.00    164357
weighted avg       1.00      1.00      1.00    164357

Confusion Matrix:
[[    20      0      0      0      0]
 [     0 106777      0      5      0]
 [     0      7  39321      0      0]
 [     0      0      0  16058      0]
 [     0      0      0      0   2169]]


In [None]:
## finding precision,recall,f1-score, accuracy using all models


import pandas as pd
import os
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score

# Base directory for CSV files
base_dir = r"J:\\R JEEVAN\B.TECH CYS\\4th Sem\\Machine Learning\\ML Project\\CICEVSE2024_Dataset\\Network Traffic\\EVSE-A\\csv"
# Define attack type to attack group mapping
attack_group_mapping = {
    'Aggressive Scan': 'Recon',
    'Benign': 'Benign',
    'Icmp Flood': 'DoS',
    'Icmp Fragmentation': 'DoS',
    'Os Fingerprinting': 'Recon',
    'Portscan': 'Recon',
    'Push Ack Flood': 'DoS',
    'Service Detection': 'Recon',
    'Slowloris Scan': 'DoS',
    'Syn Flood': 'DoS',
    'Synonymous Ip': 'Evasion',
    'Syn Stealth': 'DoS',
    'Tcp Flood': 'DoS',
    'Udp Flood': 'DoS',
    'Vulnerability Scan': 'Recon'
}

# Get all preprocessed CSV files
file_paths = []
for file_name in os.listdir(base_dir):
    if file_name.startswith('Preprocessed_') and file_name.endswith('.csv'):
        attack_type = re.sub(r'Preprocessed_EVSE-A-(charging|idle)-', '', file_name).replace('.csv', '').replace('-', ' ').title()
        attack_group = attack_group_mapping.get(attack_type, 'Unknown')
        file_paths.append({"file": os.path.join(base_dir, file_name), "attack_type": attack_type, "attack_group": attack_group})

# Load and combine all preprocessed network traffic data in chunks
combined_df = pd.DataFrame()
chunk_size = 10000
for file_info in file_paths:
    try:
        for chunk in pd.read_csv(file_info['file'], chunksize=chunk_size, low_memory=False):
            chunk['attack_type'] = file_info['attack_type']
            chunk['attack_group'] = file_info['attack_group']
            combined_df = pd.concat([combined_df, chunk], ignore_index=True)
    except Exception as e:
        print(f"Error loading {file_info['file']}: {e}")

# Check class distribution
print("Attack group distribution:")
print(combined_df['attack_group'].value_counts())

# Encode categorical features
label_encoders = {}
for col in ['attack_group', 'unit', 'events']:
    if col in combined_df.columns:
        le = LabelEncoder()
        combined_df[col] = le.fit_transform(combined_df[col].astype(str))
        label_encoders[col] = le

# Select only numeric columns for features (X)
numeric_columns = combined_df.select_dtypes(include=['number']).columns.tolist()
X = combined_df[numeric_columns].drop(columns=['attack_group'], errors='ignore')
y = combined_df['attack_group']

# Impute missing values with median
imputer = SimpleImputer(strategy='median')
X = imputer.fit_transform(X)

# Scale numeric features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Stratified split for balanced class representation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Initialize models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
    'SVM': SVC(class_weight='balanced', probability=True, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'Decision Tree': DecisionTreeClassifier(class_weight='balanced', random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=42),
    'MLP': MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
}

# Train and evaluate models
for model_name, model in models.items():
    print(f"\nTraining {model_name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    print(f"{model_name} Accuracy: {accuracy:.2f}")
    print(f"{model_name} F1 Score: {f1:.2f}")
    print('Classification Report:')
    print(classification_report(y_test, y_pred, zero_division=1))
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
