## Step 1: Import Libraries

In [1]:
import math
import io
import shutil
import os
import sys
from os import path
import json

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm

## Step 2: Load the Cleaned Data

In [2]:
# Load the main dataset (which excludes Backdoor activity)
df = pd.read_csv("EVSE-B-HPC-Kernel-Events-cleaned.csv", low_memory=False)

# Load the Backdoor activity dataset separately
dfb = pd.read_csv("backdoors.csv", low_memory=False)

## Step 3: Define Kernel Events of Interest

In [3]:
# List of kernel events we want to analyze
kernel_events = [
    "instructions", "cpu-migrations", "mem_access_rd", "mem_access_wr", 
    "cache-misses", "L1-icache-loads", "dTLB-store-misses"
]

## Step 4: Identify Overlapping Features

In [4]:
# Find common kernel events between both datasets
df_columns = set(df.columns)
dfb_columns = set(dfb.columns)
overlapping_events = [event for event in kernel_events if event in df_columns and event in dfb_columns]

# Display overlapping kernel events
print("Overlapping kernel events:", overlapping_events)
print("Number of overlapping events:", len(overlapping_events))

# Raise an error if no overlapping events are found
if not overlapping_events:
    raise ValueError("No overlapping kernel events found between the two datasets. Cannot proceed.")

Overlapping kernel events: ['instructions', 'cpu-migrations', 'mem_access_rd', 'mem_access_wr', 'cache-misses', 'L1-icache-loads', 'dTLB-store-misses']
Number of overlapping events: 7


## Step 5: Ensure Numeric Columns for Kernel Events

In [5]:
# Convert kernel event columns to numeric, coercing errors to NaN
for event in overlapping_events:
    df[event] = pd.to_numeric(df[event], errors='coerce')
    dfb[event] = pd.to_numeric(dfb[event], errors='coerce')

## Step 6: Assign Labels for Backdoor Data

In [6]:
# The "Scenario" column is missing in dfb, so we manually assign "Backdoor" to all rows
dfb["Scenario"] = "Backdoor"

## Step 7: Select Relevant Columns for Merging

In [7]:
# Keep only necessary columns for both datasets
df_cols_to_keep = overlapping_events + ["Scenario", "msec"]
dfb_cols_to_keep = overlapping_events + ["Scenario", "msec"]
df = df[df_cols_to_keep]
dfb = dfb[dfb_cols_to_keep]

## Step 8: Merge the Datasets

In [8]:
# Combine both datasets into a single DataFrame
df_combined = pd.concat([df, dfb], ignore_index=True)

## Step 9: Sort Data by Timestamp

In [9]:
# Sorting by 'msec' to maintain chronological order
df_combined = df_combined.sort_values("msec")
df_combined["sample_index"] = range(len(df_combined))

## Step 10: Convert Scenarios into Binary Classification

In [10]:
# Convert all non-benign scenarios to "Attack"
df_combined["Scenario"] = df_combined["Scenario"].apply(lambda x: "Attack" if x != "Benign" else "Benign")

## Step 11: Create Feature DataFrame

In [11]:
# Initialize a new DataFrame for model training
feature_data = pd.DataFrame()
feature_data["sample_index"] = df_combined["sample_index"]
feature_data["Scenario"] = df_combined["Scenario"]  # Binary classification target

# Add raw kernel event data (no statistical computations)
for event in overlapping_events:
    feature_data[event] = df_combined[event]

## Step 12: Handle Missing Data

In [12]:
# Fill NaN values with 0 to ensure model compatibility
feature_data = feature_data.fillna(0)

## Step 13: Display Sample Data

In [14]:
# Show the first few rows of the processed feature DataFrame
print("\nFeature DataFrame (first 5 rows):")
print(feature_data.head())


Feature DataFrame (first 5 rows):
      sample_index Scenario  instructions  cpu-migrations  mem_access_rd  \
5005             0   Attack  4.842734e+08             0.0    182447274.0   
4330             1   Attack  3.360893e+08             0.0     83614575.0   
5059             2   Attack  2.504586e+08             0.0     62673568.0   
5121             3   Attack  3.815842e+08             0.0     61085549.0   
4695             4   Attack  2.012276e+09             0.0    464448105.0   

      mem_access_wr  cache-misses  L1-icache-loads  dTLB-store-misses  
5005    157319220.0     2113406.0      245241558.0           127193.0  
4330     51690301.0     1521862.0      213434196.0           161965.0  
5059    123802350.0     1212126.0       69182309.0            18750.0  
5121     49477220.0      681121.0      158026571.0           226431.0  
4695    344314102.0    11346447.0      943011302.0           776176.0  


## Step 14: Save Processed Features to CSV

In [None]:
# Save the feature dataset to a CSV file for model training
feature_data.to_csv("evse_features.csv", index=False)
print("\nFeatures saved to 'evse_features.csv'")