# VANET Misbehavior Detection - Data Exploration

This notebook explores the VeReMi dataset for VANET misbehavior detection.

In [None]:
import sys
import os

# Add the src directory to the path
sys.path.append(os.path.abspath('../src'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from data_preprocessing.data_loader import VANETDataLoader

## 1. Load the Dataset

In [None]:
# Initialize data loader
data_path = '../data/raw'
data_loader = VANETDataLoader(data_path)

# Load raw data
df = pd.read_csv(f'{data_path}/veremi_dataset.csv')

print(f'Dataset shape: {df.shape}')
df.head()

## 2. Data Overview

In [None]:
# Check data types and missing values
df.info()

# Check for missing values
print('
Missing values per column:')
print(df.isnull().sum())

# Basic statistics
df.describe()

## 3. Attack Distribution

In [None]:
# Check attack type distribution
attack_counts = df['attack_type'].value_counts()
print('Attack type distribution:
', attack_counts)

# Plot attack distribution
plt.figure(figsize=(12, 6))
sns.barplot(x=attack_counts.index, y=attack_counts.values)
plt.title('Attack Type Distribution')
plt.xlabel('Attack Type')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

# Pie chart for attack vs normal
attack_binary = df['attack'].value_counts()
plt.figure(figsize=(8, 8))
plt.pie(attack_binary, labels=['Normal', 'Attack'] if 0 in attack_binary.index else ['Attack', 'Normal'], 
       autopct='%1.1f%%', startangle=90)
plt.title('Normal vs Attack Distribution')
plt.axis('equal')
plt.show()

## 4. Feature Analysis

In [None]:
# Select numerical features for analysis
numerical_features = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
numerical_features.remove('Unnamed: 0')  # Remove index column
if 'attack' in numerical_features:
    numerical_features.remove('attack')  # Remove target variable

# Correlation matrix
plt.figure(figsize=(14, 12))
correlation = df[numerical_features].corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

# Distribution of key features by attack type
key_features = ['pos_0', 'pos_1', 'spd_0', 'spd_1', 'acl_0', 'acl_1']

for feature in key_features:
    plt.figure(figsize=(12, 6))
    sns.boxplot(x='attack_type', y=feature, data=df.sample(min(10000, len(df))))
    plt.title(f'{feature} Distribution by Attack Type')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()

## 5. Temporal Analysis

In [None]:
# Sort by receive time
df_sorted = df.sort_values('rcvTime')

# Sample data for visualization (to avoid plotting millions of points)
sample_size = min(10000, len(df))
df_sample = df_sorted.sample(sample_size)

# Plot position over time
plt.figure(figsize=(14, 8))

plt.subplot(2, 1, 1)
for attack_type in df_sample['attack_type'].unique():
    subset = df_sample[df_sample['attack_type'] == attack_type]
    plt.scatter(subset['rcvTime'], subset['pos_0'], label=attack_type, alpha=0.6, s=10)
plt.title('Position X over Time by Attack Type')
plt.xlabel('Receive Time')
plt.ylabel('Position X')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.subplot(2, 1, 2)
for attack_type in df_sample['attack_type'].unique():
    subset = df_sample[df_sample['attack_type'] == attack_type]
    plt.scatter(subset['rcvTime'], subset['pos_1'], label=attack_type, alpha=0.6, s=10)
plt.title('Position Y over Time by Attack Type')
plt.xlabel('Receive Time')
plt.ylabel('Position Y')

plt.tight_layout()
plt.show()

## 6. Feature Engineering Ideas

In [None]:
# Calculate derived features
# 1. Speed magnitude
df['speed_magnitude'] = np.sqrt(df['spd_0']**2 + df['spd_1']**2)

# 2. Acceleration magnitude
df['acceleration_magnitude'] = np.sqrt(df['acl_0']**2 + df['acl_1']**2)

# 3. Position noise magnitude
df['position_noise_magnitude'] = np.sqrt(df['pos_noise_0']**2 + df['pos_noise_1']**2)

# 4. Speed noise magnitude
df['speed_noise_magnitude'] = np.sqrt(df['spd_noise_0']**2 + df['spd_noise_1']**2)

# Visualize new features
plt.figure(figsize=(14, 10))

plt.subplot(2, 2, 1)
sns.boxplot(x='attack_type', y='speed_magnitude', data=df.sample(min(10000, len(df))))
plt.title('Speed Magnitude by Attack Type')
plt.xticks(rotation=90)

plt.subplot(2, 2, 2)
sns.boxplot(x='attack_type', y='acceleration_magnitude', data=df.sample(min(10000, len(df))))
plt.title('Acceleration Magnitude by Attack Type')
plt.xticks(rotation=90)

plt.subplot(2, 2, 3)
sns.boxplot(x='attack_type', y='position_noise_magnitude', data=df.sample(min(10000, len(df))))
plt.title('Position Noise Magnitude by Attack Type')
plt.xticks(rotation=90)

plt.subplot(2, 2, 4)
sns.boxplot(x='attack_type', y='speed_noise_magnitude', data=df.sample(min(10000, len(df))))
plt.title('Speed Noise Magnitude by Attack Type')
plt.xticks(rotation=90)

plt.tight_layout()
plt.show()

## 7. Conclusions and Next Steps

### Key Findings:

1. The dataset contains multiple attack types with varying distributions
2. Position, speed, and acceleration features show distinct patterns for different attack types
3. Derived features like magnitude calculations provide additional discriminative power

### Next Steps:

1. Implement feature extraction pipeline based on these insights
2. Create temporal sequences for LSTM model
3. Balance the dataset to address class imbalance
4. Develop and train the ensemble model architecture