#Data Processing

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import IsolationForest
from sklearn.pipeline import Pipeline

# Load the raw dataset
df = pd.read_csv('raw_network_anomaly_dataset.csv')

# Define feature columns
numerical_features = ['packet_count', 'byte_count', 'duration']
categorical_features = ['protocol', 'service']
ip_features = ['src_ip', 'dst_ip']  # IPs will be processed separately
port_features = ['src_port', 'dst_port']
timestamp_feature = ['timestamp']

# Function to extract the last octet of IP addresses as a numerical feature
def extract_ip_octet(ip_series):
    return ip_series.apply(lambda x: int(x.split('.')[-1]))

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_features),
        ('port', StandardScaler(), port_features),
        ('ip_src', StandardScaler(), ['src_ip_octet']),
        ('ip_dst', StandardScaler(), ['dst_ip_octet'])
    ])

# Prepare data for anomaly detection
X = df[numerical_features + categorical_features + port_features + ip_features].copy()
X['src_ip_octet'] = extract_ip_octet(df['src_ip'])
X['dst_ip_octet'] = extract_ip_octet(df['dst_ip'])

# Create pipeline with preprocessing and Isolation Forest
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('isolation_forest', IsolationForest(contamination=0.1, random_state=42))
])

# Fit and predict anomalies
pipeline.fit(X)
anomaly_labels = pipeline.predict(X)  # -1 for anomalies, 1 for normal

# Add anomaly labels to the dataset
df['is_anomaly'] = np.where(anomaly_labels == -1, 1, 0)

# Save the dataset with predicted anomaly labels
df.to_csv('labeled_network_anomaly_dataset.csv', index=False)

# Print summary
print(f"Number of detected anomalies: {df['is_anomaly'].sum()}")
print(f"Percentage of anomalies: {df['is_anomaly'].mean() * 100:.2f}%")

# Display sample of anomalies
print("\nSample of detected anomalies:")
print(df[df['is_anomaly'] == 1][['timestamp', 'src_ip', 'dst_ip', 'packet_count', 'byte_count', 'duration']].head())

Number of detected anomalies: 9989
Percentage of anomalies: 10.00%

Sample of detected anomalies:
              timestamp           src_ip           dst_ip  packet_count  \
1   2025-01-01 07:14:22  184.214.112.115    151.142.3.195          6574   
8   2025-01-01 13:53:39  153.239.128.249     142.5.58.175          2827   
20  2025-01-01 07:45:38   130.135.67.239     90.35.125.95          9900   
34  2025-01-01 09:04:22    235.62.233.35  206.237.162.244          9270   
38  2025-01-01 12:48:25   77.199.214.138   192.112.249.57          7371   

    byte_count  duration  
1      5661907  1.639247  
8      5924115  4.404176  
20     6075395  6.155610  
34     8159521  8.322635  
38     6639317  6.721849  
