# Network Traffic Data Preprocessing
This notebook contains the preprocessing steps for the network traffic dataset.

In [27]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier

In [28]:
from google.colab import drive
drive.mount('/content/drive')

# Path to your CSV in Google Drive
file_path = "/content/drive/MyDrive/RT3/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv"
df = pd.read_csv(file_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [29]:
df = pd.read_csv(file_path, low_memory=False)

# Display basic information
print("Dataset shape:", df.shape)
print("\nFirst few rows:")
display(df.head())
print("\nData types and missing values:")
display(df.info())

Dataset shape: (170366, 79)

First few rows:


Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,389,113095465,48,24,9668,10012,403,0,201.416667,203.548293,...,32,203985.5,575837.3,1629110,379,13800000.0,4277541.0,16500000,6737603,BENIGN
1,389,113473706,68,40,11364,12718,403,0,167.117647,171.919413,...,32,178326.875,503426.9,1424245,325,13800000.0,4229413.0,16500000,6945512,BENIGN
2,0,119945515,150,0,0,0,0,0,0.0,0.0,...,0,6909777.333,11700000.0,20400000,6,24400000.0,24300000.0,60100000,5702188,BENIGN
3,443,60261928,9,7,2330,4221,1093,0,258.888889,409.702161,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,53,269,2,2,102,322,51,51,51.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN



Data types and missing values:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170366 entries, 0 to 170365
Data columns (total 79 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0    Destination Port             170366 non-null  int64  
 1    Flow Duration                170366 non-null  int64  
 2    Total Fwd Packets            170366 non-null  int64  
 3    Total Backward Packets       170366 non-null  int64  
 4   Total Length of Fwd Packets   170366 non-null  int64  
 5    Total Length of Bwd Packets  170366 non-null  int64  
 6    Fwd Packet Length Max        170366 non-null  int64  
 7    Fwd Packet Length Min        170366 non-null  int64  
 8    Fwd Packet Length Mean       170366 non-null  float64
 9    Fwd Packet Length Std        170366 non-null  float64
 10  Bwd Packet Length Max         170366 non-null  int64  
 11   Bwd Packet Length Min        170366 non-null  int64  
 12   Bwd Packet 

None

In [30]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values per column:")
print(missing_values[missing_values > 0])

# Column name cleaning
df.columns = df.columns.str.strip()

# Handle missing values - fill with median for numerical columns
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

# For categorical columns, fill with mode
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# Replace all occurrences of 'BEGIN' with 0 across the entire DataFrame
df['Label'] = df['Label'].replace('BENIGN', 0)

# Verify no missing values remain
print("\nMissing values after handling:", df.isnull().sum().sum())

print(df.columns)

Missing values per column:
Flow Bytes/s    20
dtype: int64

Missing values after handling: 0
Index(['Destination Port', 'Flow Duration', 'Total Fwd Packets',
       'Total Backward Packets', 'Total Length of Fwd Packets',
       'Total Length of Bwd Packets', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Min', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s',
       'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
       'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',
       'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std',
       'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags',
       'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Min Packet Length', 'Max Packet Length', 'Packet Length Mean'

In [31]:
# Basic statistics for numerical features
print("Basic statistics for numerical features:")
display(df.describe())

Basic statistics for numerical features:


Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,170366.0,170366.0,170366.0,170366.0,170366.0,170366.0,170366.0,170366.0,170366.0,170366.0,...,170366.0,170366.0,170366.0,170366.0,170366.0,170366.0,170366.0,170366.0,170366.0,170366.0
mean,7897.090599,12463540.0,15.12462,18.022276,556.9859,31831.47,167.775982,22.697786,48.23325,47.235628,...,11.925108,25.584506,74338.04,46760.38,164642.2,50126.99,3690478.0,131072.4,3784764.0,3543232.0
std,18235.489428,31938520.0,1123.107756,1494.492871,7710.431,3460816.0,461.299214,37.990783,94.947821,141.927429,...,1077.898793,6.354041,618204.8,368960.5,995867.3,560679.7,12988300.0,1733767.0,13251350.0,12841830.0
min,0.0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,53.0,192.0,1.0,1.0,31.0,6.0,23.0,0.0,7.882314,0.0,...,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,80.0,31412.0,2.0,2.0,68.0,134.0,41.0,6.0,38.0,0.0,...,1.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,443.0,816981.8,4.0,2.0,148.0,328.0,60.0,41.0,50.0,17.897858,...,3.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,65529.0,120000000.0,200755.0,270686.0,1197199.0,627000000.0,23360.0,1729.0,4183.057143,5463.49385,...,192491.0,60.0,103000000.0,63700000.0,103000000.0,103000000.0,120000000.0,72600000.0,120000000.0,120000000.0


# Steganography Injection

In [32]:
# Duplicate part of your dataset
benign_df = df[df['Label'] == 0]
stego_samples = benign_df.sample(n=20000, random_state=42).copy()

df_small = df.sample(n=20000, random_state=42)

In [33]:
from random import choice

def inject_tcp_stego(benign_df, num_samples=20000):
    stego_samples = benign_df.sample(n=num_samples, random_state=42).copy()

    for idx in stego_samples.index:
        # Randomly select 1-3 techniques per packet
        techniques = [
            'flag_anomalies',
            'header_manipulation',
            'timing_channel',
            'payload_anomaly'
        ]
        chosen_techs = np.random.choice(techniques, size=np.random.randint(1, 4), replace=False)

        # Apply chosen techniques
        if 'flag_anomalies' in chosen_techs:
            # Random illegal flag combination
            flag_combo = choice([
                {'SYN': 1, 'FIN': 1, 'ACK': 0},  # SYN+FIN
                {'URG': 1, 'PSH': 1},            # URG+PSH
                {'ECE': 1, 'CWR': 1}              # Reserved flags
            ])
            for flag, val in flag_combo.items():
                stego_samples.at[idx, f'{flag} Flag Count'] = val

        if 'header_manipulation' in chosen_techs:
            # Random header manipulation
            stego_samples.at[idx, 'TCP Window Size'] = choice([0, 5840])  # Zero or max
            stego_samples.at[idx, 'Fwd Header Length'] = np.random.randint(40, 100)
            stego_samples.at[idx, 'Bwd Header Length'] = np.random.randint(40, 100)

        if 'timing_channel' in chosen_techs:
            # Random timing pattern
            stego_samples.at[idx, 'Flow IAT Mean'] = choice([50, 100, 200])  # Fixed intervals
            stego_samples.at[idx, 'Idle Std'] = np.random.uniform(0, 0.1)    # Near-zero variation

        if 'payload_anomaly' in chosen_techs:
            # Random payload signature
            stego_samples.at[idx, 'Fwd Packet Length Max'] = choice([66, 666, 1337])
            stego_samples.at[idx, 'Bwd Packet Length Std'] = np.random.uniform(0, 5)

    stego_samples['Label'] = 1  # Stego label
    return stego_samples

# Usage
stego_samples = inject_tcp_stego(benign_df)
combined_df = pd.concat([benign_df, stego_samples])

In [34]:
combined_df = pd.concat([df_small, stego_samples])
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

combined_df.to_csv("labeled_stego_dataset.csv", index=False)