Preprocessing

In [1]:
import pandas as pd
import numpy as np

# Assuming your merged dataframe is loaded as 'df'
df = pd.read_csv('joined_dataset.csv')

# --- 1. Identify Categorical Columns (Objects) ---
# Find columns with 'object' datatype (usually strings/text)
categorical_cols = df.select_dtypes(include=['object']).columns
print("Categorical Columns (to be encoded):", categorical_cols)

# --- 2. Handle NaN/Null Values ---
# Check for null values in each column
print("\nMissing values per column:\n", df.isnull().sum())

# A common, simple approach is to fill missing values with 0.
# For a complex dataset like this, filling with 0 or the median is better than dropping rows.
# Replace all NaN values with 0.
df = df.fillna(0)


# --- 3. Handle Infinite Values (inf) ---
# In network datasets, division by zero can create 'inf' values (e.g., in load, jitter, or rates).
# We must replace these with a sensible number, like the maximum finite value in that column.

# Replace all 'inf' and '-inf' values with NaN first
df = df.replace([np.inf, -np.inf], np.nan)

# Now, fill the new NaN values (which were previously inf) with a large finite number.
# Using a fixed large number like 1e9 is safer than using the column mean/median for inf.
df = df.fillna(1e9) # Fill with 1 billion, ensuring it is a large finite number

# Re-check for any remaining non-finite values (should be 0)
print("\nRemaining non-finite values (should be 0):", df.isin([np.inf, -np.inf, np.nan]).sum().sum())

  df = pd.read_csv('joined_dataset.csv')


Categorical Columns (to be encoded): Index(['Source IP', 'Source Port', 'Destination IP', 'Destination Port',
       'Protocol', 'Connection State', 'Service', 'FTP Command Count',
       'Attack Category'],
      dtype='object')

Missing values per column:
 Source IP                                                                     0
Source Port                                                                   0
Destination IP                                                                0
Destination Port                                                              0
Protocol                                                                      0
Connection State                                                              0
Duration                                                                      0
Source Bytes                                                                  0
Destination Bytes                                                             0
Source TTL           

ENCODING -  we use one hot encoding (even though it may increase the size than binary encoding)

OHE is better as it doesnt support ordinality
like if a no 1 is assigned to tcp, and 3 is for ip...it wont think ip is superior than tcp.

and we dont want ordinality when it comes to FE and NN

In [2]:
# --- 4. One-Hot Encode Categorical Features ---

# These are the columns that need encoding:
# ['Protocol', 'Connection State', 'Service', 'Attack Category']
# NOTE: 'Label' is already 0/1, so it does not need encoding.

cols_to_encode = ['Protocol', 'Connection State', 'Service', 'Attack Category']

# Use pandas get_dummies for One-Hot Encoding
df = pd.get_dummies(df, columns=cols_to_encode, prefix=cols_to_encode)

print("\nDataFrame Shape after One-Hot Encoding:", df.shape)
print("New Columns (a sample):", df.columns[-10:].tolist())


DataFrame Shape after One-Hot Encoding: (2540047, 223)
New Columns (a sample): ['Attack Category_ Shellcode ', 'Attack Category_Analysis', 'Attack Category_Backdoor', 'Attack Category_Backdoors', 'Attack Category_DoS', 'Attack Category_Exploits', 'Attack Category_Generic', 'Attack Category_Reconnaissance', 'Attack Category_Shellcode', 'Attack Category_Worms']


Save the dataset as a save point

In [3]:
# Save the cleaned and encoded dataframe for a checkpoint
df.to_csv("checkpoint_preprocessed_unsw.csv", index=False)
print("Saved checkpoint_preprocessed_unsw.csv")

Saved checkpoint_preprocessed_unsw.csv


Feature selection and stratifies sampling - as i dont enough power

In [4]:
# --- 1. Identify and Drop ID and Redundant Columns ---
# 'Source IP', 'Destination IP', 'Source Port', 'Destination Port' have too many unique values (high cardinality).
# The original Attack Category column has been replaced by the One-Hot encoded columns.
# We also drop other unique identifiers like TCP Base Seq and time stamps.

cols_to_drop = [
    'Source IP', 'Destination IP', 'Source Port', 'Destination Port',
    'Source TCP Base Seq', 'Destination TCP Base Seq', 'Start Time', 'End Time'
]

# Drop the columns from the dataframe
df = df.drop(columns=cols_to_drop, errors='ignore')

# Separate the target variable 'Label' (0 or 1)
X = df.drop('Label', axis=1) # Features
y = df['Label']             # Target

In [5]:
from sklearn.model_selection import train_test_split

# Separate features (X) and target (y) again, after dropping the ID columns (Action 2.1)
# NOTE: We assume X and y were successfully created in Action 2.1
# Let's verify the size of the whole dataset first:
print(f"Original Dataset Size: {df.shape[0]} rows")

# We will sample 500,000 rows. Use train_test_split with stratify=y to ensure the
# sample maintains the same attack/normal ratio as the full dataset.
sample_size = 500000

# We use the test_size to determine the percentage needed for the sample_size
# sample_fraction = sample_size / len(df)

# Perform stratified sampling to get the final working dataset
X_sample, _, y_sample, _ = train_test_split(
    X, y,
    test_size=(1 - sample_size / len(df)),
    random_state=42, # Ensure reproducibility
    stratify=y       # CRITICAL: Preserves the ratio of 0s and 1s
)

# Recombine the sample for scaling
df_sampled = X_sample.copy()
df_sampled['Label'] = y_sample

print(f"Sampled Dataset Size: {df_sampled.shape[0]} rows")
print("New Sampled Label Distribution:\n", df_sampled['Label'].value_counts())

Original Dataset Size: 2540047 rows
Sampled Dataset Size: 500000 rows
New Sampled Label Distribution:
 Label
0    436756
1     63244
Name: count, dtype: int64


FEature Scaling - We will use MinMaxScaler to scale all continuous features to the range [0,1].

In [6]:
from sklearn.preprocessing import MinMaxScaler

# --- 1. Identify Numerical Features for Scaling ---
# We exclude the target 'Label' and all one-hot encoded (OHE) columns (which are already 0 or 1)

# Get column names that are NOT the Label and are NOT part of the OHE binary columns
# OHE columns end in things like '_tcp', '_http', etc., but for safety, we identify all non-OHE columns.
# A quick way is to select all columns that are NOT of type 'uint8' (typical OHE type)
# or are not the target 'Label'.

# Assuming your OHE columns are the only 'uint8' type:
cols_to_scale = df_sampled.select_dtypes(include=['int64', 'float64']).columns.drop('Label', errors='ignore')

print("Columns to Scale:", cols_to_scale.tolist())

# --- 2. Apply MinMaxScaler ---
scaler = MinMaxScaler()

# Apply scaler only to the selected continuous columns
df_sampled[cols_to_scale] = scaler.fit_transform(df_sampled[cols_to_scale])

# Now all features are clean, scaled, and numerical!

Columns to Scale: ['Duration', 'Source Bytes', 'Destination Bytes', 'Source TTL', 'Destination TTL', 'Source Packet Loss', 'Destination Packet Loss', 'Source Load', 'Destination Load', 'Source Packets', 'Destination Packets', 'Source Window Size', 'Destination Window Size', 'Source Mean Packet Size', 'Destination Mean Packet Size', 'Transaction Depth', 'Response Body Length', 'Source Jitter', 'Destination Jitter', 'Source Inter-Packet Arrival Time', 'Destination Inter-Packet Arrival Time', 'TCP Round Trip Time', 'SYN-ACK Time', 'ACK Data Time', 'Same IPs/Ports Flag', 'Count State TTL', 'HTTP Method Count', 'FTP Login Flag', 'Connections from Same Source to Same Service', 'Connections to Same Destination Service', 'Connections to Same Destination in Last Minute', 'Connections from Same Source in Last Minute', 'Connections from Same Source to Same Destination Port in Last Minute', 'Connections from Same Destination Source Port in Last Minute', 'Connections between Same Destination and So

In [7]:
# Assuming 'df_sampled' is your final, clean, scaled, and downsampled DataFrame
df_sampled.to_csv("final_golden_dataset_500k.csv", index=False)
print("Dataset saved as final_golden_dataset_500k.csv. This is your definitive starting point for all model training.")

Dataset saved as final_golden_dataset_500k.csv. This is your definitive starting point for all model training.
