In [1]:
import pandas as pd
import numpy as np
import os

# Paths to the parquet files (adjust if needed)
train_path = "UNSW_NB15_training-set.parquet"
test_path = "UNSW_NB15_testing-set.parquet"


In [3]:
# Check shape and preview
print("Training set shape:", train_df.shape)
print("Testing set shape:", test_df.shape)

# Display first few rows
display(train_df.head())
display(test_df.head())

Training set shape: (175341, 36)
Testing set shape: (82332, 36)


Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,...,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports,attack_cat,label
0,0.121478,tcp,-,FIN,6,4,258,172,74.087486,14158.942383,...,0,0,1,1,0,0,0,0,Normal,0
1,0.649902,tcp,-,FIN,14,38,734,42014,78.473373,8395.112305,...,0,0,1,1,0,0,0,0,Normal,0
2,1.623129,tcp,-,FIN,8,16,364,13186,14.170161,1572.271851,...,0,0,1,1,0,0,0,0,Normal,0
3,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,2740.178955,...,0,0,1,1,1,1,0,0,Normal,0
4,0.449454,tcp,-,FIN,10,6,534,268,33.373825,8561.499023,...,0,0,2,1,0,0,0,0,Normal,0


Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,...,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports,attack_cat,label
0,1.1e-05,udp,-,INT,2,0,496,0,90909.09375,180363632.0,...,0,0,1,1,0,0,0,0,Normal,0
1,8e-06,udp,-,INT,2,0,1762,0,125000.0,881000000.0,...,0,0,1,1,0,0,0,0,Normal,0
2,5e-06,udp,-,INT,2,0,1068,0,200000.0,854400000.0,...,0,0,1,1,0,0,0,0,Normal,0
3,6e-06,udp,-,INT,2,0,900,0,166666.65625,600000000.0,...,0,0,2,1,0,0,0,0,Normal,0
4,1e-05,udp,-,INT,2,0,2126,0,100000.0,850400000.0,...,0,0,2,1,0,0,0,0,Normal,0


In [4]:
train_df.describe(include="all").transpose().head(20)


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
dur,175341.0,,,,1.359389,6.480249,0.0,8e-06,0.001582,0.668069,59.999989
proto,175341.0,133.0,tcp,79946.0,,,,,,,
service,175341.0,13.0,-,94168.0,,,,,,,
state,175341.0,9.0,INT,82275.0,,,,,,,
spkts,175341.0,,,,20.298664,136.887597,1.0,2.0,2.0,12.0,9616.0
dpkts,175341.0,,,,18.969591,110.258271,0.0,0.0,2.0,10.0,10974.0
sbytes,175341.0,,,,8844.843836,174765.644309,28.0,114.0,430.0,1418.0,12965233.0
dbytes,175341.0,,,,14928.918564,143654.217718,0.0,0.0,164.0,1102.0,14655550.0
rate,175341.0,,,,95406.179688,165400.96875,0.0,32.78614,3225.806641,125000.0,1000000.0
sload,175341.0,,,,73454032.0,188357440.0,0.0,13053.338867,879674.75,88888888.0,5988000256.0


In [5]:
categorical_cols = ["proto", "service", "state", "attack_cat"]

for col in categorical_cols:
    if col in train_df.columns:
        train_df[col] = train_df[col].astype("category")
        test_df[col] = test_df[col].astype("category")


In [6]:
skewed_cols = ["sbytes", "dbytes", "rate"]

for col in skewed_cols:
    if col in train_df.columns:
        train_df[col] = np.log1p(train_df[col])
        test_df[col] = np.log1p(test_df[col])


In [10]:
os.makedirs("processed", exist_ok=True)

train_df.to_parquet("processed/unsw_nb15_train_clean.parquet", index=False)
test_df.to_parquet("processed/unsw_nb15_test_clean.parquet", index=False)

print("✅ Cleaned data saved in /processed/")


✅ Cleaned data saved in /processed/


In [15]:
from sklearn.preprocessing import LabelEncoder

cat_cols = ["proto", "service", "state", "attack_cat"]
encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    # fit on combined categories
    le.fit(pd.concat([train_df[col], test_df[col]], axis=0).astype(str))
    
    train_df[col] = le.transform(train_df[col].astype(str))
    test_df[col]  = le.transform(test_df[col].astype(str))
    encoders[col] = le

print("✅ Re-encoded categorical columns (with train+test):", cat_cols)


✅ Re-encoded categorical columns (with train+test): ['proto', 'service', 'state', 'attack_cat']


In [16]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

# Separate features and target
X = train_df.drop(columns=["label"])
y = train_df["label"]

# Train a small random forest just for feature ranking
model = RandomForestClassifier(
    n_estimators=50,
    random_state=42,
    n_jobs=-1
)
model.fit(X, y)

# Feature importances
importances = pd.Series(model.feature_importances_, index=X.columns)
top_feats = importances.sort_values(ascending=False).head(15)

print("Top 15 Important Features:\n")
print(top_feats)


Top 15 Important Features:

attack_cat    0.355176
rate          0.085982
dmean         0.047380
tcprtt        0.046817
synack        0.043692
dload         0.042897
sload         0.041465
dur           0.035146
sbytes        0.034233
dinpkt        0.031102
ackdat        0.028832
dbytes        0.028053
sinpkt        0.023483
dpkts         0.019895
smean         0.019766
dtype: float64
