In [84]:
"""Objective: Clean and preprocess network intrusion datasets for ML.
Contents:
- Dataset sources and loading
- Column inspection and cleaning
- Encoding categorical features
- Feature scaling
- Handling class imbalance (SMOTE, undersampling)
- Feature selection (mutual information, tree-based)
- Save preprocessed train/test splits
"""

'Objective: Clean and preprocess network intrusion datasets for ML.\nContents:\n- Dataset sources and loading\n- Column inspection and cleaning\n- Encoding categorical features\n- Feature scaling\n- Handling class imbalance (SMOTE, undersampling)\n- Feature selection (mutual information, tree-based)\n- Save preprocessed train/test splits\n'

In [86]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import mutual_info_classif
import joblib

In [88]:
# Config
DATA_PATH = "C:\\Users\\HarshaSri\\Desktop\\IDS_PROJECT\\data\\raw" # update this path
OUTPUT_DIR = "C:\\Users\\HarshaSri\\Desktop\\IDS_PROJECT\\data\\processed"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [90]:
# Load dataset
train_file = os.path.join(DATA_PATH, "Train.csv")
if os.path.exists(train_file):
    df = pd.read_csv(train_file)
    print("Loaded", df.shape)
    display(df.head())
else:
    print(f"File not found: {train_file}")

Loaded (25192, 42)


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,anomaly
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


In [92]:
# Label mapping (binary)
def prepare_labels(df, label_col='class'):
    df = df.copy()
    df['binary_label'] = df[label_col].apply(lambda x: 0 if str(x).lower() in ['normal','0','benign'] else 1)
    return df


df = prepare_labels(df, 'class')
print(df['binary_label'].value_counts())

binary_label
0    13449
1    11743
Name: count, dtype: int64


In [94]:
# Preprocessing pipeline
def build_preprocessing_pipeline(df, label_col='binary_label'):
    features = [c for c in df.columns if c != label_col and c!="class"]
    cat_feats = [c for c in features if df[c].dtype == 'object' or df[c].nunique() < 50]
    num_feats = [c for c in features if c not in cat_feats]
    numeric_transformer = Pipeline([('scaler', StandardScaler())])
    categorical_transformer = Pipeline([('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])
    preprocessor = ColumnTransformer([
    ('num', numeric_transformer, num_feats),
    ('cat', categorical_transformer, cat_feats),
    ])
    return preprocessor, num_feats, cat_feats


preprocessor, num_feats, cat_feats = build_preprocessing_pipeline(df)
print('Numeric:', len(num_feats), 'Categorical:', len(cat_feats))

Numeric: 21 Categorical: 20


In [96]:
# Split + preprocess
X = df.drop(columns=['class','binary_label'])
y = df['binary_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


preprocessor.fit(X_train)
X_train_t = preprocessor.transform(X_train)
X_test_t = preprocessor.transform(X_test)


print('Train:', X_train_t.shape, 'Test:', X_test_t.shape)

Train: (20153, 256) Test: (5039, 256)


In [98]:
# Handle imbalance (SMOTE or simple upsampling)
try:
    from imblearn.over_sampling import SMOTE
    sm = SMOTE(random_state=42)
    X_train_bal, y_train_bal = sm.fit_resample(X_train_t, y_train)
except Exception:
    df_tmp = pd.DataFrame(X_train_t)
    df_tmp['y'] = y_train.values
    maj = df_tmp['y'].value_counts().idxmax()
    minr = df_tmp['y'].value_counts().idxmin()
    df_maj = df_tmp[df_tmp['y']==maj]
    df_min = df_tmp[df_tmp['y']==minr].sample(len(df_maj), replace=True)
    df_bal = pd.concat([df_maj, df_min]).sample(frac=1, random_state=42)
    X_train_bal = df_bal.drop('y', axis=1).values
    y_train_bal = df_bal['y'].values


print('Balanced counts:', pd.Series(y_train_bal).value_counts().to_dict())

Balanced counts: {0: 10759, 1: 10759}


In [100]:
# Save artifacts
joblib.dump(preprocessor, os.path.join(OUTPUT_DIR, 'preprocessor.joblib'))
joblib.dump((X_train_bal, y_train_bal, X_test_t, y_test), os.path.join(OUTPUT_DIR, 'data_splits.joblib'))
print('Saved preprocessor and data splits to', OUTPUT_DIR)

Saved preprocessor and data splits to C:\Users\HarshaSri\Desktop\IDS_PROJECT\data\processed
