In [12]:

#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import seaborn as sns


#sklearn
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import IsolationForest
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

#imblearn
from imblearn.over_sampling import SMOTE


In [13]:
#Load Data
train_df = pd.read_csv("UNSW_NB15_training-set.csv")
test_df = pd.read_csv("UNSW_NB15_testing-set.csv")

In [14]:
#Encode Categorical Columns
cat_cols = train_df.select_dtypes(include=['object']).columns.tolist()

for col in cat_cols:
    le = LabelEncoder()
    combined = pd.concat([train_df[col], test_df[col]], axis=0).astype(str)
    le.fit(combined)
    train_df[col] = le.transform(train_df[col].astype(str))
    test_df[col] = le.transform(test_df[col].astype(str))


#Split Features/Labels
X_train = train_df.drop("label", axis=1)
y_train = train_df["label"]

X_test = test_df.drop("label", axis=1)
y_test = test_df["label"]

In [15]:
#Outlier Removal using Isolation Forest
iso = IsolationForest(contamination=0.02, random_state=42)
mask = iso.fit_predict(X_train) == 1

X_train = X_train[mask].reset_index(drop=True)
y_train = y_train[mask].reset_index(drop=True)

print("After outlier removal:", X_train.shape)

After outlier removal: (171834, 44)


In [16]:
#SMOTE Oversampling
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

X_train, y_train = X_train_res, y_train_res
print("After SMOTE:", X_train.shape, y_train.value_counts().to_dict())

After SMOTE: (235406, 44) {0: 117703, 1: 117703}


In [17]:
#Preprocessing (Standard Scaling)
numeric_cols = X_train.columns.tolist()

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_cols)
], remainder='passthrough')


#PCA (Efficiency Gain - 95% Variance)
pca = PCA(n_components=0.95, random_state=42)

In [18]:
#Logistic Regression Model
log_reg = LogisticRegression(
    max_iter=500,
    solver='lbfgs',
    n_jobs=-1
)

#Pipeline
logreg_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('pca', pca),
    ('classifier', log_reg)
])


#Train the model
logreg_pipeline.fit(X_train, y_train)


#Predict
y_pred_lr = logreg_pipeline.predict(X_test)

In [20]:
#Evaluation Metrics
acc_lr  = accuracy_score(y_test, y_pred_lr)
prec_lr = precision_score(y_test, y_pred_lr, average='weighted', zero_division=0)
rec_lr  = recall_score(y_test, y_pred_lr, average='weighted', zero_division=0)
f1_lr   = f1_score(y_test, y_pred_lr, average='weighted', zero_division=0)

print("\nLogistic Regression with PCA:\n")
print("Accuracy: ", round(acc_lr, 4))
print("Precision:", round(prec_lr, 4))
print("Recall:   ", round(rec_lr, 4))
print("F1 Score: ", round(f1_lr, 4))


Logistic Regression with PCA:

Accuracy:  0.8182
Precision: 0.8263
Recall:    0.8182
F1 Score:  0.8186
