In [45]:
import pandas as pd
import numpy as np

In [46]:
import matplotlib.pyplot as plt
import seaborn as sns

In [47]:
# Core ML & preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, Binarizer,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Models
from sklearn.naive_bayes import  BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report





Basic info about the dataset

Encode categorical columns if any

Fill or drop missing values

In [48]:
# Load your training and testing CSVs
train_df = pd.read_csv("UNSW_NB15_training-set.csv")
test_df = pd.read_csv("UNSW_NB15_testing-set.csv")


Identify categorical columns

In [49]:
# Separate
# features and target
X_train = train_df.drop('label', axis=1)
y_train = train_df['label']
X_test = test_df.drop('label', axis=1)
y_test = test_df['label']

In [50]:
# 1. Identify column types
categorical_cols = X_train.select_dtypes(include=['object']).columns
numeric_cols = X_train.columns.difference(categorical_cols)

In [51]:
# 2. Preprocessor for BernoulliNB
preprocessor = ColumnTransformer([
    ('num_scale', StandardScaler(), numeric_cols),              # scale numeric
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
], remainder='passthrough')


Preprocessor

Build the pipeline

In [52]:

# 3. Full pipeline: scale → transform → binarize → BernoulliNB
model = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('binarize', Binarizer(threshold=0.0)),     # convert all values to 0/1
    ('classifier', BernoulliNB(alpha=0.5))       # smoothing improves performance
])

Train and predict

In [53]:

# Train and predict
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


 models used in the paper: bagging, xgb,  decision tree, logistic regression , two nbs

In [54]:
# Evaluate
# performance
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

In [55]:
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1 Score:  {f1:.4f}")

Accuracy:  0.9838
Precision: 0.9840
Recall:    0.9838
F1 Score:  0.9838
