In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from imblearn.over_sampling import SMOTE

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

In [33]:
# import dataset
df = pd.read_csv("NetworkIntrusionDataset.csv")
df

Unnamed: 0,No.,Time,Source,Destination,Protocol,Length,Info
0,1,0.000000,192.168.1.7,224.0.0.251,MDNS,152,Standard query 0x0295 PTR _%9E5E7C8F47989526C9...
1,2,0.246995,192.168.1.16,20.212.88.117,TCP,55,60904 > 443 [ACK] Seq=1 Ack=1 Win=258 Len=1 ...
2,3,0.358572,20.212.88.117,192.168.1.16,TCP,66,443 > 60904 [ACK] Seq=1 Ack=2 Win=251 Len=0 ...
3,4,0.946852,zte_59:ea:48,Intel_2a:2f:bf,ARP,42,Who has 192.168.1.16? Tell 192.168.1.1
4,5,0.946893,Intel_2a:2f:bf,zte_59:ea:48,ARP,42,192.168.1.16 is at 34:e6:ad:2a:2f:bf
...,...,...,...,...,...,...,...
20964,20965,303.573195,zte_59:ea:48,Intel_2a:2f:bf,ARP,42,192.168.1.1 is at 28:ff:3e:59:ea:48
20965,20966,310.286325,192.168.1.16,20.207.70.99,TLSv1.2,89,Application Data
20966,20967,310.365222,20.207.70.99,192.168.1.16,TLSv1.2,85,Application Data
20967,20968,310.408451,192.168.1.16,20.207.70.99,TCP,54,61333 > 443 [ACK] Seq=1635 Ack=6178 Win=6604...


In [34]:
# Fill missing values
df['Info'] = df['Info'].fillna('')

In [35]:
# dataset labelling function
def label_traffic(row):
	
    suspicious_protocols = {'ICMPv6', 'SSDP', 'MDNS', 'DHCP'}
    suspicious_keywords = {'malformed', 'error', 'unauthorized', 'crypto', 'cipher', 'spec', 'change', 'fin', 'syn', 'ping'}

    # 1 for suspicious protocols
    if row['Protocol'] in suspicious_protocols:
        return 1

    # 1 for suspicious keywords
    if any(keyword in str(row['Info']).lower() for keyword in suspicious_keywords):
        return 1

    # 0 for normal
    return 0

# labeling
df['Label'] = df.apply(label_traffic, axis=1)
df

Unnamed: 0,No.,Time,Source,Destination,Protocol,Length,Info,Label
0,1,0.000000,192.168.1.7,224.0.0.251,MDNS,152,Standard query 0x0295 PTR _%9E5E7C8F47989526C9...,1
1,2,0.246995,192.168.1.16,20.212.88.117,TCP,55,60904 > 443 [ACK] Seq=1 Ack=1 Win=258 Len=1 ...,0
2,3,0.358572,20.212.88.117,192.168.1.16,TCP,66,443 > 60904 [ACK] Seq=1 Ack=2 Win=251 Len=0 ...,0
3,4,0.946852,zte_59:ea:48,Intel_2a:2f:bf,ARP,42,Who has 192.168.1.16? Tell 192.168.1.1,0
4,5,0.946893,Intel_2a:2f:bf,zte_59:ea:48,ARP,42,192.168.1.16 is at 34:e6:ad:2a:2f:bf,0
...,...,...,...,...,...,...,...,...
20964,20965,303.573195,zte_59:ea:48,Intel_2a:2f:bf,ARP,42,192.168.1.1 is at 28:ff:3e:59:ea:48,0
20965,20966,310.286325,192.168.1.16,20.207.70.99,TLSv1.2,89,Application Data,0
20966,20967,310.365222,20.207.70.99,192.168.1.16,TLSv1.2,85,Application Data,0
20967,20968,310.408451,192.168.1.16,20.207.70.99,TCP,54,61333 > 443 [ACK] Seq=1635 Ack=6178 Win=6604...,0


In [36]:
print(df['Label'].value_counts())

Label
0    18947
1     2022
Name: count, dtype: int64


In [37]:
# Encode the 'Protocol' column
le = LabelEncoder()
df['Protocol'] = le.fit_transform(df['Protocol'])

# Standardize 'Time' and 'Length' columns
scaler = StandardScaler()
df[['Time', 'Length']] = scaler.fit_transform(df[['Time', 'Length']])

df

Unnamed: 0,No.,Time,Source,Destination,Protocol,Length,Info,Label
0,1,-1.126336,192.168.1.7,224.0.0.251,4,-0.824470,Standard query 0x0295 PTR _%9E5E7C8F47989526C9...,1
1,2,-1.123471,192.168.1.16,20.212.88.117,7,-0.977811,60904 > 443 [ACK] Seq=1 Ack=1 Win=258 Len=1 ...,0
2,3,-1.122177,20.212.88.117,192.168.1.16,7,-0.960422,443 > 60904 [ACK] Seq=1 Ack=2 Win=251 Len=0 ...,0
3,4,-1.115354,zte_59:ea:48,Intel_2a:2f:bf,0,-0.998362,Who has 192.168.1.16? Tell 192.168.1.1,0
4,5,-1.115354,Intel_2a:2f:bf,zte_59:ea:48,0,-0.998362,192.168.1.16 is at 34:e6:ad:2a:2f:bf,0
...,...,...,...,...,...,...,...,...
20964,20965,2.394372,zte_59:ea:48,Intel_2a:2f:bf,0,-0.998362,192.168.1.1 is at 28:ff:3e:59:ea:48,0
20965,20966,2.472228,192.168.1.16,20.207.70.99,8,-0.924063,Application Data,0
20966,20967,2.473143,20.207.70.99,192.168.1.16,8,-0.930386,Application Data,0
20967,20968,2.473644,192.168.1.16,20.207.70.99,7,-0.979392,61333 > 443 [ACK] Seq=1635 Ack=6178 Win=6604...,0


In [38]:
# Define features and target
X = df[['Time', 'Length', 'Protocol']]
y = df['Label']

In [39]:
# Split data into testing	and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42, stratify=y)

In [None]:
# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

print(X_resampled.shape)
print(y_resampled.value_counts())

In [41]:
# Train CatBoost model
catboost_model = CatBoostClassifier(verbose=0)
catboost_model.fit(X_resampled, y_resampled)

<catboost.core.CatBoostClassifier at 0x218f10a1b80>

In [42]:
# Train LightGBM model
lgbm_model = LGBMClassifier()
lgbm_model.fit(X_resampled, y_resampled)

[LightGBM] [Info] Number of positive: 13263, number of negative: 13263
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001685 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 520
[LightGBM] [Info] Number of data points in the train set: 26526, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [44]:
# Train TabNet model
tabnet_model = TabNetClassifier(verbose=0)
tabnet_model.fit(X_resampled.values, y_resampled.values, max_epochs=10)



In [45]:
# Define Stacking Model with CatBoost and LightGBM
stacking_model = StackingClassifier(
    estimators=[('catboost', catboost_model), ('lgbm', lgbm_model)],
    final_estimator=LogisticRegression()
)
stacking_model.fit(X_resampled, y_resampled)

[LightGBM] [Info] Number of positive: 13263, number of negative: 13263
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001606 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 520
[LightGBM] [Info] Number of data points in the train set: 26526, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 10610, number of negative: 10610
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000651 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 519
[LightGBM] [Info] Number of data points in the train set: 21220, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.00000

In [46]:
# Evaluation
models = {'CatBoost': catboost_model, 'LightGBM': lgbm_model, 'TabNet': tabnet_model, 'Stacking': stacking_model}

for name, model in models.items():
    y_pred = model.predict(X_test.values)
    print(f" Model: {name}")
    accuracy = accuracy_score(y_test, y_pred)
    print(f" {name} Accuracy: {accuracy:.4f}")
    print(f" Classification Report:\n{classification_report(y_test, y_pred)}") # includes precision, recall, f1-score
    y_proba = model.predict_proba(X_test.values)[:, 1]
    roc_auc_score(y_test, y_proba)

    print(f" ROC-AUC Score: {roc_auc_score(y_test, y_proba):.4f}")
    print(" " * 50)

 Model: CatBoost
 CatBoost Accuracy: 0.8943
 Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.89      0.94      5684
           1       0.48      0.91      0.62       607

    accuracy                           0.89      6291
   macro avg       0.73      0.90      0.78      6291
weighted avg       0.94      0.89      0.91      6291

 ROC-AUC Score: 0.9619
                                                  
 Model: LightGBM
 LightGBM Accuracy: 0.8886
 Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.89      0.93      5684
           1       0.46      0.90      0.61       607

    accuracy                           0.89      6291
   macro avg       0.72      0.90      0.77      6291
weighted avg       0.94      0.89      0.90      6291

 ROC-AUC Score: 0.9589
                                                  
 Model: TabNet
 TabNet Accuracy: 0.7617
 Classification Report

In [47]:
import joblib

# Save models
joblib.dump(catboost_model, 'catboost_model.pkl')
joblib.dump(lgbm_model, 'lgbm_model.pkl')
joblib.dump(stacking_model, 'stacking_model.pkl')

# Save preprocessing tools
joblib.dump(le, 'label_encoder.pkl')
joblib.dump(scaler, 'scaler.pkl')


['scaler.pkl']