In [1]:
import numpy as np 
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [2]:
total_data = pd.read_csv("./FinalDatasets/TotalDataset_Engineered.csv")

In [3]:
for col in ['Proto', 'Src IP Addr', 'Dst IP Addr']:
    total_data[col] = total_data[col].astype('category')

In [4]:
features = list(total_data.columns)
features.remove("Unnamed: 0")
features.remove("Date first seen")
features.remove("class")
features.remove("attackType")

# Dropping these columns for now
features.remove('Proto')
features.remove('Src IP Addr')
features.remove('Dst IP Addr')

print(features)

target = "class" 

['Duration', 'Src Pt', 'Dst Pt', 'Packets', 'Bytes', 'U', 'A', 'P', 'R', 'S', 'F', 'year', 'month', 'days', 'hours', 'minutes', 'seconds']


In [5]:
# Label Encoding our target variable column
le = LabelEncoder()
total_data[target] = le.fit_transform(total_data[target])

In [6]:
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(le_name_mapping)

{'attacker': 0, 'normal': 1, 'victim': 2}


### I might want to merge 'attacker' and 'victim' class later to make it a simple binary case

In [7]:
total_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39643009 entries, 0 to 39643008
Data columns (total 24 columns):
 #   Column           Dtype   
---  ------           -----   
 0   Unnamed: 0       int64   
 1   Date first seen  object  
 2   Duration         float64 
 3   Proto            category
 4   Src IP Addr      category
 5   Src Pt           int64   
 6   Dst IP Addr      category
 7   Dst Pt           float64 
 8   Packets          int64   
 9   Bytes            int64   
 10  U                int64   
 11  A                int64   
 12  P                int64   
 13  R                int64   
 14  S                int64   
 15  F                int64   
 16  class            int64   
 17  attackType       object  
 18  year             int64   
 19  month            int64   
 20  days             int64   
 21  hours            int64   
 22  minutes          int64   
 23  seconds          float64 
dtypes: category(3), float64(3), int64(16), object(2)
memory usage: 6.7+ GB


# Model

In [8]:
xgb = XGBClassifier(objective='multi:softprob',
                   enable_categorical=True)

cat = CatBoostClassifier(loss_function='MultiClass', # MultiClass, MultiClassOneVsAll
                         eval_metric =  'Accuracy', # AUC
                         verbose=10,
                         depth = 5,
                         early_stopping_rounds=10,
                         # cat_features=[1, 2, 4]
                        )

lr = LogisticRegression()

# Training and Testing (3 classes)

In [10]:
X, y = total_data[features], total_data[target]
models = []
acc = []

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

skf = StratifiedKFold(n_splits=5)
skf.get_n_splits(X_train, y_train)

for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(f"Fold - {i+1}")
    print("----------------------")
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]

    model = CatBoostClassifier(loss_function='MultiClass', # MultiClass, MultiClassOneVsAll
                         eval_metric =  'Accuracy', # AUC
                         verbose=10,
                         depth = 5,
                         early_stopping_rounds=20,
                         # cat_features=[1, 2, 4]
                        )
    
    eval_set = [(X_valid, y_valid)]
    # Training 
    print("Training")
    print("----------------------")
    model.fit(X_train, y_train, eval_set=(X_valid, y_valid))
    # model.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="merror", eval_set=eval_set, verbose=10)
    models.append(model)

    # Predict
    pred = model.predict(X_valid)
    accuracy = accuracy_score(y_valid, pred)
    acc.append(accuracy)

    print("Fold - {} Accuracy: {:.2f}%".format(i+1, accuracy*100))
    print("----------------------")

print("Average accuracy: {:.2f}%".format(np.mean(acc)*100))

Fold - 1
----------------------
Training
----------------------
Learning rate set to 0.132847
0:	learn: 0.9413106	test: 0.7917912	best: 0.7917912 (0)	total: 5.53s	remaining: 1h 32m 7s
10:	learn: 0.9804483	test: 0.7823097	best: 0.7917912 (0)	total: 43.9s	remaining: 1h 5m 47s
Stopped by overfitting detector  (10 iterations wait)

bestTest = 0.7917911632
bestIteration = 0

Shrink model to first 1 iterations.
Fold - 1 Accuracy: 79.18%
----------------------
Fold - 2
----------------------
Training
----------------------
Learning rate set to 0.132847
0:	learn: 0.9468745	test: 0.9390557	best: 0.9390557 (0)	total: 5.09s	remaining: 1h 24m 43s
10:	learn: 0.9753420	test: 0.9831597	best: 0.9831712 (5)	total: 47.7s	remaining: 1h 11m 25s
20:	learn: 0.9844000	test: 0.9839571	best: 0.9839571 (20)	total: 1m 28s	remaining: 1h 8m 26s
30:	learn: 0.9846177	test: 0.9841425	best: 0.9841425 (30)	total: 2m 10s	remaining: 1h 7m 47s
40:	learn: 0.9873536	test: 0.9842904	best: 0.9843304 (36)	total: 2m 51s	remaini

In [12]:
# Testing 

acc = []
    
for model in models:
    pred_new = model.predict(X_test)
    accuracy_new = accuracy_score(y_test, pred_new)
    acc.append(accuracy_new)

print("The average accuracy is: {:.2f}% on the testing dataset".format(np.mean(acc)*100) )

The average accuracy is: 93.70% on the testing dataset


# Training and Testing (2 classes)

In [9]:
# 1 - normal & 0 - attack

for i in tqdm(range(len(total_data))):
    if (total_data.iloc[i]['class']) != 1:
        total_data.at[i, 'class'] = 0

100%|████████████████████████████| 39643009/39643009 [19:30<00:00, 33867.52it/s]


In [10]:
total_data[target].value_counts()

class
1    33703458
0     5939551
Name: count, dtype: int64

In [12]:
X, y = total_data[features], total_data[target]
models = []
acc = []

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

skf = StratifiedKFold(n_splits=5)
skf.get_n_splits(X_train, y_train)

X, y = X_train, y_train

for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(f"Fold - {i+1}")
    print("----------------------")
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]

    model = CatBoostClassifier(loss_function='MultiClass', # MultiClass, MultiClassOneVsAll
                         eval_metric =  'Accuracy', # AUC
                         verbose=10,
                         depth = 5,
                         early_stopping_rounds=10,
                         # cat_features=[1, 2, 4]
                        )
    
    eval_set = [(X_valid, y_valid)]
    # Training 
    print("Training")
    print("----------------------")
    model.fit(X_train, y_train, eval_set=(X_valid, y_valid))
    # model.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="merror", eval_set=eval_set, verbose=10)
    models.append(model)

    # Predict
    pred = model.predict(X_valid)
    accuracy = accuracy_score(y_valid, pred)
    acc.append(accuracy)

    print("Fold - {} Accuracy: {:.2f}%".format(i+1, accuracy*100))
    print("----------------------")

print("Average accuracy: {:.2f}%".format(np.mean(acc)*100))

Fold - 1
----------------------
Training
----------------------
Learning rate set to 0.132256
0:	learn: 0.9496127	test: 0.9495496	best: 0.9495496 (0)	total: 2.57s	remaining: 42m 44s
10:	learn: 0.9771610	test: 0.9772540	best: 0.9772540 (10)	total: 27.7s	remaining: 41m 26s
20:	learn: 0.9825737	test: 0.9826275	best: 0.9826275 (20)	total: 52.7s	remaining: 40m 58s
30:	learn: 0.9863377	test: 0.9864057	best: 0.9864057 (30)	total: 1m 15s	remaining: 39m 14s
40:	learn: 0.9893809	test: 0.9894229	best: 0.9895946 (39)	total: 1m 37s	remaining: 38m 8s
50:	learn: 0.9900350	test: 0.9900698	best: 0.9900698 (50)	total: 2m	remaining: 37m 18s
60:	learn: 0.9901103	test: 0.9901438	best: 0.9901866 (57)	total: 2m 21s	remaining: 36m 18s
70:	learn: 0.9907915	test: 0.9908119	best: 0.9908143 (65)	total: 2m 42s	remaining: 35m 21s
80:	learn: 0.9909643	test: 0.9909845	best: 0.9909964 (76)	total: 3m 3s	remaining: 34m 43s
90:	learn: 0.9915754	test: 0.9916067	best: 0.9917206 (87)	total: 3m 25s	remaining: 34m 17s
100:	le

In [13]:
# Testing 

acc = []
    
for model in models:
    pred_new = model.predict(X_test)
    accuracy_new = accuracy_score(y_test, pred_new)
    acc.append(accuracy_new)

print("The average accuracy is: {:.2f}% on the testing dataset".format(np.mean(acc)*100) )

The average accuracy is: 99.06% on the testing dataset
