In [1]:
import numpy as np 
import pandas as pd
from tqdm import tqdm

In [2]:
total_data = pd.read_csv("./FinalDatasets/TotalDataset_Engineered.csv")

In [3]:
total_data.head(5)

Unnamed: 0.1,Unnamed: 0,Date first seen,Duration,Proto,Src IP Addr,Src Pt,Dst IP Addr,Dst Pt,Packets,Bytes,...,S,F,class,attackType,year,month,days,hours,minutes,seconds
0,0,2017-03-15 00:01:16.632,0.0,TCP,192.168.100.5,445,192.168.220.16,58844.0,1,108,...,0,0,normal,none,2017,3,15,0,1,16.632
1,1,2017-03-15 00:01:16.552,0.0,TCP,192.168.100.5,445,192.168.220.15,48888.0,1,108,...,0,0,normal,none,2017,3,15,0,1,16.552
2,2,2017-03-15 00:01:16.551,0.004,TCP,192.168.220.15,48888,192.168.100.5,445.0,2,174,...,0,0,normal,none,2017,3,15,0,1,16.551
3,3,2017-03-15 00:01:16.631,0.004,TCP,192.168.220.16,58844,192.168.100.5,445.0,2,174,...,0,0,normal,none,2017,3,15,0,1,16.631
4,4,2017-03-15 00:01:17.432,0.0,TCP,192.168.220.9,37884,192.168.100.5,445.0,1,66,...,0,0,normal,none,2017,3,15,0,1,17.432


In [4]:
total_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39643009 entries, 0 to 39643008
Data columns (total 24 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Unnamed: 0       int64  
 1   Date first seen  object 
 2   Duration         float64
 3   Proto            object 
 4   Src IP Addr      object 
 5   Src Pt           int64  
 6   Dst IP Addr      object 
 7   Dst Pt           float64
 8   Packets          int64  
 9   Bytes            int64  
 10  U                int64  
 11  A                int64  
 12  P                int64  
 13  R                int64  
 14  S                int64  
 15  F                int64  
 16  class            object 
 17  attackType       object 
 18  year             int64  
 19  month            int64  
 20  days             int64  
 21  hours            int64  
 22  minutes          int64  
 23  seconds          float64
dtypes: float64(3), int64(15), object(6)
memory usage: 7.1+ GB


In [5]:
for col in ['Proto', 'Src IP Addr', 'Dst IP Addr']:
    total_data[col] = total_data[col].astype('category')

#### One concern I have with some of these features is that the featuers with IP Addresses I believe should be removed. 
#### Because what happens if we test on new data and that data contains an IP address that the model has never seen before and we are feeding the IP Address as a categorical column. 
#### So either we modify the data/model to take in the IP Address as a varying non-categorical value OR we drop the IP Addresses feature totally.

# Prepping for modeling

In [6]:
features = list(total_data.columns)
features.remove("Unnamed: 0")
features.remove("Date first seen")
features.remove("class")
features.remove("attackType")

features.remove('Proto')
features.remove('Src IP Addr')
features.remove('Dst IP Addr')

print(features)

['Duration', 'Src Pt', 'Dst Pt', 'Packets', 'Bytes', 'U', 'A', 'P', 'R', 'S', 'F', 'year', 'month', 'days', 'hours', 'minutes', 'seconds']


In [7]:
from sklearn.preprocessing import LabelEncoder
# Label Encoding our target variable column
le = LabelEncoder()
total_data['attackType'] = le.fit_transform(total_data['attackType'])

In [8]:
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(le_name_mapping)

{'blasterWorm': 0, 'bruteForce': 1, 'dos': 2, 'fragmentation': 3, 'httpFlood': 4, 'icmpFlood': 5, 'landAttack': 6, 'none': 7, 'pingScan': 8, 'portScan': 9, 'reaperWorm': 10, 'redWorm': 11, 'scanning': 12, 'smurf': 13, 'spam': 14, 'synFlood': 15, 'udpFlood': 16}


In [9]:
target = "attackType" # This is going to be a multiclass classification task 

#### Since I will be using CatBoostClassifier first, we won't need to undergo any feature normalization of sorts as it is a tree based model.
#### However, there are some articles and papers out there mentioning since this is a gradient boosting model, we would still need to normalize data. For now, I will just proceed on but would be a good attempt to try and normalize the data in the future and observe if there are any meaningful changes to the result.

# Splitting data up

In [10]:
class Split():
    
    def __init__(self, num = 5):  # num refers to the number of datasets you wanna split the original total dataset into
        self.total_data = total_data
        self.target = le.classes_.tolist()
        self._0 = []
        self._1 = []
        self._2 = []
        self._3 = []
        self._4 = []
        self._5 = []
        self._6 = []
        self._7 = []
        self._8 = []
        self._9 = []
        self._10 = []
        self._11 = []
        self._12 = []
        self._13 = []
        self._14 = []
        self._15 = []
        self._16 = []
        self.cv = num

    def generate(self):
        self._split()

        res = []
        
        for i in tqdm(range(self.cv)):
            if i != self.cv-1:
                tmp = self._0[i*(len(self._0)//self.cv):(i+1)*(len(self._0)//self.cv)] + self._1[i*(len(self._1)//self.cv):(i+1)*(len(self._1)//self.cv)] + self._2[i*(len(self._2)//self.cv):(i+1)*(len(self._2)//self.cv)] + self._3[i*(len(self._3)//self.cv):(i+1)*(len(self._3)//self.cv)] + self._4[i*(len(self._4)//self.cv):(i+1)*(len(self._4)//self.cv)] + self._5[i*(len(self._5)//self.cv):(i+1)*(len(self._5)//self.cv)] + self._6[i*(len(self._6)//self.cv):(i+1)*(len(self._6)//self.cv)] + self._7[i*(len(self._7)//self.cv):(i+1)*(len(self._7)//self.cv)] + self._8[i*(len(self._8)//self.cv):(i+1)*(len(self._8)//self.cv)] + self._9[i*(len(self._9)//self.cv):(i+1)*(len(self._9)//self.cv)] + self._10[i*(len(self._10)//self.cv):(i+1)*(len(self._10)//self.cv)] + self._11[i*(len(self._11)//self.cv):(i+1)*(len(self._11)//self.cv)] + self._12[i*(len(self._12)//self.cv):(i+1)*(len(self._12)//self.cv)] + self._13[i*(len(self._13)//self.cv):(i+1)*(len(self._13)//self.cv)] + self._14[i*(len(self._14)//self.cv):(i+1)*(len(self._14)//self.cv)] + self._15[i*(len(self._15)//self.cv):(i+1)*(len(self._15)//self.cv)] + self._16[i*(len(self._16)//self.cv):(i+1)*(len(self._16)//self.cv)]
            elif i == self.cv-1:
                tmp = self._0[i*(len(self._0)//self.cv):-1] + self._1[i*(len(self._1)//self.cv):-1] + self._2[i*(len(self._2)//self.cv):-1] + self._3[i*(len(self._3)//self.cv):-1] + self._4[i*(len(self._4)//self.cv):-1] + self._5[i*(len(self._5)//self.cv):-1] + self._6[i*(len(self._6)//self.cv):-1] + self._7[i*(len(self._7)//self.cv):-1] + self._8[i*(len(self._8)//self.cv):-1] + self._9[i*(len(self._9)//self.cv):-1] + self._10[i*(len(self._10)//self.cv):-1] + self._11[i*(len(self._11)//self.cv):-1] + self._12[i*(len(self._12)//self.cv):-1] + self._13[i*(len(self._13)//self.cv):-1] + self._14[i*(len(self._14)//self.cv):-1] + self._15[i*(len(self._15)//self.cv):-1] + self._16[i*(len(self._16)//self.cv):-1]
            res.append(tmp)

        return res # Returns arrays of indices
        
    def _split(self):
        length = len(self.total_data)
        print("Splitting...")
        for i in tqdm(range(length)):
            type = self.total_data.iloc[i].attackType
            match type:
                case 0:
                    self._0.append(i)
                case 1:
                    self._1.append(i)
                case 2:
                    self._2.append(i)
                case 3:
                    self._3.append(i)
                case 4:
                    self._4.append(i)
                case 5: 
                    self._5.append(i)
                case 6:
                    self._6.append(i)
                case 7:
                    self._7.append(i)
                case 8:
                    self._8.append(i)
                case 9:
                    self._9.append(i)
                case 10:
                    self._10.append(i)
                case 11:
                    self._11.append(i)
                case 12:
                    self._12.append(i)
                case 13:
                    self._13.append(i)
                case 14:
                    self._14.append(i)
                case 15:
                    self._15.append(i)
                case 16:
                    self._16.append(i)

        print("Splitting done!")

In [11]:
split = Split(num = 10) # num refers to the number of datasets you wanna split the original total dataset into
split_data = split.generate()

Splitting...


100%|████████████████████████████| 39643009/39643009 [20:15<00:00, 32610.14it/s]


Splitting done!


100%|███████████████████████████████████████████| 10/10 [00:01<00:00,  5.34it/s]


In [12]:
X = total_data.iloc[split_data[0]][features]
y = total_data.iloc[split_data[0]][target]

# Model

In [40]:
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

xgb = XGBClassifier(objective='multi:softprob',
                   enable_categorical=True)

cat = CatBoostClassifier(loss_function='MultiClass',
                         eval_metric =  'Accuracy', 
                         verbose=10,
                         early_stopping_rounds=10,
                         # cat_features=[1, 2, 4]
                        )

lr = LogisticRegression()

In [41]:
from sklearn.model_selection import StratifiedKFold
import time

skf = StratifiedKFold(n_splits=5)
skf.get_n_splits(X, y)
acc = []

for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    start_time = time.time()
    print(f"\nFold-{i+1}")
    print("----------------------")
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Train
    print("Training...")
    print("----------------------")
    cat.fit(X_train, y_train, eval_set=(X_test, y_test))
    print("Training done!")
    print("----------------------")

    # Predict
    pred = cat.predict(X_test)
    accuracy = accuracy_score(y_test, pred)
    acc.append(accuracy)
    # auc = roc_auc_score(y_test, cat.predict_proba(X_test)[:, 1], multi_class='ovr')

    # Printing
    print("Accuracy: {:.2f}%".format(accuracy*100))
    # print("AUC: {:.2f}".format(auc))
    print("Time taken: {:.0f}s".format(time.time()-start_time))
    print("----------------------")

print("----------------------")
print("Average accuracy: {:.2f}%".format(np.mean(acc)*100))


Fold-1
----------------------
Training...
----------------------
Learning rate set to 0.126868
0:	learn: 0.9914433	test: 0.9922180	best: 0.9922180 (0)	total: 3.36s	remaining: 55m 55s
10:	learn: 0.9958883	test: 0.9938854	best: 0.9949966 (8)	total: 32.4s	remaining: 48m 31s
20:	learn: 0.9962039	test: 0.9955553	best: 0.9955553 (20)	total: 1m 1s	remaining: 48m
30:	learn: 0.9970887	test: 0.9965404	best: 0.9965404 (30)	total: 1m 34s	remaining: 49m 1s
40:	learn: 0.9973987	test: 0.9935310	best: 0.9965404 (30)	total: 2m 7s	remaining: 49m 34s
Stopped by overfitting detector  (10 iterations wait)

bestTest = 0.9965403685
bestIteration = 30

Shrink model to first 31 iterations.
Training done!
----------------------
Accuracy: 99.65%
Time taken: 128s
----------------------

Fold-2
----------------------
Training...
----------------------
Learning rate set to 0.126868
0:	learn: 0.9915007	test: 0.9919885	best: 0.9919885 (0)	total: 4.05s	remaining: 1h 7m 25s
10:	learn: 0.9964281	test: 0.5668852	best: 0

# Testing on a different subset of dataset (never seen before by the model)

In [46]:
X_new = total_data.iloc[split_data[1]][features]
y_new = total_data.iloc[split_data[1]][target]

pred_new = cat.predict(X_new)
accuracy_new = accuracy_score(y_new, pred_new)

print("Accuracy: {:.2f}%".format(accuracy_new*100))

Accuracy: 69.22%
