In [26]:
import numpy as np 
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold


In [3]:
total_data = pd.read_csv("./FinalDatasets/TotalDataset_Engineered.csv")

In [4]:
total_data.head(5)

Unnamed: 0.1,Unnamed: 0,Date first seen,Duration,Proto,Src IP Addr,Src Pt,Dst IP Addr,Dst Pt,Packets,Bytes,...,S,F,class,attackType,year,month,days,hours,minutes,seconds
0,0,2017-03-15 00:01:16.632,0.0,TCP,192.168.100.5,445,192.168.220.16,58844.0,1,108,...,0,0,normal,none,2017,3,15,0,1,16.632
1,1,2017-03-15 00:01:16.552,0.0,TCP,192.168.100.5,445,192.168.220.15,48888.0,1,108,...,0,0,normal,none,2017,3,15,0,1,16.552
2,2,2017-03-15 00:01:16.551,0.004,TCP,192.168.220.15,48888,192.168.100.5,445.0,2,174,...,0,0,normal,none,2017,3,15,0,1,16.551
3,3,2017-03-15 00:01:16.631,0.004,TCP,192.168.220.16,58844,192.168.100.5,445.0,2,174,...,0,0,normal,none,2017,3,15,0,1,16.631
4,4,2017-03-15 00:01:17.432,0.0,TCP,192.168.220.9,37884,192.168.100.5,445.0,1,66,...,0,0,normal,none,2017,3,15,0,1,17.432


In [5]:
total_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39643009 entries, 0 to 39643008
Data columns (total 24 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Unnamed: 0       int64  
 1   Date first seen  object 
 2   Duration         float64
 3   Proto            object 
 4   Src IP Addr      object 
 5   Src Pt           int64  
 6   Dst IP Addr      object 
 7   Dst Pt           float64
 8   Packets          int64  
 9   Bytes            int64  
 10  U                int64  
 11  A                int64  
 12  P                int64  
 13  R                int64  
 14  S                int64  
 15  F                int64  
 16  class            object 
 17  attackType       object 
 18  year             int64  
 19  month            int64  
 20  days             int64  
 21  hours            int64  
 22  minutes          int64  
 23  seconds          float64
dtypes: float64(3), int64(15), object(6)
memory usage: 7.1+ GB


In [6]:
# Removing none type attackType

total_data = total_data[total_data.attackType != "none"]

In [7]:
total_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5939551 entries, 14 to 39643001
Data columns (total 24 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Unnamed: 0       int64  
 1   Date first seen  object 
 2   Duration         float64
 3   Proto            object 
 4   Src IP Addr      object 
 5   Src Pt           int64  
 6   Dst IP Addr      object 
 7   Dst Pt           float64
 8   Packets          int64  
 9   Bytes            int64  
 10  U                int64  
 11  A                int64  
 12  P                int64  
 13  R                int64  
 14  S                int64  
 15  F                int64  
 16  class            object 
 17  attackType       object 
 18  year             int64  
 19  month            int64  
 20  days             int64  
 21  hours            int64  
 22  minutes          int64  
 23  seconds          float64
dtypes: float64(3), int64(15), object(6)
memory usage: 1.1+ GB


In [8]:
for col in ['Proto', 'Src IP Addr', 'Dst IP Addr']:
    total_data[col] = total_data[col].astype('category')

#### One concern I have with some of these features is that the featuers with IP Addresses I believe should be removed. 
#### Because what happens if we test on new data and that data contains an IP address that the model has never seen before and we are feeding the IP Address as a categorical column. 
#### So either we modify the data/model to take in the IP Address as a varying non-categorical value OR we drop the IP Addresses feature totally.

# Prepping for modeling

In [9]:
features = list(total_data.columns)
features.remove("Unnamed: 0")
features.remove("Date first seen")
features.remove("class")
features.remove("attackType")

# Dropping these columns for now
features.remove('Proto')
features.remove('Src IP Addr')
features.remove('Dst IP Addr')

print(features)

['Duration', 'Src Pt', 'Dst Pt', 'Packets', 'Bytes', 'U', 'A', 'P', 'R', 'S', 'F', 'year', 'month', 'days', 'hours', 'minutes', 'seconds']


In [10]:
from sklearn.preprocessing import LabelEncoder
# Label Encoding our target variable column
le = LabelEncoder()
total_data['attackType'] = le.fit_transform(total_data['attackType'])

In [11]:
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(le_name_mapping)

{'blasterWorm': 0, 'bruteForce': 1, 'dos': 2, 'fragmentation': 3, 'httpFlood': 4, 'icmpFlood': 5, 'landAttack': 6, 'pingScan': 7, 'portScan': 8, 'reaperWorm': 9, 'redWorm': 10, 'scanning': 11, 'smurf': 12, 'spam': 13, 'synFlood': 14, 'udpFlood': 15}


In [12]:
target = "attackType" # This is going to be a multiclass classification task 

#### Since I will be using CatBoostClassifier first, we won't need to undergo any feature normalization of sorts as it is a tree based model.
#### However, there are some articles and papers out there mentioning since this is a gradient boosting model, we would still need to normalize data. For now, I will just proceed on but would be a good attempt to try and normalize the data in the future and observe if there are any meaningful changes to the result.

# Splitting data up

In [13]:
class Split():
    
    def __init__(self, num = 5):  # num refers to the number of datasets you wanna split the original total dataset into
        self.total_data = total_data
        self.target = le.classes_.tolist()
        self._0 = []
        self._1 = []
        self._2 = []
        self._3 = []
        self._4 = []
        self._5 = []
        self._6 = []
        self._7 = []
        self._8 = []
        self._9 = []
        self._10 = []
        self._11 = []
        self._12 = []
        self._13 = []
        self._14 = []
        self._15 = []
        self._16 = []
        self.cv = num

    def generate(self):
        self._split()

        res = []
        
        for i in tqdm(range(self.cv)):
            if i != self.cv-1:
                tmp = self._0[i*(len(self._0)//self.cv):(i+1)*(len(self._0)//self.cv)] + self._1[i*(len(self._1)//self.cv):(i+1)*(len(self._1)//self.cv)] + self._2[i*(len(self._2)//self.cv):(i+1)*(len(self._2)//self.cv)] + self._3[i*(len(self._3)//self.cv):(i+1)*(len(self._3)//self.cv)] + self._4[i*(len(self._4)//self.cv):(i+1)*(len(self._4)//self.cv)] + self._5[i*(len(self._5)//self.cv):(i+1)*(len(self._5)//self.cv)] + self._6[i*(len(self._6)//self.cv):(i+1)*(len(self._6)//self.cv)] + self._7[i*(len(self._7)//self.cv):(i+1)*(len(self._7)//self.cv)] + self._8[i*(len(self._8)//self.cv):(i+1)*(len(self._8)//self.cv)] + self._9[i*(len(self._9)//self.cv):(i+1)*(len(self._9)//self.cv)] + self._10[i*(len(self._10)//self.cv):(i+1)*(len(self._10)//self.cv)] + self._11[i*(len(self._11)//self.cv):(i+1)*(len(self._11)//self.cv)] + self._12[i*(len(self._12)//self.cv):(i+1)*(len(self._12)//self.cv)] + self._13[i*(len(self._13)//self.cv):(i+1)*(len(self._13)//self.cv)] + self._14[i*(len(self._14)//self.cv):(i+1)*(len(self._14)//self.cv)] + self._15[i*(len(self._15)//self.cv):(i+1)*(len(self._15)//self.cv)] + self._16[i*(len(self._16)//self.cv):(i+1)*(len(self._16)//self.cv)]
            elif i == self.cv-1:
                tmp = self._0[i*(len(self._0)//self.cv):-1] + self._1[i*(len(self._1)//self.cv):-1] + self._2[i*(len(self._2)//self.cv):-1] + self._3[i*(len(self._3)//self.cv):-1] + self._4[i*(len(self._4)//self.cv):-1] + self._5[i*(len(self._5)//self.cv):-1] + self._6[i*(len(self._6)//self.cv):-1] + self._7[i*(len(self._7)//self.cv):-1] + self._8[i*(len(self._8)//self.cv):-1] + self._9[i*(len(self._9)//self.cv):-1] + self._10[i*(len(self._10)//self.cv):-1] + self._11[i*(len(self._11)//self.cv):-1] + self._12[i*(len(self._12)//self.cv):-1] + self._13[i*(len(self._13)//self.cv):-1] + self._14[i*(len(self._14)//self.cv):-1] + self._15[i*(len(self._15)//self.cv):-1] + self._16[i*(len(self._16)//self.cv):-1]
            res.append(tmp)

        return res # Returns arrays of indices
        
    def _split(self):
        length = len(self.total_data)
        print("Splitting...")
        for i in tqdm(range(length)):
            ttype = self.total_data.iloc[i].attackType
            match ttype:
                case 0:
                    self._0.append(i)
                case 1:
                    self._1.append(i)
                case 2:
                    self._2.append(i)
                case 3:
                    self._3.append(i)
                case 4:
                    self._4.append(i)
                case 5: 
                    self._5.append(i)
                case 6:
                    self._6.append(i)
                case 7:
                    self._7.append(i)
                case 8:
                    self._8.append(i)
                case 9:
                    self._9.append(i)
                case 10:
                    self._10.append(i)
                case 11:
                    self._11.append(i)
                case 12:
                    self._12.append(i)
                case 13:
                    self._13.append(i)
                case 14:
                    self._14.append(i)
                case 15:
                    self._15.append(i)
                case 16:
                    self._16.append(i)

        print("Splitting done!")

In [14]:
subset_count = 5
split = Split(num = subset_count) # num refers to the number of datasets you wanna split the original total dataset into
split_data = split.generate()

Splitting...


100%|██████████| 5939551/5939551 [03:01<00:00, 32663.46it/s]


Splitting done!


100%|██████████| 5/5 [00:00<00:00, 28.33it/s]


# Model

In [15]:
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

# Take the models from here and insert it below

xgb = XGBClassifier(objective='multi:softprob',
                   enable_categorical=True)

cat = CatBoostClassifier(loss_function='MultiClass', # MultiClass, MultiClassOneVsAll
                         eval_metric =  'Accuracy', # AUC
                         verbose=10,
                         depth = 5,
                         early_stopping_rounds=10,
                         # cat_features=[1, 2, 4]
                        )

lr = LogisticRegression()

# Training

In [33]:
train_validation_data = split_data[0] + split_data[1] + split_data[2] + split_data[3]
test_data = split_data[-1]

In [45]:
X, y = total_data.iloc[train_validation_data][features], total_data.iloc[train_validation_data][target]
models = []
acc = []

skf = StratifiedKFold(n_splits=5)
skf.get_n_splits(X, y)
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(f"Fold - {i+1}")
    print("----------------------")
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model = CatBoostClassifier(loss_function='MultiClass', # MultiClass, MultiClassOneVsAll
                         eval_metric =  'Accuracy', # AUC
                         verbose=10,
                         depth = 5,
                         early_stopping_rounds=10,
                         # cat_features=[1, 2, 4]
                        )
    
    eval_set = [(X_test, y_test)]
    # Training 
    print("Training")
    print("----------------------")
    model.fit(X_train, y_train, eval_set=(X_test, y_test))
    # model.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="merror", eval_set=eval_set, verbose=10)
    models.append(model)

    # Predict
    pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, pred)
    acc.append(accuracy)

    print("Fold - {} Accuracy: {:.2f}%".format(i+1, accuracy*100))
    print("----------------------")

print("Average accuracy: {:.2f}%".format(np.mean(acc)*100))

Fold - 1
----------------------
Training
----------------------
Learning rate set to 0.127329
0:	learn: 0.9776758	test: 0.9774835	best: 0.9774835 (0)	total: 2.3s	remaining: 38m 19s
10:	learn: 0.9847252	test: 0.9845979	best: 0.9846379 (7)	total: 20.1s	remaining: 30m 11s
20:	learn: 0.9867629	test: 0.9859185	best: 0.9859185 (19)	total: 36.7s	remaining: 28m 32s
30:	learn: 0.9871178	test: 0.9862152	best: 0.9862184 (26)	total: 53s	remaining: 27m 35s
40:	learn: 0.9874669	test: 0.9863373	best: 0.9863373 (38)	total: 1m 7s	remaining: 26m 24s
50:	learn: 0.9877544	test: 0.9863583	best: 0.9863583 (50)	total: 1m 21s	remaining: 25m 17s
60:	learn: 0.9878789	test: 0.9864078	best: 0.9864099 (56)	total: 1m 35s	remaining: 24m 33s
70:	learn: 0.9879144	test: 0.9864352	best: 0.9864457 (65)	total: 1m 49s	remaining: 23m 57s
80:	learn: 0.9879267	test: 0.9864688	best: 0.9864836 (77)	total: 2m 4s	remaining: 23m 27s
90:	learn: 0.9879559	test: 0.9862826	best: 0.9865772 (83)	total: 2m 17s	remaining: 22m 53s
Stopped 

In [46]:
# Testing 

X, y = total_data.iloc[test_data][features], total_data.iloc[test_data][target]

acc = []
    
for model in models:
    pred_new = model.predict(X)
    accuracy_new = accuracy_score(y, pred_new)
    acc.append(accuracy_new)

print("The average accuracy is: {:.2f}% on the testing dataset".format(np.mean(acc)*100) )

The average accuracy is: 98.02%
