In [None]:
# Package Import

import numpy as np 
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
import pickle
import time

In [None]:
"""
# Date Engineering

total_data = pd.read_csv("./FinalDatasets/TotalDataset_Engineered.csv")
for col in ['Proto', 'Src IP Addr', 'Dst IP Addr']:
    total_data[col] = total_data[col].astype('category')

# Removing redundant features

features = list(total_data.columns)
features.remove("Unnamed: 0")
features.remove("Date first seen")
features.remove("class")
features.remove("attackType")

# Dropping these columns for now
features.remove('Proto')
features.remove('Src IP Addr')
features.remove('Dst IP Addr')

target = 'class'

# Label Encoding our target variable column
le = LabelEncoder()
total_data[target] = le.fit_transform(total_data[target])

le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(le_name_mapping)

# 1 - normal & 0 - attack
for i in tqdm(range(len(total_data))):
    if (total_data.iloc[i]['class']) != 1:
        total_data.at[i, 'class'] = 0

target = "attackType" 
# Label Encoding our target variable column
le = LabelEncoder()
total_data[target] = le.fit_transform(total_data[target])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(le_name_mapping)

total_data.to_csv("./FinalDatasets/FinalDataset.csv")
"""

In [None]:
total_data = pd.read_csv("./FinalDatasets/FinalDataset.csv")

# Pipelining

In [None]:
class FullModel:
    def __init__(self, 
                data, 
                model_1, 
                model_2,
                features = ['Duration', 'Src Pt', 'Dst Pt', 'Packets', 'Bytes', 'U', 'A', 'P', 'R', 'S', 'F', 'year', 'month', 'days', 'hours', 'minutes', 'seconds']               
                ):
        """
        Parameters
        ----------
        data : pandas DataFrame
            Data containing rows of data you wish to predict on
        model_1 : filepath
            Model predicting binary 
        model_2 : filepath
            Model predicting multiclass
        features: list
            List of the feature columns we wish to use
        """
        self.data = data
        self.model_1 = pickle.load(open(model_1, "rb"))
        self.model_2 = pickle.load(open(model_2, "rb"))

        # Feature columns
        self.features = features
        for feature in self.features:
            if feature in self.data.columns:
                continue
            else:
                raise Exception("Check feature list again")
            
        
    def _stage_1_predict(self): # {'anamoly': 0, 'normal': 1}
        try:
            pred = self.model_1.predict(self.data[self.features])
            return pred
        except:
            print("Error in stage 1 prediction")
            return 0

    def _stage_2_predict(self):
        try:
            pred = self.model_2.predict(self.data[self.features])
            return pred
        except:
            print("Error in stage 2 prediction")
            return 0

    def _obtain_final_res(self):
        res = []
        for i in range(len(self.data)):
            if self.data['stage_1'].iloc[i] == 1:
                res.append(7) # According to my label encoder mapping, 7 is mapped to None - please exercise personal discretion
            else: # When anamoly is detected
                var = self.data['stage_2'].iloc[i]
                res.append(var)
        return res

    def run(self):
        start_time = time.time()
        
        # Stage 1 prediction
        stage_1_pred = self._stage_1_predict()
        self.data['stage_1'] = stage_1_pred

        # Stage 2 prediction
        stage_2_pred = self._stage_2_predict()
        self.data['stage_2'] = stage_2_pred

        # Obtaining final pred
        final_res = self._obtain_final_res()
        self.data['final pred'] = final_res
        
        print("Program took", (time.time() - start_time)/60 , "minutes to run")
        
        return self.data

In [None]:
algo = FullModel(total_data, 'binary_xgb_1.pkl', 'multiclass_xgb_1.pkl')

In [None]:
new_data = algo.run()

In [None]:
new_data.head(5)

In [None]:
biased_accuracy = len(new_data[new_data['attackType'] == new_data['final pred']])/len(new_data)*100

In [None]:
biased_accuracy