## Case Study 1 : Final Submission
<br>
<li> This notebook has two functions. function1 takes one or more inputs and predicts the output for those points. </li>
<li> function2 takes X and Y as inputs in numpy array format and gives AUC score for prediction with the pretrained model. </li>
<li> Ensemble model with 10 base learners was used since this was the most efficient model to detect attacks. Base learners are Decision tree models. </li>
<li> Each base learner is trained on samples from huge data with 1.5 million datapoints. </li>
<li> pickle file with all model and preprocessing details was generated wtih the last cell in Model Testing Notebook </li>

In [1]:
# importing required Libraries

import pickle
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from scipy.sparse import hstack
import pandas as pd
from sklearn.metrics import roc_auc_score
from tqdm import tqdm

### 1. Defining function1 and function2

#### 1.1 "function1"

In [2]:
def function1(X):
    '''
        1.  This function takes input in numpy array format and predicts
            whether the input datapoint is an attack or not.
        2.  The input can be a single point or an array of points.
    '''
    
    # Loading all saved models, scalar objects and encoder objects
    # All these objects were saved in a list structured as follows:
    #    [ref_dict,[encoder,scalar],[base_learners]]
    
    all_saved_data = pickle.load(open("all_saved_data.pkl","rb"))
    ref_dict = all_saved_data[0]      # Loading ref_dict which has column index for each feature
    preprocessing = all_saved_data[1] # loading encoder and scalar objects
    sig_clf = all_saved_data[2]       # loading 10 base learners
    
    
    def predict_point(test_point):
        '''
            This function performs all preprocessing and predicts output for a single point
        '''
        
        # encoding Categorical Features
        proto_test = preprocessing[0]['proto'].transform(
            np.array(test_point[ref_dict['proto']],dtype='object').reshape(-1,1))


        state_test = preprocessing[0]['state'].transform(
            np.array(test_point[ref_dict['state']],dtype='object').reshape(-1,1))


        service_test = preprocessing[0]['service'].transform(
            np.array(test_point[ref_dict['service']],dtype='object').reshape(-1,1))


        is_sm_ips_ports_test = preprocessing[0]['is_sm_ips_ports'].transform(
            np.array(test_point[ref_dict['is_sm_ips_ports']],dtype='object').reshape(-1,1))


        ct_state_ttl_test = preprocessing[0]['ct_state_ttl'].transform(
            np.array(test_point[ref_dict['ct_state_ttl']],dtype='object').reshape(-1,1))


        # sclaling numerical features

        sport_test = preprocessing[1]['sport'].transform(
            np.array(test_point[ref_dict['sport']],dtype='object').reshape(-1,1))

        dsport_test = preprocessing[1]['dsport'].transform(
            np.array(test_point[ref_dict['dsport']],dtype='object').reshape(-1,1))

        dur_test = preprocessing[1]['dur'].transform(
            np.array(test_point[ref_dict['dur']],dtype='object').reshape(-1,1))

        sbytes_test = preprocessing[1]['sbytes'].transform(
            np.array(test_point[ref_dict['sbytes']],dtype='object').reshape(-1,1))

        dbytes_test = preprocessing[1]['dbytes'].transform(
            np.array(test_point[ref_dict['dbytes']],dtype='object').reshape(-1,1))

        sttl_test = preprocessing[1]['sttl'].transform(
            np.array(test_point[ref_dict['sttl']],dtype='object').reshape(-1,1))

        dttl_test = preprocessing[1]['dttl'].transform(
            np.array(test_point[ref_dict['dttl']],dtype='object').reshape(-1,1))

        sbytes_test = preprocessing[1]['sbytes'].transform(
            np.array(test_point[ref_dict['sbytes']],dtype='object').reshape(-1,1))

        sloss_test = preprocessing[1]['sloss'].transform(
            np.array(test_point[ref_dict['sloss']],dtype='object').reshape(-1,1))

        dloss_test = preprocessing[1]['dloss'].transform(
            np.array(test_point[ref_dict['dloss']],dtype='object').reshape(-1,1))

        Sload_test = preprocessing[1]['Sload'].transform(
            np.array(test_point[ref_dict['Sload']],dtype='object').reshape(-1,1))

        Dload_test = preprocessing[1]['Dload'].transform(
            np.array(test_point[ref_dict['Dload']],dtype='object').reshape(-1,1))

        Spkts_test = preprocessing[1]['Spkts'].transform(
            np.array(test_point[ref_dict['Spkts']],dtype='object').reshape(-1,1))

        Dpkts_test = preprocessing[1]['Dpkts'].transform(
            np.array(test_point[ref_dict['Dpkts']],dtype='object').reshape(-1,1))

        swin_test = preprocessing[1]['swin'].transform(
            np.array(test_point[ref_dict['swin']],dtype='object').reshape(-1,1))

        dwin_test = preprocessing[1]['dwin'].transform(
            np.array(test_point[ref_dict['dwin']],dtype='object').reshape(-1,1))

        stcpb_test = preprocessing[1]['stcpb'].transform(
            np.array(test_point[ref_dict['stcpb']],dtype='object').reshape(-1,1))

        dtcpb_test = preprocessing[1]['dtcpb'].transform(
            np.array(test_point[ref_dict['dtcpb']],dtype='object').reshape(-1,1))

        smeansz_test = preprocessing[1]['smeansz'].transform(
            np.array(test_point[ref_dict['smeansz']],dtype='object').reshape(-1,1))

        dmeansz_test = preprocessing[1]['dmeansz'].transform(
            np.array(test_point[ref_dict['dmeansz']],dtype='object').reshape(-1,1))

        trans_depth_test = preprocessing[1]['trans_depth'].transform(
            np.array(test_point[ref_dict['trans_depth']],dtype='object').reshape(-1,1))

        res_bdy_len_test = preprocessing[1]['res_bdy_len'].transform(
            np.array(test_point[ref_dict['res_bdy_len']],dtype='object').reshape(-1,1))

        Sjit_test = preprocessing[1]['Sjit'].transform(
            np.array(test_point[ref_dict['Sjit']],dtype='object').reshape(-1,1))

        Djit_test = preprocessing[1]['Djit'].transform(
            np.array(test_point[ref_dict['Djit']],dtype='object').reshape(-1,1))

        Sintpkt_test = preprocessing[1]['Sintpkt'].transform(
            np.array(test_point[ref_dict['Sintpkt']],dtype='object').reshape(-1,1))

        Dintpkt_test = preprocessing[1]['Dintpkt'].transform(
            np.array(test_point[ref_dict['Dintpkt']],dtype='object').reshape(-1,1))

        tcprtt_test = preprocessing[1]['tcprtt'].transform(
            np.array(test_point[ref_dict['tcprtt']],dtype='object').reshape(-1,1))

        synack_test = preprocessing[1]['synack'].transform(
            np.array(test_point[ref_dict['synack']],dtype='object').reshape(-1,1))

        ackdat_test = preprocessing[1]['ackdat'].transform(
            np.array(test_point[ref_dict['ackdat']],dtype='object').reshape(-1,1))

        ct_srv_src_test = preprocessing[1]['ct_srv_src'].transform(
            np.array(test_point[ref_dict['ct_srv_src']],dtype='object').reshape(-1,1))

        ct_srv_dst_test = preprocessing[1]['ct_srv_dst'].transform(
            np.array(test_point[ref_dict['ct_srv_dst']],dtype='object').reshape(-1,1))

        ct_dst_ltm_test = preprocessing[1]['ct_dst_ltm'].transform(
            np.array(test_point[ref_dict['ct_dst_ltm']],dtype='object').reshape(-1,1))

        ct_src_ltm_test = preprocessing[1]['ct_src_ltm'].transform(
            np.array(test_point[ref_dict['ct_src_ltm']],dtype='object').reshape(-1,1))

        ct_src_dport_ltm_test = preprocessing[1]['ct_src_dport_ltm'].transform(
            np.array(test_point[ref_dict['ct_src_dport_ltm']],dtype='object').reshape(-1,1))

        ct_dst_sport_ltm_test = preprocessing[1]['ct_dst_sport_ltm'].transform(
            np.array(test_point[ref_dict['ct_dst_sport_ltm']],dtype='object').reshape(-1,1))

        
        # Stacking all the generated features
        X_test = hstack((proto_test,state_test,service_test,is_sm_ips_ports_test,ct_state_ttl_test,
                                sport_test,dsport_test,dur_test,sbytes_test,dbytes_test,sttl_test,dttl_test,
                                sloss_test,dloss_test,Sload_test,Dload_test,Spkts_test,Dpkts_test,swin_test,
                                dwin_test,stcpb_test,dtcpb_test,smeansz_test,dmeansz_test,trans_depth_test,
                                res_bdy_len_test,Sjit_test,Djit_test,Sintpkt_test,Dintpkt_test,
                                tcprtt_test,synack_test,ackdat_test,ct_srv_src_test,ct_srv_dst_test,ct_dst_ltm_test,
                                ct_src_ltm_test,ct_src_ltm_test,ct_src_dport_ltm_test,ct_dst_sport_ltm_test))

        
        # Predicting X_test points with all base learners.
        y_pred = []
        for j in range(10):  # iterating through all 10 base learners
            y_pred.append(sig_clf[j].predict_proba(X_test))
        
        # taking mean of predictions from all base learners
        y_pred = np.array(y_pred)[:,0]
        y_pred = np.mean(y_pred[:,1])>0.5 # flag to indicate attack or not

        if y_pred:
            return "The given datapoint is an attack"
        else:
            return "The given datapoint is not an attack"
    
    
    try:
        # If single datapoint is provied then shape length will be 1
        if len(X.shape) == 1:
            return predict_point(X)
        # else length will be 2 and a loop is written to iterate through all datapoints
        elif len(X.shape) == 2:
            prediction = []
            for point in X:
                prediction.append(predict_point(point))
            return prediction
        else:
            print("Please provide a Numpy array of one or more datapoints")
    except:
        print("Please provide a valid input")

#### 1.2 "function2"

In [3]:
def function2(X,Y):
    '''
        1.  This function takes datapoints and their Label as input and
            calculates AUC score
        2.  Both X & Y should be numpy arrays
    '''
    
    # Loading all saved models, scalar objects and encoder objects
    # All these objects were saved in a list structured as follows:
    #    [ref_dict,[encoder,scalar],[base_learners]]
    
    all_saved_data = pickle.load(open("all_saved_data.pkl","rb"))
    ref_dict = all_saved_data[0]
    preprocessing = all_saved_data[1]
    sig_clf = all_saved_data[2]
    
    
    def predict_proba(test_points):
        '''
            This function takes datapoints as inputs and performs all preprocessing
            and returns the prediction probability for those input points
        '''
        
        # encoding Categorical Features
        proto_test = preprocessing[0]['proto'].transform(
            np.array(test_points[:,ref_dict['proto']],dtype='object').reshape(-1,1))


        state_test = preprocessing[0]['state'].transform(
            np.array(test_points[:,ref_dict['state']],dtype='object').reshape(-1,1))


        service_test = preprocessing[0]['service'].transform(
            np.array(test_points[:,ref_dict['service']],dtype='object').reshape(-1,1))


        is_sm_ips_ports_test = preprocessing[0]['is_sm_ips_ports'].transform(
            np.array(test_points[:,ref_dict['is_sm_ips_ports']],dtype='object').reshape(-1,1))


        ct_state_ttl_test = preprocessing[0]['ct_state_ttl'].transform(
            np.array(test_points[:,ref_dict['ct_state_ttl']],dtype='object').reshape(-1,1))


        # scaling numerical features

        sport_test = preprocessing[1]['sport'].transform(
            np.array(test_points[:,ref_dict['sport']],dtype='object').reshape(-1,1))

        dsport_test = preprocessing[1]['dsport'].transform(
            np.array(test_points[:,ref_dict['dsport']],dtype='object').reshape(-1,1))

        dur_test = preprocessing[1]['dur'].transform(
            np.array(test_points[:,ref_dict['dur']],dtype='object').reshape(-1,1))

        sbytes_test = preprocessing[1]['sbytes'].transform(
            np.array(test_points[:,ref_dict['sbytes']],dtype='object').reshape(-1,1))

        dbytes_test = preprocessing[1]['dbytes'].transform(
            np.array(test_points[:,ref_dict['dbytes']],dtype='object').reshape(-1,1))

        sttl_test = preprocessing[1]['sttl'].transform(
            np.array(test_points[:,ref_dict['sttl']],dtype='object').reshape(-1,1))

        dttl_test = preprocessing[1]['dttl'].transform(
            np.array(test_points[:,ref_dict['dttl']],dtype='object').reshape(-1,1))

        sbytes_test = preprocessing[1]['sbytes'].transform(
            np.array(test_points[:,ref_dict['sbytes']],dtype='object').reshape(-1,1))

        sloss_test = preprocessing[1]['sloss'].transform(
            np.array(test_points[:,ref_dict['sloss']],dtype='object').reshape(-1,1))

        dloss_test = preprocessing[1]['dloss'].transform(
            np.array(test_points[:,ref_dict['dloss']],dtype='object').reshape(-1,1))

        Sload_test = preprocessing[1]['Sload'].transform(
            np.array(test_points[:,ref_dict['Sload']],dtype='object').reshape(-1,1))

        Dload_test = preprocessing[1]['Dload'].transform(
            np.array(test_points[:,ref_dict['Dload']],dtype='object').reshape(-1,1))

        Spkts_test = preprocessing[1]['Spkts'].transform(
            np.array(test_points[:,ref_dict['Spkts']],dtype='object').reshape(-1,1))

        Dpkts_test = preprocessing[1]['Dpkts'].transform(
            np.array(test_points[:,ref_dict['Dpkts']],dtype='object').reshape(-1,1))

        swin_test = preprocessing[1]['swin'].transform(
            np.array(test_points[:,ref_dict['swin']],dtype='object').reshape(-1,1))

        dwin_test = preprocessing[1]['dwin'].transform(
            np.array(test_points[:,ref_dict['dwin']],dtype='object').reshape(-1,1))

        stcpb_test = preprocessing[1]['stcpb'].transform(
            np.array(test_points[:,ref_dict['stcpb']],dtype='object').reshape(-1,1))

        dtcpb_test = preprocessing[1]['dtcpb'].transform(
            np.array(test_points[:,ref_dict['dtcpb']],dtype='object').reshape(-1,1))

        smeansz_test = preprocessing[1]['smeansz'].transform(
            np.array(test_points[:,ref_dict['smeansz']],dtype='object').reshape(-1,1))

        dmeansz_test = preprocessing[1]['dmeansz'].transform(
            np.array(test_points[:,ref_dict['dmeansz']],dtype='object').reshape(-1,1))

        trans_depth_test = preprocessing[1]['trans_depth'].transform(
            np.array(test_points[:,ref_dict['trans_depth']],dtype='object').reshape(-1,1))

        res_bdy_len_test = preprocessing[1]['res_bdy_len'].transform(
            np.array(test_points[:,ref_dict['res_bdy_len']],dtype='object').reshape(-1,1))

        Sjit_test = preprocessing[1]['Sjit'].transform(
            np.array(test_points[:,ref_dict['Sjit']],dtype='object').reshape(-1,1))

        Djit_test = preprocessing[1]['Djit'].transform(
            np.array(test_points[:,ref_dict['Djit']],dtype='object').reshape(-1,1))

        Sintpkt_test = preprocessing[1]['Sintpkt'].transform(
            np.array(test_points[:,ref_dict['Sintpkt']],dtype='object').reshape(-1,1))

        Dintpkt_test = preprocessing[1]['Dintpkt'].transform(
            np.array(test_points[:,ref_dict['Dintpkt']],dtype='object').reshape(-1,1))

        tcprtt_test = preprocessing[1]['tcprtt'].transform(
            np.array(test_points[:,ref_dict['tcprtt']],dtype='object').reshape(-1,1))

        synack_test = preprocessing[1]['synack'].transform(
            np.array(test_points[:,ref_dict['synack']],dtype='object').reshape(-1,1))

        ackdat_test = preprocessing[1]['ackdat'].transform(
            np.array(test_points[:,ref_dict['ackdat']],dtype='object').reshape(-1,1))

        ct_srv_src_test = preprocessing[1]['ct_srv_src'].transform(
            np.array(test_points[:,ref_dict['ct_srv_src']],dtype='object').reshape(-1,1))

        ct_srv_dst_test = preprocessing[1]['ct_srv_dst'].transform(
            np.array(test_points[:,ref_dict['ct_srv_dst']],dtype='object').reshape(-1,1))

        ct_dst_ltm_test = preprocessing[1]['ct_dst_ltm'].transform(
            np.array(test_points[:,ref_dict['ct_dst_ltm']],dtype='object').reshape(-1,1))

        ct_src_ltm_test = preprocessing[1]['ct_src_ltm'].transform(
            np.array(test_points[:,ref_dict['ct_src_ltm']],dtype='object').reshape(-1,1))

        ct_src_dport_ltm_test = preprocessing[1]['ct_src_dport_ltm'].transform(
            np.array(test_points[:,ref_dict['ct_src_dport_ltm']],dtype='object').reshape(-1,1))

        ct_dst_sport_ltm_test = preprocessing[1]['ct_dst_sport_ltm'].transform(
            np.array(test_points[:,ref_dict['ct_dst_sport_ltm']],dtype='object').reshape(-1,1))


        X_test = hstack((proto_test,state_test,service_test,is_sm_ips_ports_test,ct_state_ttl_test,
                                sport_test,dsport_test,dur_test,sbytes_test,dbytes_test,sttl_test,dttl_test,
                                sloss_test,dloss_test,Sload_test,Dload_test,Spkts_test,Dpkts_test,swin_test,
                                dwin_test,stcpb_test,dtcpb_test,smeansz_test,dmeansz_test,trans_depth_test,
                                res_bdy_len_test,Sjit_test,Djit_test,Sintpkt_test,Dintpkt_test,
                                tcprtt_test,synack_test,ackdat_test,ct_srv_src_test,ct_srv_dst_test,ct_dst_ltm_test,
                                ct_src_ltm_test,ct_src_ltm_test,ct_src_dport_ltm_test,ct_dst_sport_ltm_test))

        
        # Predicting X_test points with all base learners.
        y_pred = []
        for j in range(10):
            y_pred.append(sig_clf[j].predict_proba(X_test))
        
        # Getting final prediction for each datapoint with taking mean of 
        # predictions with all base_learners.
        predicted_y = []  # empty list to store predictions
        for i in tqdm(range(X_test.shape[0])):   # Loop to iterate through all datapoints
            temp = []
            for j in range(10):                # Loop to iterate through all base learners
                temp.append(y_pred[j][i][1])
            predicted_y.append(np.mean(temp))  # Taking mean of all base learner predictions

        return predicted_y
    
    if len(X.shape) == 1:
        print("AUC Score can not be calculated for one point")
    elif len(X.shape) == 2:
        prediction = predict_proba(X)
        return "AUC Score for given datapoints is : {}".format(roc_auc_score(Y,prediction))

### 2. Testing Functions

In [4]:
data = pd.read_csv("final_data.csv",nrows=1000000)
data = data.dropna()

In [5]:
X = data.drop(['Label'],axis=1).values
Y = data['Label'].values

In [6]:
# prediction for one point with function1

function1(X[0])

'The given datapoint is not an attack'

In [7]:
# prediction for multiple points with function1

function1(X[0:10])

['The given datapoint is not an attack',
 'The given datapoint is not an attack',
 'The given datapoint is not an attack',
 'The given datapoint is not an attack',
 'The given datapoint is not an attack',
 'The given datapoint is not an attack',
 'The given datapoint is not an attack',
 'The given datapoint is not an attack',
 'The given datapoint is not an attack',
 'The given datapoint is not an attack']

In [8]:
# Getting AUC for complete data with function2

function2(X,Y)

100%|█████████████████████████████████████████████████████████████████████| 1000000/1000000 [00:13<00:00, 72660.91it/s]


'AUC Score for given datapoints is : 0.9999989400247669'