## Mount Drive for Colab

In [33]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [35]:
%cd gdrive/My\ Drive/1000ml/Project\ 6\ -\ Intrusion\ Detection/1000ml_Project6

[Errno 2] No such file or directory: 'gdrive/My Drive/1000ml/Project 6 - Intrusion Detection/1000ml_Project6'
/content/gdrive/My Drive/1000ml/Project 6 - Intrusion Detection/1000ml_Project6


# Imports

In [0]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier

from sklearn.model_selection import train_test_split as TTS
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.feature_selection import chi2, SelectKBest, f_classif

from sklearn.metrics import precision_score, f1_score, recall_score

from imblearn.pipeline import Pipeline
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek

rand = 18

# Set up

In [0]:
intrusion_df = pd.read_csv('data/variable_reduced_data.csv')

In [0]:
reduced_num_cols = ['wrong_fragment',
                    'hot',
                    'num_failed_logins',
                    'num_compromised',
                    'num_root',
                    'count',
                    'num_file_creations',
                    'num_shells', 
                    'num_access_files',
                    'srv_count',
                    'serror_rate',
                    'srv_serror_rate',
                    'dst_host_count',
                    'dst_host_srv_count',
                    'dst_host_same_srv_rate',
                    'dst_host_diff_srv_rate',
                    'dst_host_same_src_port_rate',
                    'dst_host_srv_diff_host_rate',
                    'dst_host_serror_rate',
                    'dst_host_srv_serror_rate',
                    'dst_host_rerror_rate',
                    'dst_host_srv_rerror_rate']

reduced_cat_cols = ['protocol_type = tcp',
                    'protocol_type = icmp',
                    'service = domain_u',
                    'service = http',
                    'service = smtp',
                    'service = ftp_data',
                    'service = ftp',
                    'service = other',
                    'service = ecr_i',
                    'service = telnet',
                    'service = tim_i',
                    'service = uucp',
                    'service = courier',
                    'service = private',
                    'flag = SF',
                    'flag = REJ',
                    'flag = SH',
                    'logged_in',
                    'is_host_login',
                    'is_guest_login',
                    'root_shell']

In [0]:
def print_scores(model, train_x, validation_x, train_y, validation_y):
    '''
    This function takes in a Classifier model that has previously been fit to a set of train data.
    It then calculates the predicted classifications and outputs the recall, precision and f1 scores
    for train and validation data sets
    '''
    y_train_pred = model.predict(train_x)
    y_val_pred = model.predict(validation_x)
    
    train_prec = precision_score(train_y, y_train_pred)
    val_prec = precision_score(validation_y, y_val_pred)
    
    train_f1 = f1_score(train_y, y_train_pred)
    val_f1 = f1_score(validation_y, y_val_pred)
    
    train_recall = recall_score(train_y, y_train_pred)
    val_recall = recall_score(validation_y, y_val_pred)
    
    print(f'''    Precision:
    Train = {train_prec.round(3)}
    Validation = {val_prec.round(3)}
    
    F1:
    Train = {train_f1.round(3)}
    Validation = {val_f1.round(3)}
    
    Recall:
    Train = {train_recall.round(3)}
    Validation = {val_recall.round(3)}    
    ''')

# Reset Models

In [0]:
# set, train test split and scale
# Set data
x = intrusion_df[reduced_num_cols+reduced_cat_cols]
y = intrusion_df['target']

# train val test split
x_rest, x_test, y_rest, y_test = TTS(x,y,test_size=0.20,random_state=3)
x_train, x_val, y_train, y_val = TTS(x_rest,y_rest,test_size=0.25,random_state=3)

transformed_cols = reduced_num_cols+reduced_cat_cols

# scale the numeric data
ct = ColumnTransformer(
            [("Num_Cols", StandardScaler(), reduced_num_cols),
             ("Cat_Cols", 'passthrough', reduced_cat_cols)
             ])

# Define scaled data as dataframes
x_train_scale = pd.DataFrame(ct.fit_transform(x_train), columns=transformed_cols)
x_val_scale = pd.DataFrame(ct.transform(x_val), columns=transformed_cols)
x_test_scale = pd.DataFrame(ct.transform(x_test), columns=transformed_cols)

## RandomForest Model: Default Class Balancing

In [53]:
# Create and fit model, print scores
rf_model = RandomForestClassifier(class_weight='balanced',
                                  random_state=rand,
                                  n_estimators=200,
                                  min_samples_leaf=1,
                                  min_samples_split=2,
                                  max_features=None,
                                  max_depth=20,
                                  criterion='gini'
                                 )
rf_model.fit(x_train_scale,y_train)

print_scores(rf_model, x_train_scale, x_val_scale, y_train, y_val)

    Precision:
    Train = 1.0
    Validation = 0.808
    
    F1:
    Train = 1.0
    Validation = 0.712
    
    Recall:
    Train = 1.0
    Validation = 0.636    
    


## RandomForest Model: SMOTETomek Class Balancing

In [0]:
combine = SMOTETomek(sampling_strategy='all', # resamples all the data 
                     random_state=rand, 
                     n_jobs=-1,
                     smote=SMOTE(),  # default parameters used
                     tomek=TomekLinks()  # default params used
                    )
                   
# fit and resample
x_train_rs, y_train_rs = combine.fit_resample(x_train,y_train)

In [0]:
# Scale the data with the column transformer
x_train_rs_scale = pd.DataFrame(ct.fit_transform(x_train_rs), columns=transformed_cols)

In [0]:
# Pick model based on the grid search from the RF section below.
rf_model_combine = RandomForestClassifier(class_weight='balanced',
                                          random_state=rand,
                                          n_estimators=150,
                                          min_samples_leaf=1,
                                          min_samples_split=2,
                                          max_features=None,
                                          max_depth=22,
                                          criterion='gini',
                                          n_jobs=-1
                                         )

In [56]:
# fit model
rf_model_combine.fit(x_train_rs_scale, y_train_rs)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=22, max_features=None,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=150,
                       n_jobs=-1, oob_score=False, random_state=18, verbose=0,
                       warm_start=False)

In [57]:
# Print the scores
print_scores(rf_model_combine, x_train_rs_scale, x_val_scale, y_train_rs, y_val)

    Precision:
    Train = 1.0
    Validation = 0.122
    
    F1:
    Train = 1.0
    Validation = 0.211
    
    Recall:
    Train = 1.0
    Validation = 0.788    
    


### Now with a Pipeline

In [0]:
# (re)Define Pipeline Steps

# Combination method for over/under sampling
combine = SMOTETomek(sampling_strategy='all', # resamples all the data 
                     random_state=rand, 
                     n_jobs=-1, 
                     smote=SMOTE(),  # default parameters used
                     tomek=TomekLinks()  # default params used
                    )

# The column transformer - leaves discrete columns alone and standard scales the cts columns
ct = ColumnTransformer(
            [("Num_Cols", StandardScaler(), reduced_num_cols),
             ("Cat_Cols", 'passthrough', reduced_cat_cols)
             ])

# Random Forest model 
rf_model_combine = RandomForestClassifier(class_weight='balanced',
                                          random_state=rand,
                                          n_estimators=150,
                                          min_samples_leaf=1,
                                          min_samples_split=2,
                                          max_features=None,
                                          max_depth=22,
                                          criterion='gini',
                                          n_jobs=-1
                                         )

combine_pipe = Pipeline(steps=[('combo_sampler',combine),
                              ('scaler',ct),
                              ('model',rf_model_combine)])

combine_pipe.fit(x_train,y_train)

In [59]:
print_scores(combine_pipe, x_train, x_val, y_train, y_val)

    Precision:
    Train = 1.0
    Validation = 0.542
    
    F1:
    Train = 1.0
    Validation = 0.642
    
    Recall:
    Train = 1.0
    Validation = 0.788    
    


# Boosting

In [0]:
import xgboost as xgb