## Imports

In [92]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier

from sklearn.model_selection import train_test_split as TTS
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.feature_selection import chi2, SelectKBest, f_classif

from sklearn.metrics import precision_score, f1_score, recall_score

In [2]:
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

In [4]:
intrusion_df = pd.read_csv('data/cleaned_intrusion_data.csv')

# Variable Reduction

In [6]:
# The columns of use

num_cols = ['duration', 'src_bytes', 'dst_bytes', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins',
            'num_compromised', 'num_root', 'count','num_file_creations', 'num_shells', 'num_access_files',
            'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate','srv_rerror_rate', 'same_srv_rate', 
            'diff_srv_rate','srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
            'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 
            'dst_host_srv_diff_host_rate','dst_host_serror_rate', 'dst_host_srv_serror_rate',
            'dst_host_rerror_rate', 'dst_host_srv_rerror_rate',
           ]

cat_cols = ['protocol_type = udp', 'protocol_type = tcp', 'protocol_type = icmp',  
            'service = domain_u', 'service = http','service = smtp', 'service = ftp_data', 'service = ftp',
            'service = eco_i', 'service = other', 'service = auth', 'service = ecr_i', 'service = IRC', 
            'service = X11', 'service = finger', 'service = time', 'service = telnet', 
            'service = ntp_u','service = tim_i', 'service = remote_job', 'service = link',
            'service = urp_i', 'service = pop_3', 'service = tftp_u','service = imap4', 
            'service = nnsp', 'service = uucp', 'service = courier', 'service = login', 
            'service = icmp', 'service = domain', 'service = private',
            'flag = SF', 'flag = RSTR', 'flag = S1', 'flag = REJ', 'flag = S3', 
            'flag = S2', 'flag = RSTOS0', 'flag = RSTO','flag = SH',
            'logged_in', 'is_host_login', 'is_guest_login', 'root_shell', 'su_attempted',
           ]

target = 'target'

## $\chi^2$ - Categorical Selection

In [7]:
# Set Categorical data
x_cat = intrusion_df[cat_cols]
y = intrusion_df['target']

# train test split rest and test
x_cat_rest, x_cat_test, y_rest, y_test = TTS(x_cat, y, test_size=0.2, random_state=3)

# TTS train and val
x_cat_train, x_cat_val, y_train, y_val = TTS(x_cat_rest, y_rest, test_size=0.25, random_state =3)

In [13]:
selector_chi2 = SelectKBest(score_func = chi2, k = 21)
selector_chi2.fit(x_cat_train,y_train);

In [14]:
selector_chi2.transform(x_cat_train)

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [17]:
selector_chi2.pvalues_.round(4)

array([0.3231, 0.0029, 0.    , 0.0062, 0.    , 0.0031, 0.0239, 0.    ,
       0.6733, 0.    , 0.6139, 0.    , 0.8446, 0.9219, 0.4818, 0.8265,
       0.    , 0.6692, 0.    ,    nan, 0.9447, 0.7238, 0.8265, 0.9447,
          nan,    nan, 0.    , 0.    ,    nan, 0.9219,    nan, 0.    ,
       0.    , 0.7187, 0.8353, 0.0128, 0.9044, 0.8545, 0.9447, 0.9447,
       0.    , 0.    , 0.    , 0.    , 0.    , 0.9044])

In [27]:
selector_chi2.scores_

array([9.76275322e-01, 8.86252280e+00, 1.07973156e+03, 7.50308051e+00,
       1.09095897e+02, 8.76640329e+00, 5.09935920e+00, 1.16680737e+02,
       1.77729820e-01, 3.13077984e+01, 2.54585958e-01, 1.42882090e+03,
       3.84280692e-02, 9.60701730e-03, 4.94761391e-01, 4.80350865e-02,
       1.19679236e+03, 1.82533329e-01, 6.24543478e+02,            nan,
       4.80350865e-03, 1.24891225e-01, 4.80350865e-02, 4.80350865e-03,
                  nan,            nan, 2.08181159e+02, 2.08181159e+02,
                  nan, 9.60701730e-03,            nan, 1.14071287e+03,
       2.24339103e+01, 1.29694734e-01, 4.32315778e-02, 6.19935970e+00,
       1.44105259e-02, 3.36245605e-02, 4.80350865e-03, 4.80350865e-03,
       1.14499638e+04, 7.11338193e+01, 4.66908810e+02, 2.05902769e+01,
       2.06058782e+03, 1.44105259e-02])

In [18]:
selector_chi2.get_support()

array([False,  True,  True,  True,  True,  True,  True,  True, False,
        True, False,  True, False, False, False, False,  True, False,
        True, False, False, False, False, False, False, False,  True,
        True, False, False, False,  True,  True, False, False,  True,
       False, False, False, False,  True,  True,  True,  True,  True,
       False])

In [19]:
x_chi2 = x_cat.iloc[:,selector_chi2.get_support(indices=True)]
x_chi2.columns

Index(['protocol_type = tcp', 'protocol_type = icmp', 'service = domain_u',
       'service = http', 'service = smtp', 'service = ftp_data',
       'service = ftp', 'service = other', 'service = ecr_i',
       'service = telnet', 'service = tim_i', 'service = uucp',
       'service = courier', 'service = private', 'flag = SF', 'flag = REJ',
       'flag = SH', 'logged_in', 'is_host_login', 'is_guest_login',
       'root_shell'],
      dtype='object')

### Exploring the NaN values

For each column above that has been assiged a `NaN` $\chi^2$ score and p value, the reason is that there exists exactly 1 record that is positive, while the rest are negative. Therefore these columns can be ignored for feature selection. See calculations below

In [29]:
print(cat_cols[19], cat_cols[24], cat_cols[25], cat_cols[28], cat_cols[30]) 

service = remote_job service = imap4 service = nnsp service = login service = domain


In [31]:
intrusion_df['service = remote_job'].value_counts(normalize=True)

0.0    0.999979
1.0    0.000021
Name: service = remote_job, dtype: float64

In [32]:
intrusion_df['service = imap4'].value_counts(normalize=True)

0.0    0.999979
1.0    0.000021
Name: service = imap4, dtype: float64

In [33]:
intrusion_df['service = nnsp'].value_counts(normalize=True)

0.0    0.999979
1.0    0.000021
Name: service = nnsp, dtype: float64

In [34]:
intrusion_df['service = login'].value_counts(normalize=True)

0.0    0.999979
1.0    0.000021
Name: service = login, dtype: float64

In [35]:
intrusion_df['service = domain'].value_counts(normalize=True)

0.0    0.999979
1.0    0.000021
Name: service = domain, dtype: float64

In [36]:
0.000021*len(intrusion_df)

1.010373

## ANOVA - Numerical Selection

In [37]:
scaler = StandardScaler()

# Set Categorical data
x_num = intrusion_df[num_cols]
y = intrusion_df['target']

# train test split
x_num_rest, x_num_test, y_rest, y_test = TTS(x_num, y, test_size=0.2, random_state=3)

x_num_train, x_num_val, y_train, y_val = TTS(x_num_rest, y_rest, test_size=0.25, random_state=3)

In [50]:
x_num_train_scale = scaler.fit_transform(x_num_train)

# Use Select K Best
selector_anova = SelectKBest(score_func= f_classif, k = 22)

# fit the selector
selector_anova.fit(x_num_train_scale, y_train)

  f = msb / msw


SelectKBest(k=22, score_func=<function f_classif at 0x7f9a9b19b950>)

In [51]:
selector_anova.pvalues_.round(3)

array([0.42 , 0.648, 0.767, 0.   ,   nan, 0.   , 0.   , 0.   , 0.006,
       0.007, 0.   , 0.   , 0.   , 0.001, 0.   , 0.   , 0.153, 0.181,
       0.991, 0.559, 0.672, 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   ])

In [52]:
selector_anova.get_support()

array([False, False, False,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False, False,
       False, False, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True])

In [53]:
x_anova = x_num.iloc[:,selector_anova.get_support(indices=True)]
list(x_anova.columns)

['wrong_fragment',
 'hot',
 'num_failed_logins',
 'num_compromised',
 'num_root',
 'count',
 'num_file_creations',
 'num_shells',
 'num_access_files',
 'srv_count',
 'serror_rate',
 'srv_serror_rate',
 'dst_host_count',
 'dst_host_srv_count',
 'dst_host_same_srv_rate',
 'dst_host_diff_srv_rate',
 'dst_host_same_src_port_rate',
 'dst_host_srv_diff_host_rate',
 'dst_host_serror_rate',
 'dst_host_srv_serror_rate',
 'dst_host_rerror_rate',
 'dst_host_srv_rerror_rate']

### Exploring Warnings

We find the same problem as before for one of the columns. The other warning seems strange, but likely follows with the error from the `NaN`, given that the calculation comes in the line directly after the previous warning.

In [43]:
intrusion_df[num_cols].iloc[:,4].head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: urgent, dtype: float64

In [46]:
intrusion_df['urgent'].value_counts(normalize=True)

0.0    0.999958
3.0    0.000021
1.0    0.000021
Name: urgent, dtype: float64

In [47]:
intrusion_df[num_cols].iloc[:,5].head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: hot, dtype: float64

In [48]:
intrusion_df['hot'].value_counts(normalize=True)

0.0      0.995739
4.0      0.001102
1.0      0.000935
5.0      0.000499
2.0      0.000416
6.0      0.000270
7.0      0.000249
3.0      0.000187
18.0     0.000145
19.0     0.000125
22.0     0.000083
24.0     0.000062
14.0     0.000042
30.0     0.000042
10.0     0.000042
11.0     0.000021
101.0    0.000021
15.0     0.000021
Name: hot, dtype: float64

# Initial Modelling

In [54]:
# columns to use:
reduced_cat_cols = list(x_chi2.columns)
reduced_num_cols = list(x_anova.columns)

# set, train test split and scale
# Set data
x = intrusion_df[reduced_num_cols+reduced_cat_cols]
y = intrusion_df['target']

# train test split
x_rest, x_test, y_rest, y_test = TTS(x,y,test_size=0.20,random_state=3)
x_train, x_val, y_train, y_val = TTS(x_rest,y_rest,test_size=0.25,random_state=3)

#reduced_cat_cols.remove('district_id')
transformed_cols = reduced_num_cols+reduced_cat_cols

# scale the numeric data
ct = ColumnTransformer(
            [("Num_Cols", StandardScaler(), reduced_num_cols),
             ("Cat_Cols", 'passthrough', reduced_cat_cols)
             ])

# Define scaled data as dataframes
x_train_scale = pd.DataFrame(ct.fit_transform(x_train), columns=transformed_cols)
x_val_scale = pd.DataFrame(ct.fit_transform(x_val), columns=transformed_cols)
x_test_scale = pd.DataFrame(ct.fit_transform(x_test), columns=transformed_cols)

In [118]:
def print_scores(model, train_x, validation_x, train_y, validation_y):
    '''
    This function takes in a Classifier model that has previously been fit to a set of train data.
    It then calculates the predicted classifications and outputs the recall, precision and f1 scores
    for train and validation data sets
    '''
    y_train_pred = model.predict(train_x)
    y_val_pred = model.predict(validation_x)
    
    train_prec = precision_score(train_y, y_train_pred)
    val_prec = precision_score(validation_y, y_val_pred)
    
    train_f1 = f1_score(train_y, y_train_pred)
    val_f1 = f1_score(validation_y, y_val_pred)
    
    train_recall = recall_score(train_y, y_train_pred)
    val_recall = recall_score(validation_y, y_val_pred)
    
    print(f'''    Precision:
    Train = {train_prec.round(3)}
    Validation = {val_prec.round(3)}
    
    F1:
    Train = {train_f1.round(3)}
    Validation = {val_f1.round(3)}
    
    Recall:
    Train = {train_recall.round(3)}
    Validation = {val_recall.round(3)}    
    ''')

## Logistic Regression

The grid search found some hot garbage. We should consider doing a more robust version of class balancing

### Grid Search - LR

In [76]:
paramsl1 = {'penalty':['l1'],
            'C':np.logspace(-2,4,7),
            'solver':['newton-cg','lbfgs','sag','saga']
           }
paramsl2 = {'penalty':['l2'],
            'C':np.logspace(-2,4,7),
            'solver':['liblinear','saga']
           }
# params = [paramsl1, paramsl2]

params_lr = {'penalty':['l2'],
             'C':np.logspace(2,5,8),
             'solver':['liblinear']
            }
scores = {'prec':'precision',
          'F1':'f1'
         }

gs_lrc = GridSearchCV(LogisticRegression(class_weight='balanced',random_state=3,max_iter=200,n_jobs=-1), 
                            params_lr,
                            scoring = scores,
                            refit = 'prec',
                            verbose = 1,
                            n_jobs=-1)

In [77]:
gs_lrc.fit(x_train_scale,y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.8min finished
  " = {}.".format(effective_n_jobs(self.n_jobs)))


GridSearchCV(cv=None, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight='balanced',
                                          dual=False, fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=200, multi_class='auto',
                                          n_jobs=-1, penalty='l2',
                                          random_state=3, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': array([   100.        ,    268.26957953,    719.685673  ,   1930.69772888,
         5179.47467923,  13894.95494373,  37275.93720315, 100000.        ]),
                         'penalty': ['l2'], 'solver': ['liblinear']},
             pre_dispatch='2*n_jobs', refit='prec', return_train_score=False,
             scoring

In [78]:
gs_lrc_results = pd.DataFrame(gs_lrc.cv_results_)
gs_lrc_results.sort_values(by='rank_test_prec').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_penalty,param_solver,params,split0_test_prec,split1_test_prec,split2_test_prec,split3_test_prec,split4_test_prec,mean_test_prec,std_test_prec,rank_test_prec,split0_test_F1,split1_test_F1,split2_test_F1,split3_test_F1,split4_test_F1,mean_test_F1,std_test_F1,rank_test_F1
4,9.209244,3.260351,0.012641,0.001671,5179.47,l2,liblinear,"{'C': 5179.474679231213, 'penalty': 'l2', 'sol...",0.342105,0.436364,0.407407,0.333333,0.361111,0.376064,0.03955,1,0.5,0.578313,0.54321,0.495238,0.52,0.527352,0.030601,1
7,8.384768,2.466114,0.014908,0.005334,100000.0,l2,liblinear,"{'C': 100000.0, 'penalty': 'l2', 'solver': 'li...",0.346667,0.436364,0.4,0.325,0.361111,0.373828,0.039697,2,0.504854,0.578313,0.536585,0.485981,0.52,0.525147,0.0314,2
6,10.138232,4.260757,0.012932,0.000621,37275.9,l2,liblinear,"{'C': 37275.93720314938, 'penalty': 'l2', 'sol...",0.342105,0.428571,0.4,0.329114,0.357143,0.371387,0.037245,3,0.5,0.571429,0.536585,0.490566,0.510204,0.521757,0.029213,4
2,6.822868,2.9665,0.012253,0.001185,719.686,l2,liblinear,"{'C': 719.6856730011522, 'penalty': 'l2', 'sol...",0.329114,0.428571,0.4,0.337662,0.356164,0.370302,0.03805,4,0.485981,0.571429,0.536585,0.5,0.514851,0.521769,0.02996,3
3,8.87933,4.163402,0.014302,0.002555,1930.7,l2,liblinear,"{'C': 1930.6977288832495, 'penalty': 'l2', 'so...",0.282828,0.428571,0.415094,0.333333,0.356164,0.363198,0.053605,5,0.440945,0.571429,0.55,0.495238,0.514851,0.514493,0.045349,5


## Bagging Classifier - Logistic Regression

### Grid Search - Bagging - LR

In [166]:
params_bag_l1 = {'base_estimator__penalty':['l1'],
                 'base_estimator__class_weight':['balanced'],
                 'base_estimator__random_state':[3],
                 'base_estimator__max_iter':[200],
                 'base_estimator__n_jobs':[-1],
                 'base_estimator__C':np.logspace(-2,4,7),
                 'base_estimator__solver':['newton-cg','lbfgs','sag','saga'],
                 'n_estimators':[10,50,100],
                 'max_features':[0.6,0.8,1.0],
                 'bootstrap_features':[True,False]
                }
params_bag_l2 = {'base_estimator__penalty':['l2'],
                 'base_estimator__class_weight':['balanced'],
                 'base_estimator__random_state':[3],
                 'base_estimator__max_iter':[200],
                 'base_estimator__n_jobs':[-1],
                 'base_estimator__C':np.logspace(-2,4,7),
                 'base_estimator__solver':['liblinear','saga'],
                 'n_estimators':[10,50,100],
                 'max_features':[0.6,0.8,1.0],
                 'bootstrap_features':[True,False]
                }
params_bag_lr = [params_bag_l1, params_bag_l2]

# params_lr = {'penalty':['l2'],
#              'C':np.logspace(2,5,8),
#              'solver':['liblinear']
#             }
scores = {'prec':'precision',
          'Recall':'recall'
         }

gs_bag_lrc = RandomizedSearchCV(BaggingClassifier(LogisticRegression(),
                                              n_jobs=-1,
                                              random_state=3
                                             ),
                            params_bag_lr,
                            scoring = scores,
                            n_iter=50,     
                            refit = 'prec',
                            verbose = 1,
                            n_jobs=-1)

In [None]:
gs_bag_lrc.fit(x_train_scale,y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 3 concurrent workers.


In [None]:
gs_bag_lrc_results = pd.DataFrame(gs_lrc.cv_results_)
gs_bag_lrc_results.sort_values(by='rank_test_prec').head()

## Decision Tree
Slightly better than logistic regression but still pretty crap. 

In [157]:
# Best model from Grid Search
dt_model = DecisionTreeClassifier(random_state=3,
                                  class_weight='balanced',
                                  criterion = 'entropy',
                                  max_depth = 20,
                                  min_samples_leaf=1,
                                  min_samples_split=2,
                                  min_impurity_decrease = 0.0
                                 )

In [158]:
dt_model.fit(x_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight='balanced',
                       criterion='entropy', max_depth=20, max_features=None,
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       presort='deprecated', random_state=3, splitter='best')

In [159]:
print_scores(dt_model, x_train,x_val,y_train,y_val)

    Precision:
    Train = 0.986
    Validation = 0.625
    
    F1:
    Train = 0.993
    Validation = 0.615
    
    Recall:
    Train = 1.0
    Validation = 0.606    
    


### Grid Search - DT

In [73]:
params_dt = {'criterion':['entropy'],
             'max_depth':[18,19,20,21],
             'max_features':[None],
             'min_samples_split':[2,4,6,8,10],
             'min_samples_leaf':[1,2,3,4,5],
             'min_impurity_decrease':[0.0,0.05],
             'min_weight_fraction_leaf':[0.0,0.05]
             }

scores = {'prec':'precision',
          'F1':'f1'
         }

gs_dtc = GridSearchCV(DecisionTreeClassifier(class_weight='balanced',random_state=3), 
                        params_dt,
                        scoring = scores,
                        refit = 'prec',
                        verbose = 1,
                        n_jobs=-1)

In [74]:
gs_dtc.fit(x_train_scale,y_train)

Fitting 5 folds for each of 400 candidates, totalling 2000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 382 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done 882 tasks      | elapsed:   28.9s
[Parallel(n_jobs=-1)]: Done 1582 tasks      | elapsed:   51.4s
[Parallel(n_jobs=-1)]: Done 1995 out of 2000 | elapsed:  1.1min remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed:  1.1min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                              class_weight='balanced',
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=3, splitter='best'),
             i...cated', n_jobs=-1,
             param_grid={'criterion': ['entropy'],
                         'max

In [83]:
gs_dtc.best_params_

{'criterion': 'entropy',
 'max_depth': 19,
 'max_features': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0}

In [75]:
gs_dtc_results = pd.DataFrame(gs_dtc.cv_results_)
gs_dtc_results.sort_values(by='rank_test_prec').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_max_features,param_min_impurity_decrease,param_min_samples_leaf,param_min_samples_split,param_min_weight_fraction_leaf,params,split0_test_prec,split1_test_prec,split2_test_prec,split3_test_prec,split4_test_prec,mean_test_prec,std_test_prec,rank_test_prec,split0_test_F1,split1_test_F1,split2_test_F1,split3_test_F1,split4_test_F1,mean_test_F1,std_test_F1,rank_test_F1
100,0.088042,0.01141,0.011406,0.000695,entropy,19,,0,1,2,0,"{'criterion': 'entropy', 'max_depth': 19, 'max...",0.923077,0.833333,0.793103,0.677419,0.807692,0.806925,0.078933,1,0.888889,0.769231,0.821429,0.724138,0.777778,0.796293,0.055655,1
300,0.085779,0.010372,0.010775,0.001223,entropy,21,,0,1,2,0,"{'criterion': 'entropy', 'max_depth': 21, 'max...",0.923077,0.833333,0.793103,0.677419,0.807692,0.806925,0.078933,1,0.888889,0.769231,0.821429,0.724138,0.777778,0.796293,0.055655,1
200,0.084589,0.008084,0.011377,0.000975,entropy,20,,0,1,2,0,"{'criterion': 'entropy', 'max_depth': 20, 'max...",0.923077,0.833333,0.793103,0.677419,0.807692,0.806925,0.078933,1,0.888889,0.769231,0.821429,0.724138,0.777778,0.796293,0.055655,1
0,0.136441,0.028817,0.021758,0.01262,entropy,18,,0,1,2,0,"{'criterion': 'entropy', 'max_depth': 18, 'max...",0.923077,0.833333,0.741935,0.677419,0.785714,0.792296,0.083143,4,0.888889,0.769231,0.793103,0.724138,0.785714,0.792215,0.053947,4
302,0.085977,0.010413,0.011425,0.000507,entropy,21,,0,1,4,0,"{'criterion': 'entropy', 'max_depth': 21, 'max...",0.851852,0.769231,0.75,0.677419,0.733333,0.756367,0.056722,5,0.836364,0.740741,0.813559,0.724138,0.758621,0.774684,0.043083,5


## Random Forest

In [86]:
gs_rfc.best_params_

{'n_estimators': 150,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': None,
 'max_depth': 20,
 'criterion': 'gini'}

In [115]:
rf_model = RandomForestClassifier(class_weight='balanced',
                                  random_state=3,
                                  n_estimators=150,
                                  min_samples_leaf=1,
                                  min_samples_split=2,
                                  max_features=None,
                                  max_depth=20,
                                  criterion='gini'
                                 )

In [116]:
rf_model.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=20, max_features=None,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=150,
                       n_jobs=None, oob_score=False, random_state=3, verbose=0,
                       warm_start=False)

In [119]:
print_scores(rf_model, x_train, x_val, y_train, y_val)

    Precision:
    Train = 1.0
    Validation = 0.87
    
    F1:
    Train = 1.0
    Validation = 0.714
    
    Recall:
    Train = 1.0
    Validation = 0.606    
    


### Grid Search - RF

In [79]:
params_rf = {'criterion':['gini'],
          'n_estimators':[150,200,250],
          'max_depth':[18,19,20],
          'max_features':[None],
          'min_samples_split':[2,4,6],
          'min_samples_leaf':[1,2,3],
          #'min_impurity_decrease':[0.0,0.05,0.1],
          #'min_weight_fraction_leaf':[0.0,0.05,0.1]
         }
scores = {'prec':'precision',
          'F1':'f1'
         }
gs_rfc = RandomizedSearchCV(RandomForestClassifier(class_weight='balanced',random_state=3, n_jobs=-1), 
                            params_rf,
                            scoring = scores,
                            n_iter=100,
                            refit = 'prec',
                            verbose = 1,
                            n_jobs=-1)

In [80]:
gs_rfc.fit(x_train_scale,y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 194 tasks      | elapsed:  8.8min
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed: 18.6min finished


RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight='balanced',
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
      

In [85]:
gs_rfc_results = pd.DataFrame(gs_rfc.cv_results_)
gs_rfc_results.sort_values(by='rank_test_F1').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_max_features,param_max_depth,param_criterion,params,split0_test_prec,split1_test_prec,split2_test_prec,split3_test_prec,split4_test_prec,mean_test_prec,std_test_prec,rank_test_prec,split0_test_F1,split1_test_F1,split2_test_F1,split3_test_F1,split4_test_F1,mean_test_F1,std_test_F1,rank_test_F1
68,10.136246,0.583543,0.580342,0.149497,250,4,2,,20,gini,"{'n_estimators': 250, 'min_samples_split': 4, ...",0.954545,1.0,0.904762,0.869565,0.863636,0.918502,0.052029,4,0.84,0.697674,0.791667,0.8,0.76,0.777868,0.04752,1
65,9.844708,0.46487,0.478826,0.04348,250,2,2,,20,gini,"{'n_estimators': 250, 'min_samples_split': 2, ...",0.954545,1.0,0.904762,0.869565,0.863636,0.918502,0.052029,4,0.84,0.697674,0.791667,0.8,0.76,0.777868,0.04752,1
54,5.664063,0.31584,0.255237,0.008975,150,2,1,,20,gini,"{'n_estimators': 150, 'min_samples_split': 2, ...",0.952381,1.0,0.9,0.952381,0.904762,0.941905,0.036688,1,0.816327,0.697674,0.765957,0.833333,0.77551,0.77776,0.047189,3
63,6.074953,0.625996,0.336723,0.043669,150,2,2,,20,gini,"{'n_estimators': 150, 'min_samples_split': 2, ...",0.913043,1.0,0.904762,0.909091,0.826087,0.910597,0.055095,6,0.823529,0.697674,0.791667,0.816327,0.745098,0.774859,0.047358,4
66,5.749916,0.475423,0.303976,0.05427,150,4,2,,20,gini,"{'n_estimators': 150, 'min_samples_split': 4, ...",0.913043,1.0,0.904762,0.909091,0.826087,0.910597,0.055095,6,0.823529,0.697674,0.791667,0.816327,0.745098,0.774859,0.047358,4
