# DECISION TREES

In [11]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

In [12]:
data = pd.read_csv("FeatureSelectionOutput.csv")

In [13]:
data.head()

Unnamed: 0,class,radiotap.datarate,wlan.fc.moredata,wlan.fc.protected,wlan.fc.pwrmgt,wlan.wep.key,wlan_mgt.fixed.auth_seq,wlan_mgt.fixed.capabilities.preamble,wlan_mgt.fixed.capabilities.short_slot_time,wlan_mgt.fixed.timestamp,wlan_mgt.rsn.akms.type
0,0,2.043483,0.0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,1.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,1.0,0.048053,0.0
3,0,0.0,0.0,-1.0,0.0,0.0,0.0,1.0,1.0,0.1683,1.0
4,0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,1.0,0.048054,0.0


In [14]:
data.shape

(97044, 11)

In [15]:
# split values into inpits and outputs.
values = data.values
X = values[:,1:11]
y = values[:,0]

In [16]:
# Initiate a DT model using default hyperparameters. Set random state value "2019" for reproducibility.
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=2019)

In [17]:
# List all available hyperparameters for decision tree model.
dt.get_params()

{'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': False,
 'random_state': 2019,
 'splitter': 'best'}

In [18]:
# Fit the model using default hyperparameters.
dt.fit(X,y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=2019, splitter='best')

In [9]:
# CV approach will be used to see the cross-validation score of the model accuracy.
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
kfold = KFold(n_splits=3, random_state=2019)
result = cross_val_score(dt, X, y, cv=kfold, scoring="accuracy")
print("Model accuracy with default hyperparameters is: ", result.mean())

Model accuracy with default hyperparameters is:  0.9874696014179136


# Grid Search of optimal hyperparameters

In [10]:
# Selecting best hyperparameters using grid search approach.
from sklearn.model_selection import GridSearchCV

# Define the grid of hyperparameters:
dt_params = {'max_depth': [2, 3, 4, 5, 6], 
             'min_samples_leaf': [0.04, 0.06, 0.08], 
             'max_features': [0.2, 0.4,0.6, 0.8]}

# Run the grid search.
dt_grid = GridSearchCV(estimator=dt, param_grid=dt_params, scoring='accuracy',cv=10,n_jobs=-1)
import time
start_time = time.time()
dt_grid_result = dt_grid.fit(X, y)
finish_time = time.time()

# Summarize results.
print("Best: %f using %s" % (dt_grid_result.best_score_, dt_grid_result.best_params_))
print("Execution time: " + str((finish_time - start_time)))

KeyboardInterrupt: 

# Random Search of optimal hyperparameters

In [None]:
# Run random search.
from sklearn.model_selection import RandomizedSearchCV
dt_random = RandomizedSearchCV(estimator=dt, param_distributions=dt_params, cv = 10, n_jobs=-1, random_state = 2019)

import time
start_time = time.time()
dt_random_result = dt_random.fit(X, y)
finish_time = time.time()

# Summarize results
print("Best: %f using %s" % (dt_random_result.best_score_, dt_random_result.best_params_))
print("Execution time: " + str((finish_time - start_time)))

In [None]:
# RandomizedSearchCV executional time is significantly smaller than GridSearchCV.
# Whilst accuracy value is the same, hyperparameter values are different.

# Results
Default hyperparameters model: accuracy = 0.9875618694607151

Predefined hyperparameters grid search model: accuracy 0.948559 using {'max_depth': 2, 'max_features': 0.4, 'min_samples_leaf': 0.04}

Predefined hyperparameters random search model: accuracy 0.948559 using {'min_samples_leaf': 0.08, 'max_features': 0.4, 'max_depth': 5}

In [None]:
dt_best_model = dt_random.best_estimator_

# RANDOM FORESTS

In [19]:
# Initiate a RF model using default hyperparameters. Set random state value "2019" for reproducibility.
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state= 2019)

In [20]:
# List all available hyperparameters for decision tree model.
rf.get_params()

{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 'warn',
 'n_jobs': None,
 'oob_score': False,
 'random_state': 2019,
 'verbose': 0,
 'warm_start': False}

In [21]:
# Parameters tuning.
# Define a grid of hyperparameter 'params_rf'
rf_params = { 'n_estimators': [300, 400, 500], 
             'max_depth': [4, 6, 8], 
             'min_samples_leaf': [0.1, 0.2], 
             'max_features': ['log2', 'sqrt']
            }

In [22]:
# Run the grid search.
from sklearn.model_selection import GridSearchCV
rf_grid = GridSearchCV(estimator=rf, param_grid=rf_params, cv=3, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)

import time
start_time = time.time()
rf_grid_result = rf_grid.fit(X, y)
finish_time = time.time()

# Summarize results.
print("Best: %f using %s" % (rf_grid_result.best_score_, rf_grid_result.best_params_))
print("Execution time: " + str((finish_time - start_time)))

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed:  2.8min finished


Best: -0.239289 using {'max_depth': 4, 'max_features': 'log2', 'min_samples_leaf': 0.1, 'n_estimators': 500}
Execution time: 171.11470413208008


In [24]:
# Run the random search.
# Run random search.
from sklearn.model_selection import RandomizedSearchCV
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=rf_params, cv = 3, n_jobs=-1, random_state = 2019)

import time
start_time = time.time()
rf_random_result = rf_random.fit(X, y)
finish_time = time.time()

# Summarize results
print("Best: %f using %s" % (rf_random_result.best_score_, rf_random_result.best_params_))
print("Execution time: " + str((finish_time - start_time)))

Best: 0.205104 using {'n_estimators': 500, 'min_samples_leaf': 0.1, 'max_features': 'log2', 'max_depth': 6}
Execution time: 49.217713594436646
