# REFINING LOGISTIC REGRESSION

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

In [2]:
data = pd.read_csv("FeatureSelectionOutput.csv")

In [3]:
data.head()

Unnamed: 0,class,radiotap.datarate,wlan.fc.moredata,wlan.fc.protected,wlan.fc.pwrmgt,wlan.wep.key,wlan_mgt.fixed.auth_seq,wlan_mgt.fixed.capabilities.preamble,wlan_mgt.fixed.capabilities.short_slot_time,wlan_mgt.fixed.timestamp,wlan_mgt.rsn.akms.type
0,0,2.043483,0.0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,1.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,1.0,0.048053,0.0
3,0,0.0,0.0,-1.0,0.0,0.0,0.0,1.0,1.0,0.1683,1.0
4,0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,1.0,0.048054,0.0


In [4]:
data.shape

(97044, 11)

In [5]:
# split values into inpits and outputs.
values = data.values
X = values[:,1:11]
y = values[:,0]

In [6]:
# Initiate the LR model. Random state value is "2019" for results reporodubility.
lr = LogisticRegression(random_state=2019)

In [7]:
# List all available hyperparameters for Logistic regression model.
lr.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'warn',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 2019,
 'solver': 'warn',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [16]:
# Fit the model using default hyperparameters.
lr.fit(X,y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [25]:
# CV approach will be used to see the cross-validation score of the model accuracy.
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
kfold = KFold(n_splits=3, random_state=2019)
result = cross_val_score(lr, X, y, cv=kfold, scoring="accuracy")
print("Model accuracy with default hyperparameters is: ", result.mean())



Model accuracy with default hyperparameters is:  0.9417274638308397




# Grid Search of optimal hyperparameters

In [11]:
# Now we will look for the best performing combination of pre-defined set of hyperparameters.
# GridSearchCV checks every possible combination, this approach is computationally extensive.
from sklearn.model_selection import GridSearchCV

In [12]:
# Create arrays of values for each hyperparameter:
dual=[True,False]
max_iter=[100, 110, 120, 130, 140]
C = [1.0, 1.5, 2.0, 2.5]
tol = [1e-10, 1e-4, 1e-3, 1e-2, 1e-1]

# Put all arrays into the grid (dictionary form).
lr_params = dict(dual=dual,max_iter=max_iter,C=C, tol=tol)

In [17]:
# Run the grid search.
lr_grid = GridSearchCV(estimator=lr, param_grid=lr_params, cv = 3, n_jobs=-1)
import time
start_time = time.time()
lr_grid_result = lr_grid.fit(X, y)
finish_time = time.time()

# Summarize results.
print("Best: %f using %s" % (lr_grid_result.best_score_, lr_grid_result.best_params_))
print("Execution time: " + str((finish_time - start_time)))



Best: 0.982297 using {'C': 2.0, 'dual': True, 'max_iter': 100, 'tol': 1e-10}
Execution time: 106.3335657119751


# Random Search of optimal hyperparameters

In [18]:
# Another way of selecting best performing combination of pre-defined set of hyperparameters is random search.
# RandomizedSearchCV will randomly choose 
from sklearn.model_selection import RandomizedSearchCV

In [20]:
# Run random search.
lr_random = RandomizedSearchCV(estimator=lr, param_distributions=lr_params, cv = 3, n_jobs=-1, random_state = 2019)

import time
start_time = time.time()
lr_random_result = lr_random.fit(X, y)
finish_time = time.time()

# Summarize results
print("Best: %f using %s" % (lr_random_result.best_score_, lr_random_result.best_params_))
print("Execution time: " + str((finish_time - start_time)))



Best: 0.982297 using {'tol': 0.01, 'max_iter': 140, 'dual': True, 'C': 2.5}
Execution time: 5.036901950836182


In [21]:
# RandomizedSearchCV executional time is significantly smaller than GridSearchCV.
# Whilst accuracy value is the same, hyperparameter values are different.

# RESULTS

Default hyperparameters model: accuracy = 0.982297

Predefined hyperparameters grid search model: 
accuracy 0.982297,
parameters = {'C': 2.0, 'dual': True, 'max_iter': 100, 'tol': 1e-10}

Predefined hyperparameters random search model: 
accuracy 0.982297,
parameters = {'tol': 0.01, 'max_iter': 140, 'dual': True, 'C': 2.5}

In [26]:
lr_best_model = lr_random.best_estimator_