# Notes for the Project Report

Same algorithms to be used: LR, DT, RF and SVM.

Another set of features to use. Pre-processed files "FS_DT10_train_output.csv" and "FS_DT10_test_output.csv".

# LOADING TRAIN AND TEST DATA

In [1]:
# Train data.
import pandas as pd
data_train = pd.read_csv("FS_DT10_train_output.csv")

In [2]:
data_train.head()

Unnamed: 0,radiotap.channel.type.ofdm,wlan.fc.pwrmgt,wlan.fc.protected,radiotap.datarate,wlan.fc.ds,frame.cap_len,wlan.fc.type,wlan.fc.subtype,wlan.seq,radiotap.mactime,class
0,1.0,1.0,-1.0,2.043483,-0.5,-1.692227,0.0,0.0,3.586345,-3.42419,0
1,1.0,0.0,-1.0,1.0,-1.0,-2.23067,-1.0,1.125012,-0.37181,-3.424014,0
2,0.0,0.0,-1.0,0.0,-1.0,1.923032,-2.0,0.5,0.696617,-3.423662,0
3,0.0,0.0,-1.0,0.0,-1.0,3.346063,-2.0,0.5,-0.067415,-3.423221,0
4,0.0,0.0,-1.0,0.0,-1.0,1.923032,-2.0,0.5,0.702766,-3.422868,0


In [4]:
data_train.shape

(97044, 11)

In [6]:
# split values into inpits and outputs.
values_train = data_train.values
X_train = values_train[:,0:10]
y_train = values_train[:,10]

In [7]:
# Test data.
data_test_full = pd.read_csv("FS_DT10_test_output.csv")

# Create new dataset with features previously selected.
columns_needed = list(data_train.columns)
data_test = data_test_full[columns_needed].copy()

# split values into inpits and outputs.
values_test = data_test.values
X_test = values_test[:,0:10]
y_test = values_test[:,10]

data_test.shape

(40158, 11)

# LOGISTIC REGRESSION

## LR with default hyperparameters

In [8]:
# Initiate the LR model with defualt hyperparameters.
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

In [9]:
# Fit the model using default hyperparameters.
# K, you don't split into train and validate sets??
lr.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [10]:
# Run predictions on TEST set and see the accuracy.
lr.score(X_test,y_test)

0.9563474276607401

In [11]:
# Build confusion matrix.
from sklearn.metrics import confusion_matrix
lr_predicted = lr.predict(X_test)
print(confusion_matrix(y_test, lr_predicted))

[[19799   280]
 [ 1473 18606]]


## LR hyperparameters tuning (Random Search)

In [12]:
from sklearn.model_selection import RandomizedSearchCV

# Create array of values for tuned hyperparameters.
lr_params = {'dual' : [True,False], 
             'C' : [0.1, 0.5, 1.0, 1.5, 2.0, 2.5], 
             'max_iter' : [100, 150, 200, 300, 500, 1000]
             }

In [13]:
# Run random search and initiate the model with tuned parameters.
lr_random = RandomizedSearchCV(estimator=lr, param_distributions=lr_params, cv = 3, n_jobs=-1, random_state = 2019)

import time
start_time = time.time()
lr_random_result = lr_random.fit(X_train, y_train)
finish_time = time.time()

# Summarize results
print("Best: %f using %s" % (lr_random_result.best_score_, lr_random_result.best_params_))
print("Execution time: " + str((finish_time - start_time)))



Best: 0.900746 using {'max_iter': 500, 'dual': True, 'C': 0.1}
Execution time: 18.19162154197693


In [14]:
# Apply best values of hyperparameters to the model.
lr_random = lr_random.best_estimator_

In [15]:
# Train the tuned model on TRAIN set and check the accuracy
lr_random.fit(X_train, y_train)
lr_random.score(X_test,y_test)



0.9562478211066289

In [16]:
# Build confusion matrix.
from sklearn.metrics import confusion_matrix
lr_random_predicted = lr_random.predict(X_test)
print(confusion_matrix(y_test, lr_random_predicted))

[[19795   284]
 [ 1473 18606]]


## LR tuning Results

In [17]:
print("LR default hyperparameters test accuracy: ", lr.score(X_test,y_test),', parameters: ', '\n', lr.get_params(),'\n')
print("LR tuned hyperparameters test accuracy: ", lr_random.score(X_test,y_test),', parameters: ', '\n', lr_random.get_params())

LR default hyperparameters test accuracy:  0.9563474276607401 , parameters:  
 {'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'warn', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'warn', 'tol': 0.0001, 'verbose': 0, 'warm_start': False} 

LR tuned hyperparameters test accuracy:  0.9562478211066289 , parameters:  
 {'C': 0.1, 'class_weight': None, 'dual': True, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 500, 'multi_class': 'warn', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'warn', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


# DECISION TREE

## DT with default hyperparameters

In [18]:
# Initiate a DT model using default hyperparameters.
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()

In [19]:
# Train model on train data.
dt.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [20]:
# Check model accuracy on the TEST set.
dt.score(X_test, y_test)

0.485855869316201

In [21]:
# Build confusion matrix.
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, dt.predict(X_test)))

[[19052  1027]
 [19620   459]]


## DT hyperparameters tuning (Random Search)

In [22]:
# Create array of values for tuned hyperparameters.
dt_params = {'max_depth': [None, 0.1, 1, 3, 5, 10], 
             'min_samples_leaf': [0.04, 0.06, 0.08, 1], 
             'max_features': [None, 0.2, 0.4,0.6, 0.8]}

In [23]:
# Run random search.
from sklearn.model_selection import RandomizedSearchCV
dt_random = RandomizedSearchCV(estimator=dt, param_distributions=dt_params, cv = 10, n_jobs=-1, random_state = 2019)

import time
start_time = time.time()
dt_random_result = dt_random.fit(X_train, y_train)
finish_time = time.time()

# Summarize results
print("Best: %f using %s" % (dt_random_result.best_score_, dt_random_result.best_params_))
print("Execution time: " + str((finish_time - start_time)))

Best: 0.929496 using {'min_samples_leaf': 0.06, 'max_features': 0.4, 'max_depth': 5}
Execution time: 4.551194667816162


In [24]:
# Apply best values of hyperparameters to the model.
dt_random = dt_random.best_estimator_

In [25]:
# Train the tuned model on TRAIN set and check the accuracy
dt_random.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features=0.4, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=0.06, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [26]:
# Train the tuned model on TRAIN set and check the accuracy
dt_random.fit(X_train, y_train)
dt_random.score(X_test,y_test)

0.8167737437123362

In [27]:
# Build confusion matrix.
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, dt_random.predict(X_test)))

[[17889  2190]
 [ 5168 14911]]


## DT tuning Results

In [28]:
print("DT default hyperparameters test accuracy: ", dt.score(X_test,y_test),', parameters: ', '\n', dt.get_params(),'\n')
print("DT tuned hyperparameters test accuracy: ", dt_random.score(X_test,y_test),', parameters: ', '\n', dt_random.get_params())

DT default hyperparameters test accuracy:  0.485855869316201 , parameters:  
 {'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'presort': False, 'random_state': None, 'splitter': 'best'} 

DT tuned hyperparameters test accuracy:  0.8167737437123362 , parameters:  
 {'class_weight': None, 'criterion': 'gini', 'max_depth': 5, 'max_features': 0.4, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 0.06, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'presort': False, 'random_state': None, 'splitter': 'best'}


# RANDOM FOREST

## RF with default hyperparameters

In [29]:
# Initiate a RF model using default hyperparameters.
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [30]:
# Train model on train data.
rf.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [31]:
# Check model accuracy on the TEST set.
rf.score(X_test, y_test)

0.8610737586533194

In [32]:
# Build confusion matrix.
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, rf.predict(X_test)))

[[20069    10]
 [ 5569 14510]]


## RF hyperparameters tuning (Random Search)

In [33]:
# Define a grid of hyperparameters.
rf_params = { 'n_estimators': [1, 5, 10, 30, 50, 100, 300, 400, 500], 
             'max_depth': [None, 4, 6, 8], 
             'min_samples_leaf': [0.1, 0.2, 0.5, 1], 
             'max_features': ['auto', 'log2', 'sqrt']
            }

In [34]:
# Run random search.
from sklearn.model_selection import RandomizedSearchCV
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=rf_params, cv = 3, n_jobs=-1, random_state = 2019)

import time
start_time = time.time()
rf_random_result = rf_random.fit(X_train, y_train)
finish_time = time.time()

# Summarize results
print("Best: %f using %s" % (rf_random_result.best_score_, rf_random_result.best_params_))
print("Execution time: " + str((finish_time - start_time)))

Best: 0.909103 using {'n_estimators': 500, 'min_samples_leaf': 0.1, 'max_features': 'sqrt', 'max_depth': 4}
Execution time: 63.70573377609253


In [35]:
# Apply best values of hyperparameters to the model.
rf_random = rf_random.best_estimator_

In [36]:
# Train the tuned model on TRAIN set and check the accuracy
rf_random.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=4, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=0.1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

## RF tuning Results

In [37]:
print("RF default hyperparameters test accuracy: ", rf.score(X_test,y_test),', parameters: ', '\n', rf.get_params(),'\n')
print("RF tuned hyperparameters test accuracy: ", rf_random.score(X_test,y_test),', parameters: ', '\n', rf_random.get_params())

RF default hyperparameters test accuracy:  0.8610737586533194 , parameters:  
 {'bootstrap': True, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 10, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False} 

RF tuned hyperparameters test accuracy:  0.9624981323771105 , parameters:  
 {'bootstrap': True, 'class_weight': None, 'criterion': 'gini', 'max_depth': 4, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 0.1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 500, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


# SVM (SVC)

## SVC with default hyperparameters

In [38]:
from sklearn import svm
svclassifier = svm.SVC()

In [39]:
svclassifier.fit(X_train, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [40]:
svclassifier.score(X_test, y_test)

0.952313362219234

In [41]:
# Build confusion matrix.
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, svclassifier.predict(X_test)))

[[19637   442]
 [ 1473 18606]]


 ## SVC hyperparameters tuning (Random Search)

In [42]:
# Define a grid of hyperparameters.
svc_params = { 'C': [0.1, 0.5, 1, 3, 5],  
              'gamma': [0.01, 0.1, 1, 10]
            }

In [None]:
# Run random search.
from sklearn.model_selection import RandomizedSearchCV
svc_random = RandomizedSearchCV(estimator=svclassifier, n_iter=3, param_distributions=svc_params, cv = 3, n_jobs=-1, 
                                random_state = 2019)

import time
start_time = time.time()
svc_random_result = svc_random.fit(X_train, y_train)
finish_time = time.time()

# Summarize results
print("Best: %f using %s" % (svc_random_result.best_score_, svc_random_result.best_params_))
print("Execution time: " + str((finish_time - start_time)))

In [None]:
# Apply best values of hyperparameters to the model.
svc_random = svc_random.best_estimator_

In [None]:
# Train the tuned model on TRAIN set and check the accuracy
svc_random.fit(X_train, y_train)
svc_random.score(X_test,y_test)

In [None]:
# Build confusion matrix.
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, svc_random.predict(X_test)))

## SVC tuning Results

In [None]:
print("SVC default hyperparameters test accuracy: ", svclassifier.score(X_test,y_test), 
      ', parameters: ', '\n', svclassifier.get_params(),'\n')
print("SVC tuned hyperparameters test accuracy: ", svc_random.score(X_test,y_test), 
      ', parameters: ', '\n', svc_random.get_params())

# Compare Algorithms Performance

In [None]:
print("LR tuned hyperparameters test accuracy: ", lr_random.score(X_test,y_test))
print(confusion_matrix(y_test, lr_random_predicted))
print()

print("DT tuned hyperparameters test accuracy: ", dt_random.score(X_test,y_test))
print(confusion_matrix(y_test, dt_random.predict(X_test)))
print()

print("RF tuned hyperparameters test accuracy: ", rf_random.score(X_test,y_test))
print(confusion_matrix(y_test, rf_random.predict(X_test)))
print()

print("SVC tuned hyperparameters test accuracy: ", svc_random.score(X_test,y_test))
print(confusion_matrix(y_test, svc_random.predict(X_test)))