# LOADING TRAIN AND TEST DATA

In [1]:
# Train data.
import pandas as pd
data_train = pd.read_csv("FeatureSelectionOutput.csv")

# split values into inpits and outputs.
values_train = data_train.values
X_train = values_train[:,1:11]
y_train = values_train[:,0]

data_train.shape

(97044, 11)

In [2]:
# Test data.
data_test_full = pd.read_csv("ScaledTestDataSet.csv")

# Create new dataset with features previously selected.
columns_needed = list(data_train.columns)
data_test = data_test_full[columns_needed].copy()

# split values into inpits and outputs.
values_test = data_test.values
X_test = values_test[:,1:11]
y_test = values_test[:,0]

data_test.shape

(40158, 11)

# LOGISTIC REGRESSION

## LR with default hyperparameters

In [3]:
# Initiate the LR model with defualt hyperparameters.
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

In [4]:
# Fit the model using default hyperparameters.
# K, you don't split into train and validate sets??
lr.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [5]:
# Run predictions on TEST set and see the accuracy.
lr.score(X_test,y_test)

0.9626973454853329

In [6]:
# Build confusion matrix.
from sklearn.metrics import confusion_matrix
lr_predicted = lr.predict(X_test)
print(confusion_matrix(y_test, lr_predicted))

[[18584  1495]
 [    3 20076]]


## LR hyperparameters tuning (Random Search)

In [7]:
from sklearn.model_selection import RandomizedSearchCV

# Create array of values for tuned hyperparameters.
lr_params = {'dual' : [True,False], 
             'max_iter' : [100,110,120,130,140],
             'C' : [1.0,1.5,2.0,2.5]
            }

In [8]:
# Run random search and initiate the model with tuned parameters.
lr_random = RandomizedSearchCV(estimator=lr, param_distributions=lr_params, cv = 3, n_jobs=-1, random_state = 2019)

import time
start_time = time.time()
lr_random_result = lr_random.fit(X_train, y_train)
finish_time = time.time()

# Summarize results
print("Best: %f using %s" % (lr_random_result.best_score_, lr_random_result.best_params_))
print("Execution time: " + str((finish_time - start_time)))



Best: 0.982297 using {'max_iter': 130, 'dual': False, 'C': 2.0}
Execution time: 23.146897554397583


In [9]:
# Apply best values of hyperparameters to the model.
lr_random = lr_random.best_estimator_

In [11]:
# Train the tuned model on TRAIN set and check the accuracy
lr_random.fit(X_train, y_train)
lr_random.score(X_test,y_test)



0.9750983614721849

In [12]:
# Build confusion matrix.
from sklearn.metrics import confusion_matrix
lr_random_predicted = lr_random.predict(X_test)
print(confusion_matrix(y_test, lr_predicted))

[[18584  1495]
 [    3 20076]]


## LR tuning Results

In [38]:
print("LR default hyperparameters test accuracy: ", lr.score(X_test,y_test),', parameters: ', '\n', lr.get_params(),'\n')
print("LR tuned hyperparameters test accuracy: ", lr_random.score(X_test,y_test),', parameters: ', '\n', lr_random.get_params())

LR default hyperparameters test accuracy:  0.9626973454853329 , parameters:  
 {'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'warn', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'warn', 'tol': 0.0001, 'verbose': 0, 'warm_start': False} 

LR tuned hyperparameters test accuracy:  0.9750983614721849 , parameters:  
 {'C': 2.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 130, 'multi_class': 'warn', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'warn', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


# DECISION TREE

## DT with default hyperparameters

In [14]:
# Initiate a DT model using default hyperparameters.
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()

In [15]:
# Train model on train data.
dt.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [16]:
# Check model accuracy on the TEST set.
dt.score(X_test, y_test)

0.9761442302903531

In [50]:
# Build confusion matrix.
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, dt.predict(X_test)))

[[19124   955]
 [    3 20076]]


## DT hyperparameters tuning (Random Search)

In [23]:
# Create array of values for tuned hyperparameters.
dt_params = {'max_depth': [None, 0.1, 1, 3, 5, 10], 
             'min_samples_leaf': [0.04, 0.06, 0.08, 1], 
             'max_features': [None, 0.2, 0.4,0.6, 0.8]}

In [24]:
# Run random search.
from sklearn.model_selection import RandomizedSearchCV
dt_random = RandomizedSearchCV(estimator=dt, param_distributions=dt_params, cv = 10, n_jobs=-1, random_state = 2019)

import time
start_time = time.time()
dt_random_result = dt_random.fit(X_train, y_train)
finish_time = time.time()

# Summarize results
print("Best: %f using %s" % (dt_random_result.best_score_, dt_random_result.best_params_))
print("Execution time: " + str((finish_time - start_time)))

Best: 0.987552 using {'min_samples_leaf': 1, 'max_features': 0.8, 'max_depth': None}
Execution time: 3.8826050758361816


In [25]:
# Apply best values of hyperparameters to the model.
dt_random = dt_random.best_estimator_

In [26]:
# Train the tuned model on TRAIN set and check the accuracy
dt_random.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=0.8, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [51]:
# Build confusion matrix.
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, dt_random.predict(X_test)))

[[19124   955]
 [    3 20076]]


## DT tuning Results

In [37]:
print("DT default hyperparameters test accuracy: ", dt.score(X_test,y_test),', parameters: ', '\n', dt.get_params(),'\n')
print("DT tuned hyperparameters test accuracy: ", dt_random.score(X_test,y_test),', parameters: ', '\n', dt_random.get_params())

DT default hyperparameters test accuracy:  0.9761442302903531 parameters:  
 {'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'presort': False, 'random_state': None, 'splitter': 'best'} 

DT tuned hyperparameters test accuracy:  0.9761442302903531 parameters:  
 {'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 0.8, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'presort': False, 'random_state': None, 'splitter': 'best'}


# RANDOM FOREST

## RF with default hyperparameters

In [40]:
# Initiate a RF model using default hyperparameters.
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()

In [41]:
# Train model on train data.
rf.fit(X_train, y_train)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [42]:
# Check model accuracy on the TEST set.
rf.score(X_test, y_test)

0.9066098444011766

In [52]:
# Build confusion matrix.
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, rf.predict(X_test)))

ValueError: Classification metrics can't handle a mix of binary and continuous targets

## RF hyperparameters tuning (Random Search)

In [54]:
# Define a grid of hyperparameters.
rf_params = { 'n_estimators': [1, 5, 10, 30, 50, 100, 300, 400, 500], 
             'max_depth': [None, 4, 6, 8], 
             'min_samples_leaf': [0.1, 0.2, 0.5, 1], 
             'max_features': ['auto', 'log2', 'sqrt']
            }

In [44]:
# Run random search.
from sklearn.model_selection import RandomizedSearchCV
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=rf_params, cv = 3, n_jobs=-1, random_state = 2019)

import time
start_time = time.time()
rf_random_result = rf_random.fit(X_train, y_train)
finish_time = time.time()

# Summarize results
print("Best: %f using %s" % (rf_random_result.best_score_, rf_random_result.best_params_))
print("Execution time: " + str((finish_time - start_time)))

Best: 0.286732 using {'n_estimators': 500, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 8}
Execution time: 43.41524076461792


In [45]:
# Apply best values of hyperparameters to the model.
rf_random = rf_random.best_estimator_

In [46]:
# Train the tuned model on TRAIN set and check the accuracy
rf_random.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=8,
                      max_features='sqrt', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=500,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

## RF tuning Results

In [55]:
print("RF default hyperparameters test accuracy: ", rf.score(X_test,y_test),', parameters: ', '\n', rf.get_params(),'\n')
print("RF tuned hyperparameters test accuracy: ", rf_random.score(X_test,y_test),', parameters: ', '\n', rf_random.get_params())

RF default hyperparameters test accuracy:  0.9066098444011766 , parameters:  
 {'bootstrap': True, 'criterion': 'mse', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 10, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False} 

RF tuned hyperparameters test accuracy:  0.8517453940637931 , parameters:  
 {'bootstrap': True, 'criterion': 'mse', 'max_depth': 8, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 500, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


# SVM (SVC)

## SVC with default hyperparameters

In [62]:
from sklearn import svm
svclassifier = svm.SVC()

In [57]:
svclassifier.fit(X_train, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [58]:
svclassifier.score(X_test, y_test)

0.9625728372926938

In [59]:
# Build confusion matrix.
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, svclassifier.predict(X_test)))

[[18582  1497]
 [    6 20073]]


 ## SVC hyperparameters tuning (Random Search)

In [60]:
# Define a grid of hyperparameters.
svc_params = { 'C': [0.1, 0.5, 1, 3, 5], 
             'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
             'degree': [2, 3, 4], 
              'gamma': [0.01, 0.1, 1, 10, 100]
            }

In [61]:
# Run random search.
from sklearn.model_selection import RandomizedSearchCV
svc_random = RandomizedSearchCV(estimator=svclassifier, param_distributions=svc_params, cv = 3, n_jobs=-1, 
                                random_state = 2019)

import time
start_time = time.time()
svc_random_result = svc_random.fit(X_train, y_train)
finish_time = time.time()

# Summarize results
print("Best: %f using %s" % (svc_random_result.best_score_, svc_random_result.best_params_))
print("Execution time: " + str((finish_time - start_time)))

Best: 0.987397 using {'kernel': 'rbf', 'gamma': 10, 'degree': 3, 'C': 3}
Execution time: 373.02566170692444


In [63]:
# Build confusion matrix.
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, svc_random.predict(X_test)))

[[19099   980]
 [ 4740 15339]]
