# Second Iteration. Notes for the Project Report

Same algorithms to be used:LR, DT, RF and SVM.

Another set of features to use. Pre-processed files "FeatureSelectionDTOutput.csv" and "FeatureSelectionDTTestOutput.csv".

# LOADING TRAIN AND TEST DATA

In [1]:
# Train data.
import pandas as pd
data_train = pd.read_csv("FeatureSelectionDTOutput.csv")

# split values into inpits and outputs.
values_train = data_train.values
X_train = values_train[:,0:10]
y_train = values_train[:,10]

data_train.shape

(97044, 11)

In [2]:
# Test data.
data_test_full = pd.read_csv("FeatureSelectionDTTestOutput.csv")

# Create new dataset with features previously selected.
columns_needed = list(data_train.columns)
data_test = data_test_full[columns_needed].copy()

# split values into inpits and outputs.
values_test = data_test.values
X_test = values_test[:,0:10]
y_test = values_test[:,10]

data_test.shape

(40158, 11)

# LOGISTIC REGRESSION

## LR with default hyperparameters

In [None]:
# Initiate the LR model with defualt hyperparameters.
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

In [None]:
# Fit the model using default hyperparameters.
# K, you don't split into train and validate sets??
lr.fit(X_train, y_train)

In [None]:
# Run predictions on TEST set and see the accuracy.
lr.score(X_test,y_test)

In [None]:
# Build confusion matrix.
from sklearn.metrics import confusion_matrix
lr_predicted = lr.predict(X_test)
print(confusion_matrix(y_test, lr_predicted))

## LR hyperparameters tuning (Random Search)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Create array of values for tuned hyperparameters.
lr_params = {'dual' : [True,False], 
             'C' : [0.1, 0.5, 1.0, 1.5, 2.0, 2.5], 
             'max_iter' : [100, 150, 200, 300, 500, 1000]
             }

In [None]:
# Run random search and initiate the model with tuned parameters.
lr_random = RandomizedSearchCV(estimator=lr, param_distributions=lr_params, cv = 3, n_jobs=-1, random_state = 2019)

import time
start_time = time.time()
lr_random_result = lr_random.fit(X_train, y_train)
finish_time = time.time()

# Summarize results
print("Best: %f using %s" % (lr_random_result.best_score_, lr_random_result.best_params_))
print("Execution time: " + str((finish_time - start_time)))

In [None]:
# Apply best values of hyperparameters to the model.
lr_random = lr_random.best_estimator_

In [None]:
# Train the tuned model on TRAIN set and check the accuracy
lr_random.fit(X_train, y_train)
lr_random.score(X_test,y_test)

In [None]:
# Build confusion matrix.
from sklearn.metrics import confusion_matrix
lr_random_predicted = lr_random.predict(X_test)
print(confusion_matrix(y_test, lr_predicted))

## LR tuning Results

In [None]:
print("LR default hyperparameters test accuracy: ", lr.score(X_test,y_test),', parameters: ', '\n', lr.get_params(),'\n')
print("LR tuned hyperparameters test accuracy: ", lr_random.score(X_test,y_test),', parameters: ', '\n', lr_random.get_params())

# DECISION TREE

## DT with default hyperparameters

In [None]:
# Initiate a DT model using default hyperparameters.
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()

In [None]:
# Train model on train data.
dt.fit(X_train, y_train)

In [None]:
# Check model accuracy on the TEST set.
dt.score(X_test, y_test)

In [None]:
# Build confusion matrix.
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, dt.predict(X_test)))

## DT hyperparameters tuning (Random Search)

In [None]:
# Create array of values for tuned hyperparameters.
dt_params = {'max_depth': [None, 0.1, 1, 3, 5, 10], 
             'min_samples_leaf': [0.04, 0.06, 0.08, 1], 
             'max_features': [None, 0.2, 0.4,0.6, 0.8]}

In [None]:
# Run random search.
from sklearn.model_selection import RandomizedSearchCV
dt_random = RandomizedSearchCV(estimator=dt, param_distributions=dt_params, cv = 10, n_jobs=-1, random_state = 2019)

import time
start_time = time.time()
dt_random_result = dt_random.fit(X_train, y_train)
finish_time = time.time()

# Summarize results
print("Best: %f using %s" % (dt_random_result.best_score_, dt_random_result.best_params_))
print("Execution time: " + str((finish_time - start_time)))

In [None]:
# Apply best values of hyperparameters to the model.
dt_random = dt_random.best_estimator_

In [None]:
# Train the tuned model on TRAIN set and check the accuracy
dt_random.fit(X_train, y_train)

In [None]:
# Train the tuned model on TRAIN set and check the accuracy
dt_random.fit(X_train, y_train)
dt_random.score(X_test,y_test)

In [None]:
# Build confusion matrix.
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, dt_random.predict(X_test)))

## DT tuning Results

In [None]:
print("DT default hyperparameters test accuracy: ", dt.score(X_test,y_test),', parameters: ', '\n', dt.get_params(),'\n')
print("DT tuned hyperparameters test accuracy: ", dt_random.score(X_test,y_test),', parameters: ', '\n', dt_random.get_params())

# RANDOM FOREST

## RF with default hyperparameters

In [None]:
# Initiate a RF model using default hyperparameters.
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [None]:
# Train model on train data.
rf.fit(X_train, y_train)

In [None]:
# Check model accuracy on the TEST set.
rf.score(X_test, y_test)

In [None]:
# Build confusion matrix.
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, rf.predict(X_test)))

## RF hyperparameters tuning (Random Search)

In [None]:
# Define a grid of hyperparameters.
rf_params = { 'n_estimators': [1, 5, 10, 30, 50, 100, 300, 400, 500], 
             'max_depth': [None, 4, 6, 8], 
             'min_samples_leaf': [0.1, 0.2, 0.5, 1], 
             'max_features': ['auto', 'log2', 'sqrt']
            }

In [None]:
# Run random search.
from sklearn.model_selection import RandomizedSearchCV
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=rf_params, cv = 3, n_jobs=-1, random_state = 2019)

import time
start_time = time.time()
rf_random_result = rf_random.fit(X_train, y_train)
finish_time = time.time()

# Summarize results
print("Best: %f using %s" % (rf_random_result.best_score_, rf_random_result.best_params_))
print("Execution time: " + str((finish_time - start_time)))

In [None]:
# Apply best values of hyperparameters to the model.
rf_random = rf_random.best_estimator_

In [None]:
# Train the tuned model on TRAIN set and check the accuracy
rf_random.fit(X_train, y_train)

## RF tuning Results

In [None]:
print("RF default hyperparameters test accuracy: ", rf.score(X_test,y_test),', parameters: ', '\n', rf.get_params(),'\n')
print("RF tuned hyperparameters test accuracy: ", rf_random.score(X_test,y_test),', parameters: ', '\n', rf_random.get_params())

# SVM (SVC)

## SVC with default hyperparameters

In [None]:
from sklearn import svm
svclassifier = svm.SVC()

In [None]:
svclassifier.fit(X_train, y_train)

In [None]:
svclassifier.score(X_test, y_test)

In [None]:
# Build confusion matrix.
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, svclassifier.predict(X_test)))

 ## SVC hyperparameters tuning (Random Search)

In [None]:
# Define a grid of hyperparameters.
svc_params = { 'C': [0.1, 0.5, 1, 3, 5], 
             'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
             'degree': [2, 3, 4], 
              'gamma': [0.01, 0.1, 1, 10]
            }

In [None]:
# Run random search.
from sklearn.model_selection import RandomizedSearchCV
svc_random = RandomizedSearchCV(estimator=svclassifier, n_iter=3, param_distributions=svc_params, cv = 3, n_jobs=-1, 
                                random_state = 2019)

import time
start_time = time.time()
svc_random_result = svc_random.fit(X_train, y_train)
finish_time = time.time()

# Summarize results
print("Best: %f using %s" % (svc_random_result.best_score_, svc_random_result.best_params_))
print("Execution time: " + str((finish_time - start_time)))

In [None]:
# Apply best values of hyperparameters to the model.
svc_random = svc_random.best_estimator_

In [None]:
# Train the tuned model on TRAIN set and check the accuracy
svc_random.fit(X_train, y_train)
svc_random.score(X_test,y_test)

In [None]:
# Build confusion matrix.
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, svc_random.predict(X_test)))

## SVC tuning Results

In [None]:
print("SVC default hyperparameters test accuracy: ", svclassifier.score(X_test,y_test), 
      ', parameters: ', '\n', svclassifier.get_params(),'\n')
print("SVC tuned hyperparameters test accuracy: ", svc_random.score(X_test,y_test), 
      ', parameters: ', '\n', svc_random.get_params())

# Compare Algorithms Performance

In [None]:
print("LR tuned hyperparameters test accuracy: ", lr_random.score(X_test,y_test))
print("DT tuned hyperparameters test accuracy: ", dt_random.score(X_test,y_test))
print("RF tuned hyperparameters test accuracy: ", rf_random.score(X_test,y_test))
print("SVC tuned hyperparameters test accuracy: ", svc_random.score(X_test,y_test))