# Notes for the Project Report

# LOADING TRAIN AND TEST DATA

In [None]:
# Train data.
import pandas as pd
data_train = pd.read_csv("FS_pca5_train_output.csv")
data_train.shape

In [None]:
data_train.head()

In [None]:
# split values into inpits and outputs.
values_train = data_train.values
X_train = values_train[:,0:5]
y_train = values_train[:,5]

data_train.shape

In [None]:
# Test data.
data_test = pd.read_csv("FS_pca5_test_output.csv")

# split values into inpits and outputs.
values_test = data_test.values
X_test = values_test[:,0:5]
y_test = values_test[:,5]

data_test.shape

# LOGISTIC REGRESSION

## LR with default hyperparameters

In [None]:
# Initiate the LR model with defualt hyperparameters.
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

In [None]:
# Fit the model using default hyperparameters.
# K, you don't split into train and validate sets??
lr.fit(X_train, y_train)

In [None]:
# Run predictions on TEST set and see the accuracy.
lr_score = lr.score(X_test,y_test)
print(lr_score)

In [None]:
# Build confusion matrix.
from sklearn.metrics import confusion_matrix
lr_cm = confusion_matrix(y_test, lr.predict(X_test))
print(lr_cm)

## LR hyperparameters tuning (Random Search)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Create array of values for tuned hyperparameters.
lr_params = {'C' : [0.01, 0.05, 0.1, 0.5, 1.0, 1.5, 2.0], 
             'max_iter' : [5, 10, 50, 100, 150, 200, 300]
             }

In [None]:
# Run random search and initiate the model with tuned parameters.
lr_random = RandomizedSearchCV(estimator=lr, param_distributions=lr_params, cv = 3, n_jobs=-1, random_state = 2019)

import time
start_time = time.time()
lr_random.fit(X_train, y_train)
finish_time = time.time()

# Summarize results
print("Best: %f using %s" % (lr_random.best_score_, lr_random.best_params_))
print("Execution time: " + str((finish_time - start_time)))

In [None]:
# Apply best values of hyperparameters to the model.
lr_tuned = lr_random.best_estimator_

In [None]:
# Train the tuned model on TRAIN set and check the accuracy
lr_tuned.fit(X_train, y_train)
lr_tuned_score = lr_tuned.score(X_test,y_test)
print(lr_tuned_score)

In [None]:
# Build confusion matrix.
from sklearn.metrics import confusion_matrix
lr_tuned_cm = confusion_matrix(y_test, lr_tuned.predict(X_test))
print(lr_tuned_cm)

## LR tuning Results

In [None]:
print("LR default hyperparameters test accuracy: ", lr_score,', parameters: ', '\n', lr.get_params())
print('Confusion matrix: ', '\n', lr_cm)
print()
print("LR tuned hyperparameters test accuracy: ", lr_tuned_score,', parameters: ', '\n', lr_tuned.get_params())
print('Confusion matrix: ', '\n', lr_tuned_cm)

# DECISION TREE

## DT with default hyperparameters

In [None]:
# Initiate a DT model using default hyperparameters.
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()

In [None]:
# Train model on train data.
dt.fit(X_train, y_train)

In [None]:
# Check model accuracy on the TEST set.
dt_score = dt.score(X_test, y_test)
print(dt_score)

In [None]:
# Build confusion matrix.
from sklearn.metrics import confusion_matrix
dt_cm = confusion_matrix(y_test, dt.predict(X_test))
print(dt_cm)

## DT hyperparameters tuning (Random Search)

In [None]:
# https://medium.com/@mohtedibf/indepth-parameter-tuning-for-decision-tree-6753118a03c3
# Create array of values for tuned hyperparameters.
dt_params = {'max_depth': [None, 1, 3, 5, 10, 50, 100, 300], 
             'min_samples_split': [2, 5, 10, 50, 100], 
             'min_samples_leaf': [1, 2, 5, 10, 50, 100]
            }

In [None]:
# Run random search.
from sklearn.model_selection import RandomizedSearchCV
dt_random = RandomizedSearchCV(estimator=dt, param_distributions=dt_params, cv = 10, n_jobs=-1, random_state = 2019)

import time
start_time = time.time()
dt_random.fit(X_train, y_train)
finish_time = time.time()

# Summarize results
print("Best: %f using %s" % (dt_random.best_score_, dt_random.best_params_))
print("Execution time: " + str((finish_time - start_time)))

In [None]:
# Apply best values of hyperparameters to the model.
dt_tuned = dt_random.best_estimator_

In [None]:
# Train the tuned model on TRAIN set and check the accuracy
dt_tuned.fit(X_train, y_train)
dt_tuned_score = dt_tuned.score(X_test,y_test)
print(dt_tuned_score)

In [None]:
# Build confusion matrix.
from sklearn.metrics import confusion_matrix
dt_tuned_cm = confusion_matrix(y_test, dt_tuned.predict(X_test))
print(dt_tuned_cm)

## DT tuning Results

In [None]:
print("DT default hyperparameters test accuracy: ", dt_score,', parameters: ', '\n', dt.get_params())
print('Confusion matrix: ', '\n', dt_cm)
print()
print("DT tuned hyperparameters test accuracy: ", dt_tuned_score,', parameters: ', '\n', dt_tuned.get_params())
print('Confusion matrix: ', '\n', dt_tuned_cm)

# RANDOM FOREST

## RF with default hyperparameters

In [None]:
# Initiate a RF model using default hyperparameters.
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [None]:
# Train model on train data.
rf.fit(X_train, y_train)

In [None]:
# Check model accuracy on the TEST set.
rf_score = rf.score(X_test, y_test)
print(rf_score)

In [None]:
# Build confusion matrix.
from sklearn.metrics import confusion_matrix
rf_cm = confusion_matrix(y_test, rf.predict(X_test))
print(rf_cm)

## RF hyperparameters tuning (Random Search)

In [None]:
# Define a grid of hyperparameters.
rf_params = { 'n_estimators': [1, 5, 10, 30, 50, 100, 200, 500], 
             'max_depth': [None, 1, 2, 4, 8, 20, 50, 100], 
             'min_samples_leaf': [1, 5, 10, 50, 100], 
             'max_features': [None, 'auto', 'log2']
            }

In [None]:
# Run random search.
from sklearn.model_selection import RandomizedSearchCV
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=rf_params, cv = 3, n_jobs=-1, random_state = 2019)

import time
start_time = time.time()
rf_random.fit(X_train, y_train)
finish_time = time.time()

# Summarize results
print("Best: %f using %s" % (rf_random.best_score_, rf_random.best_params_))
print("Execution time: " + str((finish_time - start_time)))

In [None]:
# Apply best values of hyperparameters to the model.
rf_tuned = rf_random.best_estimator_

In [None]:
# Train the tuned model on TRAIN set and check the accuracy
rf_tuned.fit(X_train, y_train)
rf_tuned_score = rf_tuned.score(X_test,y_test)
print(rf_tuned_score)

In [None]:
# Build confusion matrix.
rf_tuned_cm = confusion_matrix(y_test, rf_tuned.predict(X_test))
print(rf_tuned_cm)

## RF tuning Results

In [None]:
print("RF default hyperparameters test accuracy: ", rf_score,', parameters: ', '\n', rf.get_params())
print('Confusion matrix: ', '\n', rf_cm)
print()
print("RF tuned hyperparameters test accuracy: ", rf_tuned_score,', parameters: ', '\n', rf_tuned.get_params())
print('Confusion matrix: ', '\n', rf_tuned_cm)

# SVM (SVC)

## SVC with default hyperparameters

In [None]:
from sklearn import svm
svc = svm.SVC()

In [None]:
svc.fit(X_train, y_train)

In [None]:
svc_score = svc.score(X_test, y_test)
print(svc_score)

In [None]:
# Build confusion matrix.
from sklearn.metrics import confusion_matrix
svc_cm = confusion_matrix(y_test, svc.predict(X_test))
print(svc_cm)

 ## SVC hyperparameters tuning (Random Search)

In [None]:
# Define a grid of hyperparameters.
svc_params = { 'C': [0.1, 0.5, 1, 3, 5], 
             'gamma': ['scale', 'auto', 0.01, 0.1, 1, 10]
            }

In [None]:
# Run random search.
from sklearn.model_selection import RandomizedSearchCV
svc_random = RandomizedSearchCV(estimator=svc, n_iter=10, param_distributions=svc_params, cv = 3, n_jobs=-1, 
                                random_state = 2019)

import time
start_time = time.time()
svc_random.fit(X_train, y_train)
finish_time = time.time()

# Summarize results
print("Best: %f using %s" % (svc_random.best_score_, svc_random.best_params_))
print("Execution time: " + str((finish_time - start_time)))

In [None]:
# Apply best values of hyperparameters to the model.
svc_tuned = svc_random.best_estimator_

In [None]:
# Train the tuned model on TRAIN set and check the accuracy
svc_tuned.fit(X_train, y_train)
svc_tuned_score = svc_tuned.score(X_test,y_test)
print(svc_tuned_score)

In [None]:
# Build confusion matrix.
from sklearn.metrics import confusion_matrix
svc_tuned_cm = confusion_matrix(y_test, svc_tuned.predict(X_test))
print(svc_tuned_cm)

## SVC tuning Results

In [None]:
print("SVC default hyperparameters test accuracy: ", svc_score, 
      ', parameters: ', '\n', svc.get_params())
print('Confusion matrix: ', '\n', svc_cm)
print()
print("SVC tuned hyperparameters test accuracy: ", svc_tuned_score, 
      ', parameters: ', '\n', svc_tuned.get_params())
print('Confusion matrix: ', '\n', svc_tuned_cm)

# KNN classifier

## KNN with default hyperparameters

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()

In [None]:
knn.fit(X_train, y_train)

In [None]:
knn_score = knn.score(X_test, y_test)
print(knn_score)

In [None]:
# Build confusion matrix.
from sklearn.metrics import confusion_matrix
knn_cm = confusion_matrix(y_test, knn.predict(X_test))
print(knn_cm)

## KNN hyperparameters tuning (Random Search)¶

In [None]:
# Define a grid of hyperparameters.
knn_params = {'n_neighbors': [3, 5, 10, 20, 50, 100], 
              'weights': ['uniform', 'distance'], 
              'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 
              'leaf_size': [10, 30, 50, 100], 
              'p': [1, 2]
            }

In [None]:
# Run random search.
from sklearn.model_selection import RandomizedSearchCV
knn_random = RandomizedSearchCV(estimator=knn, n_iter=10, param_distributions=knn_params, cv = 3, n_jobs=-1, 
                                random_state = 2019)

import time
start_time = time.time()
knn_random.fit(X_train, y_train)
finish_time = time.time()

# Summarize results
print("Best: %f using %s" % (knn_random.best_score_, knn_random.best_params_))
print("Execution time: " + str((finish_time - start_time)))

In [None]:
# Apply best values of hyperparameters to the model.
knn_tuned = knn_random.best_estimator_

In [None]:
# Train the tuned model on TRAIN set and check the accuracy
knn_tuned.fit(X_train, y_train)
knn_tuned_score = knn_tuned.score(X_test,y_test)
print(knn_tuned_score)

In [None]:
# Build confusion matrix.
from sklearn.metrics import confusion_matrix
knn_tuned_cm = confusion_matrix(y_test, knn_tuned.predict(X_test))
print(knn_tuned_cm)

## KNN tuning Results

In [None]:
print("KNN default hyperparameters test accuracy: ", knn_score, 
      ', parameters: ', '\n', knn.get_params())
print('Confusion matrix: ', '\n', knn_cm)
print()
print("KNN tuned hyperparameters test accuracy: ", knn_tuned_score, 
      ', parameters: ', '\n', knn_tuned.get_params())
print('Confusion matrix: ', '\n', knn_tuned_cm)

# Naive Bayes Classifier (NBC)

In [None]:
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB
nbc = GaussianNB()

In [None]:
# Fit the model and display score
nbc.fit(X_train, y_train)

In [None]:
# Run predictions on TEST set and see the accuracy.
nbc_score = nbc.score(X_test,y_test)
print(nbc_score)

In [None]:
# Build confusion matrix.
from sklearn.metrics import confusion_matrix
nbc_cm = confusion_matrix(y_test, nbc.predict(X_test))
print(nbc_cm)

## NBC hyperparameters tuning (Random Search)

In [None]:
# Define a grid of hyperparameters.
nbc_params = {'var_smoothing': [1e-12, 1e-10, 1e-09, 1e-05, 1e-04, 1e-03, 1e-02, 1e-01, 1]
            }

In [None]:
# Run random search.
from sklearn.model_selection import RandomizedSearchCV
nbc_random = RandomizedSearchCV(estimator=nbc, n_iter=10, param_distributions=nbc_params, cv = 3, n_jobs=-1, 
                                random_state = 2019)

import time
start_time = time.time()
nbc_random.fit(X_train, y_train)
finish_time = time.time()

# Summarize results
print("Best: %f using %s" % (nbc_random.best_score_, nbc_random.best_params_))
print("Execution time: " + str((finish_time - start_time)))

In [None]:
# Apply best values of hyperparameters to the model.
nbc_tuned = nbc_random.best_estimator_

In [None]:
# Train the tuned model on TRAIN set and check the accuracy
nbc_tuned.fit(X_train, y_train)
nbc_tuned_score = nbc_tuned.score(X_test,y_test)
print(nbc_tuned_score)

In [None]:
# Build confusion matrix.
from sklearn.metrics import confusion_matrix
nbc_tuned_cm = confusion_matrix(y_test, nbc_tuned.predict(X_test))
print(nbc_tuned_cm)

## NBC tuning Results

In [None]:
print("NBC default hyperparameters test accuracy: ", nbc_score, 
      ', parameters: ', '\n', nbc.get_params())
print('Confusion matrix: ', '\n', nbc_cm)
print()
print("NBC tuned hyperparameters test accuracy: ", nbc_tuned_score, 
      ', parameters: ', '\n', nbc_tuned.get_params())
print('Confusion matrix: ', '\n', nbc_tuned_cm)

# Compare Algorithms Performance

In [None]:
print("LR tuned hyperparameters test accuracy: ", lr_tuned_score)
print('Confusion matrix: ', '\n', lr_tuned_cm, '\n')

print("DT tuned hyperparameters test accuracy: ", dt_tuned_score)
print('Confusion matrix: ', '\n', dt_tuned_cm, '\n')

print("RF tuned hyperparameters test accuracy: ", rf_tuned_score)
print('Confusion matrix: ', '\n', rf_tuned_cm, '\n')

print("SVC tuned hyperparameters test accuracy: ", svc_tuned_score)
print('Confusion matrix: ', '\n', svc_tuned_cm)

print("KNN tuned hyperparameters test accuracy: ", knn_tuned_score)
print('Confusion matrix: ', '\n', knn_tuned_cm)

print("NBC tuned hyperparameters test accuracy: ", nbc_tuned_score)
print('Confusion matrix: ', '\n', nbc_tuned_cm)