# LOADING TRAIN AND TEST DATA

In [1]:
# Train data.
import pandas as pd
data_train = pd.read_csv("FeatureSelectionOutput.csv")

# split values into inpits and outputs.
values_train = data_train.values
X_train = values_train[:,1:11]
y_train = values_train[:,0]

In [2]:
# Test data.
data_test_full = pd.read_csv("ScaledTestDataSet.csv")

# Create new dataset with features previously selected.
columns_needed = list(data_train.columns)
data_test = data_test_full[columns_needed].copy()

# split values into inpits and outputs.
values_test = data_test.values
X_test = values_test[:,1:11]
y_test = values_test[:,0]

# LOGISTIC REGRESSION

## LR with default hyperparameters

In [3]:
# Initiate the LR model with defualt hyperparameters.
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

In [4]:
# Fit the model using default hyperparameters.
# K, you don't split into train and validate sets??
lr.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [5]:
# Run predictions on TEST set and see the accuracy.
lr.score(X_test,y_test)

0.9626973454853329

## LR hyperparameters tuning (Random Search)

In [6]:
from sklearn.model_selection import RandomizedSearchCV

# Create array of values for tuned hyperparameters.
lr_params = {'dual' : [True,False], 
             'max_iter' : [100,110,120,130,140],
             'C' : [1.0,1.5,2.0,2.5]
            }

In [7]:
# Run random search and initiate the model with tuned parameters.
lr_random = RandomizedSearchCV(estimator=lr, param_distributions=lr_params, cv = 3, n_jobs=-1, random_state = 2019)

import time
start_time = time.time()
lr_random_result = lr_random.fit(X_train, y_train)
finish_time = time.time()

# Summarize results
print("Best: %f using %s" % (lr_random_result.best_score_, lr_random_result.best_params_))
print("Execution time: " + str((finish_time - start_time)))



Best: 0.982297 using {'max_iter': 130, 'dual': False, 'C': 2.0}
Execution time: 8.03005862236023


In [8]:
# Apply best values of hyperparameters to the model.
lr_random = lr_random.best_estimator_

In [9]:
# Train the tuned model on TRAIN set and check the accuracy
lr_random.fit(X_train, y_train)



LogisticRegression(C=2.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=130,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

## LR tuning Results

In [10]:
print("LR default hyperparameters test accuracy: ", lr.score(X_test,y_test))
print("LR tuned hyperparameters test accuracy: ", lr_random.score(X_test,y_test))

LR default hyperparameters test accuracy:  0.9626973454853329
LR tuned hyperparameters test accuracy:  0.9750983614721849


# DECISION TREE

## DT with default hyperparameters

In [11]:
# Initiate a DT model using default hyperparameters.
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()

In [12]:
# Train model on train data.
dt.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [13]:
# Check model accuracy on the TEST set.
dt.score(X_test, y_test)

0.9761442302903531

## DT hyperparameters tuning (Random Search)

In [14]:
# Create array of values for tuned hyperparameters.
dt_params = {'max_depth': [None, 2, 3, 4, 5, 6], 
             'min_samples_leaf': [0.04, 0.06, 0.08, 1], 
             'max_features': [None, 0.2, 0.4,0.6, 0.8]}

In [15]:
# Run random search.
from sklearn.model_selection import RandomizedSearchCV
dt_random = RandomizedSearchCV(estimator=dt, param_distributions=dt_params, cv = 10, n_jobs=-1, random_state = 2019)

import time
start_time = time.time()
dt_random_result = dt_random.fit(X_train, y_train)
finish_time = time.time()

# Summarize results
print("Best: %f using %s" % (dt_random_result.best_score_, dt_random_result.best_params_))
print("Execution time: " + str((finish_time - start_time)))

Best: 0.987552 using {'min_samples_leaf': 1, 'max_features': 0.8, 'max_depth': None}
Execution time: 5.144834518432617


In [16]:
# Apply best values of hyperparameters to the model.
dt_random = dt_random.best_estimator_

In [17]:
# Train the tuned model on TRAIN set and check the accuracy
dt_random.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=0.8, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

## DT tuning Results

In [18]:
print("DT default hyperparameters test accuracy: ", dt.score(X_test,y_test))
print("DT tuned hyperparameters test accuracy: ", dt_random.score(X_test,y_test))

DT default hyperparameters test accuracy:  0.9761442302903531
DT tuned hyperparameters test accuracy:  0.9761442302903531


# RANDOM FOREST

## RF with default hyperparameters

In [19]:
# Initiate a RF model using default hyperparameters.
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()

In [20]:
# Train model on train data.
rf.fit(X_train, y_train)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [21]:
# Check model accuracy on the TEST set.
rf.score(X_test, y_test)

0.9065910708863533

## RF hyperparameters tuning (Random Search)

In [22]:
# Define a grid of hyperparameters.
rf_params = { 'n_estimators': [1, 5, 10, 30, 50, 100, 300, 400, 500], 
             'max_depth': [None, 4, 6, 8], 
             'min_samples_leaf': [0.1, 0.2, 0.5, 1], 
             'max_features': ['auto', 'log2', 'sqrt']
            }

In [26]:
# Run random search.
from sklearn.model_selection import RandomizedSearchCV
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=rf_params, cv = 3, n_jobs=-1, random_state = 2019)

import time
start_time = time.time()
rf_random_result = rf_random.fit(X_train, y_train)
finish_time = time.time()

# Summarize results
print("Best: %f using %s" % (rf_random_result.best_score_, rf_random_result.best_params_))
print("Execution time: " + str((finish_time - start_time)))

Best: 0.287255 using {'n_estimators': 30, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 8}
Execution time: 29.560807943344116


In [28]:
# Apply best values of hyperparameters to the model.
rf_random = rf_random.best_estimator_

In [29]:
# Train the tuned model on TRAIN set and check the accuracy
rf_random.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=8,
                      max_features='log2', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=30,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

## RF tuning Results

In [30]:
print("RF default hyperparameters test accuracy: ", rf.score(X_test,y_test))
print("RF tuned hyperparameters test accuracy: ", rf_random.score(X_test,y_test))

RF default hyperparameters test accuracy:  0.9065910708863533
RF tuned hyperparameters test accuracy:  0.833857919115706
