# Train a logistic regression and gradient booster on the newly extracted features for 3 species, to get baseline to compare with neural network

Tuesday, May 15, 2018

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

In [3]:
import os
import sys

#Store the relative and absolute path to the home directory of the project
project_directory = '../..'
project_path = os.path.abspath(os.path.join(project_directory))

#Add the project path to the system path if necessesary, to be able to load modules from subdirectories
if project_path not in sys.path:
    sys.path.append(project_path)

In [4]:
import src.image_manager as im
import src.model_eval as meval

## Load the features dataframe, and pull out features (X) and labels (y)

In [5]:
feature_df_path = os.path.join(project_path, 'data', 'features_3species_20180514.csv')

In [8]:
tree_features_df = im.load_df(feature_df_path)
tree_features_df.head()

Unnamed: 0,p_hash,filename,folder,incv3_out_0,incv3_out_1,incv3_out_2,incv3_out_3,incv3_out_4,incv3_out_5,incv3_out_6,...,incv3_out_2038,incv3_out_2039,incv3_out_2040,incv3_out_2041,incv3_out_2042,incv3_out_2043,incv3_out_2044,incv3_out_2045,incv3_out_2046,incv3_out_2047
0,9bb759cb29b01a25,image_bigleaf_maple_57.png,acer_macrophyllum,0.180042,0.057599,0.632024,0.021612,0.297033,0.011966,0.903157,...,0.36049,0.009282,0.055258,0.110056,0.058888,0.201237,0.198077,1.394304,0.155102,0.0
1,99c20b3b74d53de0,image_bigleaf_maple_43.png,acer_macrophyllum,0.667066,0.249269,0.074132,0.177577,0.52058,0.167225,0.233005,...,1.446164,0.920353,0.823285,0.617185,0.121126,0.065866,0.876259,0.226926,0.274808,0.0
2,a1e1e0d4e646e765,image_big_leaf_maple_bark_48.png,acer_macrophyllum,0.159188,0.011783,0.187351,0.211521,0.466182,0.387081,0.217815,...,1.198391,0.460702,0.157247,0.267633,0.006349,0.466702,0.641492,1.066418,0.145887,0.223863
3,abee94c19cb28b0e,image_acer_macrophylum_tree_49.png,acer_macrophyllum,0.658278,0.010192,0.344359,0.192652,0.342696,0.362848,0.562928,...,0.257585,0.077631,0.057183,0.411228,0.417154,0.048589,0.421282,0.057896,0.325496,0.145896
4,d3c161631f2b4b4e,image_bigleaf_maple_94.png,acer_macrophyllum,0.056695,0.255299,0.407071,0.066481,7.6e-05,0.568202,0.362414,...,1.289151,0.578341,0.963826,0.028785,0.310883,0.001386,1.154405,0.034431,0.768076,0.217323


In [9]:
X = tree_features_df.drop(['p_hash', 'filename', 'folder'], axis=1)
X.head()

Unnamed: 0,incv3_out_0,incv3_out_1,incv3_out_2,incv3_out_3,incv3_out_4,incv3_out_5,incv3_out_6,incv3_out_7,incv3_out_8,incv3_out_9,...,incv3_out_2038,incv3_out_2039,incv3_out_2040,incv3_out_2041,incv3_out_2042,incv3_out_2043,incv3_out_2044,incv3_out_2045,incv3_out_2046,incv3_out_2047
0,0.180042,0.057599,0.632024,0.021612,0.297033,0.011966,0.903157,1.185745,0.70598,0.150624,...,0.36049,0.009282,0.055258,0.110056,0.058888,0.201237,0.198077,1.394304,0.155102,0.0
1,0.667066,0.249269,0.074132,0.177577,0.52058,0.167225,0.233005,1.123716,0.54497,0.793712,...,1.446164,0.920353,0.823285,0.617185,0.121126,0.065866,0.876259,0.226926,0.274808,0.0
2,0.159188,0.011783,0.187351,0.211521,0.466182,0.387081,0.217815,0.4709,0.553186,0.176517,...,1.198391,0.460702,0.157247,0.267633,0.006349,0.466702,0.641492,1.066418,0.145887,0.223863
3,0.658278,0.010192,0.344359,0.192652,0.342696,0.362848,0.562928,0.13362,0.420587,0.228698,...,0.257585,0.077631,0.057183,0.411228,0.417154,0.048589,0.421282,0.057896,0.325496,0.145896
4,0.056695,0.255299,0.407071,0.066481,7.6e-05,0.568202,0.362414,0.189681,0.153133,0.169399,...,1.289151,0.578341,0.963826,0.028785,0.310883,0.001386,1.154405,0.034431,0.768076,0.217323


In [10]:
y = tree_features_df['folder']
y.head()

0    acer_macrophyllum
1    acer_macrophyllum
2    acer_macrophyllum
3    acer_macrophyllum
4    acer_macrophyllum
Name: folder, dtype: object

## Get a train-test split of the data

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=359)
y_train.head()

4859    platanus_acerifolia
430       acer_macrophyllum
3723       picea_sitchensis
4893    platanus_acerifolia
5016    platanus_acerifolia
Name: folder, dtype: object

In [13]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1311 entries, 4859 to 4027
Columns: 2048 entries, incv3_out_0 to incv3_out_2047
dtypes: float64(2048)
memory usage: 20.5 MB


In [14]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 328 entries, 244 to 4999
Columns: 2048 entries, incv3_out_0 to incv3_out_2047
dtypes: float64(2048)
memory usage: 5.1 MB


In [16]:
y_train.value_counts()

picea_sitchensis       461
platanus_acerifolia    438
acer_macrophyllum      412
Name: folder, dtype: int64

In [17]:
y_test.value_counts()

acer_macrophyllum      114
picea_sitchensis       111
platanus_acerifolia    103
Name: folder, dtype: int64

## Try a logistic regression with multinomial probabilities

In [35]:
log_model = LogisticRegression(multi_class='multinomial', solver='sag', C=0.001, max_iter=2000)

In [36]:
log_model.fit(X_train, y_train)

LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=2000, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='sag',
          tol=0.0001, verbose=0, warm_start=False)

In [37]:
logeval = meval.ModelEvaluator(log_model)
logeval.print_classifier_metrics(X_train, X_test, y_train, y_test)

Train log_loss: 0.5944925029488596, Test log_loss: 0.6829306517753826
Train accuracy: 0.8283752860411899, Test accuracy: 0.7439024390243902


In [38]:
logeval.confusion_df(X_test, y_test)

Unnamed: 0,Predicted acer_macrophyllum,Predicted picea_sitchensis,Predicted platanus_acerifolia
Actual acer_macrophyllum,68,18,28
Actual picea_sitchensis,9,93,9
Actual platanus_acerifolia,8,12,83


In [39]:
logeval.confusion_df(X_train, y_train)

Unnamed: 0,Predicted acer_macrophyllum,Predicted picea_sitchensis,Predicted platanus_acerifolia
Actual acer_macrophyllum,289,38,85
Actual picea_sitchensis,34,414,13
Actual platanus_acerifolia,40,15,383


Wow, 74% accuracy for the simplest possible model! That's way better than my original models.

## Try logistic regression with lasso regularization

You must use the SAGA solver to use both L1 regularization and multinomial multi-class predictions (instead of one-vs-rest).

In [45]:
log_lasso_model = LogisticRegression(penalty='l1', multi_class='multinomial', solver='saga', C=0.1, max_iter=2000)

In [46]:
log_lasso_model.fit(X_train, y_train)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=2000, multi_class='multinomial',
          n_jobs=1, penalty='l1', random_state=None, solver='saga',
          tol=0.0001, verbose=0, warm_start=False)

In [47]:
loglasso_eval = meval.ModelEvaluator(log_lasso_model)
loglasso_eval.print_classifier_metrics(X_train, X_test, y_train, y_test)

Train log_loss: 0.4498591818552623, Test log_loss: 0.5908337379635094
Train accuracy: 0.8405797101449275, Test accuracy: 0.7469512195121951


In [49]:
loglasso_eval.confusion_df(X_test, y_test)

Unnamed: 0,Predicted acer_macrophyllum,Predicted picea_sitchensis,Predicted platanus_acerifolia
Actual acer_macrophyllum,69,10,35
Actual picea_sitchensis,11,93,7
Actual platanus_acerifolia,9,11,83


In [48]:
loglasso_eval.confusion_df(X_train, y_train)

Unnamed: 0,Predicted acer_macrophyllum,Predicted picea_sitchensis,Predicted platanus_acerifolia
Actual acer_macrophyllum,300,28,84
Actual picea_sitchensis,31,419,11
Actual platanus_acerifolia,40,15,383


## Try a gradient booster

This takes about 3 minutes to train with parameters (learning_rate=0.01, n_estimators=200, subsample=0.5, max_depth=5)

In [50]:
gb_model = GradientBoostingClassifier(learning_rate=0.01, n_estimators=200, subsample=0.5, max_depth=5)

In [52]:
gb_model.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.01, loss='deviance', max_depth=5,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=200,
              presort='auto', random_state=None, subsample=0.5, verbose=0,
              warm_start=False)

In [53]:
gbeval = meval.ModelEvaluator(gb_model)
gbeval.print_classifier_metrics(X_train, X_test, y_train, y_test)

Train log_loss: 0.2741725023525165, Test log_loss: 0.6391813639627187
Train accuracy: 0.996186117467582, Test accuracy: 0.7713414634146342


Woo hoo, lookin' good!

In [54]:
gbeval.confusion_df(X_test, y_test)

Unnamed: 0,Predicted acer_macrophyllum,Predicted picea_sitchensis,Predicted platanus_acerifolia
Actual acer_macrophyllum,69,14,31
Actual picea_sitchensis,7,98,6
Actual platanus_acerifolia,7,10,86


In [55]:
gbeval.confusion_df(X_train, y_train)

Unnamed: 0,Predicted acer_macrophyllum,Predicted picea_sitchensis,Predicted platanus_acerifolia
Actual acer_macrophyllum,409,0,3
Actual picea_sitchensis,2,459,0
Actual platanus_acerifolia,0,0,438
