# Train a logistic regression and gradient booster on 7 species!

Sunday, May 20, 2018

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

In [3]:
import os
import sys

#Store the relative and absolute path to the home directory of the project
project_directory = '../..'
project_path = os.path.abspath(os.path.join(project_directory))

#Add the project path to the system path if necessesary, to be able to load modules from subdirectories
if project_path not in sys.path:
    sys.path.append(project_path)

In [4]:
import src.image_manager as im
import src.model_eval as meval

## Load the feature data frames and append them together

In [5]:
acer_picea_platanus_path = os.path.join(project_path, 'data', 'features_3species_20180514.csv')
alnus_thuja_path = os.path.join(project_path, 'data', 'features_alnus_thuja.csv')
features_cedrus_pseudotsuga_path = os.path.join(project_path, 'data', 'features_cedrus_pseudotsuga.csv')

In [7]:
acer_picea_platanus_df = im.load_df(acer_picea_platanus_path)
alnus_thuja_df = im.load_df(alnus_thuja_path)
cedrus_pseudotsuga_df = im.load_df(features_cedrus_pseudotsuga_path)

In [8]:
features_df = acer_picea_platanus_df.append(alnus_thuja_df).append(cedrus_pseudotsuga_df)
features_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3933 entries, 0 to 3556
Columns: 2051 entries, p_hash to incv3_out_2047
dtypes: float64(2048), object(3)
memory usage: 61.6+ MB


In [9]:
features_df.head()

Unnamed: 0,p_hash,filename,folder,incv3_out_0,incv3_out_1,incv3_out_2,incv3_out_3,incv3_out_4,incv3_out_5,incv3_out_6,...,incv3_out_2038,incv3_out_2039,incv3_out_2040,incv3_out_2041,incv3_out_2042,incv3_out_2043,incv3_out_2044,incv3_out_2045,incv3_out_2046,incv3_out_2047
0,9bb759cb29b01a25,image_bigleaf_maple_57.png,acer_macrophyllum,0.180042,0.057599,0.632024,0.021612,0.297033,0.011966,0.903157,...,0.36049,0.009282,0.055258,0.110056,0.058888,0.201237,0.198077,1.394304,0.155102,0.0
1,99c20b3b74d53de0,image_bigleaf_maple_43.png,acer_macrophyllum,0.667066,0.249269,0.074132,0.177577,0.52058,0.167225,0.233005,...,1.446164,0.920353,0.823285,0.617185,0.121126,0.065866,0.876259,0.226926,0.274808,0.0
2,a1e1e0d4e646e765,image_big_leaf_maple_bark_48.png,acer_macrophyllum,0.159188,0.011783,0.187351,0.211521,0.466182,0.387081,0.217815,...,1.198391,0.460702,0.157247,0.267633,0.006349,0.466702,0.641492,1.066418,0.145887,0.223863
3,abee94c19cb28b0e,image_acer_macrophylum_tree_49.png,acer_macrophyllum,0.658278,0.010192,0.344359,0.192652,0.342696,0.362848,0.562928,...,0.257585,0.077631,0.057183,0.411228,0.417154,0.048589,0.421282,0.057896,0.325496,0.145896
4,d3c161631f2b4b4e,image_bigleaf_maple_94.png,acer_macrophyllum,0.056695,0.255299,0.407071,0.066481,7.6e-05,0.568202,0.362414,...,1.289151,0.578341,0.963826,0.028785,0.310883,0.001386,1.154405,0.034431,0.768076,0.217323


In [10]:
features_df.tail()

Unnamed: 0,p_hash,filename,folder,incv3_out_0,incv3_out_1,incv3_out_2,incv3_out_3,incv3_out_4,incv3_out_5,incv3_out_6,...,incv3_out_2038,incv3_out_2039,incv3_out_2040,incv3_out_2041,incv3_out_2042,incv3_out_2043,incv3_out_2044,incv3_out_2045,incv3_out_2046,incv3_out_2047
3552,f28e1ca12dc9956b,image_pseudotsuga_menziesii_tree_77.png,pseudotsuga_menziesii,0.0946,0.061064,0.438699,0.062858,0.182481,0.185814,0.167514,...,0.178922,0.057688,0.062832,1.061262,0.107008,0.072353,0.160739,0.071515,0.23769,0.342455
3553,b64af14559635e52,image_douglas_fir_needles_75.png,pseudotsuga_menziesii,0.414487,0.138089,0.428809,0.093233,0.279266,0.325341,0.363008,...,0.777865,0.040818,0.263513,0.201849,0.734593,0.856769,1.216158,0.154447,0.888354,0.514273
3554,bc2d91738a768b62,image_pseudotsuga_menziesii_tree_63.png,pseudotsuga_menziesii,0.141211,0.025822,0.279416,0.242231,0.240942,0.029662,0.001973,...,0.051108,0.095992,0.701401,0.898264,0.051929,0.375273,0.293792,0.064906,0.07829,0.308257
3555,d414e491bc7e0b4f,image_douglas_fir_needles_61.png,pseudotsuga_menziesii,0.924379,0.03228,0.729098,0.103906,0.834279,0.702138,0.192425,...,0.513257,0.053753,0.019497,0.148256,0.009697,0.741581,0.619497,0.536302,0.356287,0.106888
3556,ab2aa82adfa954a5,image_douglas_fir_needles_49.png,pseudotsuga_menziesii,0.439043,0.13689,0.371879,0.072276,0.09001,0.177785,0.196581,...,0.95216,0.090075,0.051675,0.002637,0.025259,0.62077,0.016263,0.04299,0.006773,0.591485


In [11]:
features_df['folder'].value_counts()

cedrus_libani            650
thuja_plicata            575
picea_sitchensis         572
alnus_rubra              567
platanus_acerifolia      541
acer_macrophyllum        526
pseudotsuga_menziesii    502
Name: folder, dtype: int64

## Pull out features (X) and labels (y), and get a train-test split¶

In [12]:
non_feature_cols = ['p_hash', 'filename', 'folder']
X = features_df.drop(non_feature_cols, axis=1)
y = features_df['folder']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=876)

In [13]:
print(f'Train: {len(X_train)}, Test: {len(X_test)}')

Train: 3146, Test: 787


In [14]:
y_train.value_counts()

cedrus_libani            515
picea_sitchensis         467
thuja_plicata            456
alnus_rubra              449
platanus_acerifolia      445
acer_macrophyllum        413
pseudotsuga_menziesii    401
Name: folder, dtype: int64

In [15]:
y_test.value_counts()

cedrus_libani            135
thuja_plicata            119
alnus_rubra              118
acer_macrophyllum        113
picea_sitchensis         105
pseudotsuga_menziesii    101
platanus_acerifolia       96
Name: folder, dtype: int64

In [16]:
np.log(7) #Baseline for log loss

1.9459101490553132

## Try Ridge Logistic Regression with best C from 4 species model

In [18]:
log_model = LogisticRegression(multi_class='multinomial', class_weight='balanced', solver='sag', C=0.1, max_iter=4000, verbose=1)

In [19]:
log_model.fit(X_train, y_train)

convergence after 229 epochs took 38 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   37.7s finished


LogisticRegression(C=0.1, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=4000,
          multi_class='multinomial', n_jobs=1, penalty='l2',
          random_state=None, solver='sag', tol=0.0001, verbose=1,
          warm_start=False)

In [20]:
logeval = meval.ModelEvaluator(log_model)
logeval.print_classifier_metrics(X_train, X_test, y_train, y_test)

Train log_loss: 0.3173700267721209, Test log_loss: 1.002594676140932
Train accuracy: 0.9551811824539097, Test accuracy: 0.6442185514612452


In [21]:
logeval.confusion_df(X_test, y_test)

Unnamed: 0,Predicted acer_macrophyllum,Predicted alnus_rubra,Predicted cedrus_libani,Predicted picea_sitchensis,Predicted platanus_acerifolia,Predicted pseudotsuga_menziesii,Predicted thuja_plicata
Actual acer_macrophyllum,69,9,9,1,14,5,6
Actual alnus_rubra,10,75,5,7,13,4,4
Actual cedrus_libani,6,5,84,13,2,15,10
Actual picea_sitchensis,5,4,3,69,1,15,8
Actual platanus_acerifolia,15,3,0,3,74,1,0
Actual pseudotsuga_menziesii,1,6,19,14,2,51,8
Actual thuja_plicata,4,8,10,7,0,5,85


## Do a grid search to find the best C

My guess is that the best C will be greater than or equal to the best for 4 species (0.1), i.e. less regularization since we have more data. In particular, we now have more data points than features for the first time.

In [22]:
param_grid = {'C': [0.01, 0.1, 1, 10]}
gridsearch = GridSearchCV(estimator=log_model,
                          param_grid=param_grid, 
                          scoring=['neg_log_loss', 'accuracy'],
                          refit='neg_log_loss')

In [23]:
gridsearch.fit(X_train, y_train)

convergence after 192 epochs took 21 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   20.8s finished


convergence after 204 epochs took 22 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   21.9s finished


convergence after 193 epochs took 21 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   21.0s finished


convergence after 216 epochs took 24 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   23.4s finished


convergence after 272 epochs took 29 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   29.5s finished


convergence after 308 epochs took 33 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   32.8s finished


convergence after 344 epochs took 38 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   38.1s finished


convergence after 360 epochs took 41 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   40.7s finished


convergence after 368 epochs took 40 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   40.0s finished


convergence after 736 epochs took 81 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.3min finished


convergence after 784 epochs took 83 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.4min finished


convergence after 761 epochs took 80 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.3min finished


convergence after 220 epochs took 35 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   34.7s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=0.1, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=4000,
          multi_class='multinomial', n_jobs=1, penalty='l2',
          random_state=None, solver='sag', tol=0.0001, verbose=1,
          warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.01, 0.1, 1, 10]}, pre_dispatch='2*n_jobs',
       refit='neg_log_loss', return_train_score='warn',
       scoring=['neg_log_loss', 'accuracy'], verbose=0)