### Notebook for Model Training and Exporting

This notebook:
* 1- loads the calibration data from a csv file consisting of RGB values per colorimetric spot and the corresponding class
* 2- explores the hyperparameter space to train an openCV Logistic Regression model on the colorimetric data
* 3- exports the selected model to an XML file that can be imported into the mobile phone App and used on the colorimetric data collected in the field

**Copyright 2020- IBM Inc. All rights reserved
SPDX-License-Identifier: BSD-3-Clause**

#### Import Libraries

In [1]:
# Operating system
import os
from pathlib import Path

# Data processing
import pandas as pd
import numpy as np

# library to build models 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

import itertools

# Open CV Version 4.4
import cv2

%matplotlib inline
%autosave 120

Autosaving every 120 seconds


#### Load Calibration Data

File and path definition

In [2]:
# Calibration Data -- Files with data -----------------------

Data_folder = Path("***") #Insert your path to data folder

Calib_file_to_open_BG = Data_folder / "CalibrationData_BG.csv"
Calib_file_to_open_BP = Data_folder / "CalibrationData_BP.csv"


# Name of files to export model
savefilename_BP =  "SOILSEP20_BP.xml"
savefilename_BG =  "SOILSEP20_BG.xml"

In [3]:
df_data_BP = pd.read_csv(Calib_file_to_open_BP)
df_data_BG = pd.read_csv(Calib_file_to_open_BG)
df_data_BG.head()

Unnamed: 0,R,B,G,Class,pH,R_std,B_std,G_std
0,106.471489,76.027084,106.281227,0,3.4,8.080424,5.923917,5.572097
1,113.196025,73.857769,109.753036,0,3.4,9.216805,6.825532,6.741241
2,107.77087,77.055856,106.962039,0,3.5,8.451906,6.670146,5.677071
3,107.784614,76.703508,109.346665,0,3.5,8.51435,6.99912,5.692544
4,113.367287,77.232398,111.999227,0,3.5,7.281277,5.79986,5.601136


#### Function Definition

Cross Validation OpenCV Function 

In [4]:
def CVLR_cross_val(model, DF, features, label, split_size, Nb, scaled=False, verbose=False):
    X = np.asarray(DF[features])    
    y = np.asarray(DF[label]).ravel()
    if scaled:
        X = StandardScaler().fit_transform(X.astype(float))
        
    scores_array = np.ones([Nb])
  
    for testNb in range(Nb):
#         if verbose: print('\t\t\t running case = ', testNb, ' of ', Nb)
        cv_X_train, cv_X_test, cv_y_train, cv_y_test = train_test_split(X, y, test_size=split_size, shuffle=True)
        model.train(cv_X_train.astype(np.float32), cv2.ml.ROW_SAMPLE, cv_y_train.astype(np.float32))
        
        ret, cv_yhat = model.predict(cv_X_test.astype(np.float32))
        curr_score = metrics.accuracy_score(cv_y_test, cv_yhat)
#         if verbose: print('\t\t\t\t\t score = ', np.round(curr_score,3))
        scores_array[testNb] = curr_score
        
    Final_score = np.mean(scores_array) 
    if verbose: print('\t\t Training Mean Score = ', np.round(Final_score,3),'\n')

    return Final_score 
   

In [5]:
def cv_lr_build(model, DF, features, label, split_size, Nb, scaled=False, verbose=False):
    X = np.asarray(DF[features])    
    y = np.asarray(DF[label]).ravel()
    if scaled:
        X = StandardScaler().fit_transform(X.astype(float))

    model.train(X.astype(np.float32), cv2.ml.ROW_SAMPLE, y.astype(np.float32))
    ret, cv_yhat = model.predict(X.astype(np.float32))
    Final_score = metrics.accuracy_score(y, cv_yhat)

    if verbose: print('\t\t Training Mean Score = ', np.round(Final_score,3),'\n')

    return model, Final_score   

Grid search over parameter space

In [6]:
def cv_lr_SearchGrid(data, label, SearchGrid, verbose=False):
    
    cv_lr_gridScore = np.zeros(len(SearchGrid), dtype='f8')
    
    for case in range(len(SearchGrid)):
        #display case number and model parameters
        if verbose: print('case Nb:', case, '\n' , 'method:', SearchGrid[case][0], ', batch:', SearchGrid[case][1],
                          ', iterations: ', SearchGrid[case][2], '\n', 'rate:', SearchGrid[case][3],', reg:', SearchGrid[case][4])
        
        #Initialize model
        cv_lr = cv2.ml.LogisticRegression_create()
        #Define parameters
        cv_lr.setTrainMethod(eval(SearchGrid[case][0]))
        cv_lr.setMiniBatchSize(SearchGrid[case][1])
        cv_lr.setTermCriteria((cv2.TermCriteria_MAX_ITER + cv2.TermCriteria_EPS, SearchGrid[case][2], 1e-9))
        cv_lr.setLearningRate(SearchGrid[case][3])
        cv_lr.setRegularization(eval(SearchGrid[case][4]))          

        #Train model
        try:
            LR_acc = CVLR_cross_val(cv_lr, data,features, label, split_size=SearchGrid[case][5], Nb=SearchGrid[case][6], scaled=False, verbose=verbose)
            cv_lr_gridScore[case] = LR_acc
        except:
            if verbose: print('model calibration did not converge\n')
            cv_lr_gridScore[case] = 0
            
    return cv_lr_gridScore

#### Define Features and search parameter space

Features and label 

In [7]:
features = ['R', 'G', 'B']
label = ['Class']

Hyperparameters to sweep during model training

In [8]:
TrainingMethods= ['cv2.ml.LogisticRegression_MINI_BATCH' , 'cv2.ml.LogisticRegression_BATCH']  
BatchSize = [10, 5, 1] 
NbIterations = [100000, 1000000]
Regularization = ['cv2.ml.LogisticRegression_REG_DISABLE', 'cv2.ml.LogisticRegression_REG_L2']
LearningRate = [0.1, 0.01, 0.001,  0.0001,  0.000001]
# LearningRate = [0.1, 0.05, 0.01, 0.002, 0.001, 0.0005,   0.0001, 0.00001, 0.000001]

crossValSize = 0.25
crossValNb = 4

In [9]:
SearchGrid = list(itertools.product(TrainingMethods,BatchSize,NbIterations,LearningRate,Regularization))
SearchGrid = [list(tup)+[crossValSize] for tup in SearchGrid]
SearchGrid = [list(tup)+[crossValNb] for tup in SearchGrid]
print('Nb cases to search:', len(SearchGrid))

Nb cases to search: 120


### Train Models

Search hyperparameters over the parameter space defined above

BG Data

In [10]:
cv_lr_gridScore_BG  = cv_lr_SearchGrid(df_data_BG, label, SearchGrid, verbose = False)

In [11]:
cv_lr_gridScore_BG

array([0.        , 0.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       0.        , 0.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       0.        , 0.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       0.        , 0.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       0.        , 0.        , 0.        , 0.        , 1.        ,
       0.99831081, 1.        , 1.        , 1.        , 1.        ,
       0.        , 0.        , 0.        , 0.        , 1.        ,
       0.99493243, 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 0.78547297,
       0.74155405, 0.72804054, 0.76182432, 0.69425676, 0.74324324,
       1.        , 1.        , 1.        , 1.        , 0.79560

BP Data

In [12]:
cv_lr_gridScore_BP  = cv_lr_SearchGrid(df_data_BP, label, SearchGrid, verbose = False)

In [13]:
cv_lr_gridScore_BP 

array([0.        , 0.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 0.9084507 , 0.90669014,
       0.        , 0.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 0.99823944, 0.99471831,
       0.        , 0.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 0.95246479, 0.94190141,
       0.        , 0.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 0.99647887, 0.99647887,
       0.        , 0.        , 0.        , 0.        , 1.        ,
       0.97183099, 1.        , 1.        , 0.99471831, 0.98943662,
       0.        , 0.        , 0.        , 0.        , 1.        ,
       1.        , 1.        , 0.99471831, 1.        , 1.        ,
       0.99647887, 0.99647887, 0.96302817, 0.96302817, 0.81866197,
       0.81338028, 0.83626761, 0.81866197, 0.82746479, 0.82570423,
       0.99647887, 0.99647887, 0.9471831 , 0.9471831 , 0.83978

#### Final model selection based on performance on separate testing dataset not externalized

Building final model with selected parameters

BP data

In [14]:
#Initialize model
cv_lr = cv2.ml.LogisticRegression_create()
#Define parameters
cv_lr.setTrainMethod(cv2.ml.LogisticRegression_MINI_BATCH)
cv_lr.setMiniBatchSize(10)
cv_lr.setTermCriteria((cv2.TermCriteria_MAX_ITER + cv2.TermCriteria_EPS, 10000000, 1e-9))
cv_lr.setLearningRate(0.000001)
cv_lr.setRegularization(cv2.ml.LogisticRegression_REG_L2)          


#Train model
cv_lr_BP, Final_score_BP = cv_lr_build(cv_lr, df_data_BP,features, label, split_size=crossValSize, Nb=crossValNb, scaled=False, verbose=True)


		 Training Mean Score =  1.0 



BG data

In [15]:
#Initialize model
cv_lr = cv2.ml.LogisticRegression_create()
#Define parameters
cv_lr.setTrainMethod(cv2.ml.LogisticRegression_MINI_BATCH)
cv_lr.setMiniBatchSize(5)
cv_lr.setTermCriteria((cv2.TermCriteria_MAX_ITER + cv2.TermCriteria_EPS, 1000000, 1e-9))
cv_lr.setLearningRate(0.0001)
cv_lr.setRegularization(cv2.ml.LogisticRegression_REG_L2)          

#Train model
cv_lr_BG, Final_score_BG = cv_lr_build(cv_lr, df_data_BG,features, label, split_size=crossValSize, Nb=crossValNb, scaled=False, verbose=True)


		 Training Mean Score =  1.0 



### Saving models

In [16]:
print('Saving model for BG data ...')
cv_lr_BG.save(savefilename_BG)

print('Saving model for BP data ...')
cv_lr_BP.save(savefilename_BP)

Saving model for BG data ...
Saving model for BP data ...
