# Soil Spectral Data Inference for cost-effective monitoring

In [8]:
#!pip install spectres
import numpy as np
import pandas as pd
from scipy.signal import savgol_filter
from spectres import spectres

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import time

Loading the soil-spectra.csv <br>
The dataset contains spectra data and measured soil properties <br>
As the first step, we transform the reflectance into absorbance spectra <br>
Second, smoothing data using Savitzky-Golay as digital filter <br>
Third, resample the spectra to 500 to 2450 nm with the spectral resolution of 10 nm <br>
Fourth, Normalize the spectra data using Standard Normal Variate(SNV)

Standard Normal Variate applies two step: <br>
    1. Compute the mean centre of each spectrum $ X_i $ by calculating its mean $ \bar{X_i} $ <br>
    2. Devide the difference of each mean with $ X_i $ by its standard deviation $\sigma_i$, then $X_{snv} = \frac{X_i - \bar{X_i}}{\sigma_i} $

In [3]:
df = pd.read_csv("soil-spectra.csv")
soil = df.iloc[:,2:7]
spectra = df.iloc[:,8:2159]

ab = np.log(100/spectra)  #first step
abs_sg = savgol_filter(ab, window_length = 11, polyorder = 2, deriv = 0)  #second step
wave = np.arange(350, 2501, 1)
new_wave = np.arange(500, 2470, 10)
new_abs = spectres(new_wavs = new_wave, spec_wavs = wave,  spec_fluxes = abs_sg)   #third step

def snv(input_data):
    
    input_data = np.asarray(input_data)
    # Define a new array and populate it with the corrected data  
    output_data = np.zeros_like(input_data)
    for i in range(input_data.shape[0]):
 
        # Apply correction
        output_data[i,:] = (input_data[i,:] - np.mean(input_data[i,:])) / np.std(input_data[i,:])
 
    return output_data

abs_std = snv(new_abs)   #fourth step
abs_std.shape

(197, 197)

Split the data into 70% training and 30% testing

In [4]:
abs_dat = pd.DataFrame(data=abs_std, columns = new_wave)
X = abs_dat
y = soil[['SOC (%)']]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size= .3)
#print(X_train)
#print(y_train)



Make a list of models and parameter <br>
Running the code containing both lists to select the best model and hyperparameter setting <br>
Note: .values will give the values in a numpy array (shape: (n,1)) and .ravel will convert that array shape to (n, ) (i.e. flatten it) <br>
    
    ========Please add the code to see the algorithm efficiency i  the for loop (or make another for loop)=========

In [9]:
models = [GradientBoostingRegressor(random_state=0), 
              RandomForestRegressor(), 
                    KNeighborsRegressor()]

params = [{'n_estimators': [10, 50, 100, 150], 'learning_rate': [0.01, 0.25, 1, 1.3], 'max_depth': [1,2]},
             {'n_estimators': [10,50,100,200],'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
                  {'n_neighbors': [1, 2, 3, 4]}]

max_i = -1
max_accuracy = -np.Inf
max_mod = None

bestModelIndex = -1
bestMSE = np.Inf
bestModel = None
times = list()
for i in range(3):
    start = time.time()
    mod = GridSearchCV(models[i], params[i])
    mod.fit(X_train, y_train.values.ravel())
    end = time.time()
    diff = end - start
    times.append(diff)
    test_score = mod.score(X_test, y_test)
    if test_score >= max_accuracy:
        max_accuracy = test_score
        max_i = i
        max_mod = mod.best_estimator_
        
    MSE = mean_squared_error(y_test, mod.predict(X_test))
    if MSE < bestMSE:
        bestModelIndex = i
        bestModel = mod
        bestMSE = MSE
        
print("i:", max_i)
print("Accuracy: ", max_accuracy) 
print("Mean Square Error (MSE): ", bestMSE)
print("Root Mean Square Error (RMSE): ", np.sqrt(bestMSE))
print("The Best Model and Hyperparameter settings:", max_mod)

i: 2
Accuracy:  0.923644085989511
Mean Square Error (MSE):  0.25185498336225154
Root Mean Square Error (RMSE):  0.5018515551059411
The Best Model and Hyperparameter settings: KNeighborsRegressor(n_neighbors=1)


In [13]:
times[max_i] ## time to run best model

for i in range(3):
    print(f"{times[i]} seconds to run {models[i]}")

23.80384063720703 seconds to run GradientBoostingRegressor(random_state=0)
65.34340405464172 seconds to run RandomForestRegressor()
0.08364105224609375 seconds to run KNeighborsRegressor()
