## Import packages

In [1]:
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from sys import stdout
from scipy.signal import savgol_filter #for derivative calculaiton

import numpy as np
import sklearn.cross_decomposition
from sklearn.cross_decomposition import  PLSRegression
from sklearn import linear_model
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR


## Read, normalize and split traning/test data

In [2]:
# load dataset, 243 sample X dimension (243, 202), Y dimension (243,1)
#df = pd.read_csv('test.csv')
xls = pd.ExcelFile('CA_MO_combined.xlsx')
df1 = pd.read_excel(xls, 'CA_MO_combined_raw')
df2 = pd.read_excel(xls, 'CA_MO_combined_1nm')

dataset = df1.values
# split into input (X) and output (Y) variables, and training/test data
X = dataset[:, 1:202]
Y = dataset[:, 205]   # Gs

# normalize data
scaler = MinMaxScaler(feature_range=(0, 1))
#X = scaler.fit_transform(X)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state = 9)

num_features = X_train.shape[1]

print (X_train.shape,Y_train.shape,Y_test.shape,X_test.shape, num_features)

(170, 201) (170,) (73,) (73, 201) 201


## Model evaluation

In [3]:
def evaluate(model,X_train, Y_train, X_test, Y_test):
    
    y_pred = model.predict(X_test)
    # The mean squared error
    MSE = mean_squared_error(Y_test, y_pred)
    RMSE = np.sqrt (MSE)
    print("RMSE: %.3f" % RMSE)
    testY_mean = np.mean(Y_test)
    RRMSE = 100*RMSE/testY_mean
    print("Relative RMSE: %.3f" % RRMSE)
    R2 = r2_score(Y_test, y_pred)
    print('Testing R2: %.3f' % R2)

    ## Training accuracy metrics
    y_pred_train = model.predict(X_train) 
    MSE = mean_squared_error(Y_train, y_pred_train)
    RMSE = np.sqrt (MSE)
    print("RMSE of Training: %.3f" % RMSE)
    trainY_mean = np.mean(Y_train)
    RRMSE = 100*RMSE/trainY_mean
    print("Relative RMSE of Trainging: %.3f" % RRMSE)
    # Explained variance score: 1 is perfect prediction
    print('Traing R2: %.3f' % r2_score(Y_train, y_pred_train))
    
    return RRMSE

## PLSR

In [4]:
pls = PLSRegression()
parl = GridSearchCV(estimator=pls, param_grid={'n_components': range(1,40)}, scoring='r2', cv=5)
parl.fit(X, Y)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=PLSRegression(copy=True, max_iter=500, n_components=2, scale=True, tol=1e-06),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_components': range(1, 40)}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring='r2', verbose=0)

In [5]:
pls_best_grid = parl.best_estimator_
grid_accuracy = evaluate(pls_best_grid, X_train, Y_train, X_test, Y_test)
pls_best_grid

RMSE: 0.097
Relative RMSE: 41.740
Testing R2: 0.510
RMSE of Training: 0.098
Relative RMSE of Trainging: 38.134
Traing R2: 0.460


PLSRegression(copy=True, max_iter=500, n_components=15, scale=True, tol=1e-06)

## RF

In [6]:

# Create the parameter grid based on the results of random search 
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state = 42)

param_grid = {
    'bootstrap': [True],
    #'max_depth': [80, 90, 100, 110],
    'max_depth': [165],
    #'max_features': range(10, 40, 1),
    #'max_features' : ['auto', 'sqrt','log2'],
    'max_features' : ['sqrt'],
    #'min_samples_leaf': [3, 4, 5],
    'min_samples_leaf': [3],
    #'min_samples_split': [2, 4, 6, 8, 10, 12],
    'min_samples_split': [6],
    #'n_estimators': range (1, 1000, 1),
    'n_estimators': [38]
                     #200, 300, 400, 500, 600, 700, 800, 900, 1000] 
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)

# fit RF and pring the best parameters
grid_search.fit(X_train, Y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    0.8s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.8s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'bootstrap': [True], 'max_depth': [165], 'max_features': ['sqrt'], 'min_samples_leaf': [3], 'min_samples_split': [6], 'n_estimators': [38]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [7]:
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_train, Y_train, X_test, Y_test)
best_grid

RMSE: 0.129
Relative RMSE: 55.550
Testing R2: 0.132
RMSE of Training: 0.070
Relative RMSE of Trainging: 27.233
Traing R2: 0.725


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=165,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=3, min_samples_split=6,
           min_weight_fraction_leaf=0.0, n_estimators=38, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

## SVM 

In [8]:
np.random.seed(0)

parameters = {'kernel': ('linear', 'rbf','poly'),
              'C':range (1, 10),
              'gamma': [1e-7, 1e-4],
              'epsilon':[0.1,0.2,0.5,0.3]}
svr = SVR()
grid_search = GridSearchCV(svr, parameters, cv=5, n_jobs = -1, verbose = 2)

grid_search.fit(X_train,Y_train)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 1080 out of 1080 | elapsed:    1.0s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'kernel': ('linear', 'rbf', 'poly'), 'C': range(1, 10), 'gamma': [1e-07, 0.0001], 'epsilon': [0.1, 0.2, 0.5, 0.3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [9]:
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_train, Y_train, X_test, Y_test)
best_grid

RMSE: 0.125
Relative RMSE: 54.032
Testing R2: 0.178
RMSE of Training: 0.108
Relative RMSE of Trainging: 42.082
Traing R2: 0.343


SVR(C=9, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=1e-07,
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)