In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import StackingRegressor

from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error

In [2]:
cowpea = pd.read_excel('../data/prepared.xlsx', sheet_name='Cowpea')
maize = pd.read_excel('../data/prepared.xlsx', sheet_name='Maize')
rice = pd.read_excel('../data/prepared.xlsx', sheet_name='Rice')
chickpea = pd.read_excel('../data/prepared.xlsx', sheet_name='Chickpea')
mustard = pd.read_excel('../data/prepared.xlsx', sheet_name='Mustard')

In [3]:
mustard.head()

Unnamed: 0,Date,Time,GSR,CT,Rn,ST_5cm,ST_10cm,ST_15cm
0,2008-11-21,08:00:00,293.72,22.9,227.012,0.0,25.5,
1,2008-11-21,09:00:00,325.19,26.3,279.908,22.5,23.5,
2,2008-11-21,10:00:00,398.62,22.8,297.54,22.5,23.5,
3,2008-11-21,11:00:00,440.58,27.7,328.396,24.0,25.0,
4,2008-11-21,12:00:00,608.42,26.6,209.38,25.5,26.0,


In [4]:
mustard.describe()

Unnamed: 0,GSR,CT,Rn,ST_5cm,ST_10cm,ST_15cm
count,198.0,198.0,198.0,198.0,117.0,81.0
mean,450.487222,20.341237,190.816758,20.94697,21.775214,19.512346
std,258.278877,4.069704,136.756249,3.812851,3.110713,2.640141
min,31.47,8.75,-8.816,0.0,15.5,14.5
25%,254.3825,17.425,77.691,18.5,20.0,17.5
50%,472.05,20.8,155.382,21.0,21.4,19.5
75%,618.91,23.2375,303.8765,23.0,23.5,21.5
max,1730.85,30.25,531.164,29.5,27.5,25.7


In [5]:
data = pd.concat([cowpea, rice, maize, chickpea, mustard], axis=0).reset_index(drop=True)

In [6]:
scalerx = StandardScaler()
scalery = StandardScaler()
data[['GSR','CT']] = scalerx.fit_transform(data[['GSR','CT']])
data[['Rn']] = scalery.fit_transform(data[['Rn']])

In [7]:
X = data[['GSR','CT']]
y = data['Rn']

In [8]:
estimators = [
    ('GBR', GradientBoostingRegressor(random_state=42)),
    ('RF', RandomForestRegressor(random_state=42)),
    ('Ridge', Ridge(random_state=42)),
    ('SVR', SVR()),
]

In [9]:
# Use KFold croos validation
kfold = KFold(n_splits=5)

In [10]:
# Train all models
def train(estimators, X, y, cv, scoring, verbose):
    if verbose:
        print("Scoring criteria:", str(scoring))
        print("CV:", cv)
        print("y std:", np.std(y))
        print('\n')
    for model in estimators if isinstance(estimators, list) else [estimators]:
        model[1].fit(X, y)
        cross_scores = cross_val_score(model[1], X, y, scoring=scoring, cv=cv)
        if verbose:
            print(model[0], "mean cv score:", np.mean(cross_scores))
            print(model[0], "all cv scores:", cross_scores)
            print('\n')

In [11]:
all_mse = {}
all_rmse = {}
for model in estimators:
    all_mse[model[0]] = []
    all_rmse[model[0]] = []
for (t_, v_) in kfold.split(X, y):
    train(estimators=estimators, X=X.iloc[t_], y=y.iloc[t_], cv=5, scoring='neg_root_mean_squared_error', verbose=0)
    for model in estimators:
        y_pred = scalery.inverse_transform(model[1].predict(X.iloc[v_][['GSR','CT']]).reshape(-1,1))
        y_true = scalery.inverse_transform(y.iloc[v_].values.reshape(-1,1))
        
        mse = mean_squared_error(y_true, y_pred)
        rmse = np.sqrt(mse)
        
        all_mse[model[0]].extend([mse])
        all_rmse[model[0]].extend([rmse])

In [12]:
for model in estimators:
    print(model[0],":")
    print("All folds MSE :", all_mse[model[0]])
    print("All folds RMSE :", all_rmse[model[0]])
    print("Mean MSE :", np.mean(all_mse[model[0]]))
    print("Mean RMSE :", np.mean(all_rmse[model[0]]))
    print("\n")

GBR :
All folds MSE : [13418.350606960164, 17920.13475667132, 6600.033337596822, 30445.896923514487, 28228.030088044823]
All folds RMSE : [115.83760445969247, 133.86610757272103, 81.24058922482543, 174.48752655566668, 168.0119938815227]
Mean MSE : 19322.489142557522
Mean RMSE : 134.68876433888565


RF :
All folds MSE : [13204.193140745263, 17855.41240373199, 8085.523341773924, 30013.83749998624, 18642.462449647846]
All folds RMSE : [114.9094997845925, 133.62414603555746, 89.91953815369563, 173.24502157345313, 136.53740311595152]
Mean MSE : 17560.285767177054
Mean RMSE : 129.64712173265005


Ridge :
All folds MSE : [10388.054698705251, 14507.949011850664, 3725.9693137295217, 23592.91444548301, 16262.608116704323]
All folds RMSE : [101.92180678689547, 120.44894774073646, 61.04071848962397, 153.5998517104851, 127.52493135345858]
Mean MSE : 13695.499117294554
Mean RMSE : 112.90725121623991


SVR :
All folds MSE : [12770.728601897907, 13444.619293551568, 4702.305154831597, 30401.89120377626

In [13]:
stacked_estimator = StackingRegressor(
    estimators=estimators,
    final_estimator=GradientBoostingRegressor(random_state=42)
)

In [15]:
cross_scores = cross_val_score(
    stacked_estimator,
    X,
    y,
    scoring='neg_root_mean_squared_error',
    cv=5
)

print("Stacked estimator mean cv score:", np.mean(cross_scores))
print("Stacked estimator all cv scores:", cross_scores)

Stacked estimator mean cv score: -0.8389544212060474
Stacked estimator all cv scores: [-0.90988616 -0.83757739 -0.4591576  -1.08814751 -0.90000344]


In [20]:
all_stacked_mses = []
all_stacked_rmses = []
for (t_, v_) in kfold.split(X, y):
    stacked_estimator.fit(X.iloc[t_], y.iloc[t_])
    y_pred = scalery.inverse_transform(stacked_estimator.predict(X.iloc[v_][['GSR','CT']]).reshape(-1,1))
    y_true = scalery.inverse_transform(y.iloc[v_].values.reshape(-1,1))
    
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    
    all_stacked_mses.append(mse)
    all_stacked_rmses.append(rmse)
    print("Stacked estimator MSE:", mse)
    print("Stacked estimator RMSE:", rmse)
    print("\n")

print("Stacked estimator mean MSE:", np.mean(all_stacked_mses))
print("Stacked estimator mean RMSE:", np.mean(all_stacked_rmses))

Stacked estimator MSE: 18772.06901363579
Stacked estimator RMSE: 137.01120032185614


Stacked estimator MSE: 15906.986414633322
Stacked estimator RMSE: 126.12290202272276


Stacked estimator MSE: 4780.370795337611
Stacked estimator RMSE: 69.14022559507316


Stacked estimator MSE: 26848.100861180395
Stacked estimator RMSE: 163.8539009641833


Stacked estimator MSE: 18366.49819065585
Stacked estimator RMSE: 135.5230540928585


Stacked estimator mean MSE: 16934.805055088596
Stacked estimator mean RMSE: 126.33025659933878
