In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import StackingRegressor

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
cowpea = pd.read_excel('../data/prepared.xlsx', sheet_name='Cowpea')
maize = pd.read_excel('../data/prepared.xlsx', sheet_name='Maize')
rice = pd.read_excel('../data/prepared.xlsx', sheet_name='Rice')
chickpea = pd.read_excel('../data/prepared.xlsx', sheet_name='Chickpea')
mustard = pd.read_excel('../data/prepared.xlsx', sheet_name='Mustard')

In [3]:
mustard.head()

Unnamed: 0,Date,Time,GSR,CT,Rn,ST_5cm,ST_10cm,ST_15cm
0,2008-11-21,08:00:00,293.72,22.9,227.012,0.0,25.5,
1,2008-11-21,09:00:00,325.19,26.3,279.908,22.5,23.5,
2,2008-11-21,10:00:00,398.62,22.8,297.54,22.5,23.5,
3,2008-11-21,11:00:00,440.58,27.7,328.396,24.0,25.0,
4,2008-11-21,12:00:00,608.42,26.6,209.38,25.5,26.0,


In [4]:
mustard.describe()

Unnamed: 0,GSR,CT,Rn,ST_5cm,ST_10cm,ST_15cm
count,198.0,198.0,198.0,198.0,117.0,81.0
mean,450.487222,20.341237,190.816758,20.94697,21.775214,19.512346
std,258.278877,4.069704,136.756249,3.812851,3.110713,2.640141
min,31.47,8.75,-8.816,0.0,15.5,14.5
25%,254.3825,17.425,77.691,18.5,20.0,17.5
50%,472.05,20.8,155.382,21.0,21.4,19.5
75%,618.91,23.2375,303.8765,23.0,23.5,21.5
max,1730.85,30.25,531.164,29.5,27.5,25.7


In [7]:
scalerx = StandardScaler()
scalery = StandardScaler()
mustard[['GSR','CT']] = scalerx.fit_transform(mustard[['GSR','CT']])
mustard[['Rn']] = scalery.fit_transform(mustard[['Rn']])

In [9]:
X_mustard = mustard[['GSR','CT']]
y_mustard = mustard['Rn']

In [10]:
X_mustard_train, X_mustard_test, y_mustard_train, y_mustard_test = train_test_split(X_mustard, y_mustard, test_size=0.2, random_state=42)

In [11]:
estimators = [
    ('GBR', GradientBoostingRegressor(random_state=42)),
    ('RF', RandomForestRegressor(random_state=42)),
    ('Ridge', Ridge(random_state=42)),
    ('SVR', SVR()),
]

In [12]:
# Train all models
def train(estimators, X, y, cv, scoring, verbose):
    if verbose:
        print("Scoring criteria:", str(scoring))
        print("CV:", cv)
        print("y std:", np.std(y))
        print('\n')
    for model in estimators if isinstance(estimators, list) else [estimators]:
        model[1].fit(X_mustard, y_mustard)
        cross_scores = cross_val_score(model[1], X, y, scoring=scoring, cv=cv)
        print(model[0], "mean cv score:", np.mean(cross_scores))
        if verbose:
            print(model[0], "all cv scores:", cross_scores)
        print('\n')

In [13]:
train(estimators=estimators, X=X_mustard_train, y=y_mustard_train, cv=5, scoring='neg_root_mean_squared_error', verbose=1)

Scoring criteria: neg_root_mean_squared_error
CV: 5
y std: 1.024665922621388


GBR mean cv score: -0.7594736938552862
GBR all cv scores: [-0.86226684 -0.76854876 -0.71246479 -0.65722327 -0.79686481]


RF mean cv score: -0.7364727878630781
RF all cv scores: [-0.84159371 -0.77273797 -0.72363229 -0.54474599 -0.79965398]


Ridge mean cv score: -0.7548053374441211
Ridge all cv scores: [-0.82394489 -0.77201445 -0.7125823  -0.6281433  -0.83734175]


SVR mean cv score: -0.7064847829610803
SVR all cv scores: [-0.88867492 -0.63337739 -0.72073456 -0.51310636 -0.77653068]




In [14]:
stacked_estimator = StackingRegressor(
    estimators=estimators,
    final_estimator=GradientBoostingRegressor(random_state=42)
)

In [15]:
stacked_estimator.fit(X_mustard_train, y_mustard_train)

StackingRegressor(estimators=[('GBR',
                               GradientBoostingRegressor(random_state=42)),
                              ('RF', RandomForestRegressor(random_state=42)),
                              ('Ridge', Ridge(random_state=42)),
                              ('SVR', SVR())],
                  final_estimator=GradientBoostingRegressor(random_state=42))

In [16]:
cross_scores = cross_val_score(
    stacked_estimator,
    X_mustard_train,
    y_mustard_train,
    scoring='neg_root_mean_squared_error',
    cv=5
)

print("Stacked estimator mean cv score:", np.mean(cross_scores))
print("Stacked estimator all cv scores:", cross_scores)

Stacked estimator mean cv score: -0.7258064413628826
Stacked estimator all cv scores: [-0.89314263 -0.73354853 -0.70456084 -0.47643561 -0.8213446 ]


In [21]:
for model in estimators:
    y_pred = scalery.inverse_transform(model[1].predict(X_mustard_test[['GSR','CT']]).reshape(-1,1))
    y_true = scalery.inverse_transform(y_mustard_test.values.reshape(-1,1))

    print(model[0], "MSE:", mean_squared_error(y_true, y_pred))
    print(model[0], "RMSE:", np.sqrt(mean_squared_error(y_true, y_pred)))
    print("\n")

GBR MSE: 2130.960568295718
GBR RMSE: 46.16232845400802


RF MSE: 994.40377474996
RF RMSE: 31.53416836940464


Ridge MSE: 9840.446164720697
Ridge RMSE: 99.19902300285369


SVR MSE: 8303.809564632631
SVR RMSE: 91.12524109505901




In [20]:
y_pred = scalery.inverse_transform(stacked_estimator.predict(X_mustard_test[['GSR','CT']]).reshape(-1,1))
y_true = scalery.inverse_transform(y_mustard_test.values.reshape(-1,1))

print("Stacked estimator MSE:", mean_squared_error(y_true, y_pred))
print("Stacked estimator RMSE:", np.sqrt(mean_squared_error(y_true, y_pred)))
print("\n")

Stacked estimator MSE 7480.706361672111
Stacked estimator RMSE 86.49107677484487


