In [57]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import StackingRegressor, AdaBoostRegressor

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, PowerTransformer, OneHotEncoder
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error

In [58]:
cowpea = pd.read_excel('../data/prepared.xlsx', sheet_name='Cowpea')    ; cowpea['crop']   = 'cowpea'
maize = pd.read_excel('../data/prepared.xlsx', sheet_name='Maize')      ; maize['crop']    = 'maize'
rice = pd.read_excel('../data/prepared.xlsx', sheet_name='Rice')        ; rice['crop']     = 'rice'
chickpea = pd.read_excel('../data/prepared.xlsx', sheet_name='Chickpea'); chickpea['crop'] = 'chickpea'
mustard = pd.read_excel('../data/prepared.xlsx', sheet_name='Mustard')  ; mustard['crop']  = 'mustard'

In [59]:
data = pd.concat([cowpea, rice, maize, chickpea, mustard], axis=0).reset_index(drop=True)

In [60]:
# Remove outliers
data = data[data['GSR']<=1000]

In [61]:
data.loc[:, 'Time'] = data.loc[:, 'Time'].apply(lambda x: x.hour)

In [62]:
data.loc[:, 'timesin'] = np.sin(data.loc[:, 'Time'] * (2 * np.pi) / 12)
data.loc[:, 'timecos'] = np.cos(data.loc[:, 'Time'] * (2 * np.pi) / 12)

In [64]:
# ohe = OneHotEncoder(sparse=False)
# df = ohe.fit_transform(data['crop'].values.reshape(-1,1))

# print(df.shape)
# df = pd.DataFrame(df, columns=ohe.get_feature_names(['crop']))
df = pd.get_dummies(data[['crop']])
data = pd.concat([df, data], axis=1)

In [66]:
scalerx = StandardScaler()
scalery = StandardScaler()
data[['GSR','CT',]] = scalerx.fit_transform(data[['GSR','CT']])
data[['Rn']] = scalery.fit_transform(data[['Rn']])

In [67]:
data.columns

Index(['crop_chickpea', 'crop_cowpea', 'crop_maize', 'crop_mustard',
       'crop_rice', 'Date', 'Time', 'GSR', 'CT', 'Rn', 'crop', 'ST_5cm',
       'ST_10cm', 'ST_15cm', 'timesin', 'timecos'],
      dtype='object')

In [68]:
feature_cols = [c for c in data.columns if c not in ['ST_5cm','ST_10cm','ST_15cm','Date','Time','crop','Rn']]
X = data[feature_cols]
y = data['Rn']

In [74]:
estimators = [
    ('GBR', GradientBoostingRegressor(random_state=42)),
    ('RF', RandomForestRegressor(random_state=42)),
    ('Ridge pipeline', Pipeline(steps=[
#         ('powert', PowerTransformer()),
        ('poly2', PolynomialFeatures(degree=2, interaction_only=True)),
        ('ridgef', Ridge(alpha=50, random_state=42))
    ])),
    ('ridge', Ridge(alpha=40, random_state=42)),
    ('SVR', SVR(kernel='rbf', gamma='auto', C=0.05)),
    ('adaboost', AdaBoostRegressor())
]

In [75]:
# Use KFold croos validation
kfold = KFold(n_splits=5)

In [76]:
# Train all models
def train(estimators, X, y, cv, scoring, verbose):
    if verbose:
        print("Scoring criteria:", str(scoring))
        print("CV:", cv)
        print("y std:", np.std(y))
        print('\n')
    for model in estimators if isinstance(estimators, list) else [estimators]:
        model[1].fit(X, y)
        cross_scores = cross_val_score(model[1], X, y, scoring=scoring, cv=cv)
        if verbose:
            print(model[0], "mean cv score:", np.mean(cross_scores))
            print(model[0], "all cv scores:", cross_scores)
            print('\n')

In [77]:
all_mse = {}
all_rmse = {}
for model in estimators:
    all_mse[model[0]] = []
    all_rmse[model[0]] = []
for (t_, v_) in kfold.split(X, y):
    train(estimators=estimators, X=X.iloc[t_], y=y.iloc[t_], cv=5, scoring='neg_root_mean_squared_error', verbose=0)
    for model in estimators:
        y_pred = scalery.inverse_transform(model[1].predict(X.iloc[v_][feature_cols]).reshape(-1,1))
        y_true = scalery.inverse_transform(y.iloc[v_].values.reshape(-1,1))
        
        mse = mean_squared_error(y_true, y_pred)
        rmse = np.sqrt(mse)
        
        all_mse[model[0]].extend([mse])
        all_rmse[model[0]].extend([rmse])

In [78]:
for model in estimators:
    print(model[0],":")
    print("All folds MSE :", all_mse[model[0]])
    print("All folds RMSE :", all_rmse[model[0]])
    print("Mean MSE :", np.mean(all_mse[model[0]]))
    print("Mean RMSE :", np.mean(all_rmse[model[0]]))
    print("\n")

GBR :
All folds MSE : [12598.311095915818, 14664.53684179979, 8039.526879477582, 25509.508318974044, 24455.278495339215]
All folds RMSE : [112.24219837438956, 121.09722061963186, 89.66340881026989, 159.71696315349237, 156.38183556711186]
Mean MSE : 17053.43232630129
Mean RMSE : 127.82032530497911


RF :
All folds MSE : [11549.189954335452, 15154.338020048152, 8660.470328089932, 24205.887371169345, 19341.267843071128]
All folds RMSE : [107.46715756144037, 123.10295699148803, 93.06164799792626, 155.58241343792474, 139.07288680066696]
Mean MSE : 15782.230703342802
Mean RMSE : 123.65741255788927


Ridge pipeline :
All folds MSE : [14360.286162650207, 14981.365432442382, 5223.845766509721, 26006.147738467404, 21153.424651108457]
All folds RMSE : [119.83441142948134, 122.39838819380908, 72.27617703302882, 161.2642171669444, 145.44216943895074]
Mean MSE : 16345.013950235636
Mean RMSE : 124.24307265244288


ridge :
All folds MSE : [12424.740578982974, 11445.88013725188, 4934.790297498309, 2665

## Stacked estimator

In [None]:
stacked_estimator = StackingRegressor(
    estimators=estimators,
    final_estimator=RandomForestRegressor(random_state=42)
)

In [None]:
cross_scores = cross_val_score(
    stacked_estimator,
    X,
    y,
    scoring='neg_root_mean_squared_error',
    cv=5
)

print("Stacked estimator mean cv score:", np.mean(cross_scores))
print("Stacked estimator all cv scores:", cross_scores)

In [None]:
all_stacked_mses = []
all_stacked_rmses = []
for (t_, v_) in kfold.split(X, y):
    stacked_estimator.fit(X.iloc[t_], y.iloc[t_])
    y_pred = scalery.inverse_transform(stacked_estimator.predict(X.iloc[v_][feature_cols]).reshape(-1,1))
    y_true = scalery.inverse_transform(y.iloc[v_].values.reshape(-1,1))
    
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    
    all_stacked_mses.append(mse)
    all_stacked_rmses.append(rmse)
    print("Stacked estimator MSE:", mse)
    print("Stacked estimator RMSE:", rmse)
    print("\n")

print("Stacked estimator mean MSE:", np.mean(all_stacked_mses))
print("Stacked estimator mean RMSE:", np.mean(all_stacked_rmses))