## VotingRegressor 
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingRegressor.html

Estimate surface soil moisture based on the VotingRegressor 

In [5]:
"""
--------------------------
File Name:  Voting_Regressor.py
Original author: zhang (FZJ/IBG3)
Contact: leojayak@gmail.com
Date: 10-June-2022

Adapted by: Ting Duan
Contact: duanting35@gmail.com


Description: Estimate surface soil moisture based on the voting Regressor model.
AdaBoost (AB)
GradientBoosting (GB)
KNR
RFR,
XGBoosting (XB)
--------------------------
"""
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import pearsonr
from sklearn.tree import DecisionTreeRegressor

In [2]:
def format_val_file(file_val, index_col):
    """
    The columns is kid of different from the 'shuffled_20220317.csv'
    :param file_val: File for validate.
    :return: X_val and y_val
    """
    df_val = pd.read_csv(file_val, index_col=index_col)
    dataset2 = pd.to_datetime(df_val['Date'])
    DOY = dataset2.dt.dayofyear
    Year = dataset2.dt.year
    df_val.insert(0, 'DOY', DOY)
    df_val.insert(0, 'Year', Year)
    df_val.drop(labels=['Date', 'station', 'ESA-CCI', 'network'], axis=1, inplace=True)
    y_val = df_val.iloc[:, -2].values
    df_val.drop(labels='Soil Moisture', axis=1, inplace=True)
    X_val = df_val.iloc[:, :].values

    return X_val, y_val

In [8]:
def reg_VotingRegressor(reg_1, reg_2, reg_3, X_train, y_train, X_test, y_test, file_val, folder_independent):
    """
    :param reg_1: regressor a
    :param reg_2: regressor b
    :param reg_3: regressor c
    :param X_train: X_train
    :param y_train: y_train
    :param X_test: X_test
    :param y_test: y_test
    :param file_val: Validation dataset
    :param folder_independent: folder contains independent stations.
    :return: Trained VotingRegressor
    """
    # Train the VotingRegressor
    ereg = VotingRegressor(estimators=[('reg_1', reg_1), ('reg_2',reg_2), ('reg_3',reg_3)])
    ereg = ereg.fit(X_train, y_train)

    # ----------------------------
    # predict the test dataset.
    # ----------------------------
    pred = ereg.predict(X_test)
    result = pd.DataFrame([y_test,pred],index=["y_test","y_pred"])
    result.T.to_csv('test_random.csv')
    pearson_r = pearsonr(y_test, pred)[0]
    mse = mean_squared_error(y_test, pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, pred)
    print('Testing: ')
    print(f'MSE: {mse}, RMSE: {rmse}, r2: {r2}, Pearson_r:, {pearson_r}')
    print('--------------- \n\n')

    # -----------------------
    # The validation dataset
    # ------------------------
    X_val, y_val = format_val_file(file_val, index_col=None)
    pred_val = ereg.predict(X_val)
    result = pd.DataFrame([y_val, pred_val],index=["y_test","y_pred"])
    result.T.to_csv('test_temporal.csv')
    mse = mean_squared_error(y_val, pred_val)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_val, pred_val)
    pearson_r = pearsonr(y_val, pred_val)[0]
    print('Validation: ')
    print(f'MSE: {mse}, RMSE: {rmse}, r2: {r2}, Pearson_r:, {pearson_r}')
    print('--------------- \n\n')

    # -----------------------------
    # Independent stations
    # -----------------------------
    files = os.listdir(folder_independent)
    df_independent_metrics = pd.DataFrame(columns=['network', 'station', 'MSE', 'RMSE', 'R2', 'Pearson_r', 'n_size'],
                                          dtype='object')
    for idx, file in enumerate(files):
        # Read the data from independent stations.
        X_val_in, y_val_in = format_val_file(os.path.join(folder_independent, file), index_col=0)
        pred_val_in = ereg.predict(X_val_in)
        result = pd.DataFrame([y_val_in,pred_val_in],index=["y_test","y_pred"])
        result.T.to_csv(file[:-4] + '_voting.csv')
        mse = mean_squared_error(y_val_in, pred_val_in)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_val_in, pred_val_in)
        pearson_r = pearsonr(y_val_in, pred_val_in)[0]

        df_val_in = pd.read_csv(os.path.join(folder_independent, file))

        s_val_in = pd.Series(index=['network', 'station', 'MSE', 'RMSE', 'R2', 'Pearson_r', 'n_size'], dtype='object')
        s_val_in['network'] = file.split('_')[1]
        s_val_in['station'] = file.split('_')[2]
        s_val_in['MSE'] = mse
        s_val_in['RMSE'] = rmse
        s_val_in['R2'] = r2
        s_val_in['Pearson_r'] = pearson_r
        s_val_in['n_size'] = len(df_val_in)
        # print(idx, s_val_in)

        df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
        del s_val_in

    return df_independent_metrics



In [6]:
# -----------------------------------------------
# Work directory, open the file and read the data
# -----------------------------------------------
# work_dir = r'/home/zhang/SSM/Input_data/ML_SSM_dataset_v1_20220317'
#work_dir = r'/home/jovyan/shared/ML_DL_SoilMoisture/Input_data/ML_SSM_dataset_v1_20220317'
#os.chdir(work_dir)

file = 'ML_training&testing_v01shuffled_20220317.csv'
file_val = 'ML_validating_v01_20220303.csv'
folder_independent = 'output'
df = pd.read_csv(file)

X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# Split the training&testing data into training dataset and testing dataset, respectively.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)


reg_AB = AdaBoostRegressor(DecisionTreeRegressor(criterion='squared_error',max_depth=10),n_estimators=30,learning_rate=0.2)
reg_GB = GradientBoostingRegressor(n_estimators=120, max_depth=5, learning_rate=0.5)
reg_KNR = KNeighborsRegressor(n_neighbors=4, weights='distance', p=1, leaf_size=20, algorithm='ball_tree')
reg_RFR = RandomForestRegressor(n_estimators=10, max_depth=None, min_samples_split=4, min_samples_leaf=2)
reg_XB = XGBRegressor(n_estimator=800, max_depth=10, min_child_weight=1, gamma=0, subscample=0.8, colsample_bytree=0.9,
                      reg_alpha=0.05, reg_lambda=0.1)

In [9]:
%%time
df_AB_GB_KNR = reg_VotingRegressor(reg_AB, reg_GB, reg_KNR, X_train, y_train, X_test, y_test, file_val, folder_independent)
df_AB_GB_KNR.to_csv('VotingRegressor_Independent_stations_AB_GB_KNR.csv')
print(df_AB_GB_KNR.head(5))

Testing: 
MSE: 0.002085793518738297, RMSE: 0.04567048848806302, r2: 0.8322969184488851, Pearson_r:, 0.9189409022128588
--------------- 


Validation: 
MSE: 0.003459045216891797, RMSE: 0.05881364821953997, r2: 0.7225197022150474, Pearson_r:, 0.8525664217693448
--------------- 




  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.appe

     network             station       MSE      RMSE        R2  Pearson_r  \
0     SNOTEL    GRANDTARGHEE.csv  0.007324  0.085578  0.129635   0.794083   
1  SOILSCAPE         node509.csv  0.006037  0.077696  0.515027   0.737012   
2        ARM            Vici.csv  0.020207  0.142152 -7.920947   0.719724   
3      RISMA             MB2.csv  0.007902  0.088896 -4.767140   0.198627   
4     SNOTEL  EFBLACKSFORKGS.csv  0.021309  0.145976 -2.238719   0.291039   

  n_size  
0    454  
1    193  
2    762  
3    196  
4    204  
CPU times: user 14min 9s, sys: 1.14 s, total: 14min 10s
Wall time: 14min 26s


  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)


In [10]:
%%time
df_AB_GB_RFR = reg_VotingRegressor(reg_AB, reg_GB, reg_RFR, X_train, y_train, X_test, y_test, file_val, folder_independent)
df_AB_GB_RFR.to_csv('VotingRegressor_Independent_stations_AB_GB_RFR.csv')
print(df_AB_GB_RFR.head(5))

Testing: 
MSE: 0.00219278114797262, RMSE: 0.046827141146696324, r2: 0.8236948420931692, Pearson_r:, 0.9143882961091739
--------------- 


Validation: 
MSE: 0.003495604319390535, RMSE: 0.05912363587762964, r2: 0.7195869765604184, Pearson_r:, 0.8515528390777504
--------------- 




  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.appe

     network             station       MSE      RMSE        R2  Pearson_r  \
0     SNOTEL    GRANDTARGHEE.csv  0.007232  0.085042  0.140512   0.752997   
1  SOILSCAPE         node509.csv  0.005664  0.075259  0.544966   0.746297   
2        ARM            Vici.csv  0.014203  0.119175 -5.270188   0.710418   
3      RISMA             MB2.csv  0.006911  0.083130 -4.043319   0.218397   
4     SNOTEL  EFBLACKSFORKGS.csv  0.022759  0.150862 -2.459162   0.511181   

  n_size  
0    454  
1    193  
2    762  
3    196  
4    204  
CPU times: user 12min 9s, sys: 994 ms, total: 12min 10s
Wall time: 12min 27s


  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)


In [11]:
%%time
df_AB_GB_XB = reg_VotingRegressor(reg_AB, reg_GB, reg_XB, X_train, y_train, X_test, y_test, file_val, folder_independent)
df_AB_GB_XB.to_csv('VotingRegressor_Independent_stations_AB_GB_XB.csv')
print(df_AB_GB_XB.head(5))

Parameters: { "n_estimator", "subscample" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Testing: 
MSE: 0.0021369814772943595, RMSE: 0.046227496982795414, r2: 0.8281812769383318, Pearson_r:, 0.9161317012482071
--------------- 


Validation: 
MSE: 0.0034216667143302758, RMSE: 0.058495014439952704, r2: 0.7255181591218485, Pearson_r:, 0.8541454136019169
--------------- 




  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.appe

     network             station       MSE      RMSE        R2  Pearson_r  \
0     SNOTEL    GRANDTARGHEE.csv  0.007348  0.085722  0.126707   0.661960   
1  SOILSCAPE         node509.csv  0.006392  0.079953  0.486440   0.709740   
2        ARM            Vici.csv  0.015285  0.123633 -5.748026   0.726746   
3      RISMA             MB2.csv  0.006791  0.082409 -3.956157   0.272791   
4     SNOTEL  EFBLACKSFORKGS.csv  0.025287  0.159020 -2.843368   0.550036   

  n_size  
0    454  
1    193  
2    762  
3    196  
4    204  
CPU times: user 15min 37s, sys: 4.96 s, total: 15min 42s
Wall time: 12min 9s


  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)


In [13]:
%%time
df_AB_KNR_RFR = reg_VotingRegressor(reg_AB, reg_KNR, reg_RFR, X_train, y_train, X_test, y_test, file_val, folder_independent)
df_AB_KNR_RFR.to_csv('VotingRegressor_Independent_stations_AB_KNR_RFR.csv')
print(df_AB_KNR_RFR.head(5))

Testing: 
MSE: 0.001813698749848599, RMSE: 0.042587542190746334, r2: 0.8541740269962083, Pearson_r:, 0.9310371476161966
--------------- 


Validation: 
MSE: 0.0035193170839720052, RMSE: 0.059323832343940876, r2: 0.71768476812867, Pearson_r:, 0.8496756596499422
--------------- 




  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.appe

     network             station       MSE      RMSE        R2  Pearson_r  \
0     SNOTEL    GRANDTARGHEE.csv  0.008089  0.089939  0.038661   0.755774   
1  SOILSCAPE         node509.csv  0.005932  0.077022  0.523404   0.736118   
2        ARM            Vici.csv  0.019791  0.140679 -7.737097   0.698628   
3      RISMA             MB2.csv  0.008179  0.090436 -4.968784   0.205100   
4     SNOTEL  EFBLACKSFORKGS.csv  0.021371  0.146187 -2.248087   0.247401   

  n_size  
0    454  
1    193  
2    762  
3    196  
4    204  
CPU times: user 7min 57s, sys: 674 ms, total: 7min 58s
Wall time: 8min 9s


  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)


In [14]:
%%time
df_AB_KNR_XB = reg_VotingRegressor(reg_AB, reg_KNR, reg_XB, X_train, y_train, X_test, y_test, file_val, folder_independent)
df_AB_KNR_XB.to_csv('VotingRegressor_Independent_stations_AB_KNR_XB.csv')
print(df_AB_KNR_XB.head(5))

Parameters: { "n_estimator", "subscample" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Testing: 
MSE: 0.0017350746009993558, RMSE: 0.04165422668828886, r2: 0.8604956076933846, Pearson_r:, 0.9341848564436687
--------------- 


Validation: 
MSE: 0.0034083146388666605, RMSE: 0.05838077285259814, r2: 0.7265892459806148, Pearson_r:, 0.8544352626255103
--------------- 




  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
  df_independent_metrics = df_independent_metrics.appe

     network             station       MSE      RMSE        R2  Pearson_r  \
0     SNOTEL    GRANDTARGHEE.csv  0.007879  0.088761  0.063674   0.694753   
1  SOILSCAPE         node509.csv  0.006175  0.078580  0.503927   0.722561   
2        ARM            Vici.csv  0.020105  0.141793 -7.876038   0.703444   
3      RISMA             MB2.csv  0.008303  0.091120 -5.059323   0.226500   
4     SNOTEL  EFBLACKSFORKGS.csv  0.022512  0.150041 -2.421626   0.282980   

  n_size  
0    454  
1    193  
2    762  
3    196  
4    204  
CPU times: user 11min 2s, sys: 4.46 s, total: 11min 6s
Wall time: 7min 33s


  df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
