In [None]:
import pandas as pd
import numpy as np
import scipy as sc
from matplotlib import pyplot
from numpy.polynomial.polynomial import polyfit
from itertools import product
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [None]:
df = pd.read_csv('CSV_file')

# train_test set splitting (stratified with day of the year and the vineyards)
X, y = df.iloc[:,1:].values, df.iloc[:,0].values
X_train_o, X_test_o, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                                    random_state=19, stratify=df.iloc[:,2])
np.mean(y_train), np.mean(y_test), np.std(y_train), np.std(y_test)

In [None]:
# Standardization
scaler_st = StandardScaler()
X_train_st = scaler_st.fit_transform(X_train_o[:,1:])
X_test_st = scaler_st.transform(X_test_o[:,1:])
X_train_st = np.insert(X_train_st, 0, X_train_o[:,0], axis=1)
X_test_st = np.insert(X_test_st, 0, X_test_o[:,0], axis=1)

# make train_test array back to dataframe
df_train = pd.DataFrame(X_train_st)
df_train.columns = df.columns[1:]
df_test = pd.DataFrame(X_test_st)
df_test.columns = df.columns[1:]

In [None]:
# make combinations of training sets 
G1 = [list(df_train['d_RHmean']),list(df_train['d_Rtotal']),
      list(df_train['zero'])]
G2 = [list(df_train['Slope']),list(df_train['zero'])]
G3 = [list(df_train['DOY']),list(df_train['w_IRtotal']),
      list(df_train['zero'])]
G4 = [list(df_train['w_WSmean']),list(df_train['d_WSmean']),
      list(df_train['d_IRtotal']),list(df_train['zero'])]
G5 = [list(df_train['TCARI']),list(df_train['d_Tmean']),
      list(df_train['w_RHmean']),list(df_train['w_Rtotal']),
      list(df_train['zero'])]
G6 = [list(df_train['EC']),list(df_train['zero'])]
G7 = [list(df_train['Elevation']),list(df_train['zero'])]
G8 = [list(df_train['w_Tmean']),list(df_train['zero'])]
combi_train = [G1,G2,G3,G4,G5,G6,G7,G8]
data_combi_train = list(product(*combi_train))
index_train = ['{}'.format(i) for i in range(1, len(data_combi_train)+1)]
df_combi_train = pd.DataFrame(data_combi_train, index=index_train,
                              columns=['G{}'.format(i) for i in range(1,9)])

In [None]:
# make combinations of test sets 
G1_ = [list(df_test['d_RHmean']),list(df_test['d_Rtotal']),
      list(df_test['zero'])]
G2_ = [list(df_test['Slope']),list(df_test['zero'])]
G3_ = [list(df_test['DOY']),list(df_test['w_IRtotal']),
      list(df_test['zero'])]
G4_ = [list(df_test['w_WSmean']),list(df_test['d_WSmean']),
      list(df_test['d_IRtotal']),list(df_test['zero'])]
G5_ = [list(df_test['TCARI']),list(df_test['d_Tmean']),
      list(df_test['w_RHmean']),list(df_test['w_Rtotal']),
      list(df_test['zero'])]
G6_ = [list(df_test['EC']),list(df_test['zero'])]
G7_ = [list(df_test['Elevation']),list(df_test['zero'])]
G8_ = [list(df_test['w_Tmean']),list(df_test['zero'])]
combi_test = [G1_,G2_,G3_,G4_,G5_,G6_,G7_,G8_]
data_combi_test = list(product(*combi_test))
index_test = ['{}'.format(i) for i in range(1, len(data_combi_test)+1)]
df_combi_test = pd.DataFrame(data_combi_test, index=index_test,
                              columns=['G{}'.format(i) for i in range(1,9)])

In [None]:
# run regression algorithms to determine the best combination of features based on RMSE
# Elastic net
en_list2 = []
for i in range(1, len(data_combi_train)+1):
    # stack the normal array to be training set of each combination
    X_train_extract = np.stack(df_combi_train.iloc[i-1,:],axis = 1)
    X_test_extract = np.stack(df_combi_test.iloc[i-1,:],axis = 1)
    # Elastic net
    en = ElasticNet(max_iter=10000, tol=0.001, warm_start=True)
    en_para = {'alpha':[1e-2, 1e-1, 1, 10, 100],
               'l1_ratio':np.arange(0.1, 1, 0.1)}
    en_gs = GridSearchCV(en, en_para, cv = 10, scoring='r2', n_jobs=8)
    en_gs.fit(X_train_extract, y_train)
    en_train_r2 = en_gs.score(X_train_extract, y_train)
    en_test_r2 = en_gs.score(X_test_extract, y_test)
    en_test_rmse = mean_squared_error(y_test, en_gs.predict(X_test_extract), 
                                      squared=False)
    en_output = [i,en_train_r2, en_test_r2, en_test_rmse]
    en_list2.append(en_output)
df_en2 = pd.DataFrame(en_list2, columns =['combination','r2_train','r2_test','rmse_test'])
df_en2.to_csv('CSV_EN')

In [None]:
# Random forest regression
rf_list2 = []
for i in range(1, len(data_combi_train)+1):
    # stack the normal array to be training set of each combination
    X_train_extract = np.stack(df_combi_train.iloc[i-1,:],axis = 1)
    X_test_extract = np.stack(df_combi_test.iloc[i-1,:],axis = 1)
    # Random forest regression
    rf = RandomForestRegressor(max_depth=2, n_estimators=100, 
                               random_state=0, n_jobs=8)
    rf_para = {'max_features':['auto', 'sqrt', 'log2']}
    rf_gs = GridSearchCV(rf, rf_para, cv = 10, scoring='r2', n_jobs=8)
    rf_gs.fit(X_train_extract, y_train)
    rf_train_r2 = rf_gs.score(X_train_extract, y_train)
    rf_test_r2 = rf_gs.score(X_test_extract, y_test)
    rf_test_rmse = mean_squared_error(y_test, rf_gs.predict(X_test_extract), 
                                      squared=False)
    rf_output = [i,rf_train_r2, rf_test_r2, rf_test_rmse]
    rf_list2.append(rf_output)
df_rf2 = pd.DataFrame(rf_list2, columns =['combination','r2_train','r2_test','rmse_test'])
df_rf2.to_csv('CSV_RFR')

In [None]:
# Support vector regression
svr_list2 = []
for i in range(1, len(data_combi_train)+1):
    # stack the normal array to be training set of each combination
    X_train_extract = np.stack(df_combi_train.iloc[i-1,:], axis = 1)
    X_test_extract = np.stack(df_combi_test.iloc[i-1,:], axis = 1)
    # Support vector regression
    svr = SVR()
    svr_para = {'kernel':['linear','poly','rbf'], 'C':[0.01,0.1,1,10,100], 
                        'gamma':['scale','auto'], 'epsilon':[0.1,0.5,0.9]}
    svr_gs = GridSearchCV(svr, svr_para, cv = 10, scoring='r2', n_jobs=8)
    svr_gs.fit(X_train_extract, y_train)
    svr_train_r2 = svr_gs.score(X_train_extract, y_train)
    svr_test_r2 = svr_gs.score(X_test_extract, y_test)
    svr_test_rmse = mean_squared_error(y_test, svr_gs.predict(X_test_extract),
                                       squared=False)
    svr_output = [i,svr_train_r2, svr_test_r2, svr_test_rmse]
    svr_list2.append(svr_output)
df_svr2 = pd.DataFrame(svr_list2, columns =['combination','r2_train','r2_test','rmse_test'])
df_svr2.to_csv('CSV_SVR')