In [None]:
import pandas as pd
import numpy as np
import scipy as sc
from matplotlib import pyplot
from numpy.polynomial.polynomial import polyfit
from itertools import product
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import shap

In [None]:
df = pd.read_csv('CSV_file')

# train_test set splitting (stratified with day of the year and the vineyards)
X, y = df.iloc[:,1:].values, df.iloc[:,0].values
X_train_o, X_test_o, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                                    random_state=19, stratify=df.iloc[:,2])

# Standardization
scaler_st = StandardScaler()
X_train_st = scaler_st.fit_transform(X_train_o[:,1:])
X_test_st = scaler_st.transform(X_test_o[:,1:])
X_train_st = np.insert(X_train_st, 0, X_train_o[:,0], axis=1)
X_test_st = np.insert(X_test_st, 0, X_test_o[:,0], axis=1)

# make train_test array back to dataframe
df_train = pd.DataFrame(X_train_st)
df_train.columns = df.columns[1:]
df_test = pd.DataFrame(X_test_st)
df_test.columns = df.columns[1:]

In [None]:
# make combinations of training sets 
G1 = [list(df_train['d_RHmean']),list(df_train['d_Rtotal']),
      list(df_train['zero'])]
G2 = [list(df_train['Slope']),list(df_train['zero'])]
G3 = [list(df_train['DOY']),list(df_train['w_IRtotal']),
      list(df_train['zero'])]
G4 = [list(df_train['w_WSmean']),list(df_train['d_WSmean']),
      list(df_train['d_IRtotal']),list(df_train['zero'])]
G5 = [list(df_train['TCARI']),list(df_train['d_Tmean']),
      list(df_train['w_RHmean']),list(df_train['w_Rtotal']),
      list(df_train['zero'])]
G6 = [list(df_train['ECa']),list(df_train['zero'])]
G7 = [list(df_train['Elevation']),list(df_train['zero'])]
G8 = [list(df_train['w_Tmean']),list(df_train['zero'])]
combi_train = [G1,G2,G3,G4,G5,G6,G7,G8]
data_combi_train = list(product(*combi_train))
index_train = ['{}'.format(i) for i in range(1, len(data_combi_train)+1)]
df_combi_train = pd.DataFrame(data_combi_train, index=index_train,
                              columns=['G{}'.format(i) for i in range(1,9)])

In [None]:
# make combinations of test sets 
G1_ = [list(df_test['d_RHmean']),list(df_test['d_Rtotal']),
      list(df_test['zero'])]
G2_ = [list(df_test['Slope']),list(df_test['zero'])]
G3_ = [list(df_test['DOY']),list(df_test['w_IRtotal']),
      list(df_test['zero'])]
G4_ = [list(df_test['w_WSmean']),list(df_test['d_WSmean']),
      list(df_test['d_IRtotal']),list(df_test['zero'])]
G5_ = [list(df_test['TCARI']),list(df_test['d_Tmean']),
      list(df_test['w_RHmean']),list(df_test['w_Rtotal']),
      list(df_test['zero'])]
G6_ = [list(df_test['ECa']),list(df_test['zero'])]
G7_ = [list(df_test['Elevation']),list(df_test['zero'])]
G8_ = [list(df_test['w_Tmean']),list(df_test['zero'])]
combi_test = [G1_,G2_,G3_,G4_,G5_,G6_,G7_,G8_]
data_combi_test = list(product(*combi_test))
index_test = ['{}'.format(i) for i in range(1, len(data_combi_test)+1)]
df_combi_test = pd.DataFrame(data_combi_test, index=index_test,
                              columns=['G{}'.format(i) for i in range(1,9)])

In [None]:
# retrieve the selected features 
pd.set_option("display.precision", 5)
feature_list = [] 
for i in [2526]: #combination No. of the model with the best performance
    X_train_extract = np.stack(df_combi_train.iloc[i-1,:],axis = 1)
    X_test_extract = np.stack(df_combi_test.iloc[i-1,:],axis = 1)
    
    svr = SVR()
    svr_para = {'kernel':['linear','poly','rbf'], 'C':[0.01,0.1,1,10,100], 
                        'gamma':['scale','auto'], 'epsilon':[0.1,0.5,0.9]}
    svr_gs = GridSearchCV(svr, svr_para, cv = 10, scoring='r2', n_jobs=8)
    svr_gs.fit(X_train_extract, y_train)
    svr_train_r2 = svr_gs.score(X_train_extract, y_train)
    svr_test_r2 = svr_gs.score(X_test_extract, y_test)
    svr_test_rmse = mean_squared_error(y_test, svr_gs.predict(X_test_extract),
                                       squared=False)
    feature = [i,svr_train_r2, svr_test_r2, svr_test_rmse]+list(data_combi_r[i-1]) #the last part: tuple to list
    feature_list.append(feature)
    df_feature = pd.DataFrame(feature_list, columns=['combination','train_r2','test_r2','test_rmse',
                                                     '1','2','3','4','5','6','7','8'])
df_feature

In [None]:
#make regression graph
X_train_extract = np.stack(df_combi_train.iloc[j-1,:],axis = 1)
X_test_extract = np.stack(df_combi_test.iloc[j-1,:],axis = 1)

svr = SVR()
svr_para = {'kernel':['linear','poly','rbf'], 'C':[0.01,0.1,1,10,100], 
                        'gamma':['scale','auto'], 'epsilon':[0.1,0.5,0.9]}
svr_gs = GridSearchCV(svr, svr_para, cv = 10, scoring='r2', n_jobs=8)
svr_gs.fit(X_train_extract, y_train)
graph_x1 = svr_gs.predict(X_train_extract) 
graph_y1 = y_train
graph_x2 = svr_gs.predict(X_test_extract) 
graph_y2 = y_test
pyplot.figure(dpi=600)
pyplot.plot([0,1400], [0,1400], 'r:')
scatter1 = pyplot.scatter(graph_x1, graph_y1, s=5)
scatter2 = pyplot.scatter(graph_x2, graph_y2, s=5)
pyplot.legend((scatter1,scatter2), ('training set','test set'), loc='lower right')
b, m = polyfit(graph_x.ravel(), graph_y, 1)
pyplot.plot(graph_x, m*graph_x + b, 'b-')
pyplot.ylabel('observed Ѱstem (kPa)')
pyplot.xlabel('predicted Ѱstem (kPa)')
pyplot.axis([0, 1400, 0, 1400])
pyplot.savefig('plot_file',dpi=600)
pyplot.show()

In [None]:
# compute SHapley Additive exPlanations (SHAP) values for the best model
X_train_extract = np.stack(df_combi_train.iloc[j-1,:],axis = 1)
X_test_extract = np.stack(df_combi_test.iloc[j-1,:],axis = 1)

svr = SVR()
svr_para = {'kernel':['linear','poly','rbf'], 'C':[0.01,0.1,1,10,100], 
                        'gamma':['scale','auto'], 'epsilon':[0.1,0.5,0.9]}
svr_gs = GridSearchCV(svr, svr_para, cv = 10, scoring='r2', n_jobs=8)
svr_gs.fit(X_train_extract, y_train)
explainer_svr = shap.KernelExplainer(svr_gs.predict, X_train_extract) 
df_st = pd.concat([df_train, df_test], ignore_index=True) # get total standardized data
shap_svr = explainer_svr.shap_values(df_st[list(data_combi_r[j-1])]) # compute SHAP values for all observation

In [None]:
# summary plot
pyplot.figure(dpi=600)
shap.summary_plot(shap_svr, df_st[list(data_combi_r[j-1])], max_display=3, 
                  show=False)
pyplot.tight_layout()
pyplot.savefig('plot_file', dpi=600)

In [None]:
# summary plot_bar
pyplot.figure(dpi=600)
shap.summary_plot(shap_svr, df_st[list(data_combi_r[j-1])], plot_type="bar", 
                  max_display=3, show=False)
pyplot.tight_layout()
pyplot.savefig('plot_file', dpi=600)