In [None]:
import pandas as pd
import numpy as np
import scipy as sc
from matplotlib import pyplot
from numpy.polynomial.polynomial import polyfit
from itertools import product
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import pickle

In [None]:
# determine the random state for multilayer perceptron
df_2021 = pd.read_csv('CSV_2021')
test_list = []
for i in range(0, 45):
    # train_test set splitting (stratified with day of the year and the vineyards)
    X_2021, y_2021 = df_2021.iloc[:,1:].values, df_2021.iloc[:,0].values
    X_train_2021, X_test_2021, y_train_2021, y_test_2021 = train_test_split(X_2021, y_2021,
                                                                            test_size=0.3,
                                                                            random_state=i,
                                                                            stratify=df_2021.iloc[:,2])
    # Standardization
    scaler_st = StandardScaler()
    X_train_2021_st = scaler_st.fit_transform(X_train_2021[:,1:])
    X_test_2021_st = scaler_st.transform(X_test_2021[:,1:])
    
    # Multi-layer Perceptron regressor
    mlp = MLPRegressor(random_state=0, max_iter=10000)
    mlp_para = {'hidden_layer_sizes': [(24,),(24,12)],
                'activation': ['tanh', 'relu'], 'solver': ['sgd', 'adam'], 
                'learning_rate': ['constant','adaptive'],
                'alpha': [0.0001,0.001,0.01]}
    mlp_gs = GridSearchCV(mlp, mlp_para, cv = 10, scoring='r2', n_jobs=8)
    mlp_gs.fit(X_train_2021_st, y_train_2021)
    mlp_train_r2 = mlp_gs.score(X_train_2021_st, y_train_2021)
    mlp_test_r2_2021 = mlp_gs.score(X_test_2021_st, y_test_2021)
    mlp_test_rmse_2021 = mean_squared_error(y_test_2021, 
                                            mlp_gs.predict(X_test_2021_st), squared=False)
    mlp_test_r2_2022 = mlp_gs.score(X_2022_st, y_2022)
    mlp_test_rmse_2022 = mean_squared_error(y_2022, 
                                            mlp_gs.predict(X_2022_st), squared=False)
    output = [i, mlp_train_r2, mlp_test_r2_2021, mlp_test_rmse_2021, mlp_test_r2_2022,
              mlp_test_rmse_2022]
    test_list.append(output)

df_test_list = pd.DataFrame(test_list, columns =['random state','train_r2_2021','test_r2_2021',
                                                 'test_rmse_2021', 'test_r2_2022','test_rmse_2022'])
df_test_list.to_csv('CSV_file')

In [None]:
# determine the random state for random forest regression
df_2021 = pd.read_csv('CSV_2021')
test_list = []
for i in range(0, 45):
    # train_test set splitting 
    X_2021, y_2021 = df_2021.iloc[:,1:].values, df_2021.iloc[:,0].values
    X_train_2021, X_test_2021, y_train_2021, y_test_2021 = train_test_split(X_2021, y_2021,
                                                                            test_size=0.3,
                                                                            random_state=i,
                                                                            stratify=df_2021.iloc[:,2])
    # Standardization
    scaler_st = StandardScaler()
    X_train_2021_st = scaler_st.fit_transform(X_train_2021[:,1:])
    X_test_2021_st = scaler_st.transform(X_test_2021[:,1:])
    
    # Random forest
    rf = RandomForestRegressor(max_depth=2, n_estimators=500, random_state=0, 
                           n_jobs=8)
    rf_para = {'max_features':['auto', 'sqrt', 'log2']}
    rf_gs = GridSearchCV(rf, rf_para, cv = 10, scoring='r2', n_jobs=8)
    rf_gs.fit(X_train_2021_st, y_train_2021)
    rf_train_r2 = rf_gs.score(X_train_2021_st, y_train_2021)
    rf_test_r2_2021 = rf_gs.score(X_test_2021_st, y_test_2021)
    rf_test_rmse_2021 = mean_squared_error(y_test_2021, 
                                       rf_gs.predict(X_test_2021_st), 
                                      squared=False)
    rf_test_r2_2022 = rf_gs.score(X_2022_st, y_2022)
    rf_test_rmse_2022 = mean_squared_error(y_2022, 
                                       rf_gs.predict(X_2022_st), 
                                      squared=False)
    output = [i, rf_train_r2, rf_test_r2_2021, rf_test_rmse_2021, rf_test_r2_2022,
              rf_test_rmse_2022]
    test_list.append(output)
df_test_list = pd.DataFrame(test_list, columns =['random state','train_r2_2021','test_r2_2021',
                                                 'test_rmse_2021', 'test_r2_2022','test_rmse_2022'])
df_test_list.to_csv('CSV_file')

In [None]:
# determine the random state for support vector regression
df_2021 = pd.read_csv('CSV_2021')
test_list = []
for i in range(0, 45):
    # train_test set splitting 
    X_2021, y_2021 = df_2021.iloc[:,1:].values, df_2021.iloc[:,0].values
    X_train_2021, X_test_2021, y_train_2021, y_test_2021 = train_test_split(X_2021, y_2021,
                                                                            test_size=0.3,
                                                                            random_state=i,
                                                                            stratify=df_2021.iloc[:,2])
    # Standardization
    scaler_st = StandardScaler()
    X_train_2021_st = scaler_st.fit_transform(X_train_2021[:,1:])
    X_test_2021_st = scaler_st.transform(X_test_2021[:,1:])
    
    # Support vector regression
    svr = SVR()
    svr_para = {'kernel':['linear','poly','rbf'], 'C':[0.01,0.1,1,10,100], 
                        'gamma':['scale','auto'], 'epsilon':[0.1,0.5,0.9]}
    svr_gs = GridSearchCV(svr, svr_para, cv = 10, scoring='r2', n_jobs=8)
    svr_gs.fit(X_train_2021_st, y_train_2021)
    svr_train_r2 = svr_gs.score(X_train_2021_st, y_train_2021)
    svr_test_r2_2021 = svr_gs.score(X_test_2021_st, y_test_2021)
    svr_test_rmse_2021 = mean_squared_error(y_test_2021, 
                                        svr_gs.predict(X_test_2021_st),
                                       squared=False)
    svr_test_r2_2022 = svr_gs.score(X_2022_st, y_2022)
    svr_test_rmse_2022 = mean_squared_error(y_2022, 
                                        svr_gs.predict(X_2022_st),
                                       squared=False)
    output = [i, svr_train_r2, svr_test_r2_2021, svr_test_rmse_2021, svr_test_r2_2022, 
              svr_test_rmse_2022]
    test_list.append(output)
df_test_list = pd.DataFrame(test_list, columns =['random state','train_r2_2021','test_r2_2021',
                                                 'test_rmse_2021', 'test_r2_2022','test_rmse_2022'])
df_test_list.to_csv('CSV_file')

In [None]:
# run various algorithms with the selected random state and decide the model with the best performance
df_2021 = pd.read_csv('CSV_2021')

# train_test set splitting 
X_2021, y_2021 = df_2021.iloc[:,1:].values, df_2021.iloc[:,0].values
X_train_2021, X_test_2021, y_train_2021, y_test_2021 = train_test_split(X_2021, y_2021, 
                                                                        test_size=0.3, 
                                                                        random_state=37, # change accordingly
                                                                        stratify=df_2021.iloc[:,2])
np.mean(y_train_2021), np.mean(y_test_2021), np.std(y_train_2021), np.std(y_test_2021)

# Standardization
scaler_st = StandardScaler()
X_train_2021_st = scaler_st.fit_transform(X_train_2021[:,1:])
X_test_2021_st = scaler_st.transform(X_test_2021[:,1:])

In [None]:
# Multi-layer Perceptron regressor
mlp = MLPRegressor(random_state=0, max_iter=20000)
mlp_para = {'hidden_layer_sizes': [(22,),(22,11)],
            'activation': ['tanh', 'relu'], 'solver': ['sgd', 'adam'], 
            'learning_rate': ['constant','adaptive'],
            'alpha': [0.0001,0.001,0.01]}
mlp_gs = GridSearchCV(mlp, mlp_para, cv = 10, scoring='r2', n_jobs=8)
mlp_gs.fit(X_train_2021_st, y_train_2021)
mlp_train_r2 = mlp_gs.score(X_train_2021_st, y_train_2021)
mlp_test_r2_2021 = mlp_gs.score(X_test_2021_st, y_test_2021)
mlp_test_rmse_2021 = mean_squared_error(y_test_2021, 
                                        mlp_gs.predict(X_test_2021_st), 
                                      squared=False)
mlp_test_r2_2022 = mlp_gs.score(X_2022_st, y_2022)
mlp_test_rmse_2022 = mean_squared_error(y_2022, 
                                        mlp_gs.predict(X_2022_st), 
                                      squared=False)
print(mlp_train_r2, mlp_test_r2_2021, mlp_test_rmse_2021, mlp_test_r2_2022,
     mlp_test_rmse_2022)

In [None]:
# Random forest regression
rf = RandomForestRegressor(max_depth=2, n_estimators=500, random_state=0, 
                           n_jobs=8)
rf_para = {'max_features':['auto', 'sqrt', 'log2']}
rf_gs = GridSearchCV(rf, rf_para, cv = 10, scoring='r2', n_jobs=8)
rf_gs.fit(X_train_2021_st, y_train_2021)
rf_train_r2 = rf_gs.score(X_train_2021_st, y_train_2021)
rf_test_r2_2021 = rf_gs.score(X_test_2021_st, y_test_2021)
rf_test_rmse_2021 = mean_squared_error(y_test_2021, 
                                       rf_gs.predict(X_test_2021_st), 
                                      squared=False)
rf_test_r2_2022 = rf_gs.score(X_2022_st, y_2022)
rf_test_rmse_2022 = mean_squared_error(y_2022, 
                                       rf_gs.predict(X_2022_st), 
                                      squared=False)
print(rf_train_r2, rf_test_r2_2021, rf_test_rmse_2021, rf_test_r2_2022, 
      rf_test_rmse_2022)

In [None]:
# Support vector regression
svr = SVR()
svr_para = {'kernel':['linear','poly','rbf'], 'C':[0.01,0.1,1,10,100], 
                        'gamma':['scale','auto'], 'epsilon':[0.1,0.5,0.9]}
svr_gs = GridSearchCV(svr, svr_para, cv = 10, scoring='r2', n_jobs=8)
svr_gs.fit(X_train_2021_st, y_train_2021)
svr_train_r2 = svr_gs.score(X_train_2021_st, y_train_2021)
svr_test_r2_2021 = svr_gs.score(X_test_2021_st, y_test_2021)
svr_test_rmse_2021 = mean_squared_error(y_test_2021, 
                                        svr_gs.predict(X_test_2021_st),
                                       squared=False)
svr_test_r2_2022 = svr_gs.score(X_2022_st, y_2022)
svr_test_rmse_2022 = mean_squared_error(y_2022, 
                                        svr_gs.predict(X_2022_st),
                                       squared=False)
print(svr_train_r2, svr_test_r2_2021, svr_test_rmse_2021, svr_test_r2_2022, 
      svr_test_rmse_2022)

In [None]:
# save trained model
pickle.dump(mlp_gs, open('file_name', 'wb'))

In [None]:
# produce regression graph
graph_x1 = y_train_2021 
graph_y1 = mlp_gs.predict(X_train_2021_st)
graph_x2 = y_test_2021
graph_y2 = mlp_gs.predict(X_test_2021_st)
pyplot.figure(dpi=600)
pyplot.plot([200,1400], [200,1400], 'r:')
scatter1 = pyplot.scatter(graph_x1, graph_y1, s=5)
scatter2 = pyplot.scatter(graph_x2, graph_y2, s=5)
lgnd = pyplot.legend((scatter1,scatter2), ('training set','test set',), loc='lower right')
b, m = polyfit(graph_x.ravel(), graph_y, 1)
pyplot.plot(graph_x, m*graph_x + b, 'b-')
lgnd.legendHandles[0]._sizes = [5] # point size in legend
lgnd.legendHandles[1]._sizes = [5]
pyplot.ylabel('predicted Ѱstem (kPa)')
pyplot.xlabel('measured Ѱstem (kPa)')
pyplot.axis([200,1400, 200,1400])
pyplot.savefig('plot_file',dpi=600)
pyplot.show()