In [None]:
import pandas as pd
import numpy as np
import scipy as sc
from matplotlib import pyplot
from numpy.polynomial.polynomial import polyfit
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import pickle
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import StackingRegressor

In [None]:
# import CSV files which contain all simple ratio indices for each sample 
df1 = pd.read_csv('PhDdata/SI/CSV_file1')
df2 = pd.read_csv('PhDdata/SI/CSV_file2')
df = pd.concat([df1, df2], axis=0, sort=False)

In [None]:
# transpose
df_T = df.iloc[:,1:].transpose()
df_T.columns = list(df['No'])
df_T.insert(loc=2, column='No', value=range(85))

In [None]:
# fill missing values with the mean value 
imputer = SimpleImputer(strategy='mean')
df_im = imputer.fit_transform(df_T)
df_im = pd.DataFrame(df_im, columns = df_T.columns)

In [None]:
# train_test dataset splitting (stratified with day of the year and the vineyards)
X, y = df_im.iloc[:,2:].values, df_im.iloc[:,0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=df_im.iloc[:,1])
X_test[:,0]

In [None]:
# Standardization
scaler_st = StandardScaler()
X_train_st = scaler_st.fit_transform(X_train[:,1:])
X_test_st = scaler_st.transform(X_test[:,1:])
X_train_st = np.insert(X_train_st, 0, X_train[:,0], axis=1)
X_test_st = np.insert(X_test_st, 0, X_test[:,0], axis=1)

In [None]:
# make traing dataset for Spearman correlation
y_train_T = y_train.T.reshape(1,59)
df_train = np.concatenate((X_train.T, y_train_T), axis=0)
df_train = pd.DataFrame(df_train)
df_train.iloc[:1021701].to_csv('CSV_file_train1', header=False, index=False)
df_train.iloc[1021701:].to_csv('CSV_file_train2', header=False, index=False)

In [None]:
# Spearman correlation (threshold is 0.6)
df_sp1 = pd.read_csv('CSV_file_train1', usecols = ['No','rho','t','p'], lineterminator='\n')
df_sp1 = df_sp1.iloc[:1021700,:]
df_sp2 = pd.read_csv('CSV_file_train2', usecols = ['No','rho','t','p'], lineterminator='\n')
df_sp2 = df_sp2.iloc[:979300,:]
df_sp = pd.concat([df_sp1, df_sp2], axis=0, sort=False)
feature_pear = df_sp[df_sp['rho']<-0.6].append(df_sp[df_sp['rho']>0.6])

In [None]:
# carefully select the parameter range to prevent overfitting, then use these ranges for modeling
# Partial least squares regression (use default scale = True for X, y)
plsr_full_plsr = PLSRegression()
para_full_plsr = {'n_components':[1,2,3,4,5,6,7,8,9,10]}
gs_full_plsr = GridSearchCV(plsr_full_plsr, para_full_plsr, cv = 10, scoring='r2')
gs_full_plsr.fit(X_train[:,1:], y_train)
gs_full_plsr.best_estimator_

In [None]:
full_plsr_tune = PLSRegression(n_components=8) # change according to the best set of hyperparameter
full_plsr_tune.fit(X_train[:,1:], y_train)
r2_full_plsr_train = full_plsr_tune.score(X_train[:, 1:], y_train)
r2_full_plsr = full_plsr_tune.score(X_test[:,1:], y_test)
rmse_full_plsr = mean_squared_error(y_test, full_plsr_tune.predict(X_test[:,1:]), squared=False)
r2_full_plsr_train, r2_full_plsr, rmse_full_plsr

In [None]:
# calculate variable importance in projection (VIP)
def _calculate_vips(model):
    t = model.x_scores_
    w = model.x_weights_
    q = model.y_loadings_
    p, h = w.shape
    vips = np.zeros((p,))
    s = np.diag(np.matmul(np.matmul(np.matmul(t.T,t),q.T), q)).reshape(h, -1)
    total_s = np.sum(s)
    for i in range(p):
        weight = np.array([ (w[i,j] / np.linalg.norm(w[:,j]))**2 for j in range(h) ])
        vips[i] = np.sqrt(p*(np.matmul(s.T, weight))/total_s)
    return vips
VIP = _calculate_vips(full_plsr_tune)

In [None]:
# save VIP files
VIP_pd = pd.DataFrame()
VIP_pd['VIP'] = VIP
VIP_pd['variable'] = df_im.columns[3:]
VIP_pd[:1021701].to_csv('CSV_file_VIP1', header=False, index=False)
VIP_pd[1021701:].to_csv('CSV_file_VIP2', header=False, index=False)

In [None]:
# make regression graph
graph_x = full_plsr_tune.predict(X_test[:,1:]) 
graph_y = y_test
pyplot.figure(dpi=600)
pyplot.plot([300,1400], [300,1400], 'r:')
pyplot.scatter(graph_x, graph_y)
b, m = polyfit(graph_x.ravel(), graph_y, 1)
pyplot.plot(graph_x, m*graph_x + b, 'b-')
pyplot.ylabel('predicted Ѱstem (kPa)')
pyplot.xlabel('observed Ѱstem (kPa)')
pyplot.axis([300, 1400, 300, 1400])
pyplot.savefig('plot_file',dpi=600)
pyplot.show()

In [None]:
# Random forest
rf_full_rf = RandomForestRegressor(max_depth=1, n_estimators=500, random_state=0, n_jobs=-1)
para_full_rf = {'max_features':['auto', 'sqrt', 'log2']}
gs_full_rf = GridSearchCV(rf_full_rf, para_full_rf, cv = 10, scoring='r2')
gs_full_rf.fit(X_train[:,1:], y_train)
gs_full_rf.best_estimator_

In [None]:
full_rf_tune = RandomForestRegressor(max_depth=1, max_features='sqrt', n_estimators=500, n_jobs=-1, random_state=0) # change according to the best set of hyperparameter
full_rf_tune.fit(X_train[:,1:], y_train)
r2_full_rf_train = full_rf_tune.score(X_train[:,1:], y_train)
r2_full_rf = full_rf_tune.score(X_test[:, 1:], y_test)
rmse_full_rf = mean_squared_error(y_test, full_rf_tune.predict(X_test[:, 1:]), squared=False)
r2_full_rf_train, r2_full_rf, rmse_full_rf

In [None]:
# calculate feature importance
rf_fi_pd = pd.DataFrame()
rf_fi_pd['FI'] = rfe_rf_tune.feature_importances_
rf_fi_pd['variable'] = df_im.iloc[:,3:].columns[rfe_rf.get_support()]
rf_fi_pd.to_csv('CSV_file_FI')

In [None]:
# Support vector regression
svr_full_svr = SVR()
para_full_svr = {'C':[0.1,0.5,1,5,10,50,100,500,1000], 'kernel':['linear','poly','rbf','sigmoid'], 'gamma':['scale','auto'], 
                 'epsilon':[0.1,0.3,0.5,0.7,0.9]}
gs_full_svr = GridSearchCV(svr_full_svr, para_full_svr, cv = 10, scoring='r2')
gs_full_svr.fit(X_train_st[:,1:], y_train)
gs_full_svr.best_estimator_

In [None]:
full_svr_tune = SVR(C=50, kernel='sigmoid') # change according to the best set of hyperparameter
full_svr_tune.fit(X_train_st[:,1:], y_train)
r2_full_svr_train = full_svr_tune.score(X_train_st[:,1:], y_train)
r2_full_svr = full_svr_tune.score(X_test_st[:, 1:], y_test)
rmse_full_svr = mean_squared_error(y_test, full_svr_tune.predict(X_test_st[:, 1:]), squared=False)
r2_full_svr_train, r2_full_svr, rmse_full_svr

In [None]:
# calculate coefficient weight
svr_coef_pd = pd.DataFrame()
svr_coef_pd['coef'] = rfe_svr_tune.coef_.ravel()
svr_coef_pd['variable'] = df_im.iloc[:, 3:].columns[rfe_svr.get_support()]                                       
svr_coef_pd.to_csv('CSV_file_CW')

In [None]:
# save the trained model
pickle.dump(gs_pear_rf, open('file_name', 'wb'))