# MLP - multilayer perceptron regression

In [10]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPRegressor

from pathlib import Path

In [11]:
path_data = Path(r'../../../../Input_data/')

## Importing the dataset

In [12]:
dataset = pd.read_csv(path_data / 'ML_SSM_dataset_v1_20220317/ML_training&testing_v01shuffled_20220317.csv')
dataset.head()

Unnamed: 0,Year,DOY,EVI,Evapo,LST_Daily,LST_Diff,NDVI,TI,T_air,API,Clay,Elevation,lat,lon,OMC,Porosity,Sand,Silt,Preci,Soil Moisture
0,2015,222,0.120117,-0.00228,29.12,31.14,0.18374,11.589293,23.996123,8.695007,19.700001,1752.627563,40.078119,-112.361396,3.56868,0.490566,37.700001,42.700001,0.000853,0.092
1,2010,177,0.515586,-0.004072,19.55,8.68,0.685505,14.058996,21.684099,14.624732,22.799999,153.939468,48.726702,3.203102,6.65464,0.509434,10.4,66.800003,0.000849,0.0015
2,2012,79,0.23831,-0.002077,8.04,10.5,0.41058,14.444198,8.660008,4.510628,23.799999,73.877228,43.717169,3.857831,6.9822,0.490566,29.799999,46.400002,0.321031,0.123435
3,2013,95,0.188224,-0.002522,12.32,29.5,0.3527,15.731341,7.477071,13.977669,31.6,213.627564,40.052801,-88.372904,6.39604,0.471698,8.8,59.599998,0.000435,0.39413
4,2007,299,0.43549,-0.001802,8.66,7.38,0.882154,12.428805,7.207212,52.144912,40.299999,192.723587,45.249999,-123.28,16.34352,0.539623,15.8,43.799999,0.00172,0.3875


In [13]:
dataset.shape

(469434, 20)

## Construction of the matrix of characteristics (X) and the dependent variable vector (y)

In [14]:
X = dataset.iloc[:, :-1].values #all columns, but the last
y = dataset.iloc[:, -1].values #only the last column

## Division of the dataset into the Training Set and the Test Set
- Obs: "random_state = 0" to ensure the training and testing sets are the same each time we train the model. 

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

## Very important: Feature scaling of X (z-transformation => mean = 0, std=1)

### Obs: Fit_transform only applied to X_train to prevent data leaking

In [16]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
X_test.shape

(117359, 19)

In [17]:
np.concatenate((X_train, X_test)).std(), np.concatenate((X_train, X_test)).mean()

(1.0007581037794264, -2.6934007929017374e-05)

# Loading (or training) the estimator

training takes long

In [18]:
with open('../models/grid_search_30_30.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

In [19]:
regressor = loaded_model.best_estimator_
regressor

MLPRegressor(activation='tanh', alpha=1e-06, hidden_layer_sizes=(30, 30),
             learning_rate='adaptive', max_iter=1000, random_state=0,
             tol=1e-07)

In [26]:
# %%time

# regressor = MLPRegressor(random_state=0,
#                          hidden_layer_sizes=(30, 30), 
#                          activation = 'tanh', 
#                          solver = 'adam', 
#                          learning_rate = 'adaptive', 
#                          max_iter=1000, 
#                          early_stopping=False, 
#                          alpha=1e-6,
#                          tol=1e-7
#                         ) 
# regressor.fit(X_train, y_train)

CPU times: user 5min 37s, sys: 0 ns, total: 5min 37s
Wall time: 5min 37s


MLPRegressor(activation='tanh', alpha=1e-06, hidden_layer_sizes=(30, 30),
             learning_rate='adaptive', max_iter=1000, random_state=0,
             tol=1e-07)

# CV

In [20]:
res = []

for train_index, test_index in KFold(n_splits=5).split(X_train):
    # print(train_index.size)
    X_val = X_train[train_index]
    y_val = y_train[train_index]

    y_val_pred = regressor.predict(X_val)
    res_i = dict(
        r2score= r2_score(y_val, y_val_pred),
        MAE=mean_absolute_error(y_val, y_val_pred),
        MSE=mean_squared_error(y_val, y_val_pred),
        RMSE=np.sqrt(mean_squared_error(y_val, y_val_pred)),
        r=np.corrcoef(y_val, y_val_pred)[0, 1]
    )
    res.append(res_i)
pd.DataFrame(res).agg(['mean', 'std'])

Unnamed: 0,r2score,MAE,MSE,RMSE,r
mean,0.763818,0.040205,0.002929,0.05412,0.874126
std,0.000416,3e-05,3e-06,2.9e-05,0.000235
