# SDGRegressor

SGD runs repeatedly on each record of the training data and updates the parameters according to the gradient of the error with respect to each individual data point.

In all cases, the StandardScaler tool automates the scaling of features (by mean and variance).
 
Parameters generally need to be set (or tweaked, the default ones are not always the most appropiate) to get good convergence.

## Importing the Libraries

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV

from multiprocessing import Pool
from pathlib import Path

## Importing the dataset

In [2]:
dataset = pd.read_csv('../../../Input_data/ML_SSM_dataset_v1_20220317/ML_training&testing_v01shuffled_20220317.csv')
dataset.head()

Unnamed: 0,Year,DOY,EVI,Evapo,LST_Daily,LST_Diff,NDVI,TI,T_air,API,Clay,Elevation,lat,lon,OMC,Porosity,Sand,Silt,Preci,Soil Moisture
0,2015,222,0.120117,-0.00228,29.12,31.14,0.18374,11.589293,23.996123,8.695007,19.700001,1752.627563,40.078119,-112.361396,3.56868,0.490566,37.700001,42.700001,0.000853,0.092
1,2010,177,0.515586,-0.004072,19.55,8.68,0.685505,14.058996,21.684099,14.624732,22.799999,153.939468,48.726702,3.203102,6.65464,0.509434,10.4,66.800003,0.000849,0.0015
2,2012,79,0.23831,-0.002077,8.04,10.5,0.41058,14.444198,8.660008,4.510628,23.799999,73.877228,43.717169,3.857831,6.9822,0.490566,29.799999,46.400002,0.321031,0.123435
3,2013,95,0.188224,-0.002522,12.32,29.5,0.3527,15.731341,7.477071,13.977669,31.6,213.627564,40.052801,-88.372904,6.39604,0.471698,8.8,59.599998,0.000435,0.39413
4,2007,299,0.43549,-0.001802,8.66,7.38,0.882154,12.428805,7.207212,52.144912,40.299999,192.723587,45.249999,-123.28,16.34352,0.539623,15.8,43.799999,0.00172,0.3875


In [3]:
dataset.shape

(469434, 20)

## Construction of the matrix of characteristics (X) and the dependent variable vector (y)

In [6]:
X = dataset.iloc[:, :-1].values #all columns, but the last
y = dataset.iloc[:, -1].values #only the last column

## Division of the dataset into the Training Set and the Test Set
- Obs: "random_state = 0" to ensure the training and testing sets are the same each time we train the model. 

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

## Very important: Feature scaling of X (z-transformation => mean = 0, std=1)

### Obs: Fit_transform only applied to X_train to prevent data leaking

In [8]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
X_test.shape

(117359, 19)

In [9]:
np.concatenate((X_train, X_test)).std(), np.concatenate((X_train, X_test)).mean()

(1.0007581037794264, -2.6934007929017374e-05)

# Grid Search to find the best model and the best parameters

In [12]:
parameters = {
    'loss': ['huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],  # 'squared_error' not supported
    'penalty': ['l2', 'l1', 'elasticnet'],  # l2
    'learning_rate': ['invscaling', 'constant', 'optimal', 'adaptive'],
    'average': [False, True],
    'warm_start': [False, True],
    'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e-5],
    # 'max_iter': [1000, 500, 100, 5000]  # 1000
}

In [13]:
%%time
regressor = SGDRegressor(random_state=0)

grid_search = GridSearchCV(estimator = regressor,
                           param_grid = parameters,
                           scoring = 'r2', 
                           verbose=True,
                           cv = 2,
                           n_jobs = 1)  # or -1 to use all CPUs

grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Fitting 2 folds for each of 720 candidates, totalling 1440 fits




Best Accuracy: 41.43 %
Best Parameters: {'alpha': 0.01, 'average': True, 'learning_rate': 'constant', 'loss': 'epsilon_insensitive', 'penalty': 'l2', 'warm_start': False}
CPU times: user 2h 17min 30s, sys: 1h 38min 8s, total: 3h 55min 39s
Wall time: 1h 8min 51s


# Saving the resulting model

In [None]:
# with open('model/grid_search_sgd.pkl', 'wb') as f:
#     pickle.dump(grid_search, f)