In [1]:
# Import libraries
import numpy as np
import pandas as pd
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn import metrics
import matplotlib.pyplot as plt

In [2]:
# load train data
train_data = pd.read_csv('../features/train_final.csv')
seed = 28

In [3]:
# split data in train & validation
X = train_data.drop(columns='Item_Outlet_Sales') # Drop feature to predict
x_train, x_val, y_train, y_val = train_test_split(X,
                                          train_data['Item_Outlet_Sales'],
                                          test_size = 0.3,
                                          random_state=seed)

In [4]:
print(f' x_train dimension {x_train.shape}')
print(f' y_train dimension {y_train.shape}')

 x_train dimension (5966, 10)
 y_train dimension (5966,)


In [5]:
def objective(trial)->float:
    """
    This function is mean to optimize hyperparameters
    of a LinearRegression.
    """
    # Choosing regresor
    regressor_n = trial.suggest_categorical('regressor', ['SVR', 'RandomForestRegressor', 'LinearRegression'])

    # tunning hyperparameter for each regressor
    if regressor_n == 'LinearRegression':
        # tunning hyperparameter 
        fit_intercept = trial.suggest_categorical('fit_intercept',[True, False])
        positive = trial.suggest_categorical('positive', [True, False])
        # initialize LinearRegression
        regressor = LinearRegression(
            fit_intercept = fit_intercept,
            positive = positive
        )
    elif regressor_n == 'SVR':
        # tunning hyperparameter
        kernel = trial.suggest_categorical('kernel',['linear', 'poly', 'rbf', 'sigmoid'])
        degree = trial.suggest_int('degree',3,5)
        gamma = trial.suggest_categorical('gamma',['scale','auto'])
        tol = trial.suggest_float('tol',1e-3,1e5)
        max_iter = trial.suggest_int('max_iter',1,1000)
        # initialize SVR
        regressor = SVR(
            kernel = kernel,
            degree= degree,
            gamma= gamma,
            tol= tol,
            max_iter= max_iter
        )
    else:
        # tunning hyperparameter
        n_estimators = trial.suggest_int('n_estimators',10,100)
        criterion = trial.suggest_categorical('criterion',['squared_error', 'absolute_error', 
                                                           'friedman_mse', 'poisson'])
        max_depth = trial.suggest_int('max_depth',2,100)
        # initialize RandomForestRegressor
        regressor = RandomForestRegressor(
            n_estimators= n_estimators,
            criterion= criterion,
            max_depth= max_depth
        )
    
    regressor.fit(x_train, y_train)
    score = cross_val_score(regressor, x_train, y_train, cv=3)
    accuracy = score.mean()

    return accuracy

**Analisis**

En esta optimazación de hiperparametros se probaran 3 modelos distintos para resolver el problema de regresión que se plantea en la notebook "notebook_analysis_train".

Estos 3 modelos serán:
* Random Forest para regresion
* SVM para regresion 
* Regresion lineal

Para cada uno a traves del framework de **optuna** se le probaran distintos hiperparametros y se obtendrá el modelo que mejor se ajusta a nuestro datos para resolver el problema de regresión.

Adicionalmente, se usa RandomSampler para la seleccion aleatoria de hiperparametros en el rango dado.

In [6]:
# choosing sampler
sampler = optuna.samplers.RandomSampler(seed=28)
# Starting study
study = optuna.create_study(direction='maximize', sampler=sampler)
study.optimize(objective,n_trials=100)

[I 2023-08-06 16:08:35,887] A new study created in memory with name: no-name-10e207e4-4893-41b2-8d7d-5448aa64d9aa
[I 2023-08-06 16:08:35,897] Trial 0 finished with value: -6.1441028995994005 and parameters: {'regressor': 'SVR', 'kernel': 'poly', 'degree': 5, 'gamma': 'auto', 'tol': 19270.098190035853, 'max_iter': 971}. Best is trial 0 with value: -6.1441028995994005.
[I 2023-08-06 16:08:35,977] Trial 1 finished with value: 0.5243846465710112 and parameters: {'regressor': 'LinearRegression', 'fit_intercept': True, 'positive': False}. Best is trial 1 with value: 0.5243846465710112.
[I 2023-08-06 16:08:36,103] Trial 2 finished with value: -6.1441028995994005 and parameters: {'regressor': 'SVR', 'kernel': 'sigmoid', 'degree': 3, 'gamma': 'auto', 'tol': 14938.373900653904, 'max_iter': 726}. Best is trial 1 with value: 0.5243846465710112.
[I 2023-08-06 16:08:36,130] Trial 3 finished with value: -6.1441028995994005 and parameters: {'regressor': 'SVR', 'kernel': 'linear', 'degree': 5, 'gamma':

In [7]:
# best results
print(f'Mejor resultado: score {study.best_trial.value}, params {study.best_trial.params}')

Mejor resultado: score 0.5499689067477762, params {'regressor': 'RandomForestRegressor', 'n_estimators': 26, 'criterion': 'poisson', 'max_depth': 8}


In [8]:
# Checking RMSE and R2 for the best model
regressor = study.best_params.__getitem__('regressor')

if regressor == 'RandomForestRegressor':
    final_model = RandomForestRegressor(
        n_estimators= study.best_params.__getitem__('n_estimators'),
        criterion= study.best_params.__getitem__('criterion'),
        max_depth= study.best_params.__getitem__('max_depth')
    )
elif regressor == 'SVR':
    final_model = SVR(
            kernel = study.best_params.__getitem__('kernel'),
            degree= study.best_params.__getitem__('degree'),
            gamma= study.best_params.__getitem__('gamma'),
            tol= study.best_params.__getitem__('tol'),
            max_iter= study.best_params.__getitem__('max_iter')
        )
else:
    final_model = LinearRegression(
            fit_intercept = study.best_params.__getitem__('fit_intercept'),
            positive = study.best_params.__getitem__('positive')
        )
    pass

# train model
final_model.fit(x_train,y_train)

# predict for validation data
pred = final_model.predict(x_val)

# calculating RMSE and R2
mse_train = metrics.mean_squared_error(y_train, final_model.predict(x_train))
R2_train = final_model.score(x_train, y_train)
print('Métricas del Modelo:')
print('ENTRENAMIENTO: RMSE: {:.2f} - R2: {:.4f}'.format(mse_train**0.5, R2_train))

mse_val = metrics.mean_squared_error(y_val, pred)
R2_val = final_model.score(x_val, y_val)
print('VALIDACIÓN: RMSE: {:.2f} - R2: {:.4f}'.format(mse_val**0.5, R2_val))

Métricas del Modelo:
ENTRENAMIENTO: RMSE: 1049.54 - R2: 0.6201
VALIDACIÓN: RMSE: 1104.59 - R2: 0.5848


In [13]:
# grafic all trials and tests for the best model
optuna.visualization.plot_parallel_coordinate(study, params= study.best_params)