# Models

### Objective: test mutliple regression models to find the best one for the task

In [None]:
# Import necessary libraries for models
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
# Initialize grid search to find best parameters for each model
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit

# Split data as time series
tscv = TimeSeriesSplit(n_splits=5)

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
# Initiate linear regression model
linear_reg_model = LinearRegression()

In [None]:
# Initialize a pipeline
linear_reg_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', linear_reg_model)
])

In [None]:
# Paramater grid- Linear regression
param_grid_linear_reg = {
    'model__fit_intercept': [True, False]
}

In [None]:
# GridSearch - Linear Regression
grid_search_linear_reg = GridSearchCV(linear_reg_pipeline, param_grid_linear_reg, cv=tscv, scoring='neg_mean_absolute_error')

In [None]:
grid_search_linear_reg.fit(X_train, y_train)

In [None]:
print(f"Best parameters: {grid_search_linear_reg.best_params_}")

Best parameters: {'model__fit_intercept': True}


### Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Initiate Random Forest Regressor model
random_forest_model = RandomForestRegressor()

In [None]:
# Initialize a pipeline
random_forest_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', random_forest_model)
])

In [None]:
# Parameter grid- Random Forest Regressor
param_grid_random_forest = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [3, 5, 7],
    'model__max_features': ['auto', 'sqrt', 'log2']
}

In [None]:
# GridSearch - Random Forest Regressor
grid_search_random_forest = GridSearchCV(random_forest_pipeline, param_grid_random_forest, cv=tscv, scoring='neg_mean_absolute_error')

In [None]:
grid_search_random_forest.fit(X_train, y_train)

45 fits failed out of a total of 135.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/pipeline.py", line 662, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1382, in

In [None]:
print(f'Best parameters: {grid_search_random_forest.best_params_}')

Best parameters: {'model__max_depth': 7, 'model__max_features': 'log2', 'model__n_estimators': 100}


### Gradient Boosting Regressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
# Initiate Gradient Boosting Regressor model
gradient_boosting_reg_model = GradientBoostingRegressor()

In [None]:
# Initialize a pipeline
gradient_boosting_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', gradient_boosting_reg_model)
])

In [None]:
# Parameter grid- Gradient Boosting Regressor
param_grid_gradient_boosting = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [3, 5, 7],
}

In [None]:
# GridSearch - Gradient Boosting Regressor
grid_search_gradient_boosting = GridSearchCV(gradient_boosting_pipeline, param_grid_gradient_boosting, cv=tscv, scoring='neg_mean_absolute_error')

In [None]:
grid_search_gradient_boosting.fit(X_train, y_train)

In [None]:
print(f"Best parameters: {grid_search_gradient_boosting.best_params_}")

Best parameters: {'model__max_depth': 7, 'model__n_estimators': 100}


### ADA Boosting Regressor

In [None]:
from sklearn.ensemble import AdaBoostRegressor

In [None]:
# Initiate ADA Boosting Regressor model
ada_boost_reg_model = AdaBoostRegressor()

In [None]:
# Initialize a pipeline
ada_boost_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', ada_boost_reg_model)
])

In [None]:
# Parameter grid- ADA Boost Regressor
param_grid_ada_boost = {
    'model__n_estimators': [50, 100, 200],
    'model__learning_rate': [0.01, 0.1, 0.3]
}

In [None]:
# GridSearch - ADA Boost Regressor
grid_search_ada_boost = GridSearchCV(ada_boost_pipeline, param_grid_ada_boost, cv=tscv, scoring='neg_mean_absolute_error')

In [None]:
grid_search_ada_boost.fit(X_train, y_train)

In [None]:
print(f'Best parameters: {grid_search_ada_boost.best_params_}')

Best parameters: {'model__learning_rate': 0.01, 'model__n_estimators': 200}


### XGBoost

In [None]:
import xgboost as xg

In [None]:
# Initiate XGBoost model
xgboost_model = xg.XGBRegressor()

In [None]:
# Initialize a pipeline
xgboost_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', xgboost_model)
])

In [None]:
# Parameter grid- XGBoost Regressor
param_grid_xgboost = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [3, 5, 7],
    'model__learning_rate': [0.01, 0.1, 0.3]
}

In [None]:
# GridSearch - XGBoost Regressor
grid_search_xgboost = GridSearchCV(xgboost_pipeline, param_grid_xgboost, cv=tscv, scoring='neg_mean_absolute_error')

In [None]:
grid_search_xgboost.fit(X_train, y_train)

In [None]:
print(f'Best parameters: {grid_search_xgboost.best_params_}')

Best parameters: {'model__learning_rate': 0.1, 'model__max_depth': 7, 'model__n_estimators': 100}


In [None]:
# Make predictions for all models
# Linear Regression
linear_reg_y_pred = grid_search_linear_reg.predict(X_test)

# Random Forest Regression
random_forest_y_pred = grid_search_random_forest.predict(X_test)

# ADA Boost Regressor
ada_boost_y_pred = grid_search_ada_boost.predict(X_test)

# Gradient Boosting Regressor
gradient_boosting_y_pred = grid_search_gradient_boosting.predict(X_test)

# XGBoost Regressor
xgboost_y_pred = grid_search_xgboost.predict(X_test)

In [None]:
# Linear Regression Prediction
print(linear_reg_y_pred)

[7281.48618013 6097.39526737 6921.54538929 ... 5549.11736664 8118.00444455
 5546.18415409]


In [None]:
# Random Forest Regressor Prediction
print(random_forest_y_pred)

[6748.39907008 5906.59812239 6439.40259209 ... 5317.85853401 7770.01359181
 5585.21472191]


In [None]:
# ADA Boost Regressor Prediction
print(ada_boost_y_pred)

[6349.21134077 4705.61683212 5990.94021507 ... 4035.79775211 7677.98451458
 5999.020178  ]


In [None]:
# Gradient Boosting Regressor Prediction
print(gradient_boosting_y_pred)

[7359.95685    5640.06292006 6384.6619029  ... 5000.26215587 8838.29841911
 6035.39717949]


In [None]:
# XGBoost Regressor Prediction
print(xgboost_y_pred)

[7356.176  5667.1245 6442.698  ... 4994.507  8839.336  5992.971 ]


In [None]:
#import shap
#explainer = shap.Explainer(model.predict, X_test)
#shap_values = explainer.shap_values(sales_data)

#shap.summary_plots(shap_values)

In [None]:
# Perform metrics and evaluate models: classification report
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import r2_score
from sklearn.metrics import root_mean_squared_error as RMSE

### Metrics

Generate Mean Squared Error (MSE), Mean Absolute Error (MAE), R2, and R2 Mean Squared

In [None]:
# Linear Regression
# MSE
mse = MSE(linear_reg_y_pred, y_test)

# MAE
mae = MAE(linear_reg_y_pred, y_test)

# R2
r2 = r2_score(linear_reg_y_pred, y_test)

# R2 Mean Squared
r2_mean = RMSE(linear_reg_y_pred, y_test)

# Print results
print('Metrics for Linear Regression')
print(f"Mean squared error: {mse}\n")
print(f"Mean absolute error: {mae}\n")
print(f"R2 Score: {r2}\n")
print(f"RSME: {r2_mean}")

Metrics for Linear Regression
Mean squared error: 2319971.894525394

Mean absolute error: 1024.778607525187

R2 Score: 0.7876186795643908

RSME: 1523.1453950708035


In [None]:
# Random Forest Regression
# MSE
mse = MSE(random_forest_y_pred, y_test)

# MAE
mae = MAE(random_forest_y_pred, y_test)

# R2
r2 = r2_score(random_forest_y_pred, y_test)

# R2 Mean
r2_mean = RMSE(random_forest_y_pred, y_test)

# Print results
print('Metrics for Random Forest Regression')
print(f"Mean squared error: {mse}\n")
print(f"Mean absolute error: {mae}\n")
print(f"R2 Score: {r2}\n")
print(f"RSME: {r2_mean}")

Metrics for Random Forest Regression
Mean squared error: 2199547.638886837

Mean absolute error: 997.5548505146603

R2 Score: 0.7594491425400594

RSME: 1483.0871986794427


In [None]:
# Gradient Boosting Regressor
# MSE
mse = MSE(gradient_boosting_y_pred, y_test)

# MAE
mae = MAE(gradient_boosting_y_pred, y_test)

# R2
r2 = r2_score(gradient_boosting_y_pred, y_test)

# R2 Mean
r2_mean = RMSE(gradient_boosting_y_pred, y_test)

# Print results
print('Metrics for Gradient Boosting Regressor')
print(f"Mean squared error: {mse}\n")
print(f"Mean absolute error: {mae}\n")
print(f"R2 Score: {r2}\n")
print(f"RSME: {r2_mean}")

Metrics for Gradient Boosting Regressor
Mean squared error: 1987352.1907760624

Mean absolute error: 958.4258070662625

R2 Score: 0.8299268946349337

RSME: 1409.7347944830128


In [None]:
# ADA Boosting Regressor
# MSE
mse = MSE(ada_boost_y_pred, y_test)

# MAE
mae = MAE(ada_boost_y_pred, y_test)

# R2
r2 = r2_score(ada_boost_y_pred, y_test)

# R2 Mean
r2_mean = RMSE(ada_boost_y_pred, y_test)

# Print results
print('Metrics for ADA Boosting Regressor')
print(f"Mean squared error: {mse}\n")
print(f"Mean absolute error: {mae}\n")
print(f"R2 Score: {r2}\n")
print(f"RSME: {r2_mean}")

Metrics for ADA Boosting Regressor
Mean squared error: 2526022.6321251835

Mean absolute error: 1082.2726897889668

R2 Score: 0.7478779449168003

RSME: 1589.3466054090227


In [None]:
# XGBoost Regressor
# MSE
mse = MSE(xgboost_y_pred, y_test)

# MAE
mae = MAE(xgboost_y_pred, y_test)

# R2
r2 = r2_score(xgboost_y_pred, y_test)

# R2 Mean
r2_mean = RMSE(xgboost_y_pred, y_test)

# Print results
print('Metrics for XGBoost Regressor')
print(f"Mean squared error: {mse}\n")
print(f"Mean absolute error: {mae}\n")
print(f"R2 Score: {r2}\n")
print(f"RSME: {r2_mean}")

Metrics for XGBoost Regressor
Mean squared error: 1983626.25

Mean absolute error: 959.3358154296875

R2 Score: 0.8279377818107605

RSME: 1408.4127197265625


In [None]:
# Save all models to test on real data
import joblib

In [None]:
# Model with best parameters - Linear Regressor
final_linear_regression_model = LinearRegression(fit_intercept=True)

# Linear Regression Model
linear_regression_model = joblib.dump(final_linear_regression_model, 'linear_regression_model.pkl')
print(f"Linear model {linear_regression_model} saved succesfully!")

Linear model ['linear_regression_model.pkl'] saved succesfully!


In [None]:
# Model with best parameters- Random Forest Regressor
final_random_forest_regression_model = RandomForestRegressor(max_depth=7, max_features='log2', n_estimators=100)

# Random Forest Model
random_forest_model = joblib.dump(final_random_forest_regression_model, 'random_forest_model.pkl')
print(f"Random Forest model {random_forest_model} saved succesfully!")

Random Forest model ['random_forest_model.pkl'] saved succesfully!


In [None]:
# Model with best parameters- Gradient Boosting Regressor
final_gradient_boosting_model = GradientBoostingRegressor(max_depth=7, n_estimators=100)

# Gradient Boosting Regressor
gradient_boosting_model = joblib.dump(grid_search_gradient_boosting, 'gradient_boosting_model.pkl')
print(f"Gradient Boosting model {gradient_boosting_model} saved succesfully!")

Gradient Boosting model ['gradient_boosting_model.pkl'] saved succesfully!


In [None]:
# Model with best parameters - ADA Boost Regressor
final_ada_boost_model = AdaBoostRegressor(learning_rate=0.01, n_estimators=200)

# ADA Boost Regressor
ada_boost_model = joblib.dump(grid_search_ada_boost, 'ada_boost_model.pkl')
print(f"ADA Boost model {ada_boost_model} saved succesfully!")

ADA Boost model ['ada_boost_model.pkl'] saved succesfully!


In [None]:
# Model with best parameters - ADA Boost Regressor
final_xgboost_regressor_model = xg.XGBRegressor(learning_rate=0.1, max_depth=7, n_estimators=100)

# XGBoost Regressor
xgboost_model = joblib.dump(grid_search_xgboost, 'xgboost_model.pkl')
print(f"XGBoost model {xgboost_model} saved succesfully!")

XGBoost model ['xgboost_model.pkl'] saved succesfully!
