# Regression

**ML MODELS**

1. Linear
2. Lasso
3. Ridge
4. ElasticNet
5. SVR
6. Decision Tree
7. Random Forest
8. AdaBoost
9. XGBoost
10. CATBoost
11. Polynomial (degree 2)

USING

- Robust Scaler
- One Hot Encoder
- Ordinal Encoder (for `month` need to convert to `str` as `OrdinalEncoder` does not allow sequence for `numeric` type)
- ~~Label Encoder~~
- Column Transformer
- Pipeline
- GridSearchCV
- metrics ($R^2$)

NOTE

- For using `sklearn.Pipeline` we can add all scalers, encoders and finally model.


[CATBOOST](https://catboost.ai)

- CatBoost builds upon the theory of decision trees and gradient boosting.
- CatBoost grows [oblivious trees](https://en.wikipedia.org/wiki/Oblivious_data_structure#Oblivious_Tree), which means that the trees are grown by imposing the rule that all nodes at the same level, test the same predictor with the same condition, and hence an index of a leaf can be calculated with bitwise operations.
- The oblivious tree procedure allows for a simple fitting scheme and efficiency on CPUs, while the tree structure operates as a regularization to find an optimal solution and avoid overfitting.

References

- https://dataaspirant.com/catboost-algorithm/#t-1609567161983
- https://towardsdatascience.com/catboost-regression-in-6-minutes-3487f3e5b329
- https://www.geeksforgeeks.org/catboost-ml/


# Importing Libraries


## General Libraries


In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

sns.set_theme(style="darkgrid", font_scale=1.4)


## Miscellaneous Libraries


In [2]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, RobustScaler, PolynomialFeatures
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error


## Model Libraries


In [3]:
from sklearn.linear_model import (
    LinearRegression,
    ElasticNet,
    Ridge,
    Lasso,
)  # // can use CV of regularization model but not using due to GridSearchCV approach
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor


  from pandas import MultiIndex, Int64Index


# Importing Dataset


In [4]:
train_data = pd.read_csv("./../data/final_data/regression_train.csv")
test_data = pd.read_csv("./../data/final_data/regression_test.csv")


In [5]:
X_train = train_data.iloc[:, :-1].values
y_train = train_data.iloc[:, -1].values
X_test = test_data.iloc[:, :-1]
y_test = test_data.iloc[:, -1]


# Model Evaluator


In [6]:
transformer = ColumnTransformer(
    transformers=[
        ("robust_scaler", RobustScaler(), [2, 3, 4, 5, 6, 7, 8]),
        ("ohe_day", OneHotEncoder(sparse=False, drop="first"), [0]),
        ("ord_month", OrdinalEncoder(categories=[["Sep", "Jun", "Jul", "Aug"]]), [1]),
        ("ohe_binary", OneHotEncoder(sparse=False, drop="if_binary"), [-2, -1]),
    ],
    remainder="passthrough",
)

polyTransformer = ColumnTransformer(
    transformers=[
        ("robust_scaler", RobustScaler(), [2, 3, 4, 5, 6, 7, 8]),
        ("polynomial", PolynomialFeatures(degree=2), [2, 3, 4, 5, 6, 7, 8]),
        ("ohe_day", OneHotEncoder(sparse=False, drop="first"), [0]),
        ("ord_month", OrdinalEncoder(categories=[["Sep", "Jun", "Jul", "Aug"]]), [1]),
        ("ohe_binary", OneHotEncoder(sparse=False, drop="if_binary"), [-2, -1]),
    ],
    remainder="passthrough",
)


class ModelEvaluate:
    __model_names = []
    __model_report = []

    def __init__(self, model_obj, model_name: str, poly: bool = False):
        self.__unique_model_name(model_name)
        self.model_obj = model_obj
        self.model_name = model_name
        self.transformers = transformer if not poly else polyTransformer
        self.gscv = None
        self.pipe = None

    def initialize_gscv(
        self, params, *, scoring="r2", cv=5
    ):  # using r2 as it's easy to compare between different models
        self.gscv = GridSearchCV(
            self.model_obj, param_grid=params, scoring=scoring, cv=cv, refit=False, verbose=1
        )  # refit=False means it will not fit the model with best perimeter, i.e. we cannot use score or predict method, it's advisable to use it only when you want to calculate score

    def initialize_pipeline(self):
        self.pipe = Pipeline(steps=[("Transformers", self.transformers), ("gscv", self.gscv)])

    def fit_pipeline(self, X, y):
        self.pipe.fit(X, y)
        self.__model_report.append(
            {"Model Name": self.model_name, "Best Score (R2)": self.pipe.named_steps["gscv"].best_score_}
        )
        return self.get_best()

    def get_best(self):
        return {"score": self.pipe.named_steps["gscv"].best_score_, "param": self.pipe.named_steps["gscv"].best_params_}

    def __unique_model_name(self, model_name: str):
        if model_name in self.__model_names:
            raise Exception(f"{model_name} already exists. model_name should be unique.")
        self.__model_names.append(model_name)

    @staticmethod
    def get_model_names():
        return ModelEvaluate.__model_names

    @staticmethod
    def get_model_report():
        if ModelEvaluate.__model_report:
            return pd.DataFrame(ModelEvaluate.__model_report)
        return "No Model Evaluated"


## Linear Regression


In [7]:
linear_param = {}
linear_regressor = ModelEvaluate(LinearRegression(), "linear regression")
linear_regressor.initialize_gscv(linear_param)
linear_regressor.initialize_pipeline()
linear_regressor.fit_pipeline(X_train, y_train)


Fitting 5 folds for each of 1 candidates, totalling 5 fits


{'score': 0.5577496554388768, 'param': {}}

## Lasso Regression


In [8]:
lasso_param = {
    "alpha": [1e-3, 1e-2, 3e-2, 4e-2, 5e-2, 8e-2, 1e-1, 0.5, 0.8, 1, 1.2, 1.5, 1.8, 2],
    "random_state": [48, 64],
}
lasso_regressor = ModelEvaluate(Lasso(), "lasso regression")
lasso_regressor.initialize_gscv(lasso_param)
lasso_regressor.initialize_pipeline()
lasso_regressor.fit_pipeline(X_train, y_train)


Fitting 5 folds for each of 28 candidates, totalling 140 fits


{'score': 0.6570463759254869, 'param': {'alpha': 0.03, 'random_state': 48}}

## Ridge Regression


In [9]:
ridge_param = {"alpha": np.arange(0.1, 50, 3.5), "random_state": [48, 64]}
ridge_regressor = ModelEvaluate(Ridge(), "ridge regression")
ridge_regressor.initialize_gscv(ridge_param)
ridge_regressor.initialize_pipeline()
ridge_regressor.fit_pipeline(X_train, y_train)


Fitting 5 folds for each of 30 candidates, totalling 150 fits


{'score': 0.6506118685847746, 'param': {'alpha': 17.6, 'random_state': 48}}

## ElasticNet Regression


In [10]:
elasticNet_param = {
    "alpha": [*lasso_param["alpha"], *ridge_param["alpha"]],
    "l1_ratio": np.arange(0.3, 0.8, 0.1),
    "random_state": [48, 64],
}
elasticNet_regressor = ModelEvaluate(ElasticNet(), "elasticNet regression")
elasticNet_regressor.initialize_gscv(elasticNet_param)
elasticNet_regressor.initialize_pipeline()
elasticNet_regressor.fit_pipeline(X_train, y_train)


Fitting 5 folds for each of 290 candidates, totalling 1450 fits


{'score': 0.6581995444629497,
 'param': {'alpha': 0.05, 'l1_ratio': 0.5, 'random_state': 48}}

## Support Vector Regression


In [11]:
svr_param = {
    "kernel": ["linear", "rbf"],
    "gamma": ["scale", "auto"],
    "C": [20, 18, 15, 13, 12, 10, 5, 3, 2, 1.5, 1.0, 0.5, 0.1, 0.05, 0.01],
    "epsilon": np.arange(0.1, 2, 0.08),
}
svr_regressor = ModelEvaluate(SVR(), "svr regression")
svr_regressor.initialize_gscv(svr_param)
svr_regressor.initialize_pipeline()
svr_regressor.fit_pipeline(X_train, y_train)


Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


## Decision Tree Regression


In [None]:
decision_tree_param = {
    "criterion": ["squared_error", "absolute_error", "poisson"],
    "max_depth": [None, 2, 3, 4],
    "min_samples_leaf": [3, 5, 8, 10],
    "max_features": [None, "log2", "sqrt"],
    "random_state": [48, 64],
}
decision_tree_regressor = ModelEvaluate(DecisionTreeRegressor(), "decision tree regression")
decision_tree_regressor.initialize_gscv(decision_tree_param)
decision_tree_regressor.initialize_pipeline()
decision_tree_regressor.fit_pipeline(X_train, y_train)


Fitting 5 folds for each of 288 candidates, totalling 1440 fits




{'score': 0.5889177566737598,
 'param': {'criterion': 'mae',
  'max_depth': 3,
  'max_features': None,
  'min_samples_leaf': 5,
  'random_state': 64}}

## Random Forest Regression


In [None]:
random_forest_param = {
    "n_estimators": np.arange(50, 200, 50),
    "criterion": ["squared_error", "absolute_error"],
    "max_depth": [None, 2, 3, 4, 5],
    "min_samples_leaf": [3, 5, 8, 10],
    "max_features": [None, "log2", "sqrt"],
    "random_state": [48, 64],
}
random_forest_regressor = ModelEvaluate(RandomForestRegressor(), "random forest regression")
random_forest_regressor.initialize_gscv(random_forest_param)
random_forest_regressor.initialize_pipeline()
random_forest_regressor.fit_pipeline(X_train, y_train)


Fitting 5 folds for each of 720 candidates, totalling 3600 fits


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


## AdaBoost Regression


In [None]:
ada_boost_param = {
    "n_estimators": np.arange(50, 301, 50),
    "learning_rate": [0.1, 0.5, 1, 1.5, 2, 3, 5],
    "loss": ["linear", "square", "exponential"],
    "random_state": [48, 64],
}
ada_boost_regressor = ModelEvaluate(AdaBoostRegressor(), "ada boost regression")
ada_boost_regressor.initialize_gscv(ada_boost_param)
ada_boost_regressor.initialize_pipeline()
ada_boost_regressor.fit_pipeline(X_train, y_train)


Fitting 5 folds for each of 252 candidates, totalling 1260 fits


{'score': 0.6429626376673637,
 'param': {'learning_rate': 3,
  'loss': 'exponential',
  'n_estimators': 250,
  'random_state': 64}}

## XGBoost Regression


In [None]:
xg_boost_param = {
    "n_estimators": np.arange(50, 301, 50),
    "max_depth": [2, 3, 5],
    "learning_rate": [0.1, 0.5, 1, 1.2],
    "random_state": [48, 64],
}
xg_boost_regressor = ModelEvaluate(XGBRegressor(), "xg boost regression")
xg_boost_regressor.initialize_gscv(xg_boost_param)
xg_boost_regressor.initialize_pipeline()
xg_boost_regressor.fit_pipeline(X_train, y_train)


Fitting 5 folds for each of 144 candidates, totalling 720 fits


{'score': 0.6706380471202776,
 'param': {'learning_rate': 0.1,
  'max_depth': 2,
  'n_estimators': 100,
  'random_state': 48}}

## CATBoost Regression


In [None]:
cat_boost_param = {
    "iterations": [50, 100, 200, 500],
    "depth": [2, 3, 5, 6],
    "learning_rate": [0.1, 0.3, 0.5, 0.8, 1],
    "random_seed": [48, 64],
    "verbose": [False],
}
cat_boost_regressor = ModelEvaluate(CatBoostRegressor(), "cat boost regression")
cat_boost_regressor.initialize_gscv(cat_boost_param)
cat_boost_regressor.initialize_pipeline()
cat_boost_regressor.fit_pipeline(X_train, y_train)


Fitting 5 folds for each of 160 candidates, totalling 800 fits


{'score': 0.6884498864738677,
 'param': {'depth': 6,
  'iterations': 100,
  'learning_rate': 0.1,
  'random_seed': 64,
  'verbose': False}}

## Polynomial Regression


In [None]:
polynomial_param = {}

# using degree 2
polynomial_regressor = ModelEvaluate(LinearRegression(), "polynomial regression", poly=True)
polynomial_regressor.initialize_gscv(polynomial_param)
polynomial_regressor.initialize_pipeline()
polynomial_regressor.fit_pipeline(X_train, y_train)


Fitting 5 folds for each of 1 candidates, totalling 5 fits


{'score': 0.3920886507364452, 'param': {}}

# Model Report


In [None]:
model_report = ModelEvaluate.get_model_report()
model_report


Unnamed: 0,Model Name,Best Score (R2)
0,linear regression,0.55775
1,lasso regression,0.657046
2,ridge regression,0.650612
3,elasticNet regression,0.6582
4,svr regression,0.679936
5,decision tree regression,0.588918
6,random forest regression,0.669899
7,ada boost regression,0.642963
8,xg boost regression,0.670638
9,cat boost regression,0.68845


In [None]:
model_report.sort_values(model_report.columns[1], ascending=False)


Unnamed: 0,Model Name,Best Score (R2)
9,cat boost regression,0.68845
4,svr regression,0.679936
8,xg boost regression,0.670638
6,random forest regression,0.669899
3,elasticNet regression,0.6582
1,lasso regression,0.657046
2,ridge regression,0.650612
7,ada boost regression,0.642963
5,decision tree regression,0.588918
0,linear regression,0.55775


# Top Model (CATBoost)


In [None]:
cat_boost_regressor.get_best()["param"]


{'depth': 6,
 'iterations': 100,
 'learning_rate': 0.1,
 'random_seed': 64,
 'verbose': False}

In [None]:
final_pipe = Pipeline(
    steps=[
        ("transformers", transformer),
        ("cat_boost", CatBoostRegressor(depth=6, iterations=100, learning_rate=0.1, random_seed=64, verbose=False)),
    ]
)
final_pipe.fit(X_train, y_train)
final_pipe.score(X_train, y_train)


0.9235706305972957

In [None]:
y_pred = final_pipe.predict(X_test.values)
r2_score(y_test, y_pred), mean_squared_error(y_test, y_pred, squared=False)


(0.7064468650271253, 2.0076757905292975)

# Serializing Model


In [None]:
import pickle
import os

if not os.path.isdir("./../model"):
    os.mkdir("./../model")

with open("./../model/regression_pipeline.pkl", "wb") as f:
    pickle.dump(final_pipe, f)
