# <u>Data Science Essentials</u>

## <u>Topic</u>: Hyperparameter Tuning using Optuna

## <u>Category</u>: Model Training

### <u>Created By</u>: Mohammed Misbahullah Sheriff
- [LinkedIn](https://www.linkedin.com/in/mohammed-misbahullah-sheriff/)
- [GitHub](https://github.com/MisbahullahSheriff)

## Importing Libraries

In [None]:
import numpy as np
import pandas as pd

import optuna

from sklearn.ensemble import RandomForestRegressor

from sklearn.preprocessing import (
    StandardScaler,
    MinMaxScaler,
    OneHotEncoder,
    OrdinalEncoder
)

from sklearn.impute import SimpleImputer

from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline

from sklearn.model_selection import (
    cross_val_score,
    train_test_split
)

from sklearn.metrics import r2_score

## Loading the Data

In [None]:
path = "/content/car-details.csv"

df = pd.read_csv(path)
print("Data Shape:", df.shape)
df.head()

Data Shape: (6926, 16)


Unnamed: 0,name,company,model,edition,year,owner,fuel,seller_type,transmission,km_driven,mileage_mpg,engine_cc,max_power_bhp,torque_nm,seats,selling_price
0,Maruti Swift Dzire VDI,Maruti,Swift,Dzire VDI,2014,First,Diesel,Individual,Manual,145500,55.0,1248.0,74.0,190.0,5.0,450000
1,Skoda Rapid 1.5 TDI Ambition,Skoda,Rapid,1.5 TDI Ambition,2014,Second,Diesel,Individual,Manual,120000,49.7,1498.0,103.52,250.0,5.0,370000
2,Honda City 2017-2020 EXi,Honda,City,2017-2020 EXi,2006,Third,Petrol,Individual,Manual,140000,41.6,1497.0,78.0,124.544455,5.0,158000
3,Hyundai i20 Sportz Diesel,Hyundai,i20,Sportz Diesel,2010,First,Diesel,Individual,Manual,127000,54.06,1396.0,90.0,219.66896,5.0,225000
4,Maruti Swift VXI BSIII,Maruti,Swift,VXI BSIII,2007,First,Petrol,Individual,Manual,120000,37.84,1298.0,88.2,112.776475,5.0,130000


In [None]:
X = df.drop(columns=["name", "edition", "selling_price"])
y = df.selling_price.copy()

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(5540, 13) (5540,)
(1386, 13) (1386,)


In [None]:
num_cols = X_train.select_dtypes(include="number").columns
num_cols

Index(['year', 'km_driven', 'mileage_mpg', 'engine_cc', 'max_power_bhp',
       'torque_nm', 'seats'],
      dtype='object')

In [None]:
obj_cols = X_train.select_dtypes(exclude="number").columns
obj_cols

Index(['company', 'model', 'owner', 'fuel', 'seller_type', 'transmission'], dtype='object')

## Demo

In [None]:
def objective(trial):
  """

  Description:
  ------------
  This function will perform the hyperparameter tuning
  and evaluate the model and return the performance metric for each trial

  Parameters:
  -----------
  trial: optuna.trial
         Instance of an optuna trial for evaluating the objective function

  """
  num_imputing_strategy = trial.suggest_categorical("num_imputer",
                                                    ["mean", "median", "constant"])
  num_fill_value = None
  if num_imputing_strategy == "constant":
    num_fill_value = -1

  num_add_indicator = trial.suggest_categorical("num_indicator", [True, False])

  num_scaling_strategy = trial.suggest_categorical("scaler",
                                                   ["std", "norm", None])

  obj_imputing_strategy = trial.suggest_categorical("obj_imputer",
                                                    ["most_frequent", "constant"])
  obj_fill_value = None
  if obj_imputing_strategy == "constant":
    obj_fill_value = "other"

  obj_add_indicator = trial.suggest_categorical("obj_indicator", [True, False])

  obj_encoding_strategy = trial.suggest_categorical("encoder",
                                                    ["ordinal", "one-hot"])

  num_imputer = SimpleImputer(strategy=num_imputing_strategy,
                              fill_value=num_fill_value,
                              add_indicator=num_add_indicator)

  if num_scaling_strategy == "std":
    scaler = StandardScaler()
  elif num_scaling_strategy == "norm":
    scaler = MinMaxScaler()
  else:
    scaler = None

  obj_imputer = SimpleImputer(strategy=obj_imputing_strategy,
                              fill_value=obj_fill_value,
                              add_indicator=obj_add_indicator)

  if obj_encoding_strategy == "ordinal":
    encoder = OrdinalEncoder(handle_unknown="use_encoded_value",
                             unknown_value=-1)
  elif obj_encoding_strategy == "one-hot":
    encoder = OneHotEncoder(handle_unknown="ignore")
  else:
    encoder = None

  num_pipe = Pipeline(steps=[("imputer", num_imputer),
                             ("scaler", scaler)])

  obj_pipe = Pipeline(steps=[("imputer", obj_imputer),
                             ("encoder", encoder)])

  preprocessor = ColumnTransformer(transformers=[
      ("num", num_pipe, num_cols),
      ("obj", obj_pipe, obj_cols)
  ])

  rf_num_estimators = trial.suggest_int("rf_num_estimators", 10, 121)
  rf_max_depth = trial.suggest_int("rf_max_depth", 1, 7)
  rf_criterion = trial.suggest_categorical("rf_criterion",
                                           ["squared_error",
                                            "absolute_error",
                                            "friedman_mse",
                                            "poisson"])
  rf_max_features = trial.suggest_categorical("rf_max_features",
                                              ["sqrt",
                                               "log2",
                                               "none"])
  if rf_max_features == "none":
    max_features = None
  else:
    max_features = rf_max_features

  model = Pipeline(steps=[("pre", preprocessor),
                          ("reg", RandomForestRegressor(n_estimators=rf_num_estimators,
                                                        criterion=rf_criterion,
                                                        max_depth=rf_max_depth,
                                                        max_features=max_features,
                                                        random_state=42))])

  scores = cross_val_score(model,
                           X_train,
                           y_train,
                           cv=5,
                           scoring="r2")

  score = scores.mean()

  return score

In [None]:
study = optuna.create_study(direction="maximize",
                            sampler=optuna.samplers.TPESampler())

study.optimize(objective, n_trials=20)

[I 2024-01-15 05:26:41,057] A new study created in memory with name: no-name-41859aed-36dc-41a4-91d3-96507d4d2f52
[I 2024-01-15 05:26:41,622] Trial 0 finished with value: 0.1843341889524104 and parameters: {'num_imputer': 'mean', 'num_indicator': False, 'scaler': 'norm', 'obj_imputer': 'most_frequent', 'obj_indicator': False, 'encoder': 'one-hot', 'rf_num_estimators': 35, 'rf_max_depth': 1, 'rf_criterion': 'squared_error', 'rf_max_features': 'sqrt'}. Best is trial 0 with value: 0.1843341889524104.
[I 2024-01-15 05:26:43,605] Trial 1 finished with value: 0.38915391118640247 and parameters: {'num_imputer': 'mean', 'num_indicator': False, 'scaler': None, 'obj_imputer': 'most_frequent', 'obj_indicator': True, 'encoder': 'one-hot', 'rf_num_estimators': 73, 'rf_max_depth': 1, 'rf_criterion': 'poisson', 'rf_max_features': 'none'}. Best is trial 1 with value: 0.38915391118640247.
[I 2024-01-15 05:26:44,836] Trial 2 finished with value: 0.7351381608267389 and parameters: {'num_imputer': 'median

In [None]:
study.best_value

0.8772612816309966

In [None]:
study.best_params

{'num_imputer': 'median',
 'num_indicator': True,
 'scaler': 'std',
 'obj_imputer': 'most_frequent',
 'obj_indicator': False,
 'encoder': 'ordinal',
 'rf_num_estimators': 72,
 'rf_max_depth': 7,
 'rf_criterion': 'poisson',
 'rf_max_features': 'none'}

In [None]:
# results dataframe

(
    study
      .trials_dataframe()
      .assign(number=lambda df_: df_.number.add(1))
      .rename(columns=dict(number="trial",
                           value="r2_score"))
      .sort_values(by="r2_score", ascending=False)
)

Unnamed: 0,trial,r2_score,datetime_start,datetime_complete,duration,params_encoder,params_num_imputer,params_num_indicator,params_obj_imputer,params_obj_indicator,params_rf_criterion,params_rf_max_depth,params_rf_max_features,params_rf_num_estimators,params_scaler,state
13,14,0.877261,2024-01-15 05:29:08.896924,2024-01-15 05:29:12.831304,0 days 00:00:03.934380,ordinal,median,True,most_frequent,False,poisson,7,none,72,std,COMPLETE
19,20,0.876909,2024-01-15 05:29:28.875856,2024-01-15 05:29:34.036879,0 days 00:00:05.161023,ordinal,median,True,constant,False,poisson,7,none,82,std,COMPLETE
15,16,0.876351,2024-01-15 05:29:17.394863,2024-01-15 05:29:20.916238,0 days 00:00:03.521375,ordinal,median,True,constant,False,poisson,7,none,64,std,COMPLETE
16,17,0.875486,2024-01-15 05:29:20.918910,2024-01-15 05:29:23.686307,0 days 00:00:02.767397,ordinal,median,True,constant,False,poisson,7,none,49,std,COMPLETE
14,15,0.87453,2024-01-15 05:29:12.834154,2024-01-15 05:29:17.392663,0 days 00:00:04.558509,ordinal,median,True,constant,False,friedman_mse,7,none,66,std,COMPLETE
18,19,0.861565,2024-01-15 05:29:25.517527,2024-01-15 05:29:28.869451,0 days 00:00:03.351924,ordinal,mean,True,constant,False,poisson,6,none,57,std,COMPLETE
17,18,0.858638,2024-01-15 05:29:23.688313,2024-01-15 05:29:25.515227,0 days 00:00:01.826914,ordinal,median,True,constant,False,poisson,7,log2,78,std,COMPLETE
8,9,0.8011,2024-01-15 05:28:13.979208,2024-01-15 05:28:21.158921,0 days 00:00:07.179713,one-hot,median,True,most_frequent,False,friedman_mse,4,none,109,std,COMPLETE
12,13,0.799555,2024-01-15 05:29:05.403143,2024-01-15 05:29:08.894447,0 days 00:00:03.491304,ordinal,constant,True,most_frequent,False,poisson,4,none,97,std,COMPLETE
5,6,0.79951,2024-01-15 05:28:09.327213,2024-01-15 05:28:12.609812,0 days 00:00:03.282599,ordinal,constant,False,most_frequent,True,poisson,4,none,95,norm,COMPLETE


## Final Model

In [None]:
num_pipe = Pipeline(steps=[("imputer", SimpleImputer(strategy="median",
                                                     add_indicator=True)),
                           ("scaler", StandardScaler())])

obj_pipe = Pipeline(steps=[("imputer", SimpleImputer(strategy="most_frequent",
                                                     add_indicator=False)),
                           ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value",
                                                      unknown_value=-1))])

preprocessor = ColumnTransformer(transformers=[
    ("num", num_pipe, num_cols),
    ("obj", obj_pipe, obj_cols)
])

model = Pipeline(steps=[("pre", preprocessor),
                        ("reg", RandomForestRegressor(n_estimators=72,
                                                      criterion="poisson",
                                                      max_depth=7,
                                                      max_features=None,
                                                      random_state=42))])

model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

print(f"{'Train R2-score':15}: {r2_score(y_train, y_train_pred)}")
print(f"{'Test R2-score':15}: {r2_score(y_test, y_test_pred)}")

Train R2-score : 0.9467171943166416
Test R2-score  : 0.9158974709528042
