# testing the best model

In [1]:
import os 
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error,r2_score,mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from urllib.parse import urlparse
import mlflow 
import mlflow.sklearn
import numpy as np
from SurgeSense.constants import * 
from SurgeSense.utils.common import read_yaml, create_directories

In [2]:
%pwd 

'd:\\pythonProjects\\SurgeSense\\research'

In [3]:
import os 
os.chdir('../')
%pwd

'd:\\pythonProjects\\SurgeSense'

In [4]:
from dataclasses import dataclass

@dataclass(frozen=True)
class HyperOptParamsXGBoost:
    root_dir: Path
    train_data_path: Path 
    test_data_path: Path
    model_name: str
    n_estimators: list 
    max_depth: list 
    learning_rate: str 
    target_column: str

In [5]:
# configuration 
class HyperOptParamsConfigManagerXGBoost:
    def __init__(
            self,
            config_file_path=CONFIG_FILE_PATH,
            params_file_path=PARAMS_FILE_PATH,
            schema_file_path=SCHEMA_FILE_PATH,
            ):
        self.config=read_yaml(config_file_path)
        self.params=read_yaml(params_file_path)
        self.schema=read_yaml(schema_file_path)
    
    def get_hyperopt_config(self)->HyperOptParamsXGBoost:
        config=self.config.model_trainer 
        params=self.params.Hyperopt_params.XGBoostRegressor
        schema=self.schema.TARGET_COLUMN

        hypoeropt_config=HyperOptParamsXGBoost(
            root_dir=config.root_dir,
            train_data_path=config.train_data_path,
            test_data_path=config.test_data_path,
            model_name=config.model_name,
            n_estimators=params.n_estimators,
            max_depth=params.max_depth,
            learning_rate=params.learning_rate,
            target_column=schema.name
        )
        return hypoeropt_config

In [6]:
# components
from hyperopt import STATUS_OK, hp, fmin, tpe, Trials
import dagshub
from functools import partial 
import mlflow
import pandas as pd 
import os 
from SurgeSense import logger
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
import joblib
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd 
import numpy as np

class hyperOptTraining:
    def __init__(self,config=HyperOptParamsXGBoost):
          self.config=config

    def create_pipeline(self):
        
        categorical_columns=['cab_type','destination','source','name']
        numerical_columns=['distance','surge_multiplier','temp','clouds','pressure','rain','humidity','wind','day','hour','month']

        numerical_preprocessor=Pipeline(
            steps=[
                ('imputation_menu',SimpleImputer(missing_values=np.nan,strategy='median')),
                ('scalar',StandardScaler())
            ]
        )

        categorical_preprocessor=Pipeline(
            steps=[
                ('imputation_constant',SimpleImputer(strategy='most_frequent')),
                ('encode',OneHotEncoder(handle_unknown='ignore'))
            ]
        )

        preprocessor=ColumnTransformer(
            transformers=[
                ('categorical_columns',categorical_preprocessor,categorical_columns),
                ('numerical_columns',numerical_preprocessor,numerical_columns)
            ]
        )

        
        pipe=Pipeline(
            steps=[
                ('preprocessor',preprocessor),
                ('model', XGBRegressor())
            ]
        )
        

        return pipe
    
    def evaluation_metrics(self,actual,pred):
        rmse=np.sqrt(mean_squared_error(actual,pred))
        mae=mean_absolute_error(actual,pred)
        r2=r2_score(actual,pred)
        return rmse, mae, r2 
    
    def objective(self,params,xtrain,ytrain,xtest,ytest):
        with mlflow.start_run():
            mlflow.set_tag('model','XGBoostRegressor')

            pipe=self.create_pipeline()
            model=pipe.set_params(**params)
            model.fit(xtrain,ytrain)
            ypred=model.predict(xtest)
            mlflow.log_params(model.get_params())
            rmse,mae,r2=self.evaluation_metrics(ytest,ypred)
            mlflow.log_metrics({'rmse':rmse,'mse': mae, 'r2':r2})
        return {'loss':rmse, 'status':STATUS_OK, 'model':model}

    def train(self):
        train_data=pd.read_csv(self.config.train_data_path)
        test_data=pd.read_csv(self.config.test_data_path)

        xtrain=train_data.drop([self.config.target_column],axis=1)
        xtest=test_data.drop([self.config.target_column],axis=1)
        ytrain=train_data[[self.config.target_column]]
        ytest=test_data[[self.config.target_column]]

        search_space={
            'model__n_estimators':hp.uniformint('n_estimators',self.config.n_estimators[0],self.config.n_estimators[1]),
            'model__max_depth':hp.uniformint('max_depth',self.config.max_depth[0],self.config.max_depth[1]),
            'model__learning_rate':hp.uniform('learning_rate',self.config.learning_rate[0],self.config.learning_rate[1])
        }
        dagshub.init(repo_owner='Immortal-Pi',repo_name='SurgeSense',mlflow=True)
        experiment_name='hyperopt_test_xgboostregressor_algorithm'
        existing_experiment=mlflow.get_experiment_by_name(experiment_name)

        if existing_experiment is None:
                experiment_id = mlflow.create_experiment(name=experiment_name,artifact_location='hyperopt-test')
        else:
            experiment_id = existing_experiment.experiment_id
        mlflow.set_experiment(experiment_id=experiment_id) 

        trials=Trials()
        best_results=fmin(
            fn=partial(
                self.objective,
                xtrain=xtrain[:1000],
                ytrain=ytrain[:1000],
                xtest=xtest[:1000],
                ytest=ytest[:1000]
            ),
            space=search_space,
            algo=tpe.suggest,
            max_evals=10, # change to config
            trials=trials
        )
        return best_results,trials

    def register_best_model(self,best_results,trials):
        best_index=np.argmin([trial['result']['loss'] for trial in trials.trials])
        best_model=trials.trials[best_index]['result']['model']

        with mlflow.start_run() as run:
            mlflow.sklearn.log_model(best_model,artifact_path='model')
            mlflow.log_params(trials.trials[best_index]['misc']['vals'])
            model_uri=f'runs:/{run.info.run_id}/best_model'
            mlflow.register_model(model_uri=model_uri,name='best_model')


In [7]:
# pipeline 
try: 
    config=HyperOptParamsConfigManagerXGBoost()
    hyperopt_config=config.get_hyperopt_config()
    hyperopt_config_training=hyperOptTraining(config=hyperopt_config)
    best_results,trails=hyperopt_config_training.train()
    hyperopt_config_training.register_best_model(best_results,trails)
except Exception as e:
    raise e 

[2025-03-24 12:03:16,532: INFO :common : yaml file: config\config.yaml loaded successfully]
[2025-03-24 12:03:16,536: INFO :common : yaml file: params.yaml loaded successfully]
[2025-03-24 12:03:16,545: INFO :common : yaml file: schema.yaml loaded successfully]
[2025-03-24 12:03:19,054: INFO :_client : HTTP Request: GET https://dagshub.com/api/v1/user "HTTP/1.1 200 OK"]


[2025-03-24 12:03:19,061: INFO :helpers : Accessing as Immortal-Pi]
[2025-03-24 12:03:19,218: INFO :_client : HTTP Request: GET https://dagshub.com/api/v1/repos/Immortal-Pi/SurgeSense "HTTP/1.1 200 OK"]
[2025-03-24 12:03:19,391: INFO :_client : HTTP Request: GET https://dagshub.com/api/v1/user "HTTP/1.1 200 OK"]


[2025-03-24 12:03:19,397: INFO :helpers : Initialized MLflow to track repo "Immortal-Pi/SurgeSense"]


[2025-03-24 12:03:19,399: INFO :helpers : Repository Immortal-Pi/SurgeSense initialized!]
  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?][2025-03-24 12:03:19,811: INFO :tpe : build_posterior_wrapper took 0.001009 seconds]
[2025-03-24 12:03:19,812: INFO :tpe : TPE using 0 trials]
🏃 View run grandiose-goose-220 at: https://dagshub.com/Immortal-Pi/SurgeSense.mlflow/#/experiments/2/runs/28323b746b7047b1b6c4e8caa9669671

🧪 View experiment at: https://dagshub.com/Immortal-Pi/SurgeSense.mlflow/#/experiments/2

 10%|█         | 1/10 [00:01<00:15,  1.77s/trial, best loss: 3.310175529965831][2025-03-24 12:03:21,579: INFO :tpe : build_posterior_wrapper took 0.001998 seconds]
[2025-03-24 12:03:21,579: INFO :tpe : TPE using 1/1 trials with best loss 3.310176]
🏃 View run silent-jay-646 at: https://dagshub.com/Immortal-Pi/SurgeSense.mlflow/#/experiments/2/runs/afe50acf7bb64a9ab1c5b9ac5df81b59

🧪 View experiment at: https://dagshub.com/Immortal-Pi/SurgeSense.mlflow/#/experiments/2

 20%|██     

Registered model 'best_model' already exists. Creating a new version of this model...
2025/03/24 12:05:55 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: best_model, version 8
Created version '8' of model 'best_model'.


🏃 View run likeable-shoat-199 at: https://dagshub.com/Immortal-Pi/SurgeSense.mlflow/#/experiments/2/runs/7891c68275b94edfb9512de635cba2ea
🧪 View experiment at: https://dagshub.com/Immortal-Pi/SurgeSense.mlflow/#/experiments/2
