# In this project I worked on

- Run hyperparameter tuning while training a model
- Log every hyperparameter and metrics in the MLFlow UI 
- Compare the various results of the various runs in the mlflow UI 
- Choose the best model to register the model


In [26]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split,GridSearchCV
import mlflow
import mlflow.sklearn
from sklearn.metrics import mean_squared_error
from sklearn.datasets import fetch_california_housing
from urllib.parse import urlparse
from mlflow.models import infer_signature


In [27]:
X,y=fetch_california_housing(return_X_y=True,as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
signature = infer_signature(X_train,y_train)

In [28]:
#Hyper parameter tuning using Grid search CV
def hyperparameter_tuning(X_train,y_train,param_grid):
    rf = RandomForestRegressor()
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2,scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)
    return grid_search

In [37]:
#perform hyparameter tuning for random forest regressor
param_grid = {
    'n_estimators':[100,200],
    'max_depth':[5,10,None],
    'min_samples_split':[2,5],
    'min_samples_leaf':[1,2]
}

with mlflow.start_run():
    gridsearch = hyperparameter_tuning(X_train, y_train, param_grid)
    mlflow.log_params(gridsearch.best_params_)

    # Get the best model
    best_model = gridsearch.best_estimator_

    # Evaluate the best model
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)

    # Log the metrics
    mlflow.log_param('best n_estimators',gridsearch.best_params_['n_estimators'])
    mlflow.log_param('best max depth',gridsearch.best_params_['max_depth'])
    mlflow.log_param('best min_samples_split',gridsearch.best_params_['min_samples_split'])
    mlflow.log_param('best min_samples_leaf',gridsearch.best_params_['min_samples_leaf'])
    mlflow.log_metric('mse', mse)




mlflow.set_tracking_uri('http://127.0.0.1:5000')
track_uri_type = urlparse(mlflow.get_tracking_uri()).scheme



if track_uri_type!='file':
    mlflow.sklearn.log_model(best_model, "Best Model",registered_model_name="Best Random Forest Model")
else :
    mlflow.sklearn.log_model(best_model, "Best Model", signature=signature)


Fitting 5 folds for each of 24 candidates, totalling 120 fits
{'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
<class 'dict'>
🏃 View run aged-pig-726 at: http://127.0.0.1:5000/#/experiments/0/runs/453d6e990d6f40fb8f5625a54badf4ba
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0


Registered model 'Best Random Forest Model' already exists. Creating a new version of this model...
2024/12/29 12:50:36 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Best Random Forest Model, version 2
Created version '2' of model 'Best Random Forest Model'.


In [32]:
mlflow.end_run()