In [105]:
import mlflow
import mlflow.sklearn #(for MLflow Models and auto-logging)
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
import urllib.request

In [106]:
csv_url = 'https://raw.githubusercontent.com/MahreenAthar/MLOps_Project/main/heart_failure_clinical_records_dataset.csv'

In [107]:
urllib.request.urlretrieve(csv_url, 'heart_failure_clinical_records_dataset.csv')

('heart_failure_clinical_records_dataset.csv',
 <http.client.HTTPMessage at 0x1acae9294c0>)

In [108]:
df = pd.read_csv('heart_failure_clinical_records_dataset.csv')

In [109]:
# Fill missing values with zeros
df_filled = df.fillna(0)

In [110]:
# Split the dataset into X and y
X = df_filled.drop('DEATH_EVENT', axis=1)
y = df_filled['DEATH_EVENT']

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [111]:
#Set the MLflow tracking URI to point to your local instance of the MLflow server
mlflow.set_tracking_uri("http://localhost:5000")

In [112]:
#Get the current tracking uri
mlflow.get_tracking_uri()

'http://localhost:5000'

In [114]:
#create a new experiment
mlflow.create_experiment("mlops_project_heart")

'158188459553538204'

In [115]:
#Create a new MLflow run (Parent run)
mlflow.start_run(
    run_name="project_heart",
    experiment_id='158188459553538204',
    description="Random Forest Regressor for heart failure dataset of mlops project")

<ActiveRun: >

In [116]:
#Enable auto-logging
mlflow.sklearn.autolog()

In [117]:
# Define hyperparameters to search
n_estimators_list = [50, 100, 200]       #number of trees
max_depth_list = [3, 5, 10]              #number of splits

In [118]:
# Create nested runs for each combination of hyperparameters
for n_estimators in n_estimators_list:
    for max_depth in max_depth_list:
        # Start nested MLflow run
        with mlflow.start_run(nested=True) as run:      #child of existing parent run
            # Train RandomForestRegressor with current hyperparameters
            model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
            model.fit(X_train, y_train)
            
            #Evaluate the performance of the trained RandomForestRegressor on the test dataset
            y_pred = model.predict(X_test)
            mse = mean_squared_error(y_test, y_pred)
                        
            #Log evaluation metric
            mlflow.log_metrics({'mse': mse})
                        
            # Log model and hyperparameters
            mlflow.sklearn.log_model(model, "heart_rfr_model")
            mlflow.log_params({"n_estimators": n_estimators, "max_depth": max_depth})



In [119]:
# Search for the best run
best_run = mlflow.search_runs(experiment_ids='158188459553538204', order_by=[f"metrics.mse DESC"], max_results=1)

# Get the run ID for the best run
run_id = best_run.iloc[0]["run_id"]
print(run_id)

a74189c590a744e98783c00d68f0cb27


In [120]:
# Register best model with MLflow Model Registry
mlflow.register_model(f"runs:/{run_id}/heart_rfr_model", "project_heart_model")

Registered model 'project_heart_model' already exists. Creating a new version of this model...
2023/06/11 19:22:22 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: project_heart_model, version 2
Created version '2' of model 'project_heart_model'.


<ModelVersion: creation_timestamp=1686493342035, current_stage='None', description='', last_updated_timestamp=1686493342035, name='project_heart_model', run_id='a74189c590a744e98783c00d68f0cb27', run_link='', source='mlflow-artifacts:/158188459553538204/a74189c590a744e98783c00d68f0cb27/artifacts/heart_rfr_model', status='READY', status_message='', tags={}, user_id='', version='2'>

In [121]:
mlflow.end_run()