<h1>Part 4 - Experiment Tracking</h1>

# Experiment Tracking and Model Management with MLFlow

In [19]:

import mlflow
import pickle

print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

tracking URI: 'file:///g:/My%20Drive/MSc%20Computer%20Science/MLOps/project/project-titanic/mlruns'


After this initialization, we can connect create a client to connect to the API and see what experiments are present.

By refering to mlflow's [documentation](https://mlflow.org/docs/latest/python_api/mlflow.client.html), create a client and display a list of the available experiments using the search_experiments function. This function could prove useful later to programatically explore experiments (rather than in the UI)

In [14]:
from mlflow.tracking import MlflowClient
client = mlflow.MlflowClient()
experiments = client.search_experiments()
print(experiments)

[<Experiment: artifact_location='file:///g:/My%20Drive/MSc%20Computer%20Science/MLOps/project/project-titanic/mlruns/0', creation_time=1736026881822, experiment_id='0', last_update_time=1736026881822, lifecycle_stage='active', name='Default', tags={}>]


We see that there is a default experiment for which the runs are stored locally in the mlruns folder.

# Use Case

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from typing import List
from scipy.sparse import csr_matrix

## 1 - Load data

In [2]:
DATA_FOLDER = "data"

train_df = pd.read_csv(DATA_FOLDER + '/train_and_test2.csv')

## 2 - Prepare the data

### 2-1 Clean Data

In [3]:
selected_cols = ['Age', 'Fare', 'Sex', 'sibsp', 'Parch', 'Pclass', 'Embarked', '2urvived']
train_df = train_df[selected_cols]

In [4]:
train_df = train_df.dropna()
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1307 entries, 0 to 1308
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       1307 non-null   float64
 1   Fare      1307 non-null   float64
 2   Sex       1307 non-null   int64  
 3   sibsp     1307 non-null   int64  
 4   Parch     1307 non-null   int64  
 5   Pclass    1307 non-null   int64  
 6   Embarked  1307 non-null   float64
 7   2urvived  1307 non-null   int64  
dtypes: float64(3), int64(5)
memory usage: 91.9 KB


### 2-2 Prepare features

In [5]:
train_df, test_df = train_test_split(train_df, test_size=0.2, random_state=42)
train_df = pd.DataFrame(train_df)
test_df = pd.DataFrame(test_df)

#### 2-2-1 Categorical features

In [6]:
CATEGORICAL_COLS = ['Pclass', 'Sex', 'Embarked']

def encode_categorical_cols(df):
    label_encoders = {}
    for col in CATEGORICAL_COLS:
        df[col] = df[col].fillna("Unknown")
        label_encoders[col] = LabelEncoder()
        df[col] = label_encoders[col].fit_transform(df[col])
    
    return df, label_encoders

train_df, encoders = encode_categorical_cols(train_df)

In [7]:
def extract_x_y(df, categorical_cols=None, dv=None, with_target=True):
    if categorical_cols is None:
        categorical_cols = ['Pclass', 'Sex', 'Embarked']
    
    dicts = df[categorical_cols].to_dict(orient="records")
    y = None

    if with_target:
        if dv is None:
            dv = DictVectorizer()
            dv.fit(dicts)
        y = df["2urvived"].values

    x = dv.transform(dicts)
    return x, y, dv

X_train, y_train, dv = extract_x_y(train_df)

## 3 - Train model

In [8]:
#Random Forest
def train_model(X_train: csr_matrix, y_train: np.ndarray):
    model_rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
    model_rf.fit(X_train, y_train)
    return model_rf

model = train_model(X_train, y_train)

## 4 - Evaluate model

We evaluate the model on train and test data

### 4-1 On train data

In [9]:
def predict_survival(input_data: csr_matrix, model: RandomForestClassifier):
    return model.predict(input_data)


def evaluate_model(y_true: np.ndarray, y_pred: np.ndarray):
    return accuracy_score(y_true, y_pred)


prediction = predict_survival(X_train, model)
train_acc = evaluate_model(y_train, prediction)
train_acc

0.7961722488038278

### 4-2 On test data

In [10]:
def transform_categorical_cols(df, encoders):
    for col, encoder in encoders.items():
        df[col] = df[col].fillna("Unknown")
        df[col] = encoder.transform(df[col])
    return df

test_df = transform_categorical_cols(test_df, encoders)
X_test, y_test, _ = extract_x_y(test_df, dv=dv)

In [11]:
y_pred_test = predict_survival(X_test, model)
test_acc = evaluate_model(y_test, y_pred_test)
test_acc

0.7786259541984732

## 5 - Log Model Parameters to MlFlow

Now that all our development functions are built and tested, let's create a training pipeline and log the training parameters, logs and model to MlFlow.

Create a training flow, log all the important parameters, metrics and model. Try to find what could be important and needs to be logged.

In [23]:
# Set the experiment name
mlflow.set_experiment("titanic-1")
model_name = 'Random Forest V1'

# Start a run
with mlflow.start_run() as run:
    run_id = run.info.run_id

    # Set tags for the run
    mlflow.set_tag("developer", "LFBV")
    mlflow.set_tag("project", "Titanic Survival Prediction")

    # Load and clean data
    DATA_FOLDER = "data"
    train_df = pd.read_csv(DATA_FOLDER + '/train_and_test2.csv')
    selected_cols = ['Age', 'Fare', 'Sex', 'sibsp', 'Parch', 'Pclass', 'Embarked', '2urvived']
    train_df = train_df[selected_cols]
    train_df = train_df.dropna()

    # Train-test
    train_df, test_df = train_test_split(train_df, test_size=0.2, random_state=42)
    train_df = pd.DataFrame(train_df)
    test_df = pd.DataFrame(test_df)

    # Encode categorical columns - get encoders
    CATEGORICAL_COLS = ['Pclass', 'Sex', 'Embarked']
    train_df, encoders = encode_categorical_cols(train_df)

    # Extract X, y and DV
    X_train, y_train, dv = extract_x_y(train_df)

    # Train model
    def train_model(X_train: csr_matrix, y_train: np.ndarray):
        model_rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
        model_rf.fit(X_train, y_train)
        return model_rf

    model = train_model(X_train, y_train)

    # Evaluate model
    prediction = predict_survival(X_train, model)
    train_acc = evaluate_model(y_train, prediction)

    # transfor cat cols from test data using ecoders
    test_df = transform_categorical_cols(test_df, encoders)
    X_test, y_test, _ = extract_x_y(test_df, dv=dv)

    # Evaluate model on test set
    y_pred_test = predict_survival(X_test, model)
    test_acc = evaluate_model(y_test, y_pred_test)

    # Log your model
    mlflow.sklearn.log_model(model, model_name)

    # Register your model in mlfow model registry
    mlflow.log_param("train_accuracy", train_acc)
    mlflow.log_param("test_accuracy", test_acc)

    # Save encoders and dv as artifacts
    with open("encoders.pkl", "wb") as f:
        pickle.dump(encoders, f)
    with open("dv.pkl", "wb") as f:
        pickle.dump(dv, f)

    # Log encoders and dv as artifacts
    mlflow.log_artifact("encoders.pkl")
    mlflow.log_artifact("dv.pkl")

    # Register your model in mlfow model registry
    result = mlflow.register_model(f"runs:/{run_id}/models", "titanic-random-forest-v1")

    # end run
    mlflow.end_run()

2025/01/04 23:12:45 INFO mlflow.tracking.fluent: Experiment with name 'titanic-1' does not exist. Creating a new experiment.
Successfully registered model 'titanic-random-forest-v1'.
Created version '1' of model 'titanic-random-forest-v1'.


In [24]:
model_name = 'Random Forest V2'

# Start a run
with mlflow.start_run() as run:
    run_id = run.info.run_id

    # Set tags for the run
    mlflow.set_tag("developer", "LFBV")
    mlflow.set_tag("project", "Titanic Survival Prediction")

    # Load and clean data
    DATA_FOLDER = "data"
    train_df = pd.read_csv(DATA_FOLDER + '/train_and_test2.csv')
    selected_cols = ['Age', 'Fare', 'Sex', 'sibsp', 'Parch', 'Pclass', 'Embarked', '2urvived']
    train_df = train_df[selected_cols]
    train_df = train_df.dropna()

    # Train-test
    train_df, test_df = train_test_split(train_df, test_size=0.2, random_state=42)
    train_df = pd.DataFrame(train_df)
    test_df = pd.DataFrame(test_df)

    # Encode categorical columns - get encoders
    CATEGORICAL_COLS = ['Pclass', 'Sex', 'Embarked']
    train_df, encoders = encode_categorical_cols(train_df)

    # Extract X, y and DV
    X_train, y_train, dv = extract_x_y(train_df)

    # Train model
    def train_model(X_train: csr_matrix, y_train: np.ndarray):
        model_rf = RandomForestClassifier(n_estimators=1000, max_depth=100, random_state=42)
        model_rf.fit(X_train, y_train)
        return model_rf

    model = train_model(X_train, y_train)

    # Evaluate model
    prediction = predict_survival(X_train, model)
    train_acc = evaluate_model(y_train, prediction)

    # transfor cat cols from test data using ecoders
    test_df = transform_categorical_cols(test_df, encoders)
    X_test, y_test, _ = extract_x_y(test_df, dv=dv)

    # Evaluate model on test set
    y_pred_test = predict_survival(X_test, model)
    test_acc = evaluate_model(y_test, y_pred_test)

    # Log your model
    mlflow.sklearn.log_model(model, model_name)

    # Register your model in mlfow model registry
    mlflow.log_param("train_accuracy", train_acc)
    mlflow.log_param("test_accuracy", test_acc)

    # Save encoders and dv as artifacts
    with open("encoders.pkl", "wb") as f:
        pickle.dump(encoders, f)
    with open("dv.pkl", "wb") as f:
        pickle.dump(dv, f)

    # Log encoders and dv as artifacts
    mlflow.log_artifact("encoders.pkl")
    mlflow.log_artifact("dv.pkl")

    # Register your model in mlfow model registry
    result = mlflow.register_model(f"runs:/{run_id}/models", "titanic-random-forest-v2")

    # end run
    mlflow.end_run()

Successfully registered model 'titanic-random-forest-v2'.
Created version '1' of model 'titanic-random-forest-v2'.
