This notebook is used to organize the experiments. If you just want the best models found, you can refer to the `3_model_training` notebook.

In [178]:
import os

import mlflow
from sklearn.model_selection import train_test_split
from mlflow.models import infer_signature
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, log_loss
import pandas as pd
import numpy as np
import xgboost as xgb

os.sys.path.append(os.path.abspath('../src'))
from data import loader
from data import preprocessor

mapname = 'Town01'

In [179]:
def create_and_train_model(model, args):
    """
    Function to create and train a model with given parameters.

    Args:
        model: The model to be trained.
        args: Arguments for training the model.

    Returns:
        result: The result of the training process.
    """
    result = {}
    # Train model with current hyperparameters
    md = model(**args)
    md.fit(X_train, y_train)
    # Predict on the validation set
    y_pred = md.predict(X_val)
    y_pred_proba = md.predict_proba(X_val)
    # Log training results
    result["f1_score"] = f1_score(y_val, y_pred, average='weighted')
    result["accuracy"] = accuracy_score(y_val, y_pred)
    result["recall"] = recall_score(y_val, y_pred, average='weighted')
    result["precision"] = precision_score(y_val, y_pred, average='weighted')
    result["log_loss"] = log_loss(y_val, y_pred_proba)
    result["model"] = md

    return result

In [180]:
def objective(params):
    """
    Objective function for hyperparameter optimization.
    This function will be called by Hyperopt for each trial.
    """
    with mlflow.start_run(nested=True):
        # Log hyperparameters being tested
        mlflow.log_params(params)

        # Train model with current hyperparameters
        result = create_and_train_model(
            model,
            args=params
        )

        # Log training results
        mlflow.log_metrics(
            {
                "accuracy": result["accuracy"],
                "recall": result["recall"],
                "precision": result["precision"],
                "f1_score": result["f1_score"],
                "log_loss": result["log_loss"]
            }
        )

        # Log the trained model
        mlflow.sklearn.log_model(
            result["model"], name=model.__name__, signature=signature)

        # Return loss for Hyperopt (it minimizes)
        return {"loss": result["log_loss"], "status": STATUS_OK}

In [181]:
def run_experiments(run_name, max_evals, search_space, data_name):
    # Create or set experiment

    print(
        f"This will run {max_evals} trials to find optimal hyperparameters...")

    with mlflow.start_run(run_name=run_name):
        # Log experiment metadata
        mlflow.log_params(
            {
                "optimization_method": "Tree-structured Parzen Estimator (TPE)",
                "max_evaluations": max_evals,
                "objective_metric": "log_loss",
                "dataset": data_name,
                "model_type": model.__name__,
            }
        )

        # Run optimization
        trials = Trials()
        best_params = fmin(
            fn=objective,
            space=search_space,
            algo=tpe.suggest,
            max_evals=max_evals,
            trials=trials,
            verbose=True,
        )
        print(best_params)

        # Find and log best results
        best_trial = min(trials.results, key=lambda x: x["loss"])
        best_log_loss = best_trial["loss"]

        # Log optimization results
        mlflow.log_params(best_params)
        mlflow.log_metrics(
            {
                "best_log_loss": best_log_loss,
                "total_trials": len(trials.trials),
                "optimization_completed": 1,
            }
        )

    return best_log_loss


In [182]:
client = mlflow.tracking.MlflowClient("http://127.0.0.1:8080")

## Preparing the Data

In [183]:
data_path = '../data'
uah_training = pd.read_csv(f'{data_path}/base/training_set_uah.csv')
uah_validation = pd.read_csv(f'{data_path}/base/validation_set_uah.csv')

carla_fixed = pd.read_csv(f'{data_path}/merged/{mapname}/carla/carla_fixed.csv')
carla_llm = pd.read_csv(f'{data_path}/merged/{mapname}/carla/carla_llm.csv')

sumo_fixed = pd.read_csv(f'{data_path}/merged/{mapname}/sumo/sumo_fixed.csv')
sumo_llm = pd.read_csv(f'{data_path}/merged/{mapname}/sumo/sumo_llm.csv')

carla_uah_fixed = pd.read_csv(f'{data_path}/merged/{mapname}/carla_uah/carla_uah_fixed.csv')
carla_uah_llm = pd.read_csv(f'{data_path}/merged/{mapname}/carla_uah/carla_uah_llm.csv')

sumo_uah_fixed = pd.read_csv(f'{data_path}/merged/{mapname}/sumo_uah/sumo_uah_fixed.csv')
sumo_uah_llm = pd.read_csv(f'{data_path}/merged/{mapname}/sumo_uah/sumo_uah_llm.csv')

Applying sliding windows to UAH, SUMO and CARLA full data

In [184]:
window_size = 10
step_size = 5

In [185]:
# UAH 
X_uah, y_uah = preprocessor.sliding_windows(uah_training, window_size=window_size, step_size=step_size)
X_val, y_val = preprocessor.sliding_windows(uah_validation, window_size=window_size, step_size=step_size)

# SUMO
X_sumo_fixed, y_sumo_fixed = preprocessor.sliding_windows(sumo_fixed, window_size=window_size, step_size=step_size)
X_sumo_llm, y_sumo_llm = preprocessor.sliding_windows(sumo_llm, window_size=window_size, step_size=step_size)

# CARLA
X_carla_fixed, y_carla_fixed = preprocessor.sliding_windows(carla_fixed, window_size=window_size, step_size=step_size)
X_carla_llm, y_carla_llm = preprocessor.sliding_windows(carla_llm, window_size=window_size, step_size=step_size)

Defining the search space for each model

In [186]:
max_evals = 10

search_space_rf = {
    "n_estimators": hp.choice("n_estimators", [50, 100]),
    "max_depth": hp.choice("max_depth", [None, 10, 20]),
    "random_state": 42,
}
search_space_svc = {
    "C": hp.loguniform("C", -3, 3),  # Regularization parameter
    "kernel": hp.choice("kernel", ["linear", "rbf", "poly"]),  # Kernel type
    "gamma": hp.loguniform("gamma", -3, 3),  # Kernel coefficient for 'rbf', 'poly', and 'sigmoid'
    "degree": hp.choice("degree", [2, 3, 4]),  # Degree of the polynomial kernel function ('poly')
    "random_state": 42
}
search_space_xgb = {
    "n_estimators": hp.choice("n_estimators", [50, 100]),
    "max_depth": hp.choice("max_depth", [None, 10, 20]),
    "learning_rate": hp.loguniform("learning_rate", -3, 0),
    "subsample": hp.uniform("subsample", 0.5, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1.0),
    "random_state": 42,
}

## Real Only

In [187]:
# Provide an Experiment description that will appear in the UI
experiment_description = (
    "Experiment to train models on the UAH driveset."
)

# Provide searchable tags that define characteristics of the Runs that
# will be in this Experiment
experiment_tags = {
    "project_name": "driver-behavior-prediction",
    "mlflow.note.content": experiment_description,
}

# Create the Experiment, providing a unique name
try:
    driver_behavior_experiment = client.create_experiment(
        name="Driver_Behavior_Models_UAH", tags=experiment_tags
    )
except mlflow.exceptions.RestException as e:
    print(f"Experiment already exists: {e}")

Experiment already exists: RESOURCE_ALREADY_EXISTS: Experiment 'Driver_Behavior_Models_UAH' already exists.


In [188]:
X_train, y_train = X_uah, y_uah
signature = infer_signature(X_uah, y_uah)
mlflow.set_experiment('Driver_Behavior_Models_UAH')
data_name = 'UAH'

In [None]:
model = RandomForestClassifier
run_experiments('rf-sweep', max_evals=max_evals, search_space=search_space_rf, data_name='UAH')
model = SVC
run_experiments('svc-sweep', max_evals=max_evals, search_space=search_space_svc, data_name='UAH')
model = xgb.XGBClassifier
run_experiments('xgb-sweep', max_evals=max_evals, search_space=search_space_xgb, data_name='UAH')

This will run 10 trials to find optimal hyperparameters...
🏃 View run secretive-fish-366 at: http://127.0.0.1:8080/#/experiments/817421822837973294/runs/3cd9daedce484383860a6d3ad35a822a

🧪 View experiment at: http://127.0.0.1:8080/#/experiments/817421822837973294

🏃 View run aged-skunk-358 at: http://127.0.0.1:8080/#/experiments/817421822837973294/runs/8857907b698d4e71aa2b931d4cb6243c

🧪 View experiment at: http://127.0.0.1:8080/#/experiments/817421822837973294   

🏃 View run brawny-cod-980 at: http://127.0.0.1:8080/#/experiments/817421822837973294/runs/ac7e83c8046c4a77bac29883c9617293

🧪 View experiment at: http://127.0.0.1:8080/#/experiments/817421822837973294   

🏃 View run fortunate-stag-608 at: http://127.0.0.1:8080/#/experiments/817421822837973294/runs/3262b7932e6142beb08142bf834c2147

🧪 View experiment at: http://127.0.0.1:8080/#/experiments/817421822837973294   

🏃 View run brawny-penguin-771 at: http://127.0.0.1:8080/#/experiments/817421822837973294/runs/4ebb3c679ae44ff5bcd145

## SUMO Only

### Fixed

In [None]:
# Provide an Experiment description that will appear in the UI
experiment_description = (
    "Experiment to train models on the SUMO Fixed dataset."
)

# Provide searchable tags that define characteristics of the Runs that
# will be in this Experiment
experiment_tags = {
    "project_name": "driver-behavior-prediction",
    "mlflow.note.content": experiment_description,
}

# Create the Experiment, providing a unique name
try:
    driver_behavior_experiment = client.create_experiment(
        name="Driver_Behavior_Models_SUMO_Fixed", tags=experiment_tags
    )
except mlflow.exceptions.RestException as e:
    print(f"Experiment already exists: {e}")

In [None]:
X_train, y_train = X_sumo_fixed, y_sumo_fixed
signature = infer_signature(X_uah, y_uah)
mlflow.set_experiment('Driver_Behavior_Models_SUMO_Fixed')
data_name = 'SUMO_Fixed'

In [None]:
model = RandomForestClassifier
run_experiments('rf-sweep', max_evals=max_evals, search_space=search_space_rf, data_name=data_name)
model = SVC
run_experiments('svc-sweep', max_evals=max_evals, search_space=search_space_svc, data_name=data_name)
model = xgb.XGBClassifier
run_experiments('xgb-sweep', max_evals=max_evals, search_space=search_space_xgb, data_name=data_name)

### LLM

In [None]:
# Provide an Experiment description that will appear in the UI
experiment_description = (
    "Experiment to train models on the SUMO LLM dataset."
)

# Provide searchable tags that define characteristics of the Runs that
# will be in this Experiment
experiment_tags = {
    "project_name": "driver-behavior-prediction",
    "mlflow.note.content": experiment_description,
}

# Create the Experiment, providing a unique name
try:
    driver_behavior_experiment = client.create_experiment(
        name="Driver_Behavior_Models_SUMO_LLM", tags=experiment_tags
    )
except mlflow.exceptions.RestException as e:
    print(f"Experiment already exists: {e}")

In [None]:
X_train, y_train = X_sumo_llm, y_sumo_llm
mlflow.set_experiment('Driver_Behavior_Models_SUMO_LLM')
data_name = 'SUMO_LLM'

In [None]:
model = RandomForestClassifier
run_experiments('rf-sweep', max_evals=max_evals, search_space=search_space_rf, data_name=data_name)
model = SVC
run_experiments('svc-sweep', max_evals=max_evals, search_space=search_space_svc, data_name=data_name)
model = xgb.XGBClassifier
run_experiments('xgb-sweep', max_evals=max_evals, search_space=search_space_xgb, data_name=data_name)

## Carla Only

### Fixed

In [None]:
# Provide an Experiment description that will appear in the UI
experiment_description = (
    "Experiment to train models on the CARLA Fixed dataset."
)

# Provide searchable tags that define characteristics of the Runs that
# will be in this Experiment
experiment_tags = {
    "project_name": "driver-behavior-prediction",
    "mlflow.note.content": experiment_description,
}

# Create the Experiment, providing a unique name
try:
    driver_behavior_experiment = client.create_experiment(
        name="Driver_Behavior_Models_CARLA_Fixed", tags=experiment_tags
    )
except mlflow.exceptions.RestException as e:
    print(f"Experiment already exists: {e}")

In [None]:
X_train, y_train = X_carla_fixed, y_carla_fixed
mlflow.set_experiment('Driver_Behavior_Models_CARLA_Fixed')
data_name = 'CARLA_Fixed'

In [None]:
model = RandomForestClassifier
run_experiments('rf-sweep', max_evals=max_evals, search_space=search_space_rf, data_name=data_name)
model = SVC
run_experiments('svc-sweep', max_evals=max_evals, search_space=search_space_svc, data_name=data_name)
model = xgb.XGBClassifier
run_experiments('xgb-sweep', max_evals=max_evals, search_space=search_space_xgb, data_name=data_name)

### LLM

In [None]:
# Provide an Experiment description that will appear in the UI
experiment_description = (
    "Experiment to train models on the CARLA LLM dataset."
)

# Provide searchable tags that define characteristics of the Runs that
# will be in this Experiment
experiment_tags = {
    "project_name": "driver-behavior-prediction",
    "mlflow.note.content": experiment_description,
}

# Create the Experiment, providing a unique name
try:
    driver_behavior_experiment = client.create_experiment(
        name="Driver_Behavior_Models_CARLA_LLM", tags=experiment_tags
    )
except mlflow.exceptions.RestException as e:
    print(f"Experiment already exists: {e}")

In [None]:
X_train, y_train = X_carla_llm, y_carla_llm
mlflow.set_experiment('Driver_Behavior_Models_CARLA_LLM')
data_name = 'CARLA_LLM'

In [None]:
model = RandomForestClassifier
run_experiments('rf-sweep', max_evals=max_evals, search_space=search_space_rf, data_name=data_name)
model = SVC
run_experiments('svc-sweep', max_evals=max_evals, search_space=search_space_svc, data_name=data_name)
model = xgb.XGBClassifier
run_experiments('xgb-sweep', max_evals=max_evals, search_space=search_space_xgb, data_name=data_name)

## Real + SUMO

### Fixed

In [176]:
signature = infer_signature(X_sumo_fixed, y_sumo_fixed)

In [None]:
# Provide an Experiment description that will appear in the UI
experiment_description = (
    "Experiment to train models on the UAH driveset supplemented by SUMO with fixed parameters."
)

# Provide searchable tags that define characteristics of the Runs that
# will be in this Experiment
experiment_tags = {
    "project_name": "driver-behavior-prediction",
    "mlflow.note.content": experiment_description,
}

# Create the Experiment, providing a unique name
driver_behavior_experiment = client.create_experiment(
    name="Driver_Behavior_Models_UAH_SUMO_fixed", tags=experiment_tags
)

In [None]:
model = RandomForestClassifier
run_experiments('rf-sweep', max_evals=5, search_space=search_space_rf, data_name='UAH')

## Real + SUMO (LLM)