This notebook is used to organize the experiments. If you just want the best models found, you can refer to the `3_model_training` notebook.

In [None]:
import os

import mlflow
from sklearn.model_selection import train_test_split
from mlflow.models import infer_signature
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, log_loss
import pandas as pd
import numpy as np
import xgboost as xgb

os.sys.path.append(os.path.abspath('../src'))
from data import loader
from data import preprocessor

mapname = 'Town01'

In [108]:
def create_and_train_model(model, args):
    """
    Function to create and train a model with given parameters.

    Args:
        model: The model to be trained.
        args: Arguments for training the model.

    Returns:
        result: The result of the training process.
    """
    result = {}
    # Train model with current hyperparameters
    md = model(**args)
    md.fit(X_train, y_train)
    # Predict on the validation set
    y_pred = md.predict(X_val)
    y_pred_proba = md.predict_proba(X_val)
    # Log training results
    result["f1_score"] = f1_score(y_val, y_pred, average='weighted')
    result["accuracy"] = accuracy_score(y_val, y_pred)
    result["recall"] = recall_score(y_val, y_pred, average='weighted')
    result["precision"] = precision_score(y_val, y_pred, average='weighted')
    result["log_loss"] = log_loss(y_val, y_pred_proba)
    result["model"] = md

    return result

In [124]:
def objective(params):
    """
    Objective function for hyperparameter optimization.
    This function will be called by Hyperopt for each trial.
    """
    with mlflow.start_run(nested=True):
        # Log hyperparameters being tested
        mlflow.log_params(params)

        # Train model with current hyperparameters
        result = create_and_train_model(
            model,
            args=params
        )

        # Log training results
        mlflow.log_metrics(
            {
                "accuracy": result["accuracy"],
                "recall": result["recall"],
                "precision": result["precision"],
                "f1_score": result["f1_score"],
                "log_loss": result["log_loss"]
            }
        )

        # Log the trained model
        mlflow.sklearn.log_model(
            result["model"], name=model.__name__, signature=signature)

        # Return loss for Hyperopt (it minimizes)
        return {"loss": result["log_loss"], "status": STATUS_OK}

In [132]:
def run_experiments(run_name, max_evals, search_space, data_name):
    # Create or set experiment

    print(
        f"This will run {max_evals} trials to find optimal hyperparameters...")

    with mlflow.start_run(run_name=run_name):
        # Log experiment metadata
        mlflow.log_params(
            {
                "optimization_method": "Tree-structured Parzen Estimator (TPE)",
                "max_evaluations": max_evals,
                "objective_metric": "log_loss",
                "dataset": data_name,
                "model_type": model.__name__,
            }
        )

        # Run optimization
        trials = Trials()
        best_params = fmin(
            fn=objective,
            space=search_space,
            algo=tpe.suggest,
            max_evals=max_evals,
            trials=trials,
            verbose=True,
        )
        print(best_params)

        # Find and log best results
        best_trial = min(trials.results, key=lambda x: x["loss"])
        best_log_loss = best_trial["loss"]

        # Log optimization results
        mlflow.log_params(best_params)
        mlflow.log_metrics(
            {
                "best_log_loss": best_log_loss,
                "total_trials": len(trials.trials),
                "optimization_completed": 1,
            }
        )

    return best_log_loss


In [115]:
client = mlflow.tracking.MlflowClient("http://127.0.0.1:8080")

## Preparing the Data

In [131]:
drivers = ['D1', 'D2', 'D3', 'D4', 'D5'] # List of drivers to load
uah_data = loader.read_data(drivers, os.path.abspath('../data/base/UAH-DRIVESET-v1'))
carla_data, sumo_data = loader.load_synthetic_data(os.path.abspath(f'../data/synthetic/{mapname}'))

Stacking the data from UAH, SUMO and CARLA

In [None]:
# UAH 
X_normal, y_normal = preprocessor.sliding_windows(uah_data['acc']['normal'], 'normal', window_size=10, step_size=5)
X_aggressive, y_aggressive = preprocessor.sliding_windows(uah_data['acc']['aggressive'], 'aggressive', window_size=10, step_size=5)
X_uah, y_uah = np.vstack([X_normal, X_aggressive]), np.concat([y_snormal, y_aggressive])

Now let's set aside a portion of the UAH-driveset for validation

In [133]:
# SUMO
X_normal, y_normal = preprocessor.sliding_windows(sumo_data['fixed']['traffic']['normal'], 'normal', window_size=10, step_size=5)
X_aggressive, y_aggressive = preprocessor.sliding_windows(sumo_data['fixed']['traffic']['aggressive'], 'aggressive', window_size=10, step_size=5)
X_sumo_fixed, y_sumo_fixed = np.vstack([X_normal, X_aggressive]), np.concatenate([y_normal, y_aggressive])

X_normal, y_normal = preprocessor.sliding_windows(sumo_data['llm']['traffic']['normal'], 'normal', window_size=10, step_size=5)
X_aggressive, y_aggressive = preprocessor.sliding_windows(sumo_data['llm']['traffic']['aggressive'], 'aggressive', window_size=10, step_size=5)
X_sumo_llm, y_sumo_llm = np.vstack([X_normal, X_aggressive]), np.concatenate([y_normal, y_aggressive])

In [136]:
# CARLA
X_normal, y_normal = preprocessor.sliding_windows(carla_data['fixed']['traffic']['normal'], 'normal', window_size=10, step_size=5)
X_aggressive, y_aggressive = preprocessor.sliding_windows(carla_data['fixed']['traffic']['aggressive'], 'aggressive', window_size=10, step_size=5)
X_carla_fixed, y_carla_fixed = np.vstack([X_normal, X_aggressive]), np.concatenate([y_normal, y_aggressive])

X_normal, y_normal = preprocessor.sliding_windows(carla_data['llm']['traffic']['normal'], 'normal', window_size=10, step_size=5)
X_aggressive, y_aggressive = preprocessor.sliding_windows(carla_data['llm']['traffic']['aggressive'], 'aggressive', window_size=10, step_size=5)
X_carla_llm, y_carla_llm = np.vstack([X_normal, X_aggressive]), np.concatenate([y_normal, y_aggressive])

Defining the search space for each model

In [None]:
search_space_rf = {
    "n_estimators": hp.choice("n_estimators", [50, 100]),
    "max_depth": hp.choice("max_depth", [None, 10, 20]),
    "random_state": 42,
}
search_space_svc = {
    "C": hp.loguniform("C", -3, 3),  # Regularization parameter
    "kernel": hp.choice("kernel", ["linear", "rbf", "poly"]),  # Kernel type
    "gamma": hp.loguniform("gamma", -3, 3),  # Kernel coefficient for 'rbf', 'poly', and 'sigmoid'
    "degree": hp.choice("degree", [2, 3, 4]),  # Degree of the polynomial kernel function ('poly')
    "random_state": 42
}
search_space_xgb = {
    "n_estimators": hp.choice("n_estimators", [50, 100]),
    "max_depth": hp.choice("max_depth", [None, 10, 20]),
    "learning_rate": hp.loguniform("learning_rate", -3, 0),
    "subsample": hp.uniform("subsample", 0.5, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1.0),
    "random_state": 42,
}

## Real Only

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_uah, y_uah, test_size=0.2, random_state=42)
signature = infer_signature(X_train, y_train)

In [127]:
# Provide an Experiment description that will appear in the UI
experiment_description = (
    "Experiment to train models on the UAH driveset."
)

# Provide searchable tags that define characteristics of the Runs that
# will be in this Experiment
experiment_tags = {
    "project_name": "driver-behavior-prediction",
    "mlflow.note.content": experiment_description,
}

# Create the Experiment, providing a unique name
driver_behavior_experiment = client.create_experiment(
    name="Driver_Behavior_Models_UAH", tags=experiment_tags
)

In [None]:
mlflow.set_experiment('Driver_Behavior_Models_UAH')
data_name = 'UAH-DRIVESET'

<Experiment: artifact_location='mlflow-artifacts:/817421822837973294', creation_time=1753199157593, experiment_id='817421822837973294', last_update_time=1753199157593, lifecycle_stage='active', name='Driver_Behavior_Models_UAH', tags={'mlflow.note.content': 'Experiment to train models on the UAH driveset.',
 'project_name': 'driver-behavior-prediction'}>

### Random Forest

In [None]:
model = RandomForestClassifier
run_experiments('rf-sweep', max_evals=5, search_space=search_space_rf, data_name='UAH')

This will run 5 trials to find optimal hyperparameters...
  0%|          | 0/5 [00:00<?, ?trial/s, best loss=?]

🏃 View run awesome-lamb-730 at: http://127.0.0.1:8080/#/experiments/817421822837973294/runs/45ed210f121d4c0e8cced9cd16489ae9

🧪 View experiment at: http://127.0.0.1:8080/#/experiments/817421822837973294

🏃 View run dazzling-mink-677 at: http://127.0.0.1:8080/#/experiments/817421822837973294/runs/2de27a0405484e5eb44c4a8645a93c0e

🧪 View experiment at: http://127.0.0.1:8080/#/experiments/817421822837973294   

🏃 View run loud-chimp-335 at: http://127.0.0.1:8080/#/experiments/817421822837973294/runs/c5866df714604558bc26c4421785d124

🧪 View experiment at: http://127.0.0.1:8080/#/experiments/817421822837973294   

🏃 View run inquisitive-loon-404 at: http://127.0.0.1:8080/#/experiments/817421822837973294/runs/f36b3141d6ab40ada1a8a25c318e202d

🧪 View experiment at: http://127.0.0.1:8080/#/experiments/817421822837973294   

🏃 View run beautiful-fox-651 at: http://127.0.0.1:8080/#/experiments/817421822837973294/runs/0842c87e13c547f69b84e5dcde5a8a72

🧪 View experiment at: http://127.0.0.1:8080/#

### Support Vector Classifier

In [None]:
model = SVC
run_experiments('svc-sweep', max_evals=5, search_space=search_space_rf, data_name='UAH')

### XGBoost

In [None]:
model = xgb.XGBClassifier
run_experiments('xgb-sweep', max_evals=5, search_space=search_space_xgb, data_name='UAH')

## SUMO Only

## Real + SUMO (fixed)

In [None]:


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
signature = infer_signature(X_train, y_train)

In [None]:
# Provide an Experiment description that will appear in the UI
experiment_description = (
    "Experiment to train models on the UAH driveset supplemented by SUMO with fixed parameters."
)

# Provide searchable tags that define characteristics of the Runs that
# will be in this Experiment
experiment_tags = {
    "project_name": "driver-behavior-prediction",
    "mlflow.note.content": experiment_description,
}

# Create the Experiment, providing a unique name
driver_behavior_experiment = client.create_experiment(
    name="Driver_Behavior_Models_UAH_SUMO_fixed", tags=experiment_tags
)

In [None]:
model = RandomForestClassifier
run_experiments('rf-sweep', max_evals=5, search_space=search_space_rf, data_name='UAH')

## Real + SUMO (LLM)