This notebook is only for training the models with the best parametrs found in `3_model_training`. If you don't need the trained models, you can skip to `4_evaluation`.

In [1]:
import os

import csv
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import StandardScaler

os.sys.path.append(os.path.abspath('../src'))
from data import preprocessor

mapname = 'Town01'

In [2]:
def create_and_train_model(model, args):
    """
    Function to create and train a model with given parameters.

    Args:
        model: The model to be trained.
        args: Arguments for training the model.

    Returns:
        result: The result of the training process.
    """
    result = {}
    # Train model with current hyperparameters
    md = model(**args)
    md.fit(X_train, y_train)
    # Predict on the validation set
    y_pred = md.predict(X_val)
    # Log training results
    result["f1_score"] = f1_score(y_val, y_pred, average='weighted')
    result["accuracy"] = accuracy_score(y_val, y_pred)
    result["recall"] = recall_score(y_val, y_pred, average='weighted')
    result["precision"] = precision_score(y_val, y_pred, average='weighted')
    result["model"] = md

    return result

In [3]:
def add_to_dict(dict, key, model, results, params):
    """
    Function to add results to the dictionary.

    Args:
        dict: The dictionary to which results will be added.
        key: The key for the dictionary.
        model: The model used.
        results: The results of the training process.

    Returns:
        None
    """
    if key not in dict:
        dict[key] = {
            'model': model,
            'params': params,
            'f1_scores': [],
            'accuracy': [],
            'recall': [],
            'precision': [],
        }
    
    dict[key]['f1_scores'].append(results['f1_score'])
    dict[key]['accuracy'].append(results['accuracy'])
    dict[key]['recall'].append(results['recall'])
    dict[key]['precision'].append(results['precision'])

In [4]:
def save_results_to_csv(dict, filename, metrics_path = '../results/metrics/'):
    """ Function to save results to a CSV file. 
    Args:
        dict: The dictionary containing results.
        filename: The name of the file to save results.
        metrics_path: The path where the file will be saved.
    Returns:
        None
    """

    with open(f'{metrics_path}/{filename}_results.csv', 'w', newline='') as csvfile:
        fieldnames = ['key', 'model', 'f1_score_mean', 'f1_score_std', 'accuracy_mean', 'accuracy_std', 'recall_mean', 'recall_std', 'precision_mean', 'precision_std', 'params']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for key, value in dict.items():
            writer.writerow({
                'key': key,
                'model': value['model'],
                'f1_score_mean': np.mean(value['f1_scores']),
                'f1_score_std': np.std(value['f1_scores']),
                'accuracy_mean': np.mean(value['accuracy']),
                'accuracy_std': np.std(value['accuracy']),
                'recall_mean': np.mean(value['recall']),
                'recall_std': np.std(value['recall']),
                'precision_mean': np.mean(value['precision']),
                'precision_std': np.std(value['precision']),
                'params': value['params']
            })

    dict_means = {}
    for key, value in dict.items():
        dict_means[key] = {
            'f1_score': np.mean(value['f1_scores']),
            'accuracy': np.mean(value['accuracy']),
            'recall': np.mean(value['recall']),
            'precision': np.mean(value['precision']),
            'model': value['model'],
            'params': value['params']
        }

    dict_means_df = pd.DataFrame.from_dict(dict_means, orient='index').sort_values(by='f1_score', ascending=False)
    dict_means_df.index.name = 'key'
    dict_means_df.to_csv(f'{metrics_path}/{filename}_means.csv')
    print(dict_means_df, end='\n\n')

    for key, value in dict.items():
        print(f"{key}:")
        print(f"  Model: {value['model']}")
        print(f"  F1 Score: {np.mean(value['f1_scores']):.2f} ± {np.std(value['f1_scores']):.2f}")
        print(f"  Accuracy: {np.mean(value['accuracy']):.2f} ± {np.std(value['accuracy']):.2f}")
        print(f"  Recall: {np.mean(value['recall']):.2f} ± {np.std(value['recall']):.2f}")
        print(f"  Precision: {np.mean(value['precision']):.2f} ± {np.std(value['precision']):.2f}\n")


## Preparing the Data

In [5]:
data_path = '../data'
uah_training = pd.read_csv(f'{data_path}/base/training_set_uah.csv')
uah_validation = pd.read_csv(f'{data_path}/base/validation_set_uah.csv')

carla_fixed = pd.read_csv(f'{data_path}/merged/{mapname}/carla/carla_fixed.csv').drop(columns=['origin'])
carla_llm = pd.read_csv(f'{data_path}/merged/{mapname}/carla/carla_llm.csv').drop(columns=['origin'])

sumo_fixed = pd.read_csv(f'{data_path}/merged/{mapname}/sumo/sumo_fixed.csv').drop(columns=['origin'])
sumo_llm = pd.read_csv(f'{data_path}/merged/{mapname}/sumo/sumo_llm.csv').drop(columns=['origin'])

carla_uah_fixed = pd.read_csv(f'{data_path}/merged/{mapname}/carla_uah/carla_uah_fixed.csv')
carla_uah_llm = pd.read_csv(f'{data_path}/merged/{mapname}/carla_uah/carla_uah_llm.csv')

sumo_uah_fixed = pd.read_csv(f'{data_path}/merged/{mapname}/sumo_uah/sumo_uah_fixed.csv')
sumo_uah_llm = pd.read_csv(f'{data_path}/merged/{mapname}/sumo_uah/sumo_uah_llm.csv')

In [6]:
sumo_columns_to_keep = sumo_fixed.columns.tolist()
carla_columns_to_keep = carla_fixed.columns.tolist()
print("SUMO features:", sumo_columns_to_keep)
print("CARLA features:", carla_columns_to_keep)

SUMO features: ['acc', 'angle', 'speed', 'label']
CARLA features: ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z', 'angle', 'label', 'acc']


In [7]:
window_size = 10
step_size = 5
one_hot_keys = {
    'normal': 0,
    'aggressive': 1
}

# Train on Real test on Synthetic

By training a model on real data and testing on synthetic data, we verify synthetic data’s fidelity with the real-world.

In [8]:
TRTS = {}
n_trials = 8

## SUMO

In [9]:
X_train, y_train = preprocessor.sliding_windows(uah_training[sumo_columns_to_keep], window_size=10, step_size=5)
y_train = preprocessor.one_hot_encode(y_train, one_hot_keys)

### Fixed

In [10]:
X_val, y_val = preprocessor.sliding_windows(sumo_fixed, window_size=10, step_size=5)
y_val = preprocessor.one_hot_encode(y_val, one_hot_keys)

In [11]:
# Best model: XGBClassifier_{'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': None, 'n_estimators': 200, 'subsample': 0.7}
params = {
    'colsample_bytree': 0.5,
    'learning_rate': 0.1,
    'max_depth': None,
    'n_estimators': 200,
    'subsample': 0.7,
}
for i in range(n_trials):
    results = create_and_train_model(xgb.XGBClassifier, params)
    add_to_dict(TRTS, 'sumo_fixed', 'XGBClassifier', results, params)

### LLM

In [12]:
X_val, y_val = preprocessor.sliding_windows(sumo_llm, window_size=10, step_size=5)
y_val = preprocessor.one_hot_encode(y_val, one_hot_keys)

In [13]:
# Best model: XGBClassifier_{'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': None, 'n_estimators': 1000, 'subsample': 0.7}
params = {
    'colsample_bytree': 1.0,
    'learning_rate': 0.1,
    'max_depth': None,
    'n_estimators': 1000,
    'subsample': 0.7,
}
for i in range(n_trials):
    results = create_and_train_model(xgb.XGBClassifier, params)
    add_to_dict(TRTS, 'sumo_llm', 'XGBClassifier', results, params)

## CARLA

In [14]:
X_train, y_train = preprocessor.sliding_windows(uah_training[carla_columns_to_keep], window_size=10, step_size=5)
y_train = preprocessor.one_hot_encode(y_train, one_hot_keys)

### Fixed

In [15]:
X_val, y_val = preprocessor.sliding_windows(carla_fixed, window_size=10, step_size=5)
y_val = preprocessor.one_hot_encode(y_val, one_hot_keys)

In [16]:
# Best model: XGBClassifier_{'colsample_bytree': 1.0, 'learning_rate': 0.001, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.7}
params = {
    'colsample_bytree': 1.0,
    'learning_rate': 0.001,
    'max_depth': 5,
    'n_estimators': 200,
    'subsample': 0.7,
}
for i in range(n_trials):
    results = create_and_train_model(xgb.XGBClassifier, params)
    add_to_dict(TRTS, 'carla_fixed', 'XGBClassifier', results, params)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### LLM

In [17]:
X_val, y_val = preprocessor.sliding_windows(carla_llm, window_size=10, step_size=5)
y_val = preprocessor.one_hot_encode(y_val, one_hot_keys)

In [18]:
# Best model: XGBClassifier_{'colsample_bytree': 1.0, 'learning_rate': 0.3, 'max_depth': 5, 'n_estimators': 700, 'subsample': 0.5}
params = {
    'colsample_bytree': 1.0,    
    'learning_rate': 0.3,
    'max_depth': 5,
    'n_estimators': 700,
    'subsample': 0.5,
}
for i in range(n_trials):
    results = create_and_train_model(xgb.XGBClassifier, params)
    add_to_dict(TRTS, 'carla_llm', 'XGBClassifier', results, params)

## Results table

In [19]:
save_results_to_csv(TRTS, 'TRTS')

             f1_score  accuracy    recall  precision          model  \
key                                                                   
sumo_llm     0.542695  0.550501  0.550501   0.542056  XGBClassifier   
sumo_fixed   0.502740  0.505795  0.505795   0.502700  XGBClassifier   
carla_llm    0.499644  0.499753  0.499753   0.499766  XGBClassifier   
carla_fixed  0.433191  0.586130  0.586130   0.343549  XGBClassifier   

                                                        params  
key                                                             
sumo_llm     {'colsample_bytree': 1.0, 'learning_rate': 0.1...  
sumo_fixed   {'colsample_bytree': 0.5, 'learning_rate': 0.1...  
carla_llm    {'colsample_bytree': 1.0, 'learning_rate': 0.3...  
carla_fixed  {'colsample_bytree': 1.0, 'learning_rate': 0.0...  

sumo_fixed:
  Model: XGBClassifier
  F1 Score: 0.50 ± 0.00
  Accuracy: 0.51 ± 0.00
  Recall: 0.51 ± 0.00
  Precision: 0.50 ± 0.00

sumo_llm:
  Model: XGBClassifier
  F1 Score: 0.54 ±

# Discriminative Score

We seek to understand how well a classifier can separate the real and the synthetic data to determine how indistiguishable they are.

If the classifiers do not get good scores, that means the data is hardly distinguishable.

In [20]:
DS = {}
n_trials = 5

In [21]:
X_uah_disc_sumo = uah_training[sumo_columns_to_keep].drop(columns=['label'])
X_uah_disc_carla = uah_training[carla_columns_to_keep].drop(columns=['label'])
y_uah_disc = np.zeros(len(X_uah_disc_carla)) # 0 for real data and 1 for synthetic data

## SUMO

In [22]:
X_sumo_disc_fixed, y_sumo_disc_fixed = sumo_fixed.drop(columns=['label']), np.ones(len(sumo_fixed))
X_sumo_disc_llm, y_sumo_disc_llm = sumo_llm.drop(columns=['label']), np.ones(len(sumo_llm))

### Fixed

In [23]:
X, y = pd.concat([X_sumo_disc_fixed, X_uah_disc_sumo], axis=0, ignore_index=True), np.concat([y_sumo_disc_fixed, y_uah_disc], axis=0)
X.reset_index(drop=True, inplace=True)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [24]:
for i in range(n_trials):
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    y_pred_rf = rf.predict(X_val)
    print(f"F1 Score: {f1_score(y_val, y_pred_rf, average='weighted')}")

    add_to_dict(DS, 'sumo_fixed', 'RFClassifier', {
        'f1_score': f1_score(y_val, y_pred_rf, average='weighted'),
        'accuracy': accuracy_score(y_val, y_pred_rf),
        'recall': recall_score(y_val, y_pred_rf, average='weighted'),
        'precision': precision_score(y_val, y_pred_rf, average='weighted'),
        'model': rf
    }, params={'default'})

F1 Score: 1.0
F1 Score: 1.0
F1 Score: 1.0
F1 Score: 1.0
F1 Score: 1.0


### LLM

In [25]:
X, y = pd.concat([X_sumo_disc_llm, X_uah_disc_sumo], axis=0, ignore_index=True), np.concat([y_sumo_disc_llm, y_uah_disc], axis=0)
X.reset_index(drop=True, inplace=True)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [26]:
for i in range(n_trials):
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    y_pred_rf = rf.predict(X_val)

    add_to_dict(DS, 'sumo_llm', 'RFClassifier', {
        'f1_score': f1_score(y_val, y_pred_rf, average='weighted'),
        'accuracy': accuracy_score(y_val, y_pred_rf),
        'recall': recall_score(y_val, y_pred_rf, average='weighted'),
        'precision': precision_score(y_val, y_pred_rf, average='weighted'),
        'model': rf
    }, params={'default'})

## CARLA

In [27]:
X_carla_disc_fixed, y_carla_disc_fixed = carla_fixed.drop(columns=['label']), np.ones(len(carla_fixed))
X_carla_disc_llm, y_carla_disc_llm = carla_llm.drop(columns=['label']), np.ones(len(carla_llm))

### Fixed

In [28]:
X, y = pd.concat([X_carla_disc_fixed, X_uah_disc_carla], axis=0, ignore_index=True), np.concat([y_carla_disc_fixed, y_uah_disc], axis=0)
X.reset_index(drop=True, inplace=True)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [29]:
for i in range(n_trials):
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    y_pred_rf = rf.predict(X_val)

    add_to_dict(DS, 'carla_fixed', 'RFClassifier', {
        'f1_score': f1_score(y_val, y_pred_rf, average='weighted'),
        'accuracy': accuracy_score(y_val, y_pred_rf),
        'recall': recall_score(y_val, y_pred_rf, average='weighted'),
        'precision': precision_score(y_val, y_pred_rf, average='weighted'),
        'model': rf
    }, params={'default'})

### LLM

In [30]:
X, y = pd.concat([X_carla_disc_llm, X_uah_disc_carla], axis=0, ignore_index=True), np.concat([y_carla_disc_llm, y_uah_disc], axis=0)
X.reset_index(drop=True, inplace=True)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [31]:
for i in range(n_trials):
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    y_pred_rf = rf.predict(X_val)

    add_to_dict(DS, 'carla_llm', 'RFClassifier', {
        'f1_score': f1_score(y_val, y_pred_rf, average='weighted'),
        'accuracy': accuracy_score(y_val, y_pred_rf),
        'recall': recall_score(y_val, y_pred_rf, average='weighted'),
        'precision': precision_score(y_val, y_pred_rf, average='weighted'),
        'model': rf
    }, params={'default'})

## Results Table

In [32]:
save_results_to_csv(DS, 'DS')

             f1_score  accuracy  recall  precision         model     params
key                                                                        
sumo_fixed        1.0       1.0     1.0        1.0  RFClassifier  {default}
sumo_llm          1.0       1.0     1.0        1.0  RFClassifier  {default}
carla_fixed       1.0       1.0     1.0        1.0  RFClassifier  {default}
carla_llm         1.0       1.0     1.0        1.0  RFClassifier  {default}

sumo_fixed:
  Model: RFClassifier
  F1 Score: 1.00 ± 0.00
  Accuracy: 1.00 ± 0.00
  Recall: 1.00 ± 0.00
  Precision: 1.00 ± 0.00

sumo_llm:
  Model: RFClassifier
  F1 Score: 1.00 ± 0.00
  Accuracy: 1.00 ± 0.00
  Recall: 1.00 ± 0.00
  Precision: 1.00 ± 0.00

carla_fixed:
  Model: RFClassifier
  F1 Score: 1.00 ± 0.00
  Accuracy: 1.00 ± 0.00
  Recall: 1.00 ± 0.00
  Precision: 1.00 ± 0.00

carla_llm:
  Model: RFClassifier
  F1 Score: 1.00 ± 0.00
  Accuracy: 1.00 ± 0.00
  Recall: 1.00 ± 0.00
  Precision: 1.00 ± 0.00



# Predictive Scores

Here we train on different combinations of real and synthetic data and test on real data only in order to verify the quality of the data for real-world applications.

In [33]:
PS = {}
n_trials = 8

## Real Only

In [34]:
columns_to_keep = ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z', 'angle', 'speed', 'label']

X_train, y_train = preprocessor.sliding_windows(uah_training[columns_to_keep], window_size=window_size, step_size=step_size)
X_val, y_val = preprocessor.sliding_windows(uah_validation[columns_to_keep], window_size=window_size, step_size=step_size)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

y_train = preprocessor.one_hot_encode(y_train, one_hot_keys=one_hot_keys)
y_val = preprocessor.one_hot_encode(y_val, one_hot_keys=one_hot_keys)

In [35]:
# Best model: SVC_{'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}
params = {
    'C': 1,
    'gamma': 0.01,
    'kernel': 'rbf',
}
for i in range(n_trials):
    results = create_and_train_model(SVC, params)
    add_to_dict(PS, 'uah', 'SVC', results, params)

## SUMO Only

In [36]:
# SUMO
X_sumo_fixed, y_sumo_fixed = preprocessor.sliding_windows(sumo_fixed, window_size=window_size, step_size=step_size)
X_sumo_llm, y_sumo_llm = preprocessor.sliding_windows(sumo_llm, window_size=window_size, step_size=step_size)

In [37]:
# Getting the validation for SUMO variables
print("Columns to keep for validation:", sumo_columns_to_keep)

X_val_base, y_val = preprocessor.sliding_windows(uah_validation[sumo_columns_to_keep], window_size=window_size, step_size=step_size)
y_val = preprocessor.one_hot_encode(y_val, one_hot_keys=one_hot_keys)

Columns to keep for validation: ['acc', 'angle', 'speed', 'label']


### Fixed

In [38]:
X_train, y_train = X_sumo_fixed, y_sumo_fixed

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val_base)
y_train = preprocessor.one_hot_encode(y_train, one_hot_keys=one_hot_keys)

In [39]:
# Best model: RandomForestClassifier_{'max_depth': 50, 'n_estimators': 100}
params = {
    'max_depth': 50,
    'n_estimators': 100,
}
for i in range(n_trials):
    results = create_and_train_model(RandomForestClassifier, params)
    add_to_dict(PS, 'sumo_fixed', 'RFClassifier', results, params)

### LLM

In [40]:
X_train, y_train = X_sumo_llm, y_sumo_llm

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val_base)
y_train = preprocessor.one_hot_encode(y_train, one_hot_keys=one_hot_keys)

In [41]:
# Best model: XGBClassifier_{'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 500, 'subsample': 0.5}
params = {
    'colsample_bytree': 1.0,
    'learning_rate': 0.2,
    'max_depth': 5,
    'n_estimators': 500,
    'subsample': 0.5,
}
for i in range(n_trials):
    results = create_and_train_model(xgb.XGBClassifier, params)
    add_to_dict(PS, 'sumo_llm', 'XGBClassifier', results, params)

## Carla Only

In [42]:
X_carla_fixed, y_carla_fixed = preprocessor.sliding_windows(carla_fixed, window_size=window_size, step_size=step_size)
X_carla_llm, y_carla_llm = preprocessor.sliding_windows(carla_llm, window_size=window_size, step_size=step_size)

In [43]:
# Getting the validation for SUMO variables
print("Columns to keep for validation:", carla_columns_to_keep)
X_val_base, y_val = preprocessor.sliding_windows(uah_validation[carla_columns_to_keep], window_size=window_size, step_size=step_size)
y_val = preprocessor.one_hot_encode(y_val, one_hot_keys=one_hot_keys)

Columns to keep for validation: ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z', 'angle', 'label', 'acc']


### Fixed

In [44]:
X_train, y_train = X_carla_fixed, y_carla_fixed

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val_base)
y_train = preprocessor.one_hot_encode(y_train, one_hot_keys=one_hot_keys)

In [45]:
# Best model: RandomForestClassifier_{'max_depth': 10, 'n_estimators': 20}
params = {
    'max_depth': 10,
    'n_estimators': 20,
}
for i in range(n_trials):
    results = create_and_train_model(RandomForestClassifier, params)
    add_to_dict(PS, 'carla_fixed', 'RFClassifier', results, params)

### LLM

In [46]:
X_train, y_train = X_carla_llm, y_carla_llm

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val_base)
y_train = preprocessor.one_hot_encode(y_train, one_hot_keys=one_hot_keys)

In [47]:
# Best model: RandomForestClassifier_{'max_depth': 10, 'n_estimators': 20}
params = {
    'max_depth': 10,
    'n_estimators': 20,
}
for i in range(n_trials):
    results = create_and_train_model(RandomForestClassifier, params)
    add_to_dict(PS, 'carla_llm', 'RFClassifier', results, params)

## Real + SUMO

In [48]:
sumo_uah_fixed_20 = preprocessor.fill_synthetic_data(sumo_uah_fixed, 0.2)
sumo_uah_llm_20 = preprocessor.fill_synthetic_data(sumo_uah_llm, 0.2)

sumo_uah_fixed_60 = preprocessor.fill_synthetic_data(sumo_uah_fixed, 0.6)
sumo_uah_llm_60 = preprocessor.fill_synthetic_data(sumo_uah_llm, 0.6)

sumo_uah_fixed_100 = preprocessor.fill_synthetic_data(sumo_uah_fixed, 1)
sumo_uah_llm_100 = preprocessor.fill_synthetic_data(sumo_uah_llm, 1)


In [49]:
X_sumo_uah_fixed_20, y_sumo_uah_fixed_20 = preprocessor.sliding_windows(sumo_uah_fixed_20, window_size=window_size, step_size=step_size)
X_sumo_uah_llm_20, y_sumo_uah_llm_20 = preprocessor.sliding_windows(sumo_uah_llm_20, window_size=window_size, step_size=step_size)

X_sumo_uah_fixed_60, y_sumo_uah_fixed_60 = preprocessor.sliding_windows(sumo_uah_fixed_60, window_size=window_size, step_size=step_size)
X_sumo_uah_llm_60, y_sumo_uah_llm_60 = preprocessor.sliding_windows(sumo_uah_llm_60, window_size=window_size, step_size=step_size)

X_sumo_uah_fixed_100, y_sumo_uah_fixed_100 = preprocessor.sliding_windows(sumo_uah_fixed_100, window_size=window_size, step_size=step_size)
X_sumo_uah_llm_100, y_sumo_uah_llm_100 = preprocessor.sliding_windows(sumo_uah_llm_100, window_size=window_size, step_size=step_size)

In [50]:
print("Columns to keep for validation:", sumo_columns_to_keep)

X_val_base, y_val = preprocessor.sliding_windows(uah_validation[sumo_columns_to_keep], window_size=window_size, step_size=step_size)
y_val = preprocessor.one_hot_encode(y_val, one_hot_keys=one_hot_keys)

Columns to keep for validation: ['acc', 'angle', 'speed', 'label']


## Real + SUMO (20%)

### Fixed

In [51]:
X_train, y_train = X_sumo_uah_fixed_20, y_sumo_uah_fixed_20

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val_base)
y_train = preprocessor.one_hot_encode(y_train, one_hot_keys=one_hot_keys)

In [52]:
# Best model: XGBClassifier_{'colsample_bytree': 0.7, 'learning_rate': 0.01, 'max_depth': None, 'n_estimators': 700, 'subsample': 0.5}
params = {
    'colsample_bytree': 0.7,
    'learning_rate': 0.01,
    'max_depth': None,
    'n_estimators': 700,
    'subsample': 0.5,
}
for i in range(n_trials):
    results = create_and_train_model(xgb.XGBClassifier, params)
    add_to_dict(PS, 'sumo_uah_fixed_20', 'XGBClassifier', results, params)

### LLM

In [53]:
X_train, y_train = X_sumo_uah_llm_20, y_sumo_uah_llm_20

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val_base)
y_train = preprocessor.one_hot_encode(y_train, one_hot_keys=one_hot_keys)

In [54]:
# Best model: RandomForestClassifier_{'max_depth': 10, 'n_estimators': 100}
params = {
    'max_depth': 10,
    'n_estimators': 100,
}
for i in range(n_trials):
    results = create_and_train_model(RandomForestClassifier, params)
    add_to_dict(PS, 'sumo_uah_llm_20', 'RFClassifier', results, params)

## Real + SUMO (60%)

### Fixed

In [55]:
X_train, y_train = X_sumo_uah_fixed_60, y_sumo_uah_fixed_60

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val_base)
y_train = preprocessor.one_hot_encode(y_train, one_hot_keys=one_hot_keys)

In [56]:
# Best model: RandomForestClassifier_{'max_depth': 10, 'n_estimators': 100}
params = {
    'max_depth': 10,
    'n_estimators': 100,
}
for i in range(n_trials):
    results = create_and_train_model(RandomForestClassifier, params)
    add_to_dict(PS, 'sumo_uah_fixed_60', 'RFClassifier', results, params)

### LLM

In [57]:
X_train, y_train = X_sumo_uah_llm_60, y_sumo_uah_llm_60

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val_base)
y_train = preprocessor.one_hot_encode(y_train, one_hot_keys=one_hot_keys)

In [58]:
# Best model: XGBClassifier_{'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': None, 'n_estimators': 1000, 'subsample': 0.7}
params = {
    'colsample_bytree': 1.0,
    'learning_rate': 0.01,
    'max_depth': None,
    'n_estimators': 1000,
    'subsample': 0.7,
}
for i in range(n_trials):
    results = create_and_train_model(xgb.XGBClassifier, params)
    add_to_dict(PS, 'sumo_uah_llm_60', 'XGBClassifier', results, params)

## Real + SUMO (100%)

### Fixed

In [59]:
X_train, y_train = X_sumo_uah_fixed_100, y_sumo_uah_fixed_100

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val_base)
y_train = preprocessor.one_hot_encode(y_train, one_hot_keys=one_hot_keys)

In [60]:
# Best model: XGBClassifier_{'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 500, 'subsample': 0.5}
params = {
    'colsample_bytree': 1.0,
    'learning_rate': 0.01,
    'max_depth': 10,
    'n_estimators': 500,
    'subsample': 0.5,
}
for i in range(n_trials):
    results = create_and_train_model(xgb.XGBClassifier, params)
    add_to_dict(PS, 'sumo_uah_fixed_100', 'XGBClassifier', results, params)

### LLM

In [61]:
X_train, y_train = X_sumo_uah_llm_100, y_sumo_uah_llm_100

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val_base)
y_train = preprocessor.one_hot_encode(y_train, one_hot_keys=one_hot_keys)

In [62]:
# Best model: RandomForestClassifier_{'max_depth': 10, 'n_estimators': 100}
params = {
    'max_depth': 10,
    'n_estimators': 100,
}
for i in range(n_trials):
    results = create_and_train_model(RandomForestClassifier, params)
    add_to_dict(PS, 'sumo_uah_llm_100', 'RandomForestClassifier', results, params)

## Real + CARLA

In [63]:
carla_uah_fixed_20 = preprocessor.fill_synthetic_data(carla_uah_fixed, 0.2)
carla_uah_llm_20 = preprocessor.fill_synthetic_data(carla_uah_llm, 0.2)

carla_uah_fixed_60 = preprocessor.fill_synthetic_data(carla_uah_fixed, 0.6)
carla_uah_llm_60 = preprocessor.fill_synthetic_data(carla_uah_llm, 0.6)

carla_uah_fixed_100 = preprocessor.fill_synthetic_data(carla_uah_fixed, 1)
carla_uah_llm_100 = preprocessor.fill_synthetic_data(carla_uah_llm, 1)


In [64]:
X_carla_uah_fixed_20, y_carla_uah_fixed_20 = preprocessor.sliding_windows(carla_uah_fixed_20, window_size=window_size, step_size=step_size)
X_carla_uah_llm_20, y_carla_uah_llm_20 = preprocessor.sliding_windows(carla_uah_llm_20, window_size=window_size, step_size=step_size)

X_carla_uah_fixed_60, y_carla_uah_fixed_60 = preprocessor.sliding_windows(carla_uah_fixed_60, window_size=window_size, step_size=step_size)
X_carla_uah_llm_60, y_carla_uah_llm_60 = preprocessor.sliding_windows(carla_uah_llm_60, window_size=window_size, step_size=step_size)

X_carla_uah_fixed_100, y_carla_uah_fixed_100 = preprocessor.sliding_windows(carla_uah_fixed_100, window_size=window_size, step_size=step_size)
X_carla_uah_llm_100, y_carla_uah_llm_100 = preprocessor.sliding_windows(carla_uah_llm_100, window_size=window_size, step_size=step_size)

In [65]:
print("Columns to keep for validation:", carla_columns_to_keep)

X_val_base, y_val = preprocessor.sliding_windows(uah_validation[carla_columns_to_keep], window_size=window_size, step_size=step_size)
y_val = preprocessor.one_hot_encode(y_val, one_hot_keys=one_hot_keys)

Columns to keep for validation: ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z', 'angle', 'label', 'acc']


## Real + CARLA (20%)

### Fixed

In [66]:
X_train, y_train = X_carla_uah_fixed_20, y_carla_uah_fixed_20

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val_base)
y_train = preprocessor.one_hot_encode(y_train, one_hot_keys=one_hot_keys)

In [67]:
# Best model: XGBClassifier_{'colsample_bytree': 0.7, 'learning_rate': 0.01, 'max_depth': None, 'n_estimators': 500, 'subsample': 0.5}
params = {
    'colsample_bytree': 0.7,
    'learning_rate': 0.01,
    'max_depth': None,
    'n_estimators': 500,
    'subsample': 0.5,
}
for i in range(n_trials):
    results = create_and_train_model(xgb.XGBClassifier, params)
    add_to_dict(PS, 'carla_uah_fixed_20', 'XGBClassifier', results, params)

### LLM

In [68]:
X_train, y_train = X_carla_uah_llm_20, y_carla_uah_llm_20

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val_base)
y_train = preprocessor.one_hot_encode(y_train, one_hot_keys=one_hot_keys)

In [69]:
# Best model: XGBClassifier_{'colsample_bytree': 0.7, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 500, 'subsample': 0.5}
params = {
    'colsample_bytree': 0.7,
    'learning_rate': 0.01,
    'max_depth': 5,
    'n_estimators': 500,
    'subsample': 0.5,
}
for i in range(n_trials):
    results = create_and_train_model(xgb.XGBClassifier, params)
    add_to_dict(PS, 'carla_uah_llm_20', 'XGBClassifier', results, params)

## Real + CARLA (60%)

### Fixed

In [70]:
X_train, y_train = X_carla_uah_fixed_60, y_carla_uah_fixed_60

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val_base)
y_train = preprocessor.one_hot_encode(y_train, one_hot_keys=one_hot_keys)

In [71]:
# Best model: RandomForestClassifier_{'max_depth': 50, 'n_estimators': 20}
params = {
    'max_depth': 50,
    'n_estimators': 20,
}
for i in range(n_trials):
    results = create_and_train_model(RandomForestClassifier, params)
    add_to_dict(PS, 'carla_uah_fixed_60', 'RFClassifier', results, params)

### LLM

In [72]:
X_train, y_train = X_carla_uah_llm_60, y_carla_uah_llm_60

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val_base)
y_train = preprocessor.one_hot_encode(y_train, one_hot_keys=one_hot_keys)

In [73]:
# Best model: XGBClassifier_{'colsample_bytree': 0.5, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 500, 'subsample': 1.0}
params = {
    'colsample_bytree': 0.5,
    'learning_rate': 0.01,
    'max_depth': 5,
    'n_estimators': 500,
    'subsample': 1.0,
}
for i in range(n_trials):
    results = create_and_train_model(xgb.XGBClassifier, params)
    add_to_dict(PS, 'carla_uah_llm_60', 'XGBClassifier', results, params)

## Real + CARLA (100%)

### Fixed

In [74]:
X_train, y_train = X_carla_uah_fixed_100, y_carla_uah_fixed_100

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val_base)
y_train = preprocessor.one_hot_encode(y_train, one_hot_keys=one_hot_keys)

In [75]:
# Best model: RandomForestClassifier_{'max_depth': 20, 'n_estimators': 100}
params = {
    'max_depth': 20,
    'n_estimators': 100,
}
for i in range(n_trials):
    results = create_and_train_model(RandomForestClassifier, params)
    add_to_dict(PS, 'carla_uah_fixed_100', 'RFClassifier', results, params)

### LLM

In [76]:
X_train, y_train = X_carla_uah_llm_100, y_carla_uah_llm_100

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val_base)
y_train = preprocessor.one_hot_encode(y_train, one_hot_keys=one_hot_keys)

In [77]:
# Best model: RandomForestClassifier_{'max_depth': None, 'n_estimators': 500}
params = {
    'max_depth': None,
    'n_estimators': 500,
}
for i in range(n_trials):
    results = create_and_train_model(RandomForestClassifier, params)
    add_to_dict(PS, 'carla_uah_llm_100', 'RandomForestClassifier', results, params)

In [78]:
PS_summary = {}
for key, value in PS.items():
    PS_summary[key] = {
        'model': value['model'],
        
        'f1_score': np.mean(value['f1_scores']),
        'accuracy': np.mean(value['accuracy']),
        'recall': np.mean(value['recall']),
        'precision': np.mean(value['precision']),
    }

## Results Table

In [79]:
save_results_to_csv(PS, 'PS')

                     f1_score  accuracy    recall  precision  \
key                                                            
uah                  0.788758  0.789402  0.789402   0.792941   
sumo_uah_fixed_20    0.737091  0.737092  0.737092   0.737096   
sumo_uah_llm_60      0.734102  0.734110  0.734110   0.734142   
sumo_uah_fixed_100   0.732889  0.732891  0.732891   0.732896   
sumo_uah_llm_20      0.731311  0.731739  0.731739   0.733201   
sumo_uah_llm_100     0.730478  0.730926  0.730926   0.732446   
sumo_uah_fixed_60    0.729843  0.730248  0.730248   0.731612   
carla_uah_fixed_20   0.662020  0.668383  0.668383   0.682016   
carla_uah_llm_20     0.658785  0.668790  0.668790   0.691110   
carla_uah_llm_60     0.653733  0.664589  0.664589   0.688077   
carla_uah_fixed_100  0.637813  0.638569  0.638569   0.639780   
carla_uah_llm_100    0.628213  0.628252  0.628252   0.628306   
carla_uah_fixed_60   0.606611  0.607298  0.607298   0.608150   
carla_llm            0.441108  0.478994 