This notebook is only for training the models with the best parametrs found in `3_model_training`. If you don't need the trained models, you can skip to `4_evaluation`.

In [84]:
import os

import csv
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import StandardScaler

os.sys.path.append(os.path.abspath('../src'))
from data import preprocessor

mapname = 'Town01'

In [2]:
def create_and_train_model(model, args):
    """
    Function to create and train a model with given parameters.

    Args:
        model: The model to be trained.
        args: Arguments for training the model.

    Returns:
        result: The result of the training process.
    """
    result = {}
    # Train model with current hyperparameters
    md = model(**args)
    md.fit(X_train, y_train)
    # Predict on the validation set
    y_pred = md.predict(X_val)
    # Log training results
    result["f1_score"] = f1_score(y_val, y_pred, average='weighted')
    result["accuracy"] = accuracy_score(y_val, y_pred)
    result["recall"] = recall_score(y_val, y_pred, average='weighted')
    result["precision"] = precision_score(y_val, y_pred, average='weighted')
    result["model"] = md

    return result

In [3]:
def add_to_dict(dict, key, model, results):
    """
    Function to add results to the dictionary.

    Args:
        dict: The dictionary to which results will be added.
        key: The key for the dictionary.
        model: The model used.
        results: The results of the training process.

    Returns:
        None
    """
    if key not in dict:
        dict[key] = {
            'model': model,
            'f1_scores': [],
            'accuracy': [],
            'recall': [],
            'precision': [],
        }
    
    dict[key]['f1_scores'].append(results['f1_score'])
    dict[key]['accuracy'].append(results['accuracy'])
    dict[key]['recall'].append(results['recall'])
    dict[key]['precision'].append(results['precision'])

## Preparing the Data

In [4]:
data_path = '../data'
uah_training = pd.read_csv(f'{data_path}/base/training_set_uah.csv')
uah_validation = pd.read_csv(f'{data_path}/base/validation_set_uah.csv')

carla_fixed = pd.read_csv(f'{data_path}/merged/{mapname}/carla/carla_fixed.csv').drop(columns=['origin'])
carla_llm = pd.read_csv(f'{data_path}/merged/{mapname}/carla/carla_llm.csv').drop(columns=['origin'])

sumo_fixed = pd.read_csv(f'{data_path}/merged/{mapname}/sumo/sumo_fixed.csv').drop(columns=['origin'])
sumo_llm = pd.read_csv(f'{data_path}/merged/{mapname}/sumo/sumo_llm.csv').drop(columns=['origin'])

carla_uah_fixed = pd.read_csv(f'{data_path}/merged/{mapname}/carla_uah/carla_uah_fixed.csv')
carla_uah_llm = pd.read_csv(f'{data_path}/merged/{mapname}/carla_uah/carla_uah_llm.csv')

sumo_uah_fixed = pd.read_csv(f'{data_path}/merged/{mapname}/sumo_uah/sumo_uah_fixed.csv')
sumo_uah_llm = pd.read_csv(f'{data_path}/merged/{mapname}/sumo_uah/sumo_uah_llm.csv')

In [5]:
sumo_columns_to_keep = sumo_fixed.columns.tolist()
carla_columns_to_keep = carla_fixed.columns.tolist()

In [6]:
window_size = 10
step_size = 5
one_hot_keys = {
    'normal': 0,
    'aggressive': 1
}

# Train on Real test on Synthetic

By training a model on real data and testing on synthetic data, we verify synthetic data’s fidelity with the real-world.

In [7]:
TRTS = {}
n_trials = 8

## SUMO

In [8]:
X_train, y_train = preprocessor.sliding_windows(uah_training[sumo_columns_to_keep], window_size=10, step_size=5)
y_train = preprocessor.one_hot_encode(y_train, one_hot_keys)

### Fixed

In [9]:
X_val, y_val = preprocessor.sliding_windows(sumo_fixed, window_size=10, step_size=5)
y_val = preprocessor.one_hot_encode(y_val, one_hot_keys)

In [10]:
# Best model: XGBClassifier_{'learning_rate': 0.001, 'max_depth': 10, 'n_estimators': 1000}
for i in range(n_trials):
    results = create_and_train_model(xgb.XGBClassifier, {
        'learning_rate': 0.001,
        'max_depth': 10,
        'n_estimators': 1000,
    })
    add_to_dict(TRTS, 'sumo_fixed', 'XGBClassifier', results)

### LLM

In [11]:
X_val, y_val = preprocessor.sliding_windows(sumo_llm, window_size=10, step_size=5)
y_val = preprocessor.one_hot_encode(y_val, one_hot_keys)

In [12]:
# Best model: RandomForestClassifier_{'max_depth': 10, 'n_estimators': 50}
for i in range(n_trials):
    results = create_and_train_model(RandomForestClassifier, {
        'max_depth': 10,
        'n_estimators': 50,
    })
    add_to_dict(TRTS, 'sumo_llm', 'RFClassifier', results)

## CARLA

In [13]:
X_train, y_train = preprocessor.sliding_windows(uah_training[carla_columns_to_keep], window_size=10, step_size=5)
y_train = preprocessor.one_hot_encode(y_train, one_hot_keys)

### Fixed

In [14]:
X_val, y_val = preprocessor.sliding_windows(carla_fixed, window_size=10, step_size=5)
y_val = preprocessor.one_hot_encode(y_val, one_hot_keys)

In [15]:
# Best model: XGBClassifier_{'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 700}
for i in range(n_trials):
    results = create_and_train_model(xgb.XGBClassifier, {
        'learning_rate': 0.2,
        'max_depth': 5,
        'n_estimators': 700,
    })
    add_to_dict(TRTS, 'carla_fixed', 'XGBClassifier', results)

### LLM

In [16]:
X_val, y_val = preprocessor.sliding_windows(carla_llm, window_size=10, step_size=5)
y_val = preprocessor.one_hot_encode(y_val, one_hot_keys)

In [17]:
# Best model: XGBClassifier_{'learning_rate': 0.3, 'max_depth': 5, 'n_estimators': 500}
for i in range(n_trials):
    results = create_and_train_model(xgb.XGBClassifier, {
        'learning_rate': 0.3,
        'max_depth': 5,
        'n_estimators': 500,
    })
    add_to_dict(TRTS, 'carla_llm', 'XGBClassifier', results)

## Results table

In [107]:
with open('../results/metrics/TRTS_results.csv', 'w', newline='') as csvfile:
    fieldnames = ['key', 'model', 'f1_score_mean', 'f1_score_std', 'accuracy_mean', 'accuracy_std', 'recall_mean', 'recall_std', 'precision_mean', 'precision_std']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for key, value in TRTS.items():
        writer.writerow({
            'key': key,
            'model': value['model'],
            'f1_score_mean': np.mean(value['f1_scores']),
            'f1_score_std': np.std(value['f1_scores']),
            'accuracy_mean': np.mean(value['accuracy']),
            'accuracy_std': np.std(value['accuracy']),
            'recall_mean': np.mean(value['recall']),
            'recall_std': np.std(value['recall']),
            'precision_mean': np.mean(value['precision']),
            'precision_std': np.std(value['precision']),
        })

TRTS_means = {}
for key, value in TRTS.items():
    TRTS_means[key] = {
        'model': value['model'],
        'f1_score': np.mean(value['f1_scores']),
        'accuracy': np.mean(value['accuracy']),
        'recall': np.mean(value['recall']),
        'precision': np.mean(value['precision']),
    }

TRTS_means_df = pd.DataFrame.from_dict(TRTS_means, orient='index').sort_values(by='f1_score', ascending=False)
TRTS_means_df.index.name = 'key'
TRTS_means_df.to_csv('../results/metrics/TRTS_means.csv')
print(TRTS_means_df, end='\n\n')

for key, value in TRTS.items():
    print(f"{key}:")
    print(f"  Model: {value['model']}")
    print(f"  F1 Score: {np.mean(value['f1_scores']):.2f} ± {np.std(value['f1_scores']):.2f}")
    print(f"  Accuracy: {np.mean(value['accuracy']):.2f} ± {np.std(value['accuracy']):.2f}")
    print(f"  Recall: {np.mean(value['recall']):.2f} ± {np.std(value['recall']):.2f}")
    print(f"  Precision: {np.mean(value['precision']):.2f} ± {np.std(value['precision']):.2f}\n")


                     model  f1_score  accuracy    recall  precision
key                                                                
carla_fixed  XGBClassifier  0.526135  0.526416  0.526416   0.525869
carla_llm    XGBClassifier  0.519439  0.520105  0.520105   0.528748
sumo_llm      RFClassifier  0.481403  0.521977  0.521977   0.509650
sumo_fixed   XGBClassifier  0.472546  0.496421  0.496421   0.499590

sumo_fixed:
  Model: XGBClassifier
  F1 Score: 0.47 ± 0.00
  Accuracy: 0.50 ± 0.00
  Recall: 0.50 ± 0.00
  Precision: 0.50 ± 0.00

sumo_llm:
  Model: RFClassifier
  F1 Score: 0.48 ± 0.00
  Accuracy: 0.52 ± 0.00
  Recall: 0.52 ± 0.00
  Precision: 0.51 ± 0.00

carla_fixed:
  Model: XGBClassifier
  F1 Score: 0.53 ± 0.00
  Accuracy: 0.53 ± 0.00
  Recall: 0.53 ± 0.00
  Precision: 0.53 ± 0.00

carla_llm:
  Model: XGBClassifier
  F1 Score: 0.52 ± 0.00
  Accuracy: 0.52 ± 0.00
  Recall: 0.52 ± 0.00
  Precision: 0.53 ± 0.00



# Discriminative Score

We seek to understand how well a classifier can separate the real and the synthetic data to determine how indistiguishable they are.

If the classifiers do not get good scores, that means the data is hardly distinguishable.

In [19]:
X_uah_disc_sumo = uah_training[sumo_columns_to_keep].drop(columns=['label'])
X_uah_disc_carla = uah_training[carla_columns_to_keep].drop(columns=['label'])
y_uah_disc = np.zeros(len(X_uah_disc_carla)) # 0 for real data and 1 for synthetic data

## SUMO

In [20]:
X_sumo_disc_fixed, y_sumo_disc_fixed = sumo_fixed.drop(columns=['label']), np.ones(len(sumo_fixed))
X_sumo_disc_llm, y_sumo_disc_llm = sumo_llm.drop(columns=['label']), np.ones(len(sumo_llm))

### Fixed

In [21]:
X, y = pd.concat([X_sumo_disc_fixed, X_uah_disc_sumo], axis=0, ignore_index=True), np.concat([y_sumo_disc_fixed, y_uah_disc], axis=0)
X.reset_index(drop=True, inplace=True)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [22]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_val)
print(f"F1 Score: {f1_score(y_val, y_pred, average='weighted')}")
print(f"Accuracy: {accuracy_score(y_val, y_pred)}")

F1 Score: 1.0
Accuracy: 1.0


In [23]:
svc = SVC(kernel='rbf')
svc.fit(X_train, y_train)
y_pred = svc.predict(X_val)
print(f"F1 Score: {f1_score(y_val, y_pred, average='weighted')}")
print(f"Accuracy: {accuracy_score(y_val, y_pred)}")

F1 Score: 0.9973274557248342
Accuracy: 0.9973290383195759


### LLM

In [24]:
X, y = pd.concat([X_sumo_disc_llm, X_uah_disc_sumo], axis=0, ignore_index=True), np.concat([y_sumo_disc_llm, y_uah_disc], axis=0)
X.reset_index(drop=True, inplace=True)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [25]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_val)
print(f"F1 Score: {f1_score(y_val, y_pred, average='weighted')}")
print(f"Accuracy: {accuracy_score(y_val, y_pred)}")

F1 Score: 1.0
Accuracy: 1.0


In [26]:
svc = SVC(kernel='rbf')
svc.fit(X_train, y_train)
y_pred = svc.predict(X_val)
print(f"F1 Score: {f1_score(y_val, y_pred, average='weighted')}")
print(f"Accuracy: {accuracy_score(y_val, y_pred)}")

F1 Score: 0.9970845693018687
Accuracy: 0.9970870416738974


## CARLA

In [27]:
X_carla_disc_fixed, y_carla_disc_fixed = carla_fixed.drop(columns=['label']), np.ones(len(carla_fixed))
X_carla_disc_llm, y_carla_disc_llm = carla_llm.drop(columns=['label']), np.ones(len(carla_llm))

### Fixed

In [28]:
X, y = pd.concat([X_carla_disc_fixed, X_uah_disc_carla], axis=0, ignore_index=True), np.concat([y_carla_disc_fixed, y_uah_disc], axis=0)
X.reset_index(drop=True, inplace=True)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [29]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_val)
print(f"F1 Score: {f1_score(y_val, y_pred, average='weighted')}")
print(f"Accuracy: {accuracy_score(y_val, y_pred)}")

F1 Score: 1.0
Accuracy: 1.0


In [30]:
svc = SVC(kernel='rbf')
svc.fit(X_train, y_train)
y_pred = svc.predict(X_val)
print(f"F1 Score: {f1_score(y_val, y_pred, average='weighted')}")
print(f"Accuracy: {accuracy_score(y_val, y_pred)}")

F1 Score: 1.0
Accuracy: 1.0


### LLM

In [31]:
X, y = pd.concat([X_carla_disc_llm, X_uah_disc_carla], axis=0, ignore_index=True), np.concat([y_carla_disc_llm, y_uah_disc], axis=0)
X.reset_index(drop=True, inplace=True)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [32]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_val)
print(f"F1 Score: {f1_score(y_val, y_pred, average='weighted')}")
print(f"Accuracy: {accuracy_score(y_val, y_pred)}")

F1 Score: 1.0
Accuracy: 1.0


In [33]:
svc = SVC(kernel='rbf')
svc.fit(X_train, y_train)
y_pred = svc.predict(X_val)
print(f"F1 Score: {f1_score(y_val, y_pred, average='weighted')}")
print(f"Accuracy: {accuracy_score(y_val, y_pred)}")

F1 Score: 1.0
Accuracy: 1.0


# Predictive Scores

Here we train on different combinations of real and synthetic data and test on real data only in order to verify the quality of the data for real-world applications.

In [34]:
PS = {}
n_trials = 8

## Real Only

In [35]:
columns_to_keep = ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z', 'angle', 'speed', 'label']

X_train, y_train = preprocessor.sliding_windows(uah_training[columns_to_keep], window_size=window_size, step_size=step_size)
X_val, y_val = preprocessor.sliding_windows(uah_validation[columns_to_keep], window_size=window_size, step_size=step_size)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

y_train = preprocessor.one_hot_encode(y_train, one_hot_keys=one_hot_keys)
y_val = preprocessor.one_hot_encode(y_val, one_hot_keys=one_hot_keys)

In [36]:
# Best model: SVC_{'C': 0.01, 'gamma': 0.01, 'kernel': 'rbf'}
for i in range(n_trials):
    results = create_and_train_model(SVC, {
        'C': 0.01,
        'gamma': 0.01,
        'kernel': 'rbf',
    })
    add_to_dict(PS, 'uah', 'SVC', results)

## SUMO Only

In [37]:
# SUMO
X_sumo_fixed, y_sumo_fixed = preprocessor.sliding_windows(sumo_fixed, window_size=window_size, step_size=step_size)
X_sumo_llm, y_sumo_llm = preprocessor.sliding_windows(sumo_llm, window_size=window_size, step_size=step_size)

In [38]:
# Getting the validation for SUMO variables
print("Columns to keep for validation:", sumo_columns_to_keep)

X_val_base, y_val = preprocessor.sliding_windows(uah_validation[sumo_columns_to_keep], window_size=window_size, step_size=step_size)
y_val = preprocessor.one_hot_encode(y_val, one_hot_keys=one_hot_keys)

Columns to keep for validation: ['acc', 'angle', 'speed', 'gyro_z', 'label']


### Fixed

In [39]:
X_train, y_train = X_sumo_fixed, y_sumo_fixed

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val_base)
y_train = preprocessor.one_hot_encode(y_train, one_hot_keys=one_hot_keys)

In [40]:
# Best model: RandomForestClassifier_{'max_depth': 20, 'n_estimators': 20}
for i in range(n_trials):
    results = create_and_train_model(RandomForestClassifier, {
        'max_depth': 20,
        'n_estimators': 20,
    })
    add_to_dict(PS, 'sumo_fixed', 'RFClassifier', results)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### LLM

In [41]:
X_train, y_train = X_sumo_llm, y_sumo_llm

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val_base)
y_train = preprocessor.one_hot_encode(y_train, one_hot_keys=one_hot_keys)

In [42]:
# Best model: XGBClassifier_{'learning_rate': 0.5, 'max_depth': 5, 'n_estimators': 200}
for i in range(n_trials):
    results = create_and_train_model(xgb.XGBClassifier, {
        'learning_rate': 0.5,
        'max_depth': 5,
        'n_estimators': 200,
    })
    add_to_dict(PS, 'sumo_llm', 'XGBClassifier', results)

## Carla Only

In [43]:
X_carla_fixed, y_carla_fixed = preprocessor.sliding_windows(carla_fixed, window_size=window_size, step_size=step_size)
X_carla_llm, y_carla_llm = preprocessor.sliding_windows(carla_llm, window_size=window_size, step_size=step_size)

In [44]:
# Getting the validation for SUMO variables
print("Columns to keep for validation:", carla_columns_to_keep)
X_val_base, y_val = preprocessor.sliding_windows(uah_validation[carla_columns_to_keep], window_size=window_size, step_size=step_size)
y_val = preprocessor.one_hot_encode(y_val, one_hot_keys=one_hot_keys)

Columns to keep for validation: ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z', 'angle', 'label', 'acc']


### Fixed

In [45]:
X_train, y_train = X_carla_fixed, y_carla_fixed

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val_base)
y_train = preprocessor.one_hot_encode(y_train, one_hot_keys=one_hot_keys)

In [46]:
# Best model: RandomForestClassifier_{'max_depth': 10, 'n_estimators': 20}
for i in range(n_trials):
    results = create_and_train_model(RandomForestClassifier, {
        'max_depth': 10,
        'n_estimators': 20,
    })
    add_to_dict(PS, 'carla_fixed', 'RFClassifier', results)

### LLM

In [47]:
X_train, y_train = X_carla_llm, y_carla_llm

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val_base)
y_train = preprocessor.one_hot_encode(y_train, one_hot_keys=one_hot_keys)

In [48]:
# Best model: RandomForestClassifier_{'max_depth': None, 'n_estimators': 500}
for i in range(n_trials):
    results = create_and_train_model(RandomForestClassifier, {
        'max_depth': None,
        'n_estimators': 500,
    })
    add_to_dict(PS, 'carla_llm', 'RFClassifier', results)

## Real + SUMO

In [49]:
sumo_uah_fixed_20 = preprocessor.fill_synthetic_data(sumo_uah_fixed, 0.2)
sumo_uah_llm_20 = preprocessor.fill_synthetic_data(sumo_uah_llm, 0.2)

sumo_uah_fixed_100 = preprocessor.fill_synthetic_data(sumo_uah_fixed, 1)
sumo_uah_llm_100 = preprocessor.fill_synthetic_data(sumo_uah_llm, 1)

sumo_uah_fixed_200 = preprocessor.fill_synthetic_data(sumo_uah_fixed, 2)
sumo_uah_llm_200 = preprocessor.fill_synthetic_data(sumo_uah_llm, 2)

In [50]:
X_sumo_uah_fixed_20, y_sumo_uah_fixed_20 = preprocessor.sliding_windows(sumo_uah_fixed_20, window_size=window_size, step_size=step_size)
X_sumo_uah_llm_20, y_sumo_uah_llm_20 = preprocessor.sliding_windows(sumo_uah_llm_20, window_size=window_size, step_size=step_size)

X_sumo_uah_fixed_100, y_sumo_uah_fixed_100 = preprocessor.sliding_windows(sumo_uah_fixed_100, window_size=window_size, step_size=step_size)
X_sumo_uah_llm_100, y_sumo_uah_llm_100 = preprocessor.sliding_windows(sumo_uah_llm_100, window_size=window_size, step_size=step_size)

X_sumo_uah_fixed_200, y_sumo_uah_fixed_200 = preprocessor.sliding_windows(sumo_uah_fixed_200, window_size=window_size, step_size=step_size)
X_sumo_uah_llm_200, y_sumo_uah_llm_200 = preprocessor.sliding_windows(sumo_uah_llm_200, window_size=window_size, step_size=step_size)

In [51]:
print("Columns to keep for validation:", sumo_columns_to_keep)

X_val_base, y_val = preprocessor.sliding_windows(uah_validation[sumo_columns_to_keep], window_size=window_size, step_size=step_size)
y_val = preprocessor.one_hot_encode(y_val, one_hot_keys=one_hot_keys)

Columns to keep for validation: ['acc', 'angle', 'speed', 'gyro_z', 'label']


## Real + SUMO (20%)

### Fixed

In [52]:
X_train, y_train = X_sumo_uah_fixed_20, y_sumo_uah_fixed_20

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val_base)
y_train = preprocessor.one_hot_encode(y_train, one_hot_keys=one_hot_keys)

In [53]:
# Best model: SVC_{'C': 0.1, 'gamma': 0.01, 'kernel': 'rbf'}
for i in range(n_trials):
    results = create_and_train_model(SVC, {
        'C': 0.1,
        'gamma': 0.01,
        'kernel': 'rbf',
    })
    add_to_dict(PS, 'sumo_uah_fixed_20', 'SVC', results)

### LLM

In [54]:
X_train, y_train = X_sumo_uah_llm_20, y_sumo_uah_llm_20

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val_base)
y_train = preprocessor.one_hot_encode(y_train, one_hot_keys=one_hot_keys)

In [55]:
# Best model: SVC_{'C': 0.1, 'gamma': 0.1, 'kernel': 'rbf'}
for i in range(n_trials):
    results = create_and_train_model(SVC, {
        'C': 0.1,
        'gamma': 0.1,
        'kernel': 'rbf',
    })
    add_to_dict(PS, 'sumo_uah_llm_20', 'SVC', results)

## Real + SUMO (100%)

### Fixed

In [56]:
X_train, y_train = X_sumo_uah_fixed_100, y_sumo_uah_fixed_100

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val_base)
y_train = preprocessor.one_hot_encode(y_train, one_hot_keys=one_hot_keys)

In [57]:
# Best model: SVC_{'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
for i in range(n_trials):
    results = create_and_train_model(SVC, {
        'C': 1,
        'gamma': 0.001,
        'kernel': 'rbf',
    })
    add_to_dict(PS, 'sumo_uah_fixed_100', 'SVC', results)

### LLM

In [58]:
X_train, y_train = X_sumo_uah_llm_100, y_sumo_uah_llm_100

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val_base)
y_train = preprocessor.one_hot_encode(y_train, one_hot_keys=one_hot_keys)

In [59]:
# Best model: SVC_{'C': 0.01, 'gamma': 0.1, 'kernel': 'rbf'}
for i in range(n_trials):
    results = create_and_train_model(SVC, {
        'C': 0.01,
        'gamma': 0.1,
        'kernel': 'rbf',
    })
    add_to_dict(PS, 'sumo_uah_llm_100', 'SVC', results)

## Real + SUMO (200%)

### Fixed

In [60]:
X_train, y_train = X_sumo_uah_fixed_200, y_sumo_uah_fixed_200

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val_base)
y_train = preprocessor.one_hot_encode(y_train, one_hot_keys=one_hot_keys)

In [61]:
# Best model: SVC_{'C': 0.01, 'gamma': 0.01, 'kernel': 'rbf'}
for i in range(n_trials):
    results = create_and_train_model(SVC, {
        'C': 0.01,
        'gamma': 0.01,
        'kernel': 'rbf',
    })
    add_to_dict(PS, 'sumo_uah_fixed_200', 'SVC', results)

### LLM

In [62]:
X_train, y_train = X_sumo_uah_llm_200, y_sumo_uah_llm_200

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val_base)
y_train = preprocessor.one_hot_encode(y_train, one_hot_keys=one_hot_keys)

In [63]:
# Best model: SVC_{'C': 0.5, 'gamma': 0.001, 'kernel': 'rbf'}
for i in range(n_trials):
    results = create_and_train_model(SVC, {
        'C': 0.5,
        'gamma': 0.001,
        'kernel': 'rbf',
    })
    add_to_dict(PS, 'sumo_uah_llm_200', 'SVC', results)

## Real + CARLA

In [64]:
carla_uah_fixed_20 = preprocessor.fill_synthetic_data(carla_uah_fixed, 0.2)
carla_uah_llm_20 = preprocessor.fill_synthetic_data(carla_uah_llm, 0.2)

carla_uah_fixed_100 = preprocessor.fill_synthetic_data(carla_uah_fixed, 0.5)
carla_uah_llm_100 = preprocessor.fill_synthetic_data(carla_uah_llm, 0.5)

carla_uah_fixed_200 = preprocessor.fill_synthetic_data(carla_uah_fixed, 0.7)
carla_uah_llm_200 = preprocessor.fill_synthetic_data(carla_uah_llm, 0.7)

In [65]:
X_carla_uah_fixed_20, y_carla_uah_fixed_20 = preprocessor.sliding_windows(carla_uah_fixed_20, window_size=window_size, step_size=step_size)
X_carla_uah_llm_20, y_carla_uah_llm_20 = preprocessor.sliding_windows(carla_uah_llm_20, window_size=window_size, step_size=step_size)

X_carla_uah_fixed_100, y_carla_uah_fixed_100 = preprocessor.sliding_windows(carla_uah_fixed_100, window_size=window_size, step_size=step_size)
X_carla_uah_llm_100, y_carla_uah_llm_100 = preprocessor.sliding_windows(carla_uah_llm_100, window_size=window_size, step_size=step_size)

X_carla_uah_fixed_200, y_carla_uah_fixed_200 = preprocessor.sliding_windows(carla_uah_fixed_200, window_size=window_size, step_size=step_size)
X_carla_uah_llm_200, y_carla_uah_llm_200 = preprocessor.sliding_windows(carla_uah_llm_200, window_size=window_size, step_size=step_size)

In [66]:
print("Columns to keep for validation:", carla_columns_to_keep)

X_val_base, y_val = preprocessor.sliding_windows(uah_validation[carla_columns_to_keep], window_size=window_size, step_size=step_size)
y_val = preprocessor.one_hot_encode(y_val, one_hot_keys=one_hot_keys)

Columns to keep for validation: ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z', 'angle', 'label', 'acc']


## Real + CARLA (20%)

### Fixed

In [67]:
X_train, y_train = X_carla_uah_fixed_20, y_carla_uah_fixed_20

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val_base)
y_train = preprocessor.one_hot_encode(y_train, one_hot_keys=one_hot_keys)

In [68]:
# Best model: RandomForestClassifier_{'max_depth': 20, 'n_estimators': 20}
for i in range(n_trials):
    results = create_and_train_model(RandomForestClassifier, {
        'max_depth': 20,
        'n_estimators': 20,
    })
    add_to_dict(PS, 'carla_uah_fixed_20', 'RFClassifier', results)

### LLM

In [69]:
X_train, y_train = X_carla_uah_llm_20, y_carla_uah_llm_20

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val_base)
y_train = preprocessor.one_hot_encode(y_train, one_hot_keys=one_hot_keys)

# TODO: TRAIN MODEL

In [70]:
# Best model: RandomForestClassifier_{'max_depth': 20, 'n_estimators': 50}
for i in range(n_trials):
    results = create_and_train_model(RandomForestClassifier, {
        'max_depth': 20,
        'n_estimators': 50,
    })
    add_to_dict(PS, 'carla_uah_llm_20', 'RFClassifier', results)

## Real + CARLA (100%)

### Fixed

In [71]:
X_train, y_train = X_carla_uah_fixed_100, y_carla_uah_fixed_100

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val_base)
y_train = preprocessor.one_hot_encode(y_train, one_hot_keys=one_hot_keys)

In [72]:
# Best model: XGBClassifier_{'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.7}
for i in range(n_trials):
    results = create_and_train_model(xgb.XGBClassifier, {
        'colsample_bytree': 0.7,
        'learning_rate': 0.1,
        'max_depth': 5,
        'n_estimators': 200,
        'subsample': 0.7,
    })
    add_to_dict(PS, 'carla_uah_fixed_100', 'XGBClassifier', results)

### LLM

In [73]:
X_train, y_train = X_carla_uah_llm_100, y_carla_uah_llm_100

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val_base)
y_train = preprocessor.one_hot_encode(y_train, one_hot_keys=one_hot_keys)

# TODO: TRAIN MODEL

In [74]:
# Best model: XGBClassifier_{'colsample_bytree': 1.0, 'learning_rate': 0.3, 'max_depth': 5, 'n_estimators': 500, 'subsample': 0.7}
for i in range(n_trials):
    results = create_and_train_model(xgb.XGBClassifier, {
        'colsample_bytree': 1.0,
        'learning_rate': 0.3,
        'max_depth': 5,
        'n_estimators': 500,
        'subsample': 0.7,
    })
    add_to_dict(PS, 'carla_uah_llm_100', 'XGBClassifier', results)

## Real + CARLA (200%)

### Fixed

In [75]:
X_train, y_train = X_carla_uah_fixed_200, y_carla_uah_fixed_200

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val_base)
y_train = preprocessor.one_hot_encode(y_train, one_hot_keys=one_hot_keys)

In [76]:
# Best model: XGBClassifier_{'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 1.0}
for i in range(n_trials):
    results = create_and_train_model(xgb.XGBClassifier, {
        'colsample_bytree': 1.0,
        'learning_rate': 0.1,
        'max_depth': 5,
        'n_estimators': 200,
        'subsample': 1.0,
    })
    add_to_dict(PS, 'carla_uah_fixed_200', 'XGBClassifier', results)

### LLM

In [77]:
X_train, y_train = X_carla_uah_llm_200, y_carla_uah_llm_200

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val_base)
y_train = preprocessor.one_hot_encode(y_train, one_hot_keys=one_hot_keys)

In [78]:
# Best model: RandomForestClassifier_{'max_depth': 50, 'n_estimators': 20}
for i in range(n_trials):
    results = create_and_train_model(RandomForestClassifier, {
        'max_depth': 50,
        'n_estimators': 20,
    })
    add_to_dict(PS, 'carla_uah_llm_200', 'RandomForestClassifier', results)

In [None]:
PS_summary = {}
for key, value in PS.items():
    PS_summary[key] = {
        'model': value['model'],
        
        'f1_score': np.mean(value['f1_scores']),
        'accuracy': np.mean(value['accuracy']),
        'recall': np.mean(value['recall']),
        'precision': np.mean(value['precision']),
    }

## Results Table

In [106]:
with open('../results/metrics/PS_results.csv', 'w', newline='') as csvfile:
    fieldnames = ['key', 'model', 'f1_score_mean', 'f1_score_std', 'accuracy_mean', 'accuracy_std', 'recall_mean', 'recall_std', 'precision_mean', 'precision_std']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for key, value in PS.items():
        writer.writerow({
            'key': key,
            'model': value['model'],
            'f1_score_mean': np.mean(value['f1_scores']),
            'f1_score_std': np.std(value['f1_scores']),
            'accuracy_mean': np.mean(value['accuracy']),
            'accuracy_std': np.std(value['accuracy']),
            'recall_mean': np.mean(value['recall']),
            'recall_std': np.std(value['recall']),
            'precision_mean': np.mean(value['precision']),
            'precision_std': np.std(value['precision']),
        })

PS_means = {}
for key, value in PS.items():
    PS_means[key] = {
        'f1_score': np.mean(value['f1_scores']),
        'accuracy': np.mean(value['accuracy']),
        'recall': np.mean(value['recall']),
        'precision': np.mean(value['precision']),
    }
PS_means_df = pd.DataFrame.from_dict(PS_means, orient='index').sort_values(by='f1_score', ascending=False)
PS_means_df.index.name = 'key'
PS_means_df.to_csv('../results/metrics/PS_means.csv')
print(PS_means_df, end='\n\n')

for key, value in PS.items():
    print(f"{key}:")
    print(f"  Model: {value['model']}")
    print(f"  F1 Score: {np.mean(value['f1_scores']):.2f} ± {np.std(value['f1_scores']):.2f}")
    print(f"  Accuracy: {np.mean(value['accuracy']):.2f} ± {np.std(value['accuracy']):.2f}")
    print(f"  Recall: {np.mean(value['recall']):.2f} ± {np.std(value['recall']):.2f}")
    print(f"  Precision: {np.mean(value['precision']):.2f} ± {np.std(value['precision']):.2f}\n")


                     f1_score  accuracy    recall  precision
key                                                         
uah                  0.750379  0.755116  0.755116   0.775980
sumo_uah_fixed_100   0.715867  0.719203  0.719203   0.729937
sumo_uah_llm_100     0.710140  0.712427  0.712427   0.719298
sumo_uah_fixed_20    0.708782  0.712427  0.712427   0.723554
sumo_uah_llm_200     0.708712  0.713782  0.713782   0.729698
sumo_uah_llm_20      0.700365  0.700366  0.700366   0.700370
sumo_uah_fixed_200   0.699262  0.707955  0.707955   0.735030
carla_uah_fixed_200  0.661489  0.666486  0.666486   0.676863
carla_uah_fixed_100  0.651891  0.655238  0.655238   0.661392
carla_uah_llm_20     0.637119  0.638095  0.638095   0.639684
carla_uah_llm_100    0.630743  0.632199  0.632199   0.634287
carla_uah_fixed_20   0.621431  0.622391  0.622391   0.623691
carla_uah_llm_200    0.614830  0.615852  0.615852   0.617238
sumo_llm             0.574871  0.593305  0.593305   0.612761
carla_fixed          0.5