In [89]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

final_results_path = "results/final"

In [90]:
model_order = ["mlp", "cnn", "lstm"]
backend_order = ["jax", "jax-keras", "tf-keras", "torch-keras", "torch"]
precision_order = ["fp32", "fp16", "bf16", "mixed_fp16", "mixed_bf16"]

Before all, a dataframe with all experiments' configuration and CSV paths is defined, for easier data manipulation.

In [91]:
# Dictionary to store the data
rows = []
error_rows = []

# Iterate through the seed folders
for seed in os.listdir(final_results_path):
    seed_path = os.path.join(final_results_path, seed)
    
    # Check that it is a directory
    if os.path.isdir(seed_path):

        # Iterate through the model folders inside each seed
        for folder in os.listdir(seed_path):
            folder_path = os.path.join(seed_path, folder)

            if not os.path.isdir(folder_path):
                continue

            # Extract info from folder name
            # Precision can contain _, so a workaround is needed
            timestamp, backend, model_type, model_complexity, right = folder.split("_", 4)
            precision, _ = right.rsplit("_", 1) # precision, seed

            experiment_info = {
                "timestamp": timestamp,
                "backend": backend,
                "model_type": model_type,
                "model_complexity": model_complexity,
                "precision": precision,
                "seed": seed,
            }

            if "error.txt" in os.listdir(folder_path):
                error_rows.append({
                    **experiment_info,
                    "path": os.path.join(folder_path, "error.txt")
                })
                continue

            # Add csv files' info
            for file in os.listdir(folder_path):
                if file.endswith(".csv"):
                    rows.append({
                        **experiment_info,
                        "filename": os.path.splitext(file)[0],  # Name without extension
                        "path": os.path.join(folder_path, file)
                    })


index_df = pd.DataFrame(rows)
error_df = pd.DataFrame(error_rows)

In [92]:
index_df.head()

Unnamed: 0,timestamp,backend,model_type,model_complexity,precision,seed,filename,path
0,20250724-142036,torch,mlp,simple,fp32,42,train_samples,results/final/42/20250724-142036_torch_mlp_sim...
1,20250724-142036,torch,mlp,simple,fp32,42,test_samples,results/final/42/20250724-142036_torch_mlp_sim...
2,20250724-142036,torch,mlp,simple,fp32,42,global_metrics,results/final/42/20250724-142036_torch_mlp_sim...
3,20250724-142036,torch,mlp,simple,fp32,42,train,results/final/42/20250724-142036_torch_mlp_sim...
4,20250724-142036,torch,mlp,simple,fp32,42,test,results/final/42/20250724-142036_torch_mlp_sim...


In [93]:
error_df.head()

Unnamed: 0,timestamp,backend,model_type,model_complexity,precision,seed,path
0,20250728-145122,torch-keras,lstm,simple,fp16,42,results/final/42/20250728-145122_torch-keras_l...
1,20250729-134409,jax,lstm,simple,bf16,42,results/final/42/20250729-134409_jax_lstm_simp...
2,20250726-152014,tf-keras,lstm,simple,bf16,42,results/final/42/20250726-152014_tf-keras_lstm...
3,20250728-145247,torch-keras,lstm,simple,bf16,42,results/final/42/20250728-145247_torch-keras_l...
4,20250729-134358,jax,lstm,simple,fp16,42,results/final/42/20250729-134358_jax_lstm_simp...


## Statistical analysis ##

In [94]:
experiment_config = ["backend", "model_type", "model_complexity", "precision", "seed"]
rows = []

for keys, subset_df in index_df.groupby(experiment_config):
    backend, model_type, model_complexity, precision, seed = keys

    row = {
        "backend": backend,
        "model_type": model_type,
        "model_complexity": model_complexity,
        "precision": precision,
        "seed": int(seed),
    }



    # --- Global metrics ---
    global_metrics_row = subset_df[subset_df["filename"] == "global_metrics"]
    global_metrics_df = pd.read_csv(global_metrics_row.iloc[0]["path"])
    global_metrics = global_metrics_df[["training_time", "testing_time"]].iloc[0]
    row.update(global_metrics.to_dict())



    # --- Train ---
    train_row = subset_df[subset_df["filename"] == "train"]
    train_df = pd.read_csv(train_row.iloc[0]["path"])

    # Metric can be accuracy or MAE
    try:
        metric_mean = train_df["accuracy"].mean()
        val_metric_mean = train_df["val_accuracy"].mean()
    except KeyError:
        metric_mean = train_df["mae"].mean()
        val_metric_mean = train_df["val_mae"].mean()

    row.update({
        "train_loss_mean": train_df["loss"].mean(),
        "train_metric_mean": metric_mean,
        "train_val_loss_mean": train_df["val_loss"].mean(),
        "train_val_metric_mean": val_metric_mean,
        "train_epoch_time_mean": train_df["epoch_time"].mean()
    })



    # --- Train samples ---
    train_samples_row = subset_df[subset_df["filename"] == "train_samples"]
    train_samples_df = pd.read_csv(train_samples_row.iloc[0]["path"])
    row.update({
        "train_gpu_utilization_mean": train_samples_df["gpu_2_utilization"].mean(),
        "train_gpu_memory_mean": train_samples_df["gpu_2_memory_used"].mean(),
        "train_gpu_power_mean": train_samples_df["gpu_2_power"].mean(),
    })



    # --- Test ---
    test_row = subset_df[subset_df["filename"] == "test"]
    test_df = pd.read_csv(test_row.iloc[0]["path"])

    try:
        metric_mean = train_df["accuracy"].mean()
    except KeyError:
        metric_mean = train_df["mae"].mean()

    row.update({
        "test_loss_mean": test_df["loss"].mean(),
        "test_metric_mean": metric_mean
    })



    # --- Test samples ---
    test_samples_row = subset_df[subset_df["filename"] == "test_samples"]
    test_samples_df = pd.read_csv(test_samples_row.iloc[0]["path"])
    row.update({
        "test_gpu_utilization_mean": test_samples_df["gpu_2_utilization"].mean(),
        "test_gpu_memory_mean": test_samples_df["gpu_2_memory_used"].mean(),
        "test_gpu_power_mean": test_samples_df["gpu_2_power"].mean(),
    })

    rows.append(row)

metrics_df = pd.DataFrame(rows)

In [95]:
metrics_df.head()

Unnamed: 0,backend,model_type,model_complexity,precision,seed,training_time,testing_time,train_loss_mean,train_metric_mean,train_val_loss_mean,train_val_metric_mean,train_epoch_time_mean,train_gpu_utilization_mean,train_gpu_memory_mean,train_gpu_power_mean,test_loss_mean,test_metric_mean,test_gpu_utilization_mean,test_gpu_memory_mean,test_gpu_power_mean
0,jax,cnn,complex,bf16,42,593.557829,3.350694,0.617344,0.785566,1.944375,0.503899,5.900471,69.073254,62093.040992,132.206995,1.25,0.785566,0.0,62093.1875,70.851
1,jax,cnn,complex,bf16,43,588.882573,3.386329,0.618457,0.784676,1.949922,0.497729,5.857713,69.197935,62093.039479,135.142317,1.21875,0.784676,0.0,62093.1875,72.408
2,jax,cnn,complex,bf16,44,642.610249,3.358645,0.653438,0.773325,2.051875,0.479636,6.393019,62.96395,62093.052704,130.33731,1.304688,0.773325,0.0,62093.1875,72.019
3,jax,cnn,complex,fp16,42,610.762836,3.359147,,0.099999,,0.100078,6.076235,66.251656,62098.978891,123.973142,,0.099999,0.0,62099.1875,70.059
4,jax,cnn,complex,fp16,43,606.74308,3.369644,,0.1,,0.100018,6.03571,66.123539,62098.977149,124.766868,,0.1,0.0,62099.1875,69.939


In [96]:
metrics = metrics_df.columns[5:]
seed_variability = []

for metric in metrics:
    for (backend, model_type, model_complexity, precision), df in metrics_df.groupby(experiment_config[:-1]):
        values = df.sort_values("seed")[metric].values
        
        mean = np.mean(values)
        std = np.std(values)
        cv = std / mean if mean != 0 and pd.notna(mean) else 0
        consistent = cv < 0.05  # 5% maximum acceptable variation

        seed_variability.append({
            "backend": backend,
            "model_type": model_type,
            "model_complexity": model_complexity,
            "precision": precision,
            "metric": metric,
            "mean": mean,
            "std": std,
            "cv": cv,
            "consistent": consistent
        })

variability_df = pd.DataFrame(seed_variability)

In [97]:
variability_df.head()

Unnamed: 0,backend,model_type,model_complexity,precision,metric,mean,std,cv,consistent
0,jax,cnn,complex,bf16,training_time,608.350217,24.300574,0.039945,True
1,jax,cnn,complex,fp16,training_time,619.264023,14.955202,0.02415,True
2,jax,cnn,complex,fp32,training_time,599.312418,3.170055,0.005289,True
3,jax,cnn,complex,mixed_bf16,training_time,669.838652,14.836218,0.022149,True
4,jax,cnn,complex,mixed_fp16,training_time,648.072338,12.861021,0.019845,True


In [98]:
num_metrics = len(variability_df)
num_consistent = variability_df["consistent"].sum()
percent = (num_consistent / num_metrics) * 100
print(f"{percent:.2f}% of metrics are consistent between seeds ({num_consistent} out of {num_metrics})")

85.44% of metrics are consistent between seeds (1743 out of 2040)


In [99]:
variability_df[~variability_df["consistent"]].head(10)

Unnamed: 0,backend,model_type,model_complexity,precision,metric,mean,std,cv,consistent
31,jax-keras,cnn,simple,bf16,training_time,215.868621,11.346106,0.05256,False
32,jax-keras,cnn,simple,fp16,training_time,199.535093,15.189703,0.076125,False
34,jax-keras,cnn,simple,mixed_bf16,training_time,205.620268,13.309526,0.064729,False
35,jax-keras,cnn,simple,mixed_fp16,training_time,234.699164,24.4175,0.104037,False
42,jax-keras,mlp,complex,bf16,training_time,171.462941,10.986529,0.064075,False
47,jax-keras,mlp,simple,bf16,training_time,132.751263,8.171771,0.061557,False
49,jax-keras,mlp,simple,fp32,training_time,136.450019,12.353826,0.090537,False
58,tf-keras,cnn,simple,fp16,training_time,141.424864,15.303753,0.108211,False
59,tf-keras,cnn,simple,fp32,training_time,148.756444,10.426279,0.07009,False
60,tf-keras,cnn,simple,mixed_bf16,training_time,156.755038,14.368854,0.091664,False


In [100]:
(
    variability_df[~variability_df["consistent"]]
    .groupby(["backend", "model_type", "model_complexity"])
    .size()
    .reset_index(name="n_inconsistent")
    .sort_values("n_inconsistent", ascending=False)
)

Unnamed: 0,backend,model_type,model_complexity,n_inconsistent
20,torch,lstm,simple,20
26,torch-keras,lstm,simple,19
9,jax-keras,mlp,complex,19
14,tf-keras,lstm,simple,17
10,jax-keras,mlp,simple,15
25,torch-keras,lstm,complex,15
6,jax-keras,cnn,simple,14
27,torch-keras,mlp,complex,13
16,tf-keras,mlp,simple,13
5,jax-keras,cnn,complex,13


In [101]:
(
    variability_df[~variability_df["consistent"]]
    .groupby(["backend"])
    .size()
    .reset_index(name="n_inconsistent")
    .sort_values("n_inconsistent", ascending=False)
)

Unnamed: 0,backend,n_inconsistent
1,jax-keras,81
2,tf-keras,73
4,torch-keras,66
3,torch,55
0,jax,22


In [102]:
(
    variability_df
    .groupby(["backend", "model_type", "model_complexity"])["consistent"]
    .mean()
    .reset_index(name="%_consistent")
    .sort_values("%_consistent")
)

Unnamed: 0,backend,model_type,model_complexity,%_consistent
27,torch-keras,lstm,simple,0.577778
26,torch-keras,lstm,complex,0.666667
15,tf-keras,lstm,simple,0.716667
21,torch,lstm,simple,0.733333
10,jax-keras,mlp,complex,0.746667
8,jax-keras,lstm,complex,0.777778
9,jax-keras,lstm,simple,0.777778
11,jax-keras,mlp,simple,0.8
7,jax-keras,cnn,simple,0.813333
17,tf-keras,mlp,simple,0.826667


In [103]:
(
    variability_df
    .groupby(["backend", "model_type", "model_complexity", "precision"])["consistent"]
    .mean()
    .reset_index(name="%_consistent")
    .sort_values("%_consistent")
    .head(20)
)

Unnamed: 0,backend,model_type,model_complexity,precision,%_consistent
97,torch,lstm,simple,fp32,0.466667
127,torch-keras,mlp,complex,fp16,0.466667
43,jax-keras,mlp,complex,fp16,0.533333
123,torch-keras,lstm,simple,fp32,0.533333
52,tf-keras,cnn,complex,bf16,0.6
26,jax-keras,cnn,complex,bf16,0.6
42,jax-keras,mlp,complex,bf16,0.6
47,jax-keras,mlp,simple,bf16,0.6
124,torch-keras,lstm,simple,mixed_bf16,0.6
95,torch,lstm,simple,bf16,0.6


In [104]:
(
    variability_df
    .groupby(["precision"])["consistent"]
    .mean()
    .reset_index(name="%_consistent")
    .sort_values("%_consistent")
    .head(20)
)

Unnamed: 0,precision,%_consistent
0,bf16,0.793939
4,mixed_fp16,0.86
2,fp32,0.862222
3,mixed_bf16,0.864444
1,fp16,0.880556


In [None]:
summary_df = variability_df.copy()

# column with format "mean ± std"
summary_df["value"] = summary_df.apply(
    lambda row: f"{row['mean']:.3f} ± {row['std']:.3f}", axis=1
)

pivot_table = summary_df.pivot_table(
    index=["model_type", "backend"],
    columns="metric",
    values="value",
    aggfunc="first"
).reset_index()

#pivot_table.to_latex("summary_table.tex", index=False, escape=False)

# Results analysis

## Execution time

In [108]:
time_df = metrics_df[experiment_config + ["training_time", "testing_time"]].copy()

In [109]:
time_df.head()

Unnamed: 0,backend,model_type,model_complexity,precision,seed,training_time,testing_time
0,jax,cnn,complex,bf16,42,593.557829,3.350694
1,jax,cnn,complex,bf16,43,588.882573,3.386329
2,jax,cnn,complex,bf16,44,642.610249,3.358645
3,jax,cnn,complex,fp16,42,610.762836,3.359147
4,jax,cnn,complex,fp16,43,606.74308,3.369644
