In [18]:
import os
import json
import pandas as pd
from glob import glob
from tqdm import tqdm
from collections import defaultdict

In [19]:
# Configuration - You can modify these variables
data_dir = r"C:\Users\weiha\Desktop\benchmark_result"  # Base directory containing results
pipeline_list = ["supervised"]  # List of pipelines to analyze

task_name_list =["MotionSourceRecognitionUpdated","BreathingDetection_Subset","Localization","FallDetection","ProximityRecognition","HumanActivityRecognition","HumanIdentification"]
# task_name_list = ["BreathingDetection_Subset"]  # List of tasks to analyze
# task_name_list =["ProximityRecognition","HumanActivityRecognition","HumanIdentification"]

model_name_list = ["mlp", "lstm", "resnet18", "transformer", "vit", "patchtst", "timesformer1d"] # List of models to analyze
# model_name_list = ["transformer"]

# Initialize an empty list to store results
results_data = []

# Iterate through all combinations
for pipeline in pipeline_list:
    for task_name in task_name_list:
        for model_name in model_name_list:
            # Find all experiment folders for the current combination
            exp_pattern = os.path.join(data_dir, pipeline, task_name, model_name, "params_*")
            exp_folders = glob(exp_pattern)
            
            print(f"Found {len(exp_folders)} experiments for {pipeline}/{task_name}/{model_name}")
            
            # Process each experiment
            for exp_folder in tqdm(exp_folders, desc=f"Processing {pipeline}/{task_name}/{model_name}"):
                # Extract experiment ID
                exp_id = os.path.basename(exp_folder)
                
                # Define paths for config and results files
                config_filename = f"{model_name}_{task_name}_config.json"
                results_filename = f"{model_name}_{task_name}_results.json"

                if pipeline == 'multitask':
                    results_filename = f"{model_name}_{task_name}_test_results.json"
                
                config_path = os.path.join(exp_folder, config_filename)
                results_path = os.path.join(exp_folder, results_filename)
                
                # Skip if either file doesn't exist
                if not os.path.exists(config_path) or not os.path.exists(results_path):
                    print(f"Missing files for {exp_folder}, skipping")
                    continue
                
                # Read config file
                try:
                    with open(config_path, 'r') as f:
                        config_data = json.load(f)
                    
                    # Extract required fields from config
                    learning_rate = config_data.get('learning_rate')
                    weight_decay = config_data.get('weight_decay')
                    seed = config_data.get('seed')
                    batch_size = config_data.get('batch_size')
                    
                    if pipeline == 'multitask':
                        seed =2
                    # Skip if seed is not available (old version)
                    if seed is None:
                        print(f"Seed not available in {exp_folder}, skipping")
                        continue
                        
                except Exception as e:
                    print(f"Error reading config file {config_path}: {e}")
                    continue
                
                # Read results file
                try:
                    with open(results_path, 'r') as f:
                        results_data_json = json.load(f)
                except Exception as e:
                    print(f"Error reading results file {results_path}: {e}")
                    continue
                
                # Create a dictionary for the current experiment
                exp_result = {
                    'pipeline': pipeline,
                    'task_name': task_name,
                    'model_name': model_name,
                    'experiment_id': exp_id,
                    'learning_rate': learning_rate,
                    'weight_decay': weight_decay,
                    'batch_size':batch_size,
                    'seed': seed,
                }
                
                # Extract test results for all test sets
                for test_name, test_results in results_data_json.items():
                    if isinstance(test_results, dict):
                        for metric_name, metric_value in test_results.items():
                            column_name = f"{test_name}_{metric_name}"
                            exp_result[column_name] = metric_value
                
                # Add to results list
                results_data.append(exp_result)


results_df = pd.DataFrame(results_data)


for task in task_name_list:
    for model in model_name_list:
        task_model_df = results_df[(results_df['task_name'] == task) & (results_df['model_name'] == model)]
        if not task_model_df.empty:
            print(f"\n{task} - {model} (count: {len(task_model_df)})")
            
            # Find metrics columns
            metric_columns = [col for col in task_model_df.columns if any(col.endswith(m) for m in ['_loss', '_accuracy', '_f1'])]
            
            if metric_columns:
                avg_metrics = task_model_df[metric_columns].mean()
                for metric, value in avg_metrics.items():
                    print(f"  {metric}: {value:.4f}")

if pipeline_list == ["supervised"]:
    results_df = results_df[results_df["batch_size"] == 128].reset_index(drop=True)



Found 4 experiments for supervised/MotionSourceRecognitionUpdated/mlp


Processing supervised/MotionSourceRecognitionUpdated/mlp: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 485.45it/s]


Found 4 experiments for supervised/MotionSourceRecognitionUpdated/lstm


Processing supervised/MotionSourceRecognitionUpdated/lstm: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 663.21it/s]


Found 4 experiments for supervised/MotionSourceRecognitionUpdated/resnet18


Processing supervised/MotionSourceRecognitionUpdated/resnet18: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 851.38it/s]


Found 4 experiments for supervised/MotionSourceRecognitionUpdated/transformer


Processing supervised/MotionSourceRecognitionUpdated/transformer: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 832.24it/s]


Found 4 experiments for supervised/MotionSourceRecognitionUpdated/vit


Processing supervised/MotionSourceRecognitionUpdated/vit: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 945.57it/s]


Found 4 experiments for supervised/MotionSourceRecognitionUpdated/patchtst


Processing supervised/MotionSourceRecognitionUpdated/patchtst: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 736.62it/s]


Found 4 experiments for supervised/MotionSourceRecognitionUpdated/timesformer1d


Processing supervised/MotionSourceRecognitionUpdated/timesformer1d: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 707.36it/s]


Found 5 experiments for supervised/BreathingDetection_Subset/mlp


Processing supervised/BreathingDetection_Subset/mlp: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 47.62it/s]


Missing files for C:\Users\weiha\Desktop\benchmark_result\supervised\BreathingDetection_Subset\mlp\params_e82c47fd, skipping
Found 5 experiments for supervised/BreathingDetection_Subset/lstm


Processing supervised/BreathingDetection_Subset/lstm:   0%|                                                                                                                                  | 0/5 [00:00<?, ?it/s]

Missing files for C:\Users\weiha\Desktop\benchmark_result\supervised\BreathingDetection_Subset\lstm\params_103a2ab2, skipping


Processing supervised/BreathingDetection_Subset/lstm: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 52.72it/s]


Found 5 experiments for supervised/BreathingDetection_Subset/resnet18


Processing supervised/BreathingDetection_Subset/resnet18: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 67.53it/s]


Missing files for C:\Users\weiha\Desktop\benchmark_result\supervised\BreathingDetection_Subset\resnet18\params_f3993c61, skipping
Found 5 experiments for supervised/BreathingDetection_Subset/transformer


Processing supervised/BreathingDetection_Subset/transformer: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 60.94it/s]


Missing files for C:\Users\weiha\Desktop\benchmark_result\supervised\BreathingDetection_Subset\transformer\params_da0536fb, skipping
Found 5 experiments for supervised/BreathingDetection_Subset/vit


Processing supervised/BreathingDetection_Subset/vit:   0%|                                                                                                                                   | 0/5 [00:00<?, ?it/s]

Missing files for C:\Users\weiha\Desktop\benchmark_result\supervised\BreathingDetection_Subset\vit\params_7d6cf50b, skipping


Processing supervised/BreathingDetection_Subset/vit: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 58.94it/s]


Found 5 experiments for supervised/BreathingDetection_Subset/patchtst


Processing supervised/BreathingDetection_Subset/patchtst:   0%|                                                                                                                              | 0/5 [00:00<?, ?it/s]

Missing files for C:\Users\weiha\Desktop\benchmark_result\supervised\BreathingDetection_Subset\patchtst\params_2ac42674, skipping


Processing supervised/BreathingDetection_Subset/patchtst: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 56.09it/s]


Found 5 experiments for supervised/BreathingDetection_Subset/timesformer1d


Processing supervised/BreathingDetection_Subset/timesformer1d: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 57.61it/s]


Missing files for C:\Users\weiha\Desktop\benchmark_result\supervised\BreathingDetection_Subset\timesformer1d\params_eaf7d314, skipping
Found 7 experiments for supervised/Localization/mlp


Processing supervised/Localization/mlp:   0%|                                                                                                                                                | 0/7 [00:00<?, ?it/s]

Seed not available in C:\Users\weiha\Desktop\benchmark_result\supervised\Localization\mlp\params_5f8b7681, skipping


Processing supervised/Localization/mlp:  86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                   | 6/7 [00:00<00:00, 57.91it/s]

Seed not available in C:\Users\weiha\Desktop\benchmark_result\supervised\Localization\mlp\params_c21f4b60, skipping


Processing supervised/Localization/mlp: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 54.74it/s]


Found 7 experiments for supervised/Localization/lstm


Processing supervised/Localization/lstm:   0%|                                                                                                                                               | 0/7 [00:00<?, ?it/s]

Missing files for C:\Users\weiha\Desktop\benchmark_result\supervised\Localization\lstm\params_1b0e1c8b, skipping
Missing files for C:\Users\weiha\Desktop\benchmark_result\supervised\Localization\lstm\params_58042d4f, skipping


Processing supervised/Localization/lstm: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 75.91it/s]


Found 7 experiments for supervised/Localization/resnet18


Processing supervised/Localization/resnet18:   0%|                                                                                                                                           | 0/7 [00:00<?, ?it/s]

Missing files for C:\Users\weiha\Desktop\benchmark_result\supervised\Localization\resnet18\params_0637f687, skipping
Missing files for C:\Users\weiha\Desktop\benchmark_result\supervised\Localization\resnet18\params_134c9969, skipping


Processing supervised/Localization/resnet18: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 61.43it/s]


Found 7 experiments for supervised/Localization/transformer


Processing supervised/Localization/transformer:   0%|                                                                                                                                        | 0/7 [00:00<?, ?it/s]

Missing files for C:\Users\weiha\Desktop\benchmark_result\supervised\Localization\transformer\params_42f9b535, skipping
Missing files for C:\Users\weiha\Desktop\benchmark_result\supervised\Localization\transformer\params_9761304b, skipping


Processing supervised/Localization/transformer: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 77.80it/s]


Missing files for C:\Users\weiha\Desktop\benchmark_result\supervised\Localization\transformer\params_d4cd0e38, skipping
Found 7 experiments for supervised/Localization/vit


Processing supervised/Localization/vit: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 81.40it/s]


Missing files for C:\Users\weiha\Desktop\benchmark_result\supervised\Localization\vit\params_04696cd8, skipping
Missing files for C:\Users\weiha\Desktop\benchmark_result\supervised\Localization\vit\params_c90efad6, skipping
Found 7 experiments for supervised/Localization/patchtst


Processing supervised/Localization/patchtst:   0%|                                                                                                                                           | 0/7 [00:00<?, ?it/s]

Missing files for C:\Users\weiha\Desktop\benchmark_result\supervised\Localization\patchtst\params_451e1069, skipping


Processing supervised/Localization/patchtst: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 74.35it/s]


Missing files for C:\Users\weiha\Desktop\benchmark_result\supervised\Localization\patchtst\params_c52077c2, skipping
Found 7 experiments for supervised/Localization/timesformer1d


Processing supervised/Localization/timesformer1d: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 81.31it/s]


Missing files for C:\Users\weiha\Desktop\benchmark_result\supervised\Localization\timesformer1d\params_8fae46c8, skipping
Missing files for C:\Users\weiha\Desktop\benchmark_result\supervised\Localization\timesformer1d\params_ad406c14, skipping
Found 4 experiments for supervised/FallDetection/mlp


Processing supervised/FallDetection/mlp: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 64.79it/s]


Found 4 experiments for supervised/FallDetection/lstm


Processing supervised/FallDetection/lstm: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 58.10it/s]


Found 4 experiments for supervised/FallDetection/resnet18


Processing supervised/FallDetection/resnet18: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 64.44it/s]


Found 4 experiments for supervised/FallDetection/transformer


Processing supervised/FallDetection/transformer: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 53.96it/s]


Found 4 experiments for supervised/FallDetection/vit


Processing supervised/FallDetection/vit: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 66.09it/s]


Found 4 experiments for supervised/FallDetection/patchtst


Processing supervised/FallDetection/patchtst: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 46.84it/s]


Found 4 experiments for supervised/FallDetection/timesformer1d


Processing supervised/FallDetection/timesformer1d: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 47.56it/s]


Found 4 experiments for supervised/ProximityRecognition/mlp


Processing supervised/ProximityRecognition/mlp: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 59.29it/s]


Found 4 experiments for supervised/ProximityRecognition/lstm


Processing supervised/ProximityRecognition/lstm: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 54.45it/s]


Found 4 experiments for supervised/ProximityRecognition/resnet18


Processing supervised/ProximityRecognition/resnet18: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 52.41it/s]


Found 4 experiments for supervised/ProximityRecognition/transformer


Processing supervised/ProximityRecognition/transformer: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 57.09it/s]


Found 4 experiments for supervised/ProximityRecognition/vit


Processing supervised/ProximityRecognition/vit: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 51.18it/s]


Found 4 experiments for supervised/ProximityRecognition/patchtst


Processing supervised/ProximityRecognition/patchtst: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 51.91it/s]


Found 4 experiments for supervised/ProximityRecognition/timesformer1d


Processing supervised/ProximityRecognition/timesformer1d: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 47.09it/s]


Found 4 experiments for supervised/HumanActivityRecognition/mlp


Processing supervised/HumanActivityRecognition/mlp: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 56.62it/s]


Found 4 experiments for supervised/HumanActivityRecognition/lstm


Processing supervised/HumanActivityRecognition/lstm: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 49.89it/s]


Found 4 experiments for supervised/HumanActivityRecognition/resnet18


Processing supervised/HumanActivityRecognition/resnet18: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 54.48it/s]


Found 4 experiments for supervised/HumanActivityRecognition/transformer


Processing supervised/HumanActivityRecognition/transformer: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 51.12it/s]


Found 4 experiments for supervised/HumanActivityRecognition/vit


Processing supervised/HumanActivityRecognition/vit: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 51.35it/s]


Found 4 experiments for supervised/HumanActivityRecognition/patchtst


Processing supervised/HumanActivityRecognition/patchtst: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 49.01it/s]


Found 4 experiments for supervised/HumanActivityRecognition/timesformer1d


Processing supervised/HumanActivityRecognition/timesformer1d: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 49.85it/s]


Found 4 experiments for supervised/HumanIdentification/mlp


Processing supervised/HumanIdentification/mlp: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 51.17it/s]


Found 4 experiments for supervised/HumanIdentification/lstm


Processing supervised/HumanIdentification/lstm: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 51.25it/s]


Found 4 experiments for supervised/HumanIdentification/resnet18


Processing supervised/HumanIdentification/resnet18: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 58.85it/s]


Found 4 experiments for supervised/HumanIdentification/transformer


Processing supervised/HumanIdentification/transformer: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 63.71it/s]


Found 4 experiments for supervised/HumanIdentification/vit


Processing supervised/HumanIdentification/vit: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 56.01it/s]


Found 4 experiments for supervised/HumanIdentification/patchtst


Processing supervised/HumanIdentification/patchtst: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 49.15it/s]


Found 4 experiments for supervised/HumanIdentification/timesformer1d


Processing supervised/HumanIdentification/timesformer1d: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 53.32it/s]



MotionSourceRecognitionUpdated - mlp (count: 4)
  test_hard_loss: 0.0501
  test_hard_accuracy: 0.9819
  test_loss: 0.0337
  test_accuracy: 0.9884
  test_easy_loss: 0.0409
  test_easy_accuracy: 0.9821
  test_medium_loss: 0.0274
  test_medium_accuracy: 0.9913
  test_medium_id_loss: nan
  test_medium_id_accuracy: nan
  test_hard_id_loss: nan
  test_hard_id_accuracy: nan
  test_easy_id_loss: nan
  test_easy_id_accuracy: nan
  test_cross_device_loss: nan
  test_cross_device_accuracy: nan
  test_cross_env_loss: nan
  test_cross_env_accuracy: nan
  test_cross_user_loss: nan
  test_cross_user_accuracy: nan

MotionSourceRecognitionUpdated - lstm (count: 4)
  test_hard_loss: 0.1016
  test_hard_accuracy: 0.9694
  test_loss: 0.0670
  test_accuracy: 0.9818
  test_easy_loss: 0.1188
  test_easy_accuracy: 0.9665
  test_medium_loss: 0.0488
  test_medium_accuracy: 0.9879
  test_medium_id_loss: nan
  test_medium_id_accuracy: nan
  test_hard_id_loss: nan
  test_hard_id_accuracy: nan
  test_easy_id_loss: 

In [20]:
# results_df[["task_name","model_name","test_accuracy", "test_f1_score"]]
# results_df[results_df['task_name'] == 'BreathingDetection_Subset']
results_df

Unnamed: 0,pipeline,task_name,model_name,experiment_id,learning_rate,weight_decay,batch_size,seed,test_hard_loss,test_hard_accuracy,...,test_easy_id_f1_score,test_cross_device_loss,test_cross_device_accuracy,test_cross_device_f1_score,test_cross_env_loss,test_cross_env_accuracy,test_cross_env_f1_score,test_cross_user_loss,test_cross_user_accuracy,test_cross_user_f1_score
0,supervised,MotionSourceRecognitionUpdated,mlp,params_3eba2706,0.001,0.00001,128,11,0.057538,0.980539,...,,,,,,,,,,
1,supervised,MotionSourceRecognitionUpdated,mlp,params_a56bbd55,0.001,0.00001,128,3,0.042137,0.984032,...,,,,,,,,,,
2,supervised,MotionSourceRecognitionUpdated,mlp,params_d4700539,0.001,0.00001,128,99,0.053438,0.977545,...,,,,,,,,,,
3,supervised,MotionSourceRecognitionUpdated,mlp,params_fc13ed90,0.001,0.00001,128,42,0.047192,0.985529,...,,,,,,,,,,
4,supervised,MotionSourceRecognitionUpdated,lstm,params_58ba7f06,0.001,0.00001,128,99,0.099868,0.970559,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191,supervised,HumanIdentification,patchtst,params_899208c2,0.001,0.00001,128,3,,,...,,5.127985,0.265797,0.256079,,,,,,
192,supervised,HumanIdentification,timesformer1d,params_609f17ff,0.001,0.00001,128,3,,,...,,5.871156,0.338361,0.337961,,,,,,
193,supervised,HumanIdentification,timesformer1d,params_87273be9,0.001,0.00001,128,42,,,...,,4.506577,0.313901,0.310916,,,,,,
194,supervised,HumanIdentification,timesformer1d,params_bd5d911b,0.001,0.00001,128,99,,,...,,4.745084,0.215654,0.214990,,,,,,


In [21]:
def merge_id_columns(df, drop_id_cols=True):
    merge_targets = ['easy', 'medium', 'hard']
    metrics = ['loss', 'accuracy', 'f1_score']
    id_cols_to_drop = []

    for target in merge_targets:
        for metric in metrics:
            base_col = f'test_{target}_{metric}'
            id_col = f'test_{target}_id_{metric}'

            if base_col in df.columns and id_col in df.columns:
                df[base_col] = df[base_col].combine_first(df[id_col])
                if drop_id_cols:
                    id_cols_to_drop.append(id_col)

    if drop_id_cols:
        df.drop(columns=id_cols_to_drop, inplace=True)

    return df

In [22]:
results_df = merge_id_columns(results_df)

In [23]:
# Save results to CSV
output_path = "all_results_summary.csv"
results_df.to_csv(output_path, index=False)
print(f"\nResults saved to {output_path}")


Results saved to all_results_summary.csv


In [24]:
results_df

Unnamed: 0,pipeline,task_name,model_name,experiment_id,learning_rate,weight_decay,batch_size,seed,test_hard_loss,test_hard_accuracy,...,test_medium_f1_score,test_cross_device_loss,test_cross_device_accuracy,test_cross_device_f1_score,test_cross_env_loss,test_cross_env_accuracy,test_cross_env_f1_score,test_cross_user_loss,test_cross_user_accuracy,test_cross_user_f1_score
0,supervised,MotionSourceRecognitionUpdated,mlp,params_3eba2706,0.001,0.00001,128,11,0.057538,0.980539,...,0.992819,,,,,,,,,
1,supervised,MotionSourceRecognitionUpdated,mlp,params_a56bbd55,0.001,0.00001,128,3,0.042137,0.984032,...,0.990606,,,,,,,,,
2,supervised,MotionSourceRecognitionUpdated,mlp,params_d4700539,0.001,0.00001,128,99,0.053438,0.977545,...,0.990437,,,,,,,,,
3,supervised,MotionSourceRecognitionUpdated,mlp,params_fc13ed90,0.001,0.00001,128,42,0.047192,0.985529,...,0.991393,,,,,,,,,
4,supervised,MotionSourceRecognitionUpdated,lstm,params_58ba7f06,0.001,0.00001,128,99,0.099868,0.970559,...,0.986901,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191,supervised,HumanIdentification,patchtst,params_899208c2,0.001,0.00001,128,3,,,...,,5.127985,0.265797,0.256079,,,,,,
192,supervised,HumanIdentification,timesformer1d,params_609f17ff,0.001,0.00001,128,3,,,...,,5.871156,0.338361,0.337961,,,,,,
193,supervised,HumanIdentification,timesformer1d,params_87273be9,0.001,0.00001,128,42,,,...,,4.506577,0.313901,0.310916,,,,,,
194,supervised,HumanIdentification,timesformer1d,params_bd5d911b,0.001,0.00001,128,99,,,...,,4.745084,0.215654,0.214990,,,,,,


## Generate Error Bar


In [25]:
def generate_errorbar_summary_with_latex(df: pd.DataFrame,
                                        task_list,
                                        metric_columns,
                                        learning_rate=0.001,
                                        weight_decay=0.00001,
                                        model_order=None,
                                        precision=4):

    # Filter by learning rate and weight decay
    # filtered_df = df[(df["learning_rate"] == learning_rate) & 
    #                  (df["weight_decay"] == weight_decay)]

    filtered_df = df

    # Calculate mean ± std per model per task
    summary = []
    for task in task_list:
        task_df = filtered_df[filtered_df["task_name"] == task]
        for model in sorted(task_df["model_name"].unique()):
            row = {"task": task, "model": model}
            model_df = task_df[task_df["model_name"] == model]
            for col in metric_columns:
                if col in model_df.columns:
                    values = model_df[col].dropna()
                    if len(values) > 0:
                        mean = values.mean()
                        std = values.std(ddof=1)
                        row[col] = f"{mean:.{precision}f}±{std:.{precision}f}"
                    else:
                        row[col] = "--"
                else:
                    row[col] = "--"
            summary.append(row)
    summary_df = pd.DataFrame(summary)

    # Prepare LaTeX data
    latex_data = defaultdict(lambda: defaultdict(lambda: {"accuracy": "--", "f1_score": "--"}))
    for _, row in summary_df.iterrows():
        model = row["model"]
        task = row["task"]
        latex_data[model][task]["accuracy"] = row.get(metric_columns[0], "--")
        latex_data[model][task]["f1_score"] = row.get(metric_columns[1], "--")

    if model_order is None:
        model_order = sorted(summary_df["model"].unique())

    # Generate LaTeX rows
    latex_rows = []
    for model in model_order:
        acc_f1_values = []
        for task in task_list:
            acc = latex_data[model][task]["accuracy"]
            f1 = latex_data[model][task]["f1_score"]
            acc_f1_values.extend([acc, f1])
        row_line = f"{model.upper()} & " + " & ".join(acc_f1_values) + r" \\"
        latex_rows.append(row_line)

    # LaTeX header and footer
    latex_table_header = r"""
\begin{table}[t]
\centering
\caption{Standard supervised evaluation on four core tasks. We report Accuracy and F1-score for each model.}
\label{tab:supervised-results}
\resizebox{\textwidth}{!}{%
\begin{tabular}{
    >{\centering\arraybackslash}p{2.5cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}
}
     \toprule
    Model & \multicolumn{2}{c|}{\textbf{Motion Source Recognition}} & \multicolumn{2}{c|}{\textbf{Fall Detection}} & \multicolumn{2}{c|}{\textbf{Breathing Detection}} & \multicolumn{2}{c}{\textbf{Localization}} \\
    \midrule
"""
    latex_table_footer = r"""
\hline
\end{tabular}%
}
\end{table}
"""
    full_latex_code = latex_table_header + "\n" + "\n".join(latex_rows) + latex_table_footer
    return summary_df, full_latex_code





In [26]:
results_df.columns

Index(['pipeline', 'task_name', 'model_name', 'experiment_id', 'learning_rate',
       'weight_decay', 'batch_size', 'seed', 'test_hard_loss',
       'test_hard_accuracy', 'test_hard_f1_score', 'test_loss',
       'test_accuracy', 'test_f1_score', 'test_easy_loss',
       'test_easy_accuracy', 'test_easy_f1_score', 'test_medium_loss',
       'test_medium_accuracy', 'test_medium_f1_score',
       'test_cross_device_loss', 'test_cross_device_accuracy',
       'test_cross_device_f1_score', 'test_cross_env_loss',
       'test_cross_env_accuracy', 'test_cross_env_f1_score',
       'test_cross_user_loss', 'test_cross_user_accuracy',
       'test_cross_user_f1_score'],
      dtype='object')

In [30]:
# task_list =["MotionSourceRecognition","BreathingDetection_Subset","Localization","FallDetection","ProximityRecognition","HumanActivityRecognition","HumanIdentification"]

task_list = ["MotionSourceRecognitionUpdated", "FallDetection", "BreathingDetection_Subset", "Localization"]
# task_list = ["ProximityRecognition","HumanActivityRecognition","HumanIdentification"]


# metric_columns =['test_cross_device_accuracy', 'test_cross_device_f1_score','test_cross_env_accuracy',
#        'test_cross_env_f1_score','test_cross_user_accuracy', 'test_cross_user_f1_score']
metric_columns =['test_accuracy', 'test_f1_score']
# metric_columns = ['test_easy_accuracy', 'test_easy_f1_score', 'test_medium_accuracy', 'test_medium_f1_score', 'test_hard_accuracy', 'test_hard_f1_score']
model_order=["mlp", "lstm", "transformer", "resnet18", "patchtst", "timesformer1d"]

summary_df, latex_code = generate_errorbar_summary_with_latex(df=results_df, 
                                                              task_list=task_list,
                                                              metric_columns=metric_columns,
                                                              model_order=model_order,
                                                             precision=4)



# 查看结果表格
display(summary_df)

# 打印 LaTeX 表格代码
print(latex_code)


Unnamed: 0,task,model,test_accuracy,test_f1_score
0,MotionSourceRecognitionUpdated,lstm,0.9818±0.0030,0.9819±0.0030
1,MotionSourceRecognitionUpdated,mlp,0.9884±0.0009,0.9884±0.0009
2,MotionSourceRecognitionUpdated,patchtst,0.9829±0.0034,0.9830±0.0034
3,MotionSourceRecognitionUpdated,resnet18,0.9969±0.0006,0.9969±0.0006
4,MotionSourceRecognitionUpdated,timesformer1d,0.9818±0.0011,0.9818±0.0011
5,MotionSourceRecognitionUpdated,transformer,0.9852±0.0025,0.9852±0.0025
6,MotionSourceRecognitionUpdated,vit,0.9893±0.0018,0.9893±0.0018
7,FallDetection,lstm,0.9493±0.0051,0.9492±0.0050
8,FallDetection,mlp,0.9216±0.0091,0.9217±0.0092
9,FallDetection,patchtst,0.9403±0.0074,0.9403±0.0073



\begin{table}[t]
\centering
\caption{Standard supervised evaluation on four core tasks. We report Accuracy and F1-score for each model.}
\label{tab:supervised-results}
\resizebox{\textwidth}{!}{%
\begin{tabular}{
    >{\centering\arraybackslash}p{2.5cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}
}
     \toprule
    Model & \multicolumn{2}{c|}{\textbf{Motion Source Recognition}} & \multicolumn{2}{c|}{\textbf{Fall Detection}} & \multicolumn{2}{c|}{\textbf{Breathing Detection}} & \multicolumn{2}{c}{\textbf{Localization}} \\
    \midrule

MLP & 0.9884±0.0009 & 0.9884±0.0009 & 0.9216±0.0091 & 0.9217±0.0092 & 0.9759±0.0008 & 0.9759±0.0008 & 0.8714±0.0080 & 0.8690±0.0083 \\
LSTM & 0.9818±0.0030 & 0.9819±

In [31]:
summary_df.to_csv('4_task_acc_f1_result_with_error_bar.csv')