In [18]:
import os
import json
import pandas as pd
from glob import glob
from tqdm import tqdm
from collections import defaultdict

In [32]:
# Configuration - You can modify these variables
data_dir = r"C:\Users\weiha\Desktop\benchmark_result"  # Base directory containing results
pipeline_list = ["multitask"]  # List of pipelines to analyze

# task_name_list =["MotionSourceRecognitionUpdated","BreathingDetection_Subset","Localization","FallDetection","ProximityRecognition","HumanActivityRecognition","HumanIdentification"]
# task_name_list = ["BreathingDetection_Subset"]  # List of tasks to analyze
task_name_list =["ProximityRecognition","HumanActivityRecognition","HumanIdentification"]

model_name_list = ["mlp", "lstm", "resnet18", "transformer", "vit", "patchtst", "timesformer1d"] # List of models to analyze
# model_name_list = ["transformer"]

# Initialize an empty list to store results
results_data = []

# Iterate through all combinations
for pipeline in pipeline_list:
    for task_name in task_name_list:
        for model_name in model_name_list:
            # Find all experiment folders for the current combination
            exp_pattern = os.path.join(data_dir, pipeline, task_name, model_name, "params_*")
            exp_folders = glob(exp_pattern)
            
            print(f"Found {len(exp_folders)} experiments for {pipeline}/{task_name}/{model_name}")
            
            # Process each experiment
            for exp_folder in tqdm(exp_folders, desc=f"Processing {pipeline}/{task_name}/{model_name}"):
                # Extract experiment ID
                exp_id = os.path.basename(exp_folder)
                
                # Define paths for config and results files
                config_filename = f"{model_name}_{task_name}_config.json"
                results_filename = f"{model_name}_{task_name}_results.json"

                if pipeline == 'multitask':
                    results_filename = f"{model_name}_{task_name}_test_results.json"
                
                config_path = os.path.join(exp_folder, config_filename)
                results_path = os.path.join(exp_folder, results_filename)
                
                # Skip if either file doesn't exist
                if not os.path.exists(config_path) or not os.path.exists(results_path):
                    print(f"Missing files for {exp_folder}, skipping")
                    continue
                
                # Read config file
                try:
                    with open(config_path, 'r') as f:
                        config_data = json.load(f)
                    
                    # Extract required fields from config
                    learning_rate = config_data.get('learning_rate')
                    weight_decay = config_data.get('weight_decay')
                    seed = config_data.get('seed')
                    batch_size = config_data.get('batch_size')
                    
                    if pipeline == 'multitask':
                        seed =2
                    # Skip if seed is not available (old version)
                    if seed is None:
                        print(f"Seed not available in {exp_folder}, skipping")
                        continue
                        
                except Exception as e:
                    print(f"Error reading config file {config_path}: {e}")
                    continue
                
                # Read results file
                try:
                    with open(results_path, 'r') as f:
                        results_data_json = json.load(f)
                except Exception as e:
                    print(f"Error reading results file {results_path}: {e}")
                    continue
                
                # Create a dictionary for the current experiment
                exp_result = {
                    'pipeline': pipeline,
                    'task_name': task_name,
                    'model_name': model_name,
                    'experiment_id': exp_id,
                    'learning_rate': learning_rate,
                    'weight_decay': weight_decay,
                    'batch_size':batch_size,
                    'seed': seed,
                }
                
                # Extract test results for all test sets
                for test_name, test_results in results_data_json.items():
                    if isinstance(test_results, dict):
                        for metric_name, metric_value in test_results.items():
                            column_name = f"{test_name}_{metric_name}"
                            exp_result[column_name] = metric_value
                
                # Add to results list
                results_data.append(exp_result)


results_df = pd.DataFrame(results_data)


for task in task_name_list:
    for model in model_name_list:
        task_model_df = results_df[(results_df['task_name'] == task) & (results_df['model_name'] == model)]
        if not task_model_df.empty:
            print(f"\n{task} - {model} (count: {len(task_model_df)})")
            
            # Find metrics columns
            metric_columns = [col for col in task_model_df.columns if any(col.endswith(m) for m in ['_loss', '_accuracy', '_f1'])]
            
            if metric_columns:
                avg_metrics = task_model_df[metric_columns].mean()
                for metric, value in avg_metrics.items():
                    print(f"  {metric}: {value:.4f}")

if pipeline_list == ["supervised"]:
    results_df = results_df[results_df["batch_size"] == 128].reset_index(drop=True)



Found 0 experiments for multitask/ProximityRecognition/mlp


Processing multitask/ProximityRecognition/mlp: 0it [00:00, ?it/s]


Found 0 experiments for multitask/ProximityRecognition/lstm


Processing multitask/ProximityRecognition/lstm: 0it [00:00, ?it/s]


Found 0 experiments for multitask/ProximityRecognition/resnet18


Processing multitask/ProximityRecognition/resnet18: 0it [00:00, ?it/s]


Found 6 experiments for multitask/ProximityRecognition/transformer


Processing multitask/ProximityRecognition/transformer: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 52.66it/s]


Missing files for C:\Users\weiha\Desktop\benchmark_result\multitask\ProximityRecognition\transformer\params_1747274891_d093ee76, skipping
Found 0 experiments for multitask/ProximityRecognition/vit


Processing multitask/ProximityRecognition/vit: 0it [00:00, ?it/s]


Found 0 experiments for multitask/ProximityRecognition/patchtst


Processing multitask/ProximityRecognition/patchtst: 0it [00:00, ?it/s]


Found 0 experiments for multitask/ProximityRecognition/timesformer1d


Processing multitask/ProximityRecognition/timesformer1d: 0it [00:00, ?it/s]


Found 0 experiments for multitask/HumanActivityRecognition/mlp


Processing multitask/HumanActivityRecognition/mlp: 0it [00:00, ?it/s]


Found 0 experiments for multitask/HumanActivityRecognition/lstm


Processing multitask/HumanActivityRecognition/lstm: 0it [00:00, ?it/s]


Found 0 experiments for multitask/HumanActivityRecognition/resnet18


Processing multitask/HumanActivityRecognition/resnet18: 0it [00:00, ?it/s]


Found 6 experiments for multitask/HumanActivityRecognition/transformer


Processing multitask/HumanActivityRecognition/transformer:   0%|                                                                                                                             | 0/6 [00:00<?, ?it/s]

Missing files for C:\Users\weiha\Desktop\benchmark_result\multitask\HumanActivityRecognition\transformer\params_1747274891_d093ee76, skipping


Processing multitask/HumanActivityRecognition/transformer: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 51.47it/s]


Found 0 experiments for multitask/HumanActivityRecognition/vit


Processing multitask/HumanActivityRecognition/vit: 0it [00:00, ?it/s]


Found 0 experiments for multitask/HumanActivityRecognition/patchtst


Processing multitask/HumanActivityRecognition/patchtst: 0it [00:00, ?it/s]


Found 0 experiments for multitask/HumanActivityRecognition/timesformer1d


Processing multitask/HumanActivityRecognition/timesformer1d: 0it [00:00, ?it/s]


Found 0 experiments for multitask/HumanIdentification/mlp


Processing multitask/HumanIdentification/mlp: 0it [00:00, ?it/s]


Found 0 experiments for multitask/HumanIdentification/lstm


Processing multitask/HumanIdentification/lstm: 0it [00:00, ?it/s]


Found 0 experiments for multitask/HumanIdentification/resnet18


Processing multitask/HumanIdentification/resnet18: 0it [00:00, ?it/s]


Found 6 experiments for multitask/HumanIdentification/transformer


Processing multitask/HumanIdentification/transformer:   0%|                                                                                                                                  | 0/6 [00:00<?, ?it/s]

Missing files for C:\Users\weiha\Desktop\benchmark_result\multitask\HumanIdentification\transformer\params_1747274891_d093ee76, skipping


Processing multitask/HumanIdentification/transformer: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 47.09it/s]


Found 0 experiments for multitask/HumanIdentification/vit


Processing multitask/HumanIdentification/vit: 0it [00:00, ?it/s]


Found 0 experiments for multitask/HumanIdentification/patchtst


Processing multitask/HumanIdentification/patchtst: 0it [00:00, ?it/s]


Found 0 experiments for multitask/HumanIdentification/timesformer1d


Processing multitask/HumanIdentification/timesformer1d: 0it [00:00, ?it/s]


ProximityRecognition - transformer (count: 5)
  test_cross_device_loss: 4.5575
  test_cross_device_accuracy: 0.2964
  test_cross_env_loss: 5.0010
  test_cross_env_accuracy: 0.2832
  test_cross_user_loss: 5.7481
  test_cross_user_accuracy: 0.2987
  test_loss: 0.4352
  test_accuracy: 0.8702

HumanActivityRecognition - transformer (count: 5)
  test_cross_device_loss: 4.2895
  test_cross_device_accuracy: 0.2520
  test_cross_env_loss: 4.3878
  test_cross_env_accuracy: 0.2869
  test_cross_user_loss: 3.1760
  test_cross_user_accuracy: 0.4835
  test_loss: 0.4553
  test_accuracy: 0.8744

HumanIdentification - transformer (count: 5)
  test_cross_device_loss: 5.5854
  test_cross_device_accuracy: 0.2529
  test_cross_env_loss: 6.4829
  test_cross_env_accuracy: 0.1580
  test_cross_user_loss: 10.9707
  test_cross_user_accuracy: 0.0010
  test_loss: 0.0148
  test_accuracy: 0.9955





In [33]:
# results_df[["task_name","model_name","test_accuracy", "test_f1_score"]]
# results_df[results_df['task_name'] == 'BreathingDetection_Subset']
results_df

Unnamed: 0,pipeline,task_name,model_name,experiment_id,learning_rate,weight_decay,batch_size,seed,test_cross_device_loss,test_cross_device_accuracy,test_cross_device_f1_score,test_cross_env_loss,test_cross_env_accuracy,test_cross_env_f1_score,test_cross_user_loss,test_cross_user_accuracy,test_cross_user_f1_score,test_loss,test_accuracy,test_f1_score
0,multitask,ProximityRecognition,transformer,params_1746938874_6f98a5d7,,,32,2,4.406655,0.332503,0.312785,5.805759,0.249557,0.151432,6.483927,0.287538,0.208563,0.598675,0.849826,0.871676
1,multitask,ProximityRecognition,transformer,params_1747239399_044a3d71,,,128,2,3.563103,0.349776,0.31579,4.764651,0.307474,0.265957,5.107967,0.29727,0.061693,0.40642,0.856017,0.855847
2,multitask,ProximityRecognition,transformer,params_1747283048_f22d33ac,,,128,2,5.143761,0.262943,0.171483,4.901661,0.307474,0.389565,5.305052,0.31225,0.142129,0.36794,0.886574,0.885068
3,multitask,ProximityRecognition,transformer,params_1747320859_2e1a599d,,,128,2,5.379067,0.260497,0.180313,5.375434,0.273822,0.268422,6.262068,0.305925,0.07122,0.359153,0.894676,0.896886
4,multitask,ProximityRecognition,transformer,params_1747338740_ccefd6aa,,,128,2,4.295095,0.276396,0.269233,4.157582,0.277719,0.229779,5.581658,0.29028,0.071313,0.443779,0.864005,0.863287
5,multitask,HumanActivityRecognition,transformer,params_1746938874_6f98a5d7,,,32,2,3.869454,0.382182,0.356173,4.321249,0.383237,0.20388,3.100078,0.552683,0.416498,0.532842,0.873457,0.836388
6,multitask,HumanActivityRecognition,transformer,params_1747239399_044a3d71,,,128,2,4.64793,0.203308,0.190914,4.607148,0.210424,0.213728,3.157194,0.438411,0.303885,0.417876,0.879776,0.869593
7,multitask,HumanActivityRecognition,transformer,params_1747283048_f22d33ac,,,128,2,4.602675,0.251714,0.186073,4.529624,0.304417,0.174532,3.647447,0.47813,0.313716,0.410617,0.888529,0.873951
8,multitask,HumanActivityRecognition,transformer,params_1747320859_2e1a599d,,,128,2,4.402168,0.166599,0.173851,4.449487,0.231449,0.133192,3.028693,0.46355,0.355744,0.454639,0.868824,0.859856
9,multitask,HumanActivityRecognition,transformer,params_1747338740_ccefd6aa,,,128,2,3.925396,0.256152,0.19181,4.031529,0.30477,0.152211,2.946623,0.484666,0.352165,0.460466,0.861471,0.842702


In [21]:
def merge_id_columns(df, drop_id_cols=True):
    merge_targets = ['easy', 'medium', 'hard']
    metrics = ['loss', 'accuracy', 'f1_score']
    id_cols_to_drop = []

    for target in merge_targets:
        for metric in metrics:
            base_col = f'test_{target}_{metric}'
            id_col = f'test_{target}_id_{metric}'

            if base_col in df.columns and id_col in df.columns:
                df[base_col] = df[base_col].combine_first(df[id_col])
                if drop_id_cols:
                    id_cols_to_drop.append(id_col)

    if drop_id_cols:
        df.drop(columns=id_cols_to_drop, inplace=True)

    return df

In [22]:
results_df = merge_id_columns(results_df)

In [23]:
# Save results to CSV
output_path = "all_results_summary.csv"
results_df.to_csv(output_path, index=False)
print(f"\nResults saved to {output_path}")


Results saved to all_results_summary.csv


In [24]:
results_df

Unnamed: 0,pipeline,task_name,model_name,experiment_id,learning_rate,weight_decay,batch_size,seed,test_hard_loss,test_hard_accuracy,...,test_medium_f1_score,test_cross_device_loss,test_cross_device_accuracy,test_cross_device_f1_score,test_cross_env_loss,test_cross_env_accuracy,test_cross_env_f1_score,test_cross_user_loss,test_cross_user_accuracy,test_cross_user_f1_score
0,supervised,MotionSourceRecognitionUpdated,mlp,params_3eba2706,0.001,0.00001,128,11,0.057538,0.980539,...,0.992819,,,,,,,,,
1,supervised,MotionSourceRecognitionUpdated,mlp,params_a56bbd55,0.001,0.00001,128,3,0.042137,0.984032,...,0.990606,,,,,,,,,
2,supervised,MotionSourceRecognitionUpdated,mlp,params_d4700539,0.001,0.00001,128,99,0.053438,0.977545,...,0.990437,,,,,,,,,
3,supervised,MotionSourceRecognitionUpdated,mlp,params_fc13ed90,0.001,0.00001,128,42,0.047192,0.985529,...,0.991393,,,,,,,,,
4,supervised,MotionSourceRecognitionUpdated,lstm,params_58ba7f06,0.001,0.00001,128,99,0.099868,0.970559,...,0.986901,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191,supervised,HumanIdentification,patchtst,params_899208c2,0.001,0.00001,128,3,,,...,,5.127985,0.265797,0.256079,,,,,,
192,supervised,HumanIdentification,timesformer1d,params_609f17ff,0.001,0.00001,128,3,,,...,,5.871156,0.338361,0.337961,,,,,,
193,supervised,HumanIdentification,timesformer1d,params_87273be9,0.001,0.00001,128,42,,,...,,4.506577,0.313901,0.310916,,,,,,
194,supervised,HumanIdentification,timesformer1d,params_bd5d911b,0.001,0.00001,128,99,,,...,,4.745084,0.215654,0.214990,,,,,,


## Generate Error Bar


In [34]:
def generate_errorbar_summary_with_latex(df: pd.DataFrame,
                                        task_list,
                                        metric_columns,
                                        learning_rate=0.001,
                                        weight_decay=0.00001,
                                        model_order=None,
                                        precision=4):

    # Filter by learning rate and weight decay
    # filtered_df = df[(df["learning_rate"] == learning_rate) & 
    #                  (df["weight_decay"] == weight_decay)]

    filtered_df = df

    # Calculate mean ± std per model per task
    summary = []
    for task in task_list:
        task_df = filtered_df[filtered_df["task_name"] == task]
        for model in sorted(task_df["model_name"].unique()):
            row = {"task": task, "model": model}
            model_df = task_df[task_df["model_name"] == model]
            for col in metric_columns:
                if col in model_df.columns:
                    values = model_df[col].dropna()
                    if len(values) > 0:
                        mean = values.mean()
                        std = values.std(ddof=1)
                        row[col] = f"{mean:.{precision}f}±{std:.{precision}f}"
                    else:
                        row[col] = "--"
                else:
                    row[col] = "--"
            summary.append(row)
    summary_df = pd.DataFrame(summary)

    # Prepare LaTeX data
    latex_data = defaultdict(lambda: defaultdict(lambda: {"accuracy": "--", "f1_score": "--"}))
    for _, row in summary_df.iterrows():
        model = row["model"]
        task = row["task"]
        latex_data[model][task]["accuracy"] = row.get(metric_columns[0], "--")
        latex_data[model][task]["f1_score"] = row.get(metric_columns[1], "--")

    if model_order is None:
        model_order = sorted(summary_df["model"].unique())

    # Generate LaTeX rows
    latex_rows = []
    for model in model_order:
        acc_f1_values = []
        for task in task_list:
            acc = latex_data[model][task]["accuracy"]
            f1 = latex_data[model][task]["f1_score"]
            acc_f1_values.extend([acc, f1])
        row_line = f"{model.upper()} & " + " & ".join(acc_f1_values) + r" \\"
        latex_rows.append(row_line)

    # LaTeX header and footer
    latex_table_header = r"""
\begin{table}[t]
\centering
\caption{Standard supervised evaluation on four core tasks. We report Accuracy and F1-score for each model.}
\label{tab:supervised-results}
\resizebox{\textwidth}{!}{%
\begin{tabular}{
    >{\centering\arraybackslash}p{2.5cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}
}
     \toprule
    Model & \multicolumn{2}{c|}{\textbf{Motion Source Recognition}} & \multicolumn{2}{c|}{\textbf{Fall Detection}} & \multicolumn{2}{c|}{\textbf{Breathing Detection}} & \multicolumn{2}{c}{\textbf{Localization}} \\
    \midrule
"""
    latex_table_footer = r"""
\hline
\end{tabular}%
}
\end{table}
"""
    full_latex_code = latex_table_header + "\n" + "\n".join(latex_rows) + latex_table_footer
    return summary_df, full_latex_code





In [35]:
results_df.columns

Index(['pipeline', 'task_name', 'model_name', 'experiment_id', 'learning_rate',
       'weight_decay', 'batch_size', 'seed', 'test_cross_device_loss',
       'test_cross_device_accuracy', 'test_cross_device_f1_score',
       'test_cross_env_loss', 'test_cross_env_accuracy',
       'test_cross_env_f1_score', 'test_cross_user_loss',
       'test_cross_user_accuracy', 'test_cross_user_f1_score', 'test_loss',
       'test_accuracy', 'test_f1_score'],
      dtype='object')

In [38]:
# task_list =["MotionSourceRecognition","BreathingDetection_Subset","Localization","FallDetection","ProximityRecognition","HumanActivityRecognition","HumanIdentification"]

# task_list = ["MotionSourceRecognitionUpdated", "FallDetection", "BreathingDetection_Subset", "Localization"]
task_list = ["ProximityRecognition","HumanActivityRecognition","HumanIdentification"]


metric_columns =['test_cross_device_accuracy', 'test_cross_device_f1_score','test_cross_env_accuracy',
       'test_cross_env_f1_score','test_cross_user_accuracy', 'test_cross_user_f1_score']
# metric_columns =['test_accuracy', 'test_f1_score']
# metric_columns = ['test_easy_accuracy', 'test_easy_f1_score', 'test_medium_accuracy', 'test_medium_f1_score', 'test_hard_accuracy', 'test_hard_f1_score']
model_order=["mlp", "lstm", "transformer", "resnet18", "patchtst", "timesformer1d"]

summary_df, latex_code = generate_errorbar_summary_with_latex(df=results_df, 
                                                              task_list=task_list,
                                                              metric_columns=metric_columns,
                                                              model_order=model_order,
                                                             precision=4)



# 查看结果表格
display(summary_df)

# 打印 LaTeX 表格代码
print(latex_code)


Unnamed: 0,task,model,test_cross_device_accuracy,test_cross_device_f1_score,test_cross_env_accuracy,test_cross_env_f1_score,test_cross_user_accuracy,test_cross_user_f1_score
0,ProximityRecognition,transformer,0.2964±0.0417,0.2499±0.0701,0.2832±0.0246,0.2610±0.0860,0.2987±0.0104,0.1110±0.0634
1,HumanActivityRecognition,transformer,0.2520±0.0816,0.2198±0.0766,0.2869±0.0686,0.1755±0.0339,0.4835±0.0426,0.3484±0.0444
2,HumanIdentification,transformer,0.2529±0.0549,0.2444±0.1276,0.1580±0.1021,0.0000±0.0000,0.0010±0.0022,0.0000±0.0000



\begin{table}[t]
\centering
\caption{Standard supervised evaluation on four core tasks. We report Accuracy and F1-score for each model.}
\label{tab:supervised-results}
\resizebox{\textwidth}{!}{%
\begin{tabular}{
    >{\centering\arraybackslash}p{2.5cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}
}
     \toprule
    Model & \multicolumn{2}{c|}{\textbf{Motion Source Recognition}} & \multicolumn{2}{c|}{\textbf{Fall Detection}} & \multicolumn{2}{c|}{\textbf{Breathing Detection}} & \multicolumn{2}{c}{\textbf{Localization}} \\
    \midrule

MLP & -- & -- & -- & -- & -- & -- \\
LSTM & -- & -- & -- & -- & -- & -- \\
TRANSFORMER & 0.2964±0.0417 & 0.2499±0.0701 & 0.2520±0.0816 & 0.2198±0.0766 & 0.2529±0.054

In [39]:
summary_df.to_csv('multitask_pipeline_3_task_cross_result_with_error_bar.csv')