In [2]:
import os
import json
import pandas as pd
from glob import glob
from tqdm import tqdm
from collections import defaultdict

In [8]:
# Configuration - You can modify these variables
data_dir = r"C:\Users\weiha\Desktop\benchmark_result"  # Base directory containing results
pipeline_list = ["multitask"]  # List of pipelines to analyze

# task_name_list =["MotionSourceRecognition","BreathingDetection_Subset","Localization","FallDetection","ProximityRecognition","HumanActivityRecognition","HumanIdentification"]
# task_name_list = ["BreathingDetection_Subset"]  # List of tasks to analyze
task_name_list =["ProximityRecognition","HumanActivityRecognition","HumanIdentification"]

model_name_list = ["mlp", "lstm", "resnet18", "transformer", "vit", "patchtst", "timesformer1d"] # List of models to analyze
model_name_list = ["transformer"]

# Initialize an empty list to store results
results_data = []

# Iterate through all combinations
for pipeline in pipeline_list:
    for task_name in task_name_list:
        for model_name in model_name_list:
            # Find all experiment folders for the current combination
            exp_pattern = os.path.join(data_dir, pipeline, task_name, model_name, "params_*")
            exp_folders = glob(exp_pattern)
            
            print(f"Found {len(exp_folders)} experiments for {pipeline}/{task_name}/{model_name}")
            
            # Process each experiment
            for exp_folder in tqdm(exp_folders, desc=f"Processing {pipeline}/{task_name}/{model_name}"):
                # Extract experiment ID
                exp_id = os.path.basename(exp_folder)
                
                # Define paths for config and results files
                config_filename = f"{model_name}_{task_name}_config.json"
                results_filename = f"{model_name}_{task_name}_results.json"

                if pipeline == 'multitask':
                    results_filename = f"{model_name}_{task_name}_test_results.json"
                
                config_path = os.path.join(exp_folder, config_filename)
                results_path = os.path.join(exp_folder, results_filename)
                
                # Skip if either file doesn't exist
                if not os.path.exists(config_path) or not os.path.exists(results_path):
                    print(f"Missing files for {exp_folder}, skipping")
                    continue
                
                # Read config file
                try:
                    with open(config_path, 'r') as f:
                        config_data = json.load(f)
                    
                    # Extract required fields from config
                    learning_rate = config_data.get('learning_rate')
                    weight_decay = config_data.get('weight_decay')
                    seed = config_data.get('seed')
                    batch_size = config_data.get('batch_size')
                    
                    if pipeline == 'multitask':
                        seed =2
                    # Skip if seed is not available (old version)
                    if seed is None:
                        print(f"Seed not available in {exp_folder}, skipping")
                        continue
                        
                except Exception as e:
                    print(f"Error reading config file {config_path}: {e}")
                    continue
                
                # Read results file
                try:
                    with open(results_path, 'r') as f:
                        results_data_json = json.load(f)
                except Exception as e:
                    print(f"Error reading results file {results_path}: {e}")
                    continue
                
                # Create a dictionary for the current experiment
                exp_result = {
                    'pipeline': pipeline,
                    'task_name': task_name,
                    'model_name': model_name,
                    'experiment_id': exp_id,
                    'learning_rate': learning_rate,
                    'weight_decay': weight_decay,
                    'batch_size':batch_size,
                    'seed': seed,
                }
                
                # Extract test results for all test sets
                for test_name, test_results in results_data_json.items():
                    if isinstance(test_results, dict):
                        for metric_name, metric_value in test_results.items():
                            column_name = f"{test_name}_{metric_name}"
                            exp_result[column_name] = metric_value
                
                # Add to results list
                results_data.append(exp_result)


results_df = pd.DataFrame(results_data)


for task in task_name_list:
    for model in model_name_list:
        task_model_df = results_df[(results_df['task_name'] == task) & (results_df['model_name'] == model)]
        if not task_model_df.empty:
            print(f"\n{task} - {model} (count: {len(task_model_df)})")
            
            # Find metrics columns
            metric_columns = [col for col in task_model_df.columns if any(col.endswith(m) for m in ['_loss', '_accuracy', '_f1'])]
            
            if metric_columns:
                avg_metrics = task_model_df[metric_columns].mean()
                for metric, value in avg_metrics.items():
                    print(f"  {metric}: {value:.4f}")

if pipeline_list == ["supervised"]:
    results_df = results_df[results_df["batch_size"] == 128].reset_index(drop=True)



Found 4 experiments for multitask/ProximityRecognition/transformer


Processing multitask/ProximityRecognition/transformer: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 2863.98it/s]


Missing files for C:\Users\weiha\Desktop\benchmark_result\multitask\ProximityRecognition\transformer\params_1747274891_d093ee76, skipping
Found 4 experiments for multitask/HumanActivityRecognition/transformer


Processing multitask/HumanActivityRecognition/transformer: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 3633.00it/s]


Missing files for C:\Users\weiha\Desktop\benchmark_result\multitask\HumanActivityRecognition\transformer\params_1747274891_d093ee76, skipping
Found 4 experiments for multitask/HumanIdentification/transformer


Processing multitask/HumanIdentification/transformer: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 2404.65it/s]

Missing files for C:\Users\weiha\Desktop\benchmark_result\multitask\HumanIdentification\transformer\params_1747274891_d093ee76, skipping

ProximityRecognition - transformer (count: 3)
  test_cross_device_loss: 4.3712
  test_cross_device_accuracy: 0.3151
  test_cross_env_loss: 5.1574
  test_cross_env_accuracy: 0.2882
  test_cross_user_loss: 5.6323
  test_cross_user_accuracy: 0.2990
  test_loss: 0.4577
  test_accuracy: 0.8641

HumanActivityRecognition - transformer (count: 3)
  test_cross_device_loss: 4.3734
  test_cross_device_accuracy: 0.2791
  test_cross_env_loss: 4.4860
  test_cross_env_accuracy: 0.2994
  test_cross_user_loss: 3.3016
  test_cross_user_accuracy: 0.4897
  test_loss: 0.4538
  test_accuracy: 0.8806

HumanIdentification - transformer (count: 3)
  test_cross_device_loss: 5.7879
  test_cross_device_accuracy: 0.2340
  test_cross_env_loss: 5.5419
  test_cross_env_accuracy: 0.2123
  test_cross_user_loss: 10.9805
  test_cross_user_accuracy: 0.0017
  test_loss: 0.0166
  test_acc




In [9]:
# results_df[["task_name","model_name","test_accuracy", "test_f1_score"]]
# results_df[results_df['task_name'] == 'BreathingDetection_Subset']
results_df

Unnamed: 0,pipeline,task_name,model_name,experiment_id,learning_rate,weight_decay,batch_size,seed,test_cross_device_loss,test_cross_device_accuracy,test_cross_device_f1_score,test_cross_env_loss,test_cross_env_accuracy,test_cross_env_f1_score,test_cross_user_loss,test_cross_user_accuracy,test_cross_user_f1_score,test_loss,test_accuracy,test_f1_score
0,multitask,ProximityRecognition,transformer,params_1746938874_6f98a5d7,,,32,2,4.406655,0.332503,0.312785,5.805759,0.249557,0.151432,6.483927,0.287538,0.208563,0.598675,0.849826,0.871676
1,multitask,ProximityRecognition,transformer,params_1747239399_044a3d71,,,128,2,3.563103,0.349776,0.31579,4.764651,0.307474,0.265957,5.107967,0.29727,0.061693,0.40642,0.856017,0.855847
2,multitask,ProximityRecognition,transformer,params_1747283048_f22d33ac,,,128,2,5.143761,0.262943,0.171483,4.901661,0.307474,0.389565,5.305052,0.31225,0.142129,0.36794,0.886574,0.885068
3,multitask,HumanActivityRecognition,transformer,params_1746938874_6f98a5d7,,,32,2,3.869454,0.382182,0.356173,4.321249,0.383237,0.20388,3.100078,0.552683,0.416498,0.532842,0.873457,0.836388
4,multitask,HumanActivityRecognition,transformer,params_1747239399_044a3d71,,,128,2,4.64793,0.203308,0.190914,4.607148,0.210424,0.213728,3.157194,0.438411,0.303885,0.417876,0.879776,0.869593
5,multitask,HumanActivityRecognition,transformer,params_1747283048_f22d33ac,,,128,2,4.602675,0.251714,0.186073,4.529624,0.304417,0.174532,3.647447,0.47813,0.313716,0.410617,0.888529,0.873951
6,multitask,HumanIdentification,transformer,params_1746938874_6f98a5d7,,,32,2,6.802142,0.230429,0.16502,6.264702,0.165978,0.0,12.560709,0.0,0.0,0.019491,0.994792,1.0
7,multitask,HumanIdentification,transformer,params_1747239399_044a3d71,,,128,2,5.052409,0.241337,0.349354,5.483082,0.154091,0.0,9.826658,0.004993,0.0,0.014378,0.995796,0.99616
8,multitask,HumanIdentification,transformer,params_1747283048_f22d33ac,,,128,2,5.509236,0.23033,0.257354,4.877793,0.316684,0.0,10.554127,0.0,0.0,0.016078,0.995949,0.994785


In [101]:
def merge_id_columns(df, drop_id_cols=True):
    merge_targets = ['easy', 'medium', 'hard']
    metrics = ['loss', 'accuracy', 'f1_score']
    id_cols_to_drop = []

    for target in merge_targets:
        for metric in metrics:
            base_col = f'test_{target}_{metric}'
            id_col = f'test_{target}_id_{metric}'

            if base_col in df.columns and id_col in df.columns:
                df[base_col] = df[base_col].combine_first(df[id_col])
                if drop_id_cols:
                    id_cols_to_drop.append(id_col)

    if drop_id_cols:
        df.drop(columns=id_cols_to_drop, inplace=True)

    return df

In [102]:
results_df = merge_id_columns(results_df)

In [98]:
# Save results to CSV
output_path = "all_results_summary.csv"
results_df.to_csv(output_path, index=False)
print(f"\nResults saved to {output_path}")


Results saved to all_results_summary.csv


In [103]:
results_df

Unnamed: 0,pipeline,task_name,model_name,experiment_id,learning_rate,weight_decay,batch_size,seed,test_hard_loss,test_hard_accuracy,...,test_easy_f1_score,test_cross_device_loss,test_cross_device_accuracy,test_cross_device_f1_score,test_cross_env_loss,test_cross_env_accuracy,test_cross_env_f1_score,test_cross_user_loss,test_cross_user_accuracy,test_cross_user_f1_score
0,supervised,MotionSourceRecognition,mlp,params_54de1ed4,0.001,0.00001,128,11,0.054798,0.979658,...,0.983136,,,,,,,,,
1,supervised,MotionSourceRecognition,mlp,params_a5f8d464,0.001,0.00001,128,3,0.060461,0.975590,...,0.991959,,,,,,,,,
2,supervised,MotionSourceRecognition,mlp,params_bc0ca9cb,0.001,0.00001,128,42,0.045080,0.982099,...,0.986605,,,,,,,,,
3,supervised,MotionSourceRecognition,lstm,params_3f6489b2,0.001,0.00001,128,42,0.078542,0.980472,...,0.984542,,,,,,,,,
4,supervised,MotionSourceRecognition,lstm,params_77e9c041,0.001,0.00001,128,11,0.110225,0.974776,...,0.984534,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184,supervised,HumanIdentification,patchtst,params_899208c2,0.001,0.00001,128,3,,,...,,5.127985,0.265797,0.256079,,,,,,
185,supervised,HumanIdentification,timesformer1d,params_609f17ff,0.001,0.00001,128,3,,,...,,5.871156,0.338361,0.337961,,,,,,
186,supervised,HumanIdentification,timesformer1d,params_87273be9,0.001,0.00001,128,42,,,...,,4.506577,0.313901,0.310916,,,,,,
187,supervised,HumanIdentification,timesformer1d,params_bd5d911b,0.001,0.00001,128,99,,,...,,4.745084,0.215654,0.214990,,,,,,


## Generate Error Bar


In [12]:
def generate_errorbar_summary_with_latex(df: pd.DataFrame,
                                        task_list,
                                        metric_columns,
                                        learning_rate=0.001,
                                        weight_decay=0.00001,
                                        model_order=None,
                                        precision=4):

    # Filter by learning rate and weight decay
    # filtered_df = df[(df["learning_rate"] == learning_rate) & 
    #                  (df["weight_decay"] == weight_decay)]

    filtered_df = df

    # Calculate mean ± std per model per task
    summary = []
    for task in task_list:
        task_df = filtered_df[filtered_df["task_name"] == task]
        for model in sorted(task_df["model_name"].unique()):
            row = {"task": task, "model": model}
            model_df = task_df[task_df["model_name"] == model]
            for col in metric_columns:
                if col in model_df.columns:
                    values = model_df[col].dropna()
                    if len(values) > 0:
                        mean = values.mean()
                        std = values.std(ddof=1)
                        row[col] = f"{mean:.{precision}f}±{std:.{precision}f}"
                    else:
                        row[col] = "--"
                else:
                    row[col] = "--"
            summary.append(row)
    summary_df = pd.DataFrame(summary)

    # Prepare LaTeX data
    latex_data = defaultdict(lambda: defaultdict(lambda: {"accuracy": "--", "f1_score": "--"}))
    for _, row in summary_df.iterrows():
        model = row["model"]
        task = row["task"]
        latex_data[model][task]["accuracy"] = row.get(metric_columns[0], "--")
        latex_data[model][task]["f1_score"] = row.get(metric_columns[1], "--")

    if model_order is None:
        model_order = sorted(summary_df["model"].unique())

    # Generate LaTeX rows
    latex_rows = []
    for model in model_order:
        acc_f1_values = []
        for task in task_list:
            acc = latex_data[model][task]["accuracy"]
            f1 = latex_data[model][task]["f1_score"]
            acc_f1_values.extend([acc, f1])
        row_line = f"{model.upper()} & " + " & ".join(acc_f1_values) + r" \\"
        latex_rows.append(row_line)

    # LaTeX header and footer
    latex_table_header = r"""
\begin{table}[t]
\centering
\caption{Standard supervised evaluation on four core tasks. We report Accuracy and F1-score for each model.}
\label{tab:supervised-results}
\resizebox{\textwidth}{!}{%
\begin{tabular}{
    >{\centering\arraybackslash}p{2.5cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}
}
     \toprule
    Model & \multicolumn{2}{c|}{\textbf{Motion Source Recognition}} & \multicolumn{2}{c|}{\textbf{Fall Detection}} & \multicolumn{2}{c|}{\textbf{Breathing Detection}} & \multicolumn{2}{c}{\textbf{Localization}} \\
    \midrule
"""
    latex_table_footer = r"""
\hline
\end{tabular}%
}
\end{table}
"""
    full_latex_code = latex_table_header + "\n" + "\n".join(latex_rows) + latex_table_footer
    return summary_df, full_latex_code





In [13]:
results_df.columns

Index(['pipeline', 'task_name', 'model_name', 'experiment_id', 'learning_rate',
       'weight_decay', 'batch_size', 'seed', 'test_cross_device_loss',
       'test_cross_device_accuracy', 'test_cross_device_f1_score',
       'test_cross_env_loss', 'test_cross_env_accuracy',
       'test_cross_env_f1_score', 'test_cross_user_loss',
       'test_cross_user_accuracy', 'test_cross_user_f1_score', 'test_loss',
       'test_accuracy', 'test_f1_score'],
      dtype='object')

In [16]:
# task_list =["MotionSourceRecognition","BreathingDetection_Subset","Localization","FallDetection","ProximityRecognition","HumanActivityRecognition","HumanIdentification"]

# task_list = ["MotionSourceRecognition", "FallDetection", "BreathingDetection_Subset", "Localization"]
task_list = ["ProximityRecognition","HumanActivityRecognition","HumanIdentification"]


# metric_columns =['test_cross_device_accuracy', 'test_cross_device_f1_score','test_cross_env_accuracy',
#        'test_cross_env_f1_score','test_cross_user_accuracy', 'test_cross_user_f1_score']
metric_columns =['test_accuracy', 'test_f1_score']
# metric_columns = ['test_easy_accuracy', 'test_easy_f1_score', 'test_medium_accuracy', 'test_medium_f1_score', 'test_hard_accuracy', 'test_hard_f1_score']
model_order=["mlp", "lstm", "transformer", "resnet18", "patchtst", "timesformer1d"]

summary_df, latex_code = generate_errorbar_summary_with_latex(df=results_df, 
                                                              task_list=task_list,
                                                              metric_columns=metric_columns,
                                                              model_order=model_order,
                                                             precision=4)



# 查看结果表格
display(summary_df)

# 打印 LaTeX 表格代码
print(latex_code)


Unnamed: 0,task,model,test_accuracy,test_f1_score
0,ProximityRecognition,transformer,0.8641±0.0197,0.8709±0.0146
1,HumanActivityRecognition,transformer,0.8806±0.0076,0.8600±0.0205
2,HumanIdentification,transformer,0.9955±0.0006,0.9970±0.0027



\begin{table}[t]
\centering
\caption{Standard supervised evaluation on four core tasks. We report Accuracy and F1-score for each model.}
\label{tab:supervised-results}
\resizebox{\textwidth}{!}{%
\begin{tabular}{
    >{\centering\arraybackslash}p{2.5cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}|
    >{\centering\arraybackslash}p{1.8cm}
}
     \toprule
    Model & \multicolumn{2}{c|}{\textbf{Motion Source Recognition}} & \multicolumn{2}{c|}{\textbf{Fall Detection}} & \multicolumn{2}{c|}{\textbf{Breathing Detection}} & \multicolumn{2}{c}{\textbf{Localization}} \\
    \midrule

MLP & -- & -- & -- & -- & -- & -- \\
LSTM & -- & -- & -- & -- & -- & -- \\
TRANSFORMER & 0.8641±0.0197 & 0.8709±0.0146 & 0.8806±0.0076 & 0.8600±0.0205 & 0.9955±0.000

In [17]:
summary_df.to_csv('multitask_pipeline_3_mutitask_acc_f1_result_with_error_bar.csv')