# LLaRA evaluation

## Load all evaluation results

In [1]:
from glob import glob
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import re

levels = ['placement_generalization',
 'combinatorial_generalization',
 'novel_object_generalization',
 'novel_task_generalization']

files = glob('../results/[*.json')
result = []

for f in files:
    model_name = f[:-5]
    model_name = re.sub(r'\(.*\)', '', model_name).split(']')[-1]
    
    js = json.load(open(f, 'r'))

    # about prompt mode
    pm = ''
    pid = -1
    prop = []
    model_path = None
    for i, j in js.items():
        if i == 'global':
            pm = j.get('prompt_mode', 'N/A')
            pid = j.get('prompt_id', -2)
            model_path = j.get('model_path', None)
        else:
            try:
                del j['lm_prompt_hist']
                del j['lm_answer_hist']
            except:
                pass
            prop.append(j)

    for i in prop:
        i['level'] = f"L{levels.index(i['level']) + 1}"
        if pid < 0:
            # these methods use random user_prompt for action generation (default setting reported in paper)
            i['method'] = model_name
        elif pid < 15:
            # these methods use a fixed user_prompt for action generation
            i['method'] = model_name + f'_prompt{pid:03d}'
        else:
            # these methods omit the user_prompt for action generation
            i['method'] = model_name + '_no_prompt'
        i['prompt_mode'] = pm
    result.extend(prop)


df = pd.DataFrame(result).fillna('')
display(df.head())

Unnamed: 0,tid,level,task,seed,prompt,step,success,failure,method,prompt_mode
0,placement_generalization/sweep_without_exceedi...,L1,sweep_without_exceeding,200000,Sweep two {swept_obj} into {bounds} without ex...,2,True,False,D-inBC-AuxB-VIMA-80k_prompt002,hso
1,placement_generalization/sweep_without_exceedi...,L1,sweep_without_exceeding,200001,Sweep any {swept_obj} into {bounds} without ex...,1,True,False,D-inBC-AuxB-VIMA-80k_prompt002,hso
2,placement_generalization/sweep_without_exceedi...,L1,sweep_without_exceeding,200002,Sweep all {swept_obj} into {bounds} without ex...,2,True,False,D-inBC-AuxB-VIMA-80k_prompt002,hso
3,placement_generalization/sweep_without_exceedi...,L1,sweep_without_exceeding,200003,Sweep two {swept_obj} into {bounds} without ex...,2,True,False,D-inBC-AuxB-VIMA-80k_prompt002,hso
4,placement_generalization/sweep_without_exceedi...,L1,sweep_without_exceeding,200004,Sweep two {swept_obj} into {bounds} without ex...,2,False,True,D-inBC-AuxB-VIMA-80k_prompt002,hso


## Show the success rate

In [2]:
def show_results(data, data_total):
    # Pivot the table
    grouped = data.groupby(['method', 'prompt_mode', 'level']).size().unstack(fill_value=0)
    total = data_total.groupby(['method', 'prompt_mode', 'level']).size().unstack(fill_value=0)
    
    # Create a new dataframe with the desired text format
    result = total.copy().astype(str)  # Copy the structure of table1
    for col in total.columns:
        for idx in total.index:
            if total.loc[idx, col] > 0:
                try:
                    result.loc[idx, col] = f"{grouped.loc[idx, col]} / {total.loc[idx, col]} ({grouped.loc[idx, col] / total.loc[idx, col] * 100:.1f}%)"
                except KeyError:
                    result.loc[idx, col] = f"0 / {total.loc[idx, col]} (0.0%)"
            else:
                result.loc[idx, col] = 'N/A'
    
    display(result)
    
print('Please note that results for L4 are not valid because there is no rotation data when the end effector is a spatula.')
show_results(df[df['success']], df)


Please note that results for L4 are not valid because there is no rotation data when the end effector is a spatula.


Unnamed: 0_level_0,level,L1,L2,L3,L4
method,prompt_mode,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
D-inBC-AuxB-VIMA-80k,hso,234 / 260 (90.0%),229 / 260 (88.1%),190 / 240 (79.2%),27 / 80 (33.8%)
D-inBC-AuxB-VIMA-80k_no_prompt,hso,231 / 260 (88.8%),227 / 260 (87.3%),187 / 240 (77.9%),28 / 80 (35.0%)
D-inBC-AuxB-VIMA-80k_prompt000,hso,229 / 260 (88.1%),227 / 260 (87.3%),189 / 240 (78.8%),28 / 80 (35.0%)
D-inBC-AuxB-VIMA-80k_prompt001,hso,235 / 260 (90.4%),230 / 260 (88.5%),191 / 240 (79.6%),31 / 80 (38.8%)
D-inBC-AuxB-VIMA-80k_prompt002,hso,232 / 260 (89.2%),232 / 260 (89.2%),190 / 240 (79.2%),29 / 80 (36.2%)
D-inBC-AuxB-VIMA-80k_prompt003,hso,231 / 260 (88.8%),229 / 260 (88.1%),190 / 240 (79.2%),28 / 80 (35.0%)
D-inBC-AuxB-VIMA-80k_prompt004,hso,235 / 260 (90.4%),229 / 260 (88.1%),191 / 240 (79.6%),27 / 80 (33.8%)
D-inBC-AuxB-VIMA-80k_prompt005,hso,232 / 260 (89.2%),232 / 260 (89.2%),189 / 240 (78.8%),27 / 80 (33.8%)
D-inBC-AuxB-VIMA-80k_prompt006,hso,230 / 260 (88.5%),226 / 260 (86.9%),191 / 240 (79.6%),29 / 80 (36.2%)
D-inBC-AuxB-VIMA-80k_prompt007,hso,231 / 260 (88.8%),228 / 260 (87.7%),192 / 240 (80.0%),29 / 80 (36.2%)
