# LLaRA evaluation

## Load all evaluation results

In [1]:
from glob import glob
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import re

levels = ['placement_generalization',
 'combinatorial_generalization',
 'novel_object_generalization',
 'novel_task_generalization']

files = glob('./results/[*.json')
result = []

for f in files:
    model_name = f[:-5]
    model_name = re.sub(r'\(.*\)', '', model_name).split(']')[-1]
    
    js = json.load(open(f, 'r'))

    # about prompt mode
    pm = ''
    pid = -1
    prop = []
    model_path = None
    for i, j in js.items():
        if i == 'global':
            pm = j.get('prompt_mode', 'N/A')
            pid = j.get('prompt_id', -2)
            model_path = j.get('model_path', None)
        else:
            try:
                del j['lm_prompt_hist']
                del j['lm_answer_hist']
            except:
                pass
            prop.append(j)

    for i in prop:
        i['level'] = f"L{levels.index(i['level']) + 1}"
        if pid < 0:
            # these methods use random user_prompt for action generation (default setting reported in paper)
            i['method'] = model_name
        elif pid < 15:
            # these methods use a fixed user_prompt for action generation
            i['method'] = model_name + f'_prompt{pid:03d}'
        else:
            # these methods omit the user_prompt for action generation
            i['method'] = model_name + '_no_prompt'
        i['prompt_mode'] = pm
    result.extend(prop)


df = pd.DataFrame(result).fillna('')
# display(df.head())

## Show the success rate

In [2]:
def show_results(data, data_total):
    # Pivot the table
    grouped = data.groupby(['method', 'prompt_mode', 'level']).size().unstack(fill_value=0)
    total = data_total.groupby(['method', 'prompt_mode', 'level']).size().unstack(fill_value=0)
    
    # Create a new dataframe with the desired text format
    result = total.copy().astype(str)  # Copy the structure of table1
    for col in total.columns:
        for idx in total.index:
            if total.loc[idx, col] > 0:
                try:
                    result.loc[idx, col] = f"{grouped.loc[idx, col]} / {total.loc[idx, col]} ({grouped.loc[idx, col] / total.loc[idx, col] * 100:.1f}%)"
                except KeyError:
                    result.loc[idx, col] = f"0 / {total.loc[idx, col]} (0.0%)"
            else:
                result.loc[idx, col] = 'N/A'
    
    display(result)
    
print('Please note that results for L4 are not valid because there is no rotation data when the end effector is a spatula.')
show_results(df[df['success']], df)


Please note that results for L4 are not valid because there is no rotation data when the end effector is a spatula.


Unnamed: 0_level_0,level,L1,L2,L3
method,prompt_mode,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Manishs_egoexo4d-inst-yolo_vllama3-s3init-Proj-LLM,hs,8 / 260 (3.1%),9 / 260 (3.5%),8 / 240 (3.3%)
Manishs_egoexo4d-inst-yolo_vllama3-s3init-Proj-LLM,hso,0 / 260 (0.0%),0 / 260 (0.0%),3 / 240 (1.2%)
TESTING,hso,9 / 17 (52.9%),2 / 4 (50.0%),4 / 4 (100.0%)
videollama3-LLMProj_3epoch_DinBC-8k,hso,173 / 260 (66.5%),159 / 260 (61.2%),156 / 240 (65.0%)
videollama3-LLMProj_3epoch_inBC-8k,hs,123 / 260 (47.3%),124 / 260 (47.7%),119 / 240 (49.6%)
videollama3-LLMProj_DinBC-8k,ho,134 / 260 (51.5%),124 / 260 (47.7%),111 / 240 (46.2%)
videollama3-LLMProj_DinBC-8k,hs,90 / 260 (34.6%),85 / 260 (32.7%),86 / 240 (35.8%)
videollama3-LLMProj_DinBC-8k,hso,173 / 260 (66.5%),165 / 260 (63.5%),159 / 240 (66.2%)
videollama3-LLMProj_DinBC-8k-VIDEO,ho,31 / 260 (11.9%),26 / 260 (10.0%),33 / 240 (13.8%)
videollama3-LLMProj_DinBC-8k-VIDEO,hso,13 / 260 (5.0%),14 / 260 (5.4%),18 / 240 (7.5%)


In [28]:
import random, json, cv2, re, os
import matplotlib.pyplot as plt

data = json.load(open('/projects/vidlab_data/data/EgoExo_cooking_pick_place_instruction_set.json'))
random.shuffle(data)

data = data[:20]

for i, sample in enumerate(data):
    image_path = os.path.join('/projects/vidlab_data/data', sample['image'][0])
    image = cv2.imread(image_path)
    h, w = image.shape[:2]

    question = answer = sample['conversations'][0]['value']
    task_str = re.findall(r"<task>(.*?)</task>", question)

    answer = sample['conversations'][1]['value']
    matches = re.findall(r"<b>\(([\d.]+), ([\d.]+)\)</b>", answer)
    matches = [tuple(map(float, match)) for match in matches]

    for j, match in enumerate(matches):
        x, y = match
        x = int(x * w)
        y = int(y * h)
        cv2.circle(image, (x, y), 5, (255*j, 255-(j*255), 0), -1)

    # write task_string on image with text outline
    for j, task in enumerate(task_str):
        cv2.putText(image, task, (10, 30 + j*30), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1, cv2.LINE_AA)
        cv2.putText(image, task, (11, 31 + j*30), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1, cv2.LINE_AA)

    cv2.imwrite(f'./_exp_frames/instruction_{i}.jpg', image)