## LLM model hosting
We use vllm to serve the 70B LLaMA-3.1 model for llm-as-a-judge evaluation.

In [None]:
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "4,5,6,7"

from mmassist.datasets.generate.llm_utils import LLMGenerator

model_id = "meta-llama/Meta-Llama-3.1-70B-Instruct"
# model_id = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8" # for H100 or higher
number_gpus = 4
llm = LLMGenerator.build(model_id=model_id, number_gpus=number_gpus)

## LLM-Based Evaluation

In [2]:
import json, os, copy, random
from mmassist.configs.arguments import DATA_ROOT_DIR

file = f"{DATA_ROOT_DIR}/processed_data/wtag/generated_dialogs/val.json"
generated_dialogs = json.load(open(file, "r"))
print(generated_dialogs)

In [3]:
import os, json
from mmassist.configs.arguments import DATA_ROOT_DIR

I=1
exp_dir = f"../../ProAssist-Model-L4096-I{I}"
dataset_name = f"ego4d-dialog_val_L0_I{I}"
runner = "stream"
run_name = "notalk0.5-maxlen_4k"
result_dir = os.path.join(exp_dir, "eval", dataset_name, runner, run_name, "results")

print(len(os.listdir(result_dir)))
# results = json.load(open(os.path.join(result_dir, "all_results.json")))
# metrics = json.load(open(os.path.join(result_dir, "metrics.json")))
# prediction = json.load(open(os.path.join(result_dir,"results", "3.json")))

96


In [4]:
import re, json

EVALUATION_SYS_PROMPT = """You are an expert in evaluating the quality of user-assistant dialogues. Your task is to evaluate dialog responses generated by an assistant model that helps users with their tasks. You should evaluate the dialogs by comparing them to reference gold-standard dialogues from professional assistants.

Requirement:
1. Read dialogues carefully and compare them line by line. Keep you analysis concise and to the point.
2. Evaluate the following aspects: 
- Correctness: does each generated instruction/feedback make sense (correct or relevant) or not, based on the context and the gold-standard reference?
- Promptness: does the assistant provide guidance at the right time, or does it talk too early or too late?
- Efficiency: does the assistant provide the necessary information in a concise and efficient manner, without too much repetition or redundancy information?
- Overall: the overall helpfulness and quality of the assistant's responses.
3. For each aspect, give a score from 1 to 5 based on the following criteria:
- 1=very poor: most of utterances are incorrect, irrelevant, mistimed, inefficient etc
- 2=poor: bad utterances that are incorrect, irrelevant, mistimed are more than good ones
- 3=average: the number of good and bad utterances are roughly the same
- 4=good: more good utterances than bad ones
- 5=excellent: most of utterances are correct, relevant, timely, efficient etc
"""

DIALOG_EVALUATION_PROMPT_TEMPLATE = """Gold-standard reference dialogue:
{reference_dialog}

Generated dialogue by the model:
{generated_dialog}

Format your answer as follows:
<your step-by-step comparison and concise analysis>
---
{{"correctness": x, "promptness": y, "efficiency": z, "overall": w}}
"""


score_fields = ["correctness", "promptness", "efficiency", "overall"]

def parse_scores(text: str) -> dict | None:
    scores = {}
    if "---" in text:
        text = text.split("---")[1]
    text = text.strip("\n")
    try:
        scores = json.loads(text)
    except:
        print("*"*50)
        print(text)
        print("*"*50)
        return None
    return scores

In [8]:
all_predictions = []
all_prompts = []
for file in sorted(os.listdir(result_dir)):
    with open(os.path.join(result_dir, file), "r") as f:
        prediction = json.load(f)

    gen_and_ref = prediction['predictions']
    ref_dialog = ""
    for turn in prediction['predictions']:
        time = turn['timestamp_in_stream']
        for t in turn["text_inputs"]:
            if t[0] == "user":
                ref_dialog += f"[{time}s] User: {t[1]}\n"
        if turn["ref"]:
            ref_dialog += f"[{time}s] Assistant: {turn['ref']}\n"

    gen_dialog = ""
    for turn in prediction['predictions']:
        time = turn['timestamp_in_stream']
        for t in turn["text_inputs"]:
            if t[0] == "user":
                gen_dialog += f"[{time}s] User: {t[1]}\n"
        if turn["gen"]:
            gen_dialog += f"[{time}s] Assistant: {turn['gen']}\n"
    
    prompt = DIALOG_EVALUATION_PROMPT_TEMPLATE.format(
        reference_dialog=ref_dialog, generated_dialog=gen_dialog
    )
    inputs = [("system", EVALUATION_SYS_PROMPT), ("user", prompt)]
    
    all_predictions.append({
        "file": file,
        "ref": ref_dialog,
        "gen": gen_dialog,
    })
    all_prompts.append(inputs)

In [9]:
from tqdm import tqdm

num_repeat = 3
outputs = []
batch_size = 8
for i in tqdm(range(0, len(all_prompts), batch_size)):
    batch_prompts = all_prompts[i:i+batch_size]
    batch_outputs = llm.batch_generate(batch_prompts, n=num_repeat)
    outputs.extend(batch_outputs)

  0%|          | 0/12 [00:00<?, ?it/s]

100%|██████████| 12/12 [04:54<00:00, 24.51s/it]


In [12]:
print(all_predictions[0]['gen'])

[1.5s] User: I want to prepare a multi-course meal with grilled fish, meat, and rice.
[1.5s] Assistant: Great goal! To start, let's gather all the ingredients we need. Please collect the fish, meat, rice, and any seasonings or spices you'd like to use.
[139.5s] Assistant: Now that we have all the ingredients, let's start preparing the rice. Please wash the rice thoroughly to remove any impurities.
[157.0s] Assistant: Next, add water to the rice. Make sure the water level is at the recommended ratio. Then, put the rice in the rice cooker and turn it on.
[188.5s] Assistant: While the rice is cooking, let's prepare the grill. Please preheat it to the right temperature for grilling the fish and meat.
[203.5s] Assistant: Now, let's move on to preparing the fish. Please unwrap the fish and wash it to remove any impurities.
[228.5s] Assistant: Next, add some oil to the fish to prevent it from sticking to the grill. Then, add your desired seasonings or spices to the fish. You can use salt, pep

In [13]:
import re

def parse_scores(text: str) -> dict | None:
    scores = {}
    if "---" in text:
        text = text.split("---")[1]
    try:
        text = re.findall(r"\{.*?\}", text)[0]
        scores = json.loads(text)
    except:
        # print("*"*50)
        # print(text)
        # print("*"*50)
        return None
    return scores

for idx, (pred, out) in enumerate(zip(all_predictions, outputs)):
    # print(pred["file"])
    # print(out[0])

    all_scores = {k:[] for k in score_fields}
    for i in range(num_repeat):
        parsed_dict = parse_scores(out[i])
        if parsed_dict:
            for k, v in parsed_dict.items():
                all_scores[k].append(v)
    
    
    pred["all_scores"] = all_scores
    pred["llm_outputs"] = out
    mean_scores = {}
    for k, v in all_scores.items():
        if v:
            mean_scores[k] = sum(v) / len(v)
        else:
            mean_scores[k] = None
    print(idx, {k: f"{v:.2f}" if v else None for k, v in mean_scores.items()})
    pred["scores"] = mean_scores
    # print("\n\n\n")

# compute the mean of each score
for m in score_fields:
    scores = [p["scores"][m] for p in all_predictions if p["scores"][m]]
    mean_score = sum(scores) / len(scores)
    print(f"{m}: {mean_score:.2f}")



import json
save_file = "llm_eval_try4.json"
with open(save_file, "w") as f:
    json.dump(all_predictions, f, indent=2)


0 {'correctness': '3.67', 'promptness': '3.00', 'efficiency': '2.33', 'overall': '3.00'}
1 {'correctness': '2.33', 'promptness': '2.33', 'efficiency': '1.33', 'overall': '2.00'}
2 {'correctness': '3.67', 'promptness': '3.33', 'efficiency': '2.33', 'overall': '3.33'}
3 {'correctness': '3.17', 'promptness': '3.00', 'efficiency': '2.17', 'overall': '2.67'}
4 {'correctness': '1.00', 'promptness': '1.33', 'efficiency': '1.00', 'overall': '1.00'}
5 {'correctness': '2.00', 'promptness': '2.00', 'efficiency': '2.00', 'overall': '2.00'}
6 {'correctness': '2.00', 'promptness': '2.00', 'efficiency': '1.33', 'overall': '2.00'}
7 {'correctness': '2.00', 'promptness': '2.00', 'efficiency': '1.00', 'overall': '2.00'}
8 {'correctness': '2.00', 'promptness': '2.33', 'efficiency': '2.00', 'overall': '2.00'}
9 {'correctness': '2.00', 'promptness': '2.67', 'efficiency': '2.00', 'overall': '2.00'}
10 {'correctness': '2.00', 'promptness': '2.00', 'efficiency': '1.00', 'overall': '2.00'}
11 {'correctness': '

In [None]:
correctness: 1.95
promptness: 2.10
efficiency: 1.52
overall: 1.90

In [None]:
correctness: 2.13
promptness: 2.32
efficiency: 1.79
overall: 2.06

In [65]:
import re

def parse_scores(text: str) -> dict | None:
    scores = {}
    if "---" in text:
        text = text.split("---")[1]
    try:
        text = re.findall(r"\{.*?\}", text)[0]
        scores = json.loads(text)
    except:
        # print("*"*50)
        # print(text)
        # print("*"*50)
        return None
    return scores

for idx, (pred, out) in enumerate(zip(all_predictions, outputs)):
    # print(pred["file"])
    # print(out[0])

    all_scores = {k:[] for k in score_fields}
    for i in range(num_repeat):
        parsed_dict = parse_scores(out[i])
        if parsed_dict:
            for k, v in parsed_dict.items():
                all_scores[k].append(v)
    
    
    pred["all_scores"] = all_scores
    pred["llm_outputs"] = out
    mean_scores = {}
    for k, v in all_scores.items():
        if v:
            mean_scores[k] = sum(v) / len(v)
        else:
            mean_scores[k] = None
    print(idx, {k: f"{v:.2f}" if v else None for k, v in mean_scores.items()})
    pred["scores"] = mean_scores
    # print("\n\n\n")

# compute the mean of each score
for m in score_fields:
    scores = [p["scores"][m] for p in all_predictions if p["scores"][m]]
    mean_score = sum(scores) / len(scores)
    print(f"{m}: {mean_score:.2f}")



import json
save_file = "llm_eval.json"
with open(save_file, "w") as f:
    json.dump(all_predictions, f, indent=2)


0 {'correctness': '2.17', 'promptness': '2.00', 'efficiency': '1.67', 'overall': '2.17'}
1 {'correctness': '2.00', 'promptness': '1.67', 'efficiency': '1.00', 'overall': '1.67'}
2 {'correctness': '3.00', 'promptness': '2.33', 'efficiency': '2.00', 'overall': '2.67'}
3 {'correctness': '2.33', 'promptness': '2.67', 'efficiency': '1.67', 'overall': '2.00'}
4 {'correctness': '2.00', 'promptness': '2.33', 'efficiency': '1.67', 'overall': '2.00'}
5 {'correctness': '3.20', 'promptness': '3.50', 'efficiency': '2.50', 'overall': '3.25'}
6 {'correctness': '2.00', 'promptness': '2.33', 'efficiency': '1.67', 'overall': '2.00'}
7 {'correctness': '2.33', 'promptness': '2.00', 'efficiency': '1.67', 'overall': '2.00'}
8 {'correctness': '1.67', 'promptness': '2.00', 'efficiency': '1.00', 'overall': '1.33'}
9 {'correctness': '1.67', 'promptness': '2.67', 'efficiency': '2.00', 'overall': '2.00'}
10 {'correctness': '1.93', 'promptness': '2.00', 'efficiency': '1.67', 'overall': '2.00'}
11 {'correctness': '

In [None]:
prompt = DIALOG_EVALUATION_PROMPT_TEMPLATE.format(
    reference_dialog=ref_dialog, generated_dialog=gen_dialog
)
inputs = [("system", EVALUATION_SYS_PROMPT), ("user", prompt)]
outputs = llm.generate(inputs, n=5)
# print(outputs[0])

all_scores = {k:[] for k in score_fields}
for i in range(5):
    scores = parse_scores(outputs[i])
    if scores:
        for k, v in scores.items():
            all_scores[k].append(v)
print(all_scores)
for k, v in all_scores.items():
    avg = sum(v) / len(v)
    print(f"{k}: {avg:.1f}")