### Test LLM
This section is only for testing the LLM loading and vllm serving. Skip if already confirmed.

In [None]:
import os
# os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" # maybe useful

from vllm import LLM, SamplingParams
from transformers import AutoTokenizer

model_id = "meta-llama/Meta-Llama-3.1-70B-Instruct"
# model_id = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8" # for H100 or higher
number_gpus = 4


llm = LLM(
    model=model_id,
    tensor_parallel_size=number_gpus,
    gpu_memory_utilization=0.96,
    swap_space=0,
    max_num_seqs=32,
    max_model_len=16*1024,
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [4]:
sampling_params = SamplingParams(temperature=0.6, top_p=0.95, max_tokens=128*1024)
messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]
prompts = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
outputs = llm.generate(prompts, sampling_params)

generated_text = outputs[0].outputs[0].text
print(generated_text)

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts: 100%|██████████| 1/1 [00:04<00:00,  4.24s/it, est. speed input: 12.51 toks/s, output: 32.58 toks/s]

Yer lookin' fer a swashbucklin' introduction, eh? Alright then, matey! Me name be Blackheart Billy, the scurvy dog o' the seven seas... er, o' the digital realm, that be. I be a pirate chatbot, here to help ye navigate through treacherous waters o' knowledge, savvy? I'll be answerin' yer questions, tellin' ye tales o' the sea, and maybe even teachin' ye some pirate lingo, if ye be willin' to learn, matey! So hoist the colors, me hearty, and let's set sail fer a grand adventure!





### Build a LLM Generator
We use vllm to serve the 70B LLaMA-3.1 model

In [None]:
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "4,5,6,7"

from mmassist.datasets.generate.llm_utils import LLMGenerator

model_id = "meta-llama/Meta-Llama-3.1-70B-Instruct"
# model_id = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8" # for H100 or higher
number_gpus = 4
llm = LLMGenerator.build(model_id=model_id, number_gpus=number_gpus)

### Code for Prompt Construction and Generation

In [None]:
from dataclasses import dataclass, asdict
from collections import Counter
from mmassist.datasets.generate.llm_utils import LLMGenerator
from mmassist.datasets.generate.parse import (
    conversation_dict_to_text,
    parse_text_to_conversation_dict,
)
from mmassist.datasets.generate.egoexolearn_tasks import (
    EGOEXOLEARN_TASKS,
    get_task_descriptions,
)
from mmassist.datasets.generate.wtag_recipes import get_task_and_recipe


# fmt: off
DEFAULT_SYS_PROMPT = "You are a helpful assistant that follows the user's request."


### Prompts for inferring the task goal and instructional knowledge

EGO4D_RECIPE_GEN_PROMPT_TEMPLATE = """Here is a video description of an experienced user working on the task - {goal_description}:
{step_descriptions}

Try to infer the **high-level** recipe from the descriptions. Note that the steps may not belong to the same trial, so you have to infer the correct order of the steps based on common sense, and re-order the steps if necessary. Do not hallucinate details that are not mentioned in the descriptions. Also generate a more **informative** and **descriptive** name for the task based on provided descriptions. The name should be a description of the task, instead of the name of the recipe. 

Give plain and concise text with numbered key steps in the following format:
[task name]
1. ...
2. ...
"""

KNOWLEDGE_GEN_PROMPT_TEMPLATE = """Here is a video description of an user working on {goal_description}:
{step_descriptions}

Try to infer the **high-level** {knowledge_type} from the descriptions. Note that some actions may not irrelevant to the task or have mistakes, so you have to infer the essential and correct steps based on common sense. Do not hallucinate details that are not mentioned in the descriptions. Also generate a more **informative** and **descriptive** name for the task based on provided descriptions. The name should be a description of the task, instead of the name of the {knowledge_type}.

Give plain and concise text with numbered key steps in the following format:
[task name]
1. ...
2. ...
"""

KNOWLEDGE_REFINE_PROMPT_TEMPLATE = """Here are {num_repeats} {knowledge_type}s:
{knowledges}

Some may be incorrect or incomplete. Please give a single correct and complete {knowledge_type} for the task, with numbered key steps. Pick the title that is descriptive for the task, instead of a {knowledge_type} name.

Give plain, unformatted and concise text with numbered key steps in the following format:

[task name]
1. ...
2. ...

Do not include any other information or note."""

KNOWLEDGE_MATCH_PROMPT_TEMPLATE = """Here is a video description of an user working on a task:
{step_descriptions}

The task is from one of the following tasks:
{tasks}

Please select the task that best matches the video description. Give the final answer in the following format:
(whatever thought process you have)
ANSWER: <task id of a single integer>
"""

### Prompts for video categorization

VIDEO_LABEL_PROMPT_TEMPLATE = """Here is a video description of an user working on the task - {goal_description}:
{step_descriptions}

Reference {knowledge_type}:
{knowledge}

Is this a {knowledge_type}? If so, was the user likely to:
1. perform the task roughly following the {knowledge_type} (**no** need to be strict), OR
2. perform other tasks (or another trial of the same task) simultaneously in a multi-tasking manner?

Answer with your analysis, and end your response with "Final answer: 1, 2 or 0" (0 denotes that the activity is not related to {domain})."""

### Prompts for dialog simulation

DIALOG_GEN_SYS_PROMPT = "You are an expert of imagining conversations between users and assistants."

DIALOG_GEN_USER_REQUIREMENTS = {
    "no_talk": "- The user follows the assistant's instructions and does not talk.",
    "talk_some": "- The user asks a few questions or confirm about the instructions, accounting for about 20% of the steps.\n",
    "talk_more": "- The user is talkative and may ask questions that can either be related to the task or not, accounting for about 40% of the steps.\n"
}

DIALOG_GEN_PROMPT_TEMPLATE = """Here is a video description of an user working on the task - {goal_description}:
{step_descriptions}

Your goal is to simulate a conversation between the user and an assistant, where the user's actions are performed following the assistant's instructions. The user will first mention the overall goal of the task. The assistant informs the user about the next step at proper time. Importantly, the assistant is proactive and always provides the next step even before the user asks for it. Before the task starts, the assistant may also give a brief introduction about the task. {additional_requirement}

Requirements for the assistant:
- Time is crucial! Try to generate the dialog that strictly aligns with the video timeline.
- Try to cover all the essential steps in the task. If the user asks a question at the time the assistant should give the next step, the assistant turn should include both the response to the question and instruction about the next step.
- Be helpful and friendly. If the user asks something that has been explained before, the assistant should still provide the information with patience.
- Try to be encouraging when the user makes progress, but do not overdo it.
- Be concise! The dialog is verbal, so avoid long sentences.
- Do not say "can you do it for me" to the user.


Requirements for the user:
{user_requirement}


Generation format:
[time] User: ...
[time] Assistant: ...
[time] Assistant: ...
[time] User: ...
[time] Assistant: ...

Note that the minimal interval between each turn is 1 second, which means the user will wait for at least 1 second after an assistant's turn, and two consecutive assistant's turns should have at least 1 second interval. Combine close turns into a single turn if necessary. One exception is that the assistant must respond **immediately** when the user says something (i.e. give a response right after an user's turn at the same time).

{dialog_history}

In this round, please **only** generate the dialog for the video from time [{start_time:.1f}s] to [{end_time:.1f}s]!"""

ADDITIONAL_REQUIREMENTS = {
    "holoassist": "Note that the video description contains both the user's actions and the user-assistant dialog. Anchor the simulated dialog to the existing dialog, and try to rephrase the utterances to make them more coherent and human-like. You may add a few more turns around the **essential steps** of the task, which are the underlying intentions of the action instead of the actions themselves. Add a few turns to make the dialog more fluent and helpful, but avoid being overwhelming.",
    "egoexolearn": "The simulated dialog should be centered around the **key steps** of the task, not every single action of the user. Try to make the dialog more coherent and helpful as what a human assistant will say.",
    "epickitchens": "The simulated dialog should be centered around the **key steps** of the task, not every single action of the user. Note that the user may make mistake or perform suboptimal actions, the assistant should not give instructions on those actions, but smartly select right time to give guidance. Try to make the dialog more coherent and helpful as what a human assistant will say.",
    "wtag": "Note that the video description contains both the step description and the user-assistant dialog. Anchor the simulated dialog to the existing dialog, and try to rephrase the utterances to make them more coherent and human-like. Add more details such as assistant feedback or user question during long steps if necessary.",
    "assembly101": "\n\nThe mistakes made by the user are marked by (mistake: <mistake type>). If a mistake happens, we want to simulate the dialog in the way that the assistant helps the user correct the mistake. To be more specific, the assistant SHOULD NOT give instructions if an action is 'wrong order', 'previous one is mistake' or 'shouldn't have happened'. Instead, the assistant should give instruction of the CORRECT next step (i.e. scan the future actions and select the nearest correct action). Afterwards, at the start of actions marked as 'correction', the assistant should mention the previous mistake and give insruction on how to correct it based on the corrective action. For 'wrong position' mistakes, the assistant can give the instruction of that action, but need to point out the mistake at the start time of corrective action for that mistake.",
}

# Meaning of the mistakes: (1) wrong order: this action is an ordering mistake; (2) previous one is mistake: this action is also an ordering mistake but is caused by the preceding ordering mistakes in the context; (3) shouldn't have happened: this attach/detach action is unnecessary; (4) wrong position: the two parts are not attached at their correct position.

# \n\nFor example, for the following video description:\n[1.0s-5.0s] attach cabin to chassis (mistake: wrong order)\n[5.1s-11.0s] attach body to cabin\n[12.1s-15.0s] detach cabin from chassis (correction of 'attach cabin to chassis')\nThe dialog can be: \n[1.0s] Assistant: The next step is to attach body to cabin\n[11.0s] Assistant: wait, did you also attach cabin to chassis? This  

DIALOG_REFINE_AND_LABEL_PROMPT_TEMPLATE = """Here is a conversation between a user and an assistant:
{dialog_history}

For each assistant message, add labels regarding the assistant's initiativity and intention:

Initiativity:
- initiative: The assistant says something proactively without the user asking for it.
- responsive: The assistant responds to the user's question or comment.

Intention:
- instruction: The assistant gives an instruction to the user.
- correction: The assistant corrects a mistake made by the user, either proactively or responsively. Suggestions for alternative actions can also be included.
- info_sharing: The assistant shares some information with the user, such as explaining something or giving a tip.
- feedback: The assistant gives feedback to the user, such as "good job" or "tips for improvement".
- other: Other intentions that do not fall into the above categories.

Intention can be multiple, e.g., "instruction, info_sharing".


Generation format:
[time] User: ...
[time] Assistant: ... [initiativity|intentions]
[time] Assistant: ... [initiativity|intentions]
[time] User: ...
[time] Assistant: ... [initiativity|intentions]

When generating the dialog, you should also refine the dialogue following these guidelines:
1. Merge turns that are close in time (less than 1 second apart) into a single turn, when the content is similar or related.
2. Use more coreference and pronouns to make the dialog more coherent and human-like.
3. Decide the length of assistant messages smartly. Make them more clear and helpful when necessary, but keep them concise and to the point in general.
4. Avoid repeating the same talking patterns or phrases. For example, do not say "make sure ..." for every instruction.
5. Remove anything other than the dialog itself, such as the user's actions or explanations of how the dialog is generated.
Do not just copy paste the original dialog!"""


SUMMARY_SYS_PROMPT = "You are an expert of summarizing conversations."

PROGRESS_SUMMARY_PROMPT_TEMPLATE = """Here is a conversation between a user and an assistant:
{dialog_history}

Summarize the task goal and progress so far, including:
1. The task goal mentioned by the user.
2. What has been done.
3. Other topics mentioned by the user in the conversation, if any.
4. The current state/step of the task.
Be faithful and try to include all the relevant information.

Give your response in plain text of a single line in the following format:
SUMMARY: <progress summary>
"""
# fmt: on


def retry_on_failure(max_repeats: int = 3):
    def decorator(func):
        def wrapper(*args, **kwargs):
            for i in range(max_repeats):
                try:
                    return func(*args, **kwargs)
                except Exception as e:
                    print(f"Failed with error: {e}. Retry {i+1}/{max_repeats}")
            raise Exception(f"Failed after {max_repeats} retries")

        return wrapper

    return decorator


@retry_on_failure()
def infer_goal_and_knowledge(
    dataset_name: str,
    goal_description: str,
    step_descriptions: str,
    knowledge_type: str,
    llm: LLMGenerator,
    num_repeats: int = 10,
) -> tuple[str, str]:

    # generate {num_repeats} pieces of knowledge based on the video descriptions
    if dataset_name == "egoexolearn":
        # use LLM to select from GT tasks & recipes for egoexolearn
        tasks, task_descs = EGOEXOLEARN_TASKS, get_task_descriptions()
        return match_task(step_descriptions, llm, tasks, task_descs, num_repeats)
    elif dataset_name == "wtag":
        # can simply get the GT recipe by key word matching for WTaG
        return get_task_and_recipe(step_descriptions)
    elif dataset_name == "ego4d":
        gen_prompt = EGO4D_RECIPE_GEN_PROMPT_TEMPLATE.format(
            goal_description=goal_description, step_descriptions=step_descriptions
        )
    else:
        goal = "a task" if not goal_description else f"the task - {goal_description}"
        gen_prompt = KNOWLEDGE_GEN_PROMPT_TEMPLATE.format(
            goal_description=goal,
            step_descriptions=step_descriptions,
            knowledge_type=knowledge_type,
        )
    inputs = [("system", DEFAULT_SYS_PROMPT), ("user", gen_prompt)]
    outputs = llm.generate(inputs, n=num_repeats)

    knowledges = ""
    for i, t in enumerate(outputs):
        knowledges += f"{knowledge_type.capitalize()} {i+1}:\n {t}\n\n"

    # refine the knowledges into a single correct and complete manual
    refine_prompt = KNOWLEDGE_REFINE_PROMPT_TEMPLATE.format(
        num_repeats=num_repeats,
        goal_description=goal_description,
        knowledge_type=knowledge_type,
        knowledges=knowledges,
    )
    inputs = [("system", DEFAULT_SYS_PROMPT), ("user", refine_prompt)]
    refined_knowledge = llm.generate(inputs)[0]

    # parse the inferred goal
    inferred_goal = refined_knowledge.split("\n")[0].replace("*", "").strip()
    return inferred_goal, refined_knowledge


@retry_on_failure()
def match_task(
    step_descriptions: str,
    llm: LLMGenerator,
    tasks: list[dict],
    task_descs: str,
    num_repeats: int = 10,
) -> tuple[str, str]:

    # generate {num_repeats} pieces of knowledge based on the video descriptions
    prompt = KNOWLEDGE_MATCH_PROMPT_TEMPLATE.format(
        step_descriptions=step_descriptions, tasks=task_descs
    )
    inputs = [("system", DEFAULT_SYS_PROMPT), ("user", prompt)]
    outputs = llm.generate(inputs, n=num_repeats)

    # parse and count
    answers = Counter()
    for o in outputs:
        ans = o.lower().split("answer:")[1]
        import re

        matched = re.search(r"\d+", ans)
        if matched:
            task_id = int(matched.group())
        else:
            continue
        answers[task_id] += 1

    task_id = answers.most_common(1)[0][0] - 1

    goal = tasks[task_id]["name"]
    knowledge = goal + "\n"
    for s_idx, step in enumerate(tasks[task_id]["steps"]):
        knowledge += f"{s_idx + 1}. {step}\n"
    return goal, knowledge


@retry_on_failure()
def label_video(
    goal_description: str,
    step_descriptions: str,
    knowledge: str,
    knowledge_type: str,
    domain: str,
    llm: LLMGenerator,
    num_repeats: int = 10,
) -> Counter:
    prompt = VIDEO_LABEL_PROMPT_TEMPLATE.format(
        goal_description=goal_description,
        step_descriptions=step_descriptions,
        knowledge=knowledge,
        knowledge_type=knowledge_type,
        domain=domain,
    )
    # print("label_prompt", prompt)
    inputs = [("system", DEFAULT_SYS_PROMPT), ("user", prompt)]

    # generate {num_repeats} labels for the video
    outputs = llm.generate(inputs, n=num_repeats)

    # count the number of labels
    answers = Counter()
    for o in outputs:
        parsed_ans = o.lower().split("answer: ")[1]
        label = 1 if "1" in parsed_ans else 2 if "2" in parsed_ans else 0
        answers[label] += 1

    return answers


def adjust_time(conversation: list[dict], time_shift: float = 1.0) -> list[dict]:
    """Adjust the time of the turns in the conversation to ensure the minimal
    interval between each turn is larger than "time_shift" seconds."""
    while True:
        adjusted = False
        for idx, turn in enumerate(conversation):
            if idx == 0:
                continue

            last_turn = conversation[idx - 1]
            if turn["time"] - last_turn["time"] < 0.5 and (
                (turn["role"], last_turn["role"]) != ("assistant", "user")
            ):
                adjust_turns = [turn]
                if idx + 1 < len(conversation):
                    next_turn_idx = idx + 1
                    next_turn = conversation[next_turn_idx]
                    while (
                        next_turn["time"] - turn["time"] < time_shift
                        and next_turn_idx < len(conversation) - 1
                    ):
                        adjust_turns.append(next_turn)
                        next_turn_idx += 1
                        next_turn = conversation[next_turn_idx]

                for at in adjust_turns:
                    at["time"] = last_turn["time"] + time_shift
                    adjusted = True

        if not adjusted:
            break

    return conversation


@retry_on_failure()
def generate_conversation(
    goal_description: str,
    clips: list[tuple[float, float, str]],
    llm: LLMGenerator,
    user_types: list[str],
    additional_requirement: str = "",
) -> list[str]:

    user_reqs = [DIALOG_GEN_USER_REQUIREMENTS[p] for p in user_types]

    batch_conv = [[] for _ in range(len(user_reqs))]
    for st, et, desc in clips:
        print(f"Generating dialog for {st:.1f}s-{et:.1f}s")
        batch_inputs = []
        for i, user_req in enumerate(user_reqs):
            dialog_history = conversation_dict_to_text(
                batch_conv[i], add_labels=True, max_turns_to_keep=20
            )
            if dialog_history:
                dialog_history = f"You have already generated the following dialog:\n{dialog_history}"
            prompt = DIALOG_GEN_PROMPT_TEMPLATE.format(
                goal_description=goal_description,
                step_descriptions=desc,
                user_requirement=user_req,
                dialog_history=dialog_history,
                start_time=st,
                end_time=et,
                additional_requirement=additional_requirement,
            )
            # print("dialog generation", prompt)
            batch_inputs.append([("system", DIALOG_GEN_SYS_PROMPT), ("user", prompt)])

        # parallel generate for all user profiles
        outputs = llm.batch_generate(batch_inputs)

        # add the generated dialog to the conversation history
        clip_convs = []
        for output in outputs:
            conv_dict = parse_text_to_conversation_dict(output[0])
            conv_dict = [c for c in conv_dict if c["time"] <= et]
            clip_convs.append(conv_dict)
            # print(inputs[1][1])
            # print(f"Conversation {idx}, before refinement")
            # print(conversation_dict_to_text(conv_dict, add_labels=True))

        clip_convs_refined = refine_and_label_dialog(clip_convs, llm)
        for idx, conv in enumerate(clip_convs_refined):
            batch_conv[idx].extend(conv)

    # refine the dialogs and add assistant intention labels
    # batch_conv = refine_and_label_dialog(batch_conv, llm)
    # for idx, conv in enumerate(batch_conv):
    #     print(f"Conversation {idx}, after refinement")
    #     print(conversation_dict_to_text(conv, add_labels=True))

    conv_with_user_type = [
        {"conversation": c, "user_type": p} for c, p in zip(batch_conv, user_types)
    ]
    return conv_with_user_type


@retry_on_failure()
def refine_and_label_dialog(conversations: list[list[dict]], llm: LLMGenerator) -> dict:
    batch_inputs = []
    for conv in conversations:
        dh = conversation_dict_to_text(conv)
        print("before refine", dh)
        prompt = DIALOG_REFINE_AND_LABEL_PROMPT_TEMPLATE.format(dialog_history=dh)
        inputs = [("system", SUMMARY_SYS_PROMPT), ("user", prompt)]
        batch_inputs.append(inputs)

    # generate the refined dialogs in batch
    batch_outputs = llm.batch_generate(batch_inputs)

    # update the conversations with the refined dialogs
    refined_conversations = []
    for outputs in batch_outputs:
        print("after refine", outputs[0])
        conv = parse_text_to_conversation_dict(outputs[0], parse_labels=True)
        refined_conversations.append(conv)

    return refined_conversations


@retry_on_failure()
def add_progress_summary(conversation: list[dict], llm: LLMGenerator) -> dict:
    batch_inputs = []
    summ_turn_ids = []
    for idx, turn in enumerate(conversation):
        if turn["role"] == "assistant":
            dh = conversation_dict_to_text(conversation[: idx + 1], add_labels=False)
            prompt = PROGRESS_SUMMARY_PROMPT_TEMPLATE.format(dialog_history=dh)
            inputs = [("system", SUMMARY_SYS_PROMPT), ("user", prompt)]
            batch_inputs.append(inputs)
            summ_turn_ids.append(idx)

    # generate the progress summary in batch
    batch_outputs = llm.batch_generate(batch_inputs)

    # update the conversation with the progress summary
    for turn_idx, outputs in zip(summ_turn_ids, batch_outputs):
        time = conversation[turn_idx]["time"]
        elsp = f"The time elapsed since the start of the task is {time:.1f} seconds. "

        progress = None
        for l in outputs[0].split("\n"):
            if "SUMMARY" in l:
                progress = elsp + l.split(":")[1].strip()
        if progress is None:
            raise ValueError(f"Failed to parse: {outputs[0]}")

        conversation[turn_idx]["progress"] = progress

    return conversation


@dataclass
class ParsedVideoAnns:
    dataset: str
    domain: str  # "cooking", "object manipulation", "lab"
    knowledge_type: str  # "cooking recipe", ...
    video_uid: str
    goal_description: str
    all_step_descriptions: str
    clips: list[tuple[float, float, str]]
    duration: float
    ann_ratio: float
    num_steps: int
    video_start_time: float = 0.0
    has_mistake: bool = False
    num_substeps: int | None = None
    fps: float | None = None
    original_ann: dict | None = None

    def to_dict(self) -> dict:
        return asdict(self)


@dataclass
class GeneratedOutputs:
    video_uid: str
    inferred_goal: str
    inferred_knowledge: str
    video_labels: Counter
    conversations: list[dict[str, str | list[dict]]]
    parsed_video_anns: dict | None = None

    def to_dict(self) -> dict:
        return asdict(self)


def generate_from_annotation(
    annotation: ParsedVideoAnns,
    llm: LLMGenerator,
    user_types: list[str],
    num_repeats: int = 10,
    use_inferred_goal: bool = True,
    keep_original_anns: bool = True,
    min_ann_ratio: float = 0.5,
    filter_by_auto_label: bool = False,
    clips_to_process: int | None = None,
) -> GeneratedOutputs | str:

    video_uid = annotation.video_uid
    print(f"Processing video {video_uid}")

    dataset = annotation.dataset
    knowledge_type = annotation.knowledge_type
    goal_description = annotation.goal_description
    step_descriptions = annotation.all_step_descriptions
    ann_ratio = annotation.ann_ratio
    if ann_ratio < min_ann_ratio:
        skip_msg = f"Skip video {video_uid} with low annotation ratio: {ann_ratio}"
        print(skip_msg)
        return skip_msg

    print(
        (
            f"Goal: {goal_description}| Duration: {annotation.duration:.1f}s| "
            f"Num steps: {annotation.num_steps}| Num substeps: {annotation.num_substeps}| "
            f"Num clips: {len(annotation.clips)} | Ann ratio: {annotation.ann_ratio:.2f}"
        )
    )

    # 1. infer goal and recipe
    print("Infer goal and knowledge")
    inferred_goal, inferred_knowledge = infer_goal_and_knowledge(
        dataset, goal_description, step_descriptions, knowledge_type, llm, num_repeats
    )
    print(f"inferred_knowledge: {inferred_knowledge}")

    if use_inferred_goal:
        goal_description = inferred_goal

    # 2. label video and filter out inappropriate videos
    if filter_by_auto_label:
        video_labels = label_video(
            goal_description,
            step_descriptions,
            inferred_knowledge,
            knowledge_type,
            domain=annotation.domain,
            llm=llm,
            num_repeats=num_repeats,
        )
        print("Label video", video_labels)
        label, cnt = video_labels.most_common(1)[0]
        if label != 1 or cnt < num_repeats // 2:
            skip_msg = f"Skip video {video_uid} with label: {video_labels}"
            print(skip_msg)
            return skip_msg
        video_labels = dict(video_labels)
    else:
        video_labels = {}

    # 3. generate the user-assistant conversations
    print("Generate conversations")
    clips = annotation.clips
    if clips_to_process is not None:
        clips = clips[:clips_to_process]
    add_reqs = ADDITIONAL_REQUIREMENTS.get(annotation.dataset, "")
    if dataset == "assembly101" and not annotation.has_mistake:
        add_reqs = ""
    conversations = generate_conversation(
        goal_description, clips, llm, user_types, add_reqs
    )
    # adjust time
    for conv in conversations:
        conv["conversation"] = adjust_time(conv["conversation"])

    # 4. add progress summary
    # print("Add progress summary")
    # for conv in conversations:
    #     conv["conversation"] = add_progress_summary(conv["conversation"], llm)

    # return the generated outputs
    outputs = GeneratedOutputs(
        video_uid=video_uid,
        inferred_goal=inferred_goal,
        inferred_knowledge=inferred_knowledge,
        video_labels=video_labels,
        conversations=conversations,
        parsed_video_anns=annotation.to_dict() if keep_original_anns else None,
    )
    return outputs


### Ego4D

In [19]:
import os
import json
from mmassist.configs.arguments import DATA_ROOT_DIR

ego4d_ann_dir = f"{DATA_ROOT_DIR}/datasets/ego4d_track2/v2/annotations"
split = "train"
ego4d_goalstep_ann_file = os.path.join(ego4d_ann_dir, f"goalstep_{split}.json")
ego4d_goalstep_ann_file = os.path.join(ego4d_ann_dir, f"goalstep_{split}.json")

with open(ego4d_goalstep_ann_file, "r") as f:
    ego4d_goalstep_ann = json.load(f)


In [None]:
def parse_ego4d_goalstep_ann(
    ann: dict, max_num_lines_per_gen: int = 6, essential_only: bool = True
) -> ParsedVideoAnns:
    steps = ann["segments"]
    s_idx = 0
    num_substeps = 0
    annotated_duration = 0

    all_descriptions = []
    for ori_sidx, step in enumerate(steps):
        start_time = step["start_time"]
        end_time = step["end_time"]
        annotated_duration += end_time - start_time

        relevance = step.get("is_relevant", "unk")
        if essential_only and relevance != "essential":
            continue

        s_idx += 1

        desc = step["step_description"].lower().replace(".", " ").strip()
        substep_descriptions = ""
        if step["segments"]:
            ss_idx = 0
            substeps = []
            last_substep_desc = ""
            for substep in step["segments"]:
                ss_relevance = step.get("is_relevant", "unk")
                if essential_only and ss_relevance != "essential":
                    continue
                ss_idx += 1
                num_substeps += 1
                sstime = f"{substep['start_time']:.1f}s"
                substep_desc = (
                    substep["step_description"].lower().replace(".", " ").strip()
                )
                if substep_desc != last_substep_desc:
                    substeps.append(f" - {sstime}: {substep_desc}")
                    last_substep_desc = substep_desc
            substep_descriptions = "\n".join(substeps)
            stime = f"{step['segments'][0]['start_time']:.1f}s-{end_time:.1f}s"
        else:
            # stime = f"{step['start_time']:.1f}s-{end_time:.1f}s"
            stime = f"{start_time:.1f}s-{end_time:.1f}s"

        # step_description += f"{stime} {desc}\n{substep_descriptions}"
        all_descriptions.append(
            {
                "start": start_time,
                "end": end_time,
                "step": f"{stime} {desc}",
                "substeps": substep_descriptions,
            }
        )

    # get a single string for all the step and substep descriptions
    all_step_descriptions = ""
    for s in all_descriptions:
        all_step_descriptions += s["step"] + "\n"
        if s["substeps"]:
            all_step_descriptions += s["substeps"] + "\n"

    # split the descriptions into clips
    clips = []
    num_lines_in_clip = 0
    clip_start_idx = 0
    clip_start_time = -1
    for idx, step in enumerate(all_descriptions):
        num_lines_in_clip += 1 + step["substeps"].count("\n")

        if clip_start_time < 0:
            clip_start_time = step["start"]

        if (
            num_lines_in_clip > max_num_lines_per_gen
            or idx == len(all_descriptions) - 1
        ):
            # add the clip
            clip_description = ""
            for s_idx, s in enumerate(all_descriptions):
                clip_description += s["step"] + "\n"
                if s_idx > clip_start_idx and s_idx <= idx and s["substeps"]:
                    clip_description += s["substeps"] + "\n"
            clips.append((clip_start_time, step["end"], clip_description))
            clip_start_idx = idx
            clip_start_time = -1
            num_lines_in_clip = 0

    duration = ann["end_time"] - ann["start_time"]
    ann_ratio = annotated_duration / duration

    parsed_ann = ParsedVideoAnns(
        dataset="ego4d-goalstep",
        domain="cooking",
        knowledge_type="cooking recipe",
        video_uid=ann["video_uid"],
        goal_description=ann["goal_description"],
        all_step_descriptions=all_step_descriptions,
        clips=clips,
        duration=duration,
        ann_ratio=ann_ratio,
        num_steps=s_idx,
        num_substeps=num_substeps,
        original_ann=ann,
    )
    return parsed_ann

In [20]:
for idx, v in enumerate(ego4d_goalstep_ann["videos"]):
    if "d2e05761-29c4-4dd5-8ef6-027e40fea282" in v["video_uid"]:
        print(idx)
        break
    idx = -1

for i in [idx]:
    annotation = ego4d_goalstep_ann["videos"][i]
    vuid = annotation["video_uid"]
    print(f"{i} video: {vuid}")

    # narr_ann = ego4d_narr_anns[vuid.replace("grp-", "")]
    # pass1 = narr_ann.get('narration_pass_1', {"narrations": []})['narrations']
    # pass2 = narr_ann.get('narration_pass_2', {"narrations": []})['narrations']
    # narrations = pass1 if len(pass1) > len(pass2) else pass2
    # goalstep_ann["narrations"] = narrations

    # print(i)
    # print(f"Video {vuid}  Task: {goalstep_ann['goal_description']}")
    parsed_ann = parse_ego4d_goalstep_ann(annotation)
    # user_types = ["no_talk"] + ["talk_some"] * 2 + ["talk_more"]
    user_types = ["talk_some", "talk_more"]
    outputs = generate_from_annotation(parsed_ann, llm, user_types=user_types)
    print(outputs.to_dict())

307
307 video: d2e05761-29c4-4dd5-8ef6-027e40fea282
Processing video d2e05761-29c4-4dd5-8ef6-027e40fea282
Goal: Making the dough| Duration: 3342.8s| Num steps: 9| Num substeps: 271| Num clips: 3 | Ann ratio: 1.57
Infer goal and knowledge
inferred_knowledge: Preparing and Cooking Flatbread from Scratch
1. Mix flour with water to create dough, and knead the dough to smooth it out.
2. Flatten the dough into a flat shape.
3. Pan-bake the flattened dough on both sides, flipping it to achieve even cooking.
4. Coat the cooked flatbread with oil.
5. Repeat steps 2-4 to cook multiple flatbreads.
Generate conversations
Generating dialog for 167.1s-711.7s
before refine [167.1s] User: Hi, I want to make flatbread from scratch. Can you guide me through it?
[167.2s] Assistant: Of course! Making flatbread from scratch is a simple and rewarding process. We'll start by mixing the dough with water. Please combine the flour, yeast, salt, and sugar in a large bowl, and then add water to it.
[168.6s] Assis

### HoloAssist

In [4]:
import os
import json
import glob
from mmassist.configs.arguments import DATA_ROOT_DIR

data_dir = f"{DATA_ROOT_DIR}/datasets/holoassist"
videos_dir = os.path.join(data_dir, "video_pitch_shifted")
ann_file = os.path.join(data_dir, "data-annotation-trainval-v1_1.json")

split = "train"
split_vid_file = os.path.join(data_dir, f"{split}-v1_2.txt")

with open(ann_file, "r") as f:
    all_annotations: list[dict] = json.load(f)

all_video_files = glob.glob(os.path.join(videos_dir, "*/*/*.mp4"))
split_vids = set([l.strip() for l in open(split_vid_file, "r")])
video_files_in_split = [v for v in all_video_files if v.split("/")[-3] in split_vids]

anns_in_split = [a for a in all_annotations if a["video_name"] in split_vids]
print(f"Total videos in split: {len(video_files_in_split)} / {len(all_video_files)}")

Total videos in split: 1466 / 2111


In [None]:
def cleanup_txt(txt: str) -> str:
    return (
        txt.strip()
        .replace("_", " ")
        .replace("student", "user")
        .replace("*unintelligible*", "")
    )


def parse_holoassist_ann(
    ann: dict, max_num_lines_per_gen: int = 6, essential_only: bool = True
) -> ParsedVideoAnns:
    video_uid = ann["video_name"]
    task_goal = ann["taskType"]
    duration = ann["videoMetadata"]["duration"]["seconds"]
    fps = ann["videoMetadata"]["video"]["fps"]
    # print(video_uid, task_goal, duration, fps)

    events = ann["events"]
    for e in events:
        if "Conversation Purpose" in e["attributes"]:
            e["start"] = (e["start"] + e["end"]) / 2
    events.sort(key=lambda e: e["start"])

    annotated_duration = 0
    num_steps = 0
    num_substeps = 0
    task_summary = ""
    all_descriptions = []
    substeps = []
    substep_actions = set()

    for e in events:
        start_time = e["start"]
        end_time = e["end"]
        attributes = e["attributes"]

        time_span = f"[{start_time:.1f}s-{end_time:.1f}s]"

        if "Long form description" in attributes:
            task_summary = cleanup_txt(attributes["Long form description"])

        if "Action sentence" in attributes:
            desc = cleanup_txt(attributes["Action sentence"])
            annotated_duration += end_time - start_time
            num_steps += 1
            if all_descriptions:
                all_descriptions[-1]["substeps"] = "\n".join(substeps)
            all_descriptions.append(
                {
                    "start": start_time,
                    "end": end_time,
                    "step": f"{time_span} {desc}",
                    "substeps": "",
                }
            )
            substeps = []
            substep_actions = set()

        elif "Conversation Purpose" in attributes:
            intent = attributes["Conversation Purpose"]
            role = "user" if intent.split("-")[0] == "student" else "assistant"
            utterance = cleanup_txt(attributes["Transcription"])
            if not utterance:
                continue
            num_substeps += 1
            substeps.append(f' - [{start_time:.1f}s] {role}: "{utterance}"')

        elif "Action Correctness" in attributes:
            is_error = attributes["Action Correctness"] != "Correct Action"
            error_reason = cleanup_txt(
                attributes.get("Incorrect Action Explanation", "none")
            )
            verb = cleanup_txt(attributes["Verb"])
            noun = cleanup_txt(attributes["Noun"])
            adj = (
                f"{attributes['Adjective']} "
                if attributes.get("Adjective") not in ["none", "wrong", ""]
                else ""
            )
            action = f"{verb} {adj}{noun}"
            err_msg = f" (ERROR: {error_reason})" if is_error else ""

            if verb in ["hold", "touch", "rotate", "inspect"]:
                continue

            if action not in substep_actions:
                num_substeps += 1
                substeps.append(
                    f" - [{start_time:.1f}s-{end_time:.1f}s] {action}{err_msg}"
                )
                substep_actions.add(action)

    # get a single string for all the step and substep descriptions
    all_step_descriptions = ""
    for s in all_descriptions:
        all_step_descriptions += s["step"] + "\n"
        if s["substeps"]:
            all_step_descriptions += s["substeps"] + "\n"

    all_step_descriptions = f"{all_step_descriptions}Summary: {task_summary}\n"

    # split the descriptions into clips
    clips = []
    num_lines_in_clip = 0
    clip_st = -1
    for idx, step in enumerate(all_descriptions):
        num_lines_in_clip += 1 + step["substeps"].count("\n")

        if clip_st < 0:
            clip_st = step["start"]

        if (
            num_lines_in_clip > max_num_lines_per_gen
            or idx == len(all_descriptions) - 1
        ):
            # add the clip
            clip_description = ""
            clip_et = step["end"]
            for s in all_descriptions:
                clip_description += s["step"] + "\n"
                if s["start"] >= clip_st and s["end"] <= clip_et and s["substeps"]:
                    clip_description += s["substeps"] + "\n"
            clip_description += f"Summary: {task_summary}\n"
            clips.append((clip_st, clip_et, clip_description))
            clip_st = -1
            num_lines_in_clip = 0

    ann_ratio = annotated_duration / duration

    parsed_ann = ParsedVideoAnns(
        dataset="holoassist",
        domain="object manipulation",
        knowledge_type="operation manual",
        video_uid=video_uid,
        goal_description=task_goal,
        all_step_descriptions=all_step_descriptions,
        clips=clips,
        duration=duration,
        ann_ratio=ann_ratio,
        num_steps=num_steps,
        num_substeps=num_substeps,
        fps=fps,
        original_ann=ann,
    )
    return parsed_ann

annotation = all_annotations[0]
parsed_ann = parse_holoassist_ann(annotation, max_num_lines_per_gen=10)
print(parsed_ann.all_step_descriptions)
for c in parsed_ann.clips:
    print(c[0], c[1])
    # print(c[2])

[11.0s-28.2s] The user grabs the GoPro.
 - [11.0s-12.2s] approach gopro
 - [13.7s] assistant: "Okay."
 - [15.1s] assistant: "You can pull the GoPro."
 - [16.9s] user: "GoPro."
 - [17.0s-22.2s] grab gopro
 - [22.2s-23.5s] flip bag
[29.9s-66.2s] The user changes the battery for the GoPro.
 - [30.2s] assistant: "Change the Battery."
 - [34.2s-41.0s] pull battery door
 - [41.0s-42.3s] open battery door
 - [43.5s-45.0s] grab battery
 - [45.1s-46.1s] withdraw battery
 - [45.6s] assistant: "Take out the battery."
 - [47.5s-49.6s] place battery
 - [48.8s] assistant: "Now, put it down"
 - [49.7s-50.9s] lift battery
 - [50.9s-51.7s] insert battery
 - [58.8s] assistant: "Close it."
 - [59.8s-61.0s] close battery door
 - [62.2s-63.5s] push battery door
 - [67.0s] assistant: "Now change the micro SD."
[68.4s-304.3s] The user opens the GoPro.
 - [68.6s-70.8s] grab battery door
 - [70.8s-72.7s] open battery door
 - [72.7s-73.9s] press battery (ERROR: The user presses the wrong place.)
 - [73.9s] assi

In [6]:
for i in range(len(all_annotations)):
    annotation = all_annotations[i]
    if annotation["video_name"]  != "R208-11Nov-ATV":
        continue
    parsed_ann = parse_holoassist_ann(annotation, max_num_lines_per_gen=10)
    print(f"{i} video: {parsed_ann.video_uid}")

    
    # user_types = ["no_talk"] + ["talk_some"] * 2 + ["talk_more"]
    user_types = ["talk_some"]
    outputs = generate_from_annotation(parsed_ann, llm, user_types=user_types)
    print(outputs.to_dict())

291 video: R208-11Nov-ATV
Processing video R208-11Nov-ATV
Goal: fix motorcycle| Duration: 239.0s| Num steps: 9| Num substeps: 88| Num clips: 6 | Ann ratio: 0.83
Infer goal and knowledge
inferred_knowledge: Mounting and Testing a Motorcycle Shift Arm
1. Put the shift lever in a neutral position.
2. Attach the shift arm to the mounting peg by sliding the long metal part onto the peg and aligning it.
3. Secure the shift arm to the mounting peg using a bolt and washer, tightening the bolt firmly.
4. Cycle through all gear positions to test the shift arm.
5. Use the remote to adjust the table height to a comfortable working position if necessary.
6. Remove the bolt and washer to detach the shift arm from the mounting peg.
7. Return the bolt and washer to their original storage bin.
8. Detach the shift arm from the mounting peg by adjusting the shift lever.
Generate conversations
Generating dialog for 6.3s-60.2s
before refine [6.3s] User: I want to mount and test a motorcycle shift arm.
[6.7

In [142]:
conversation = outputs.conversations[0]["conversation"]
text = ""
for idx, c in enumerate(conversation):
    text += f"[{c['time']}s] {c['role'].capitalize()}: {c['content']}"
    if c.get("labels"):
        text += f"   [{c['labels']}]"
    # if c.get("labels"):
        # text += f"\n       -> {c['labels']}"text += f"\n       -> {c['labels']}"
    # if "progress" in c:
    #     text += f"\n - Progress: {c['progress']}"
    text += "\n"

print(text)

[11.0s] User: I want to set up and prepare my GoPro for use.
[11.0s] Assistant: Great! Let's get started. Please approach the GoPro packaging.   [initiative|instruction]
[14.0s] Assistant: Now you can pull the GoPro out of the packaging.   [initiative|instruction]
[17.2s] User: GoPro.
[22.2s] Assistant: Flip the packaging bag to access the contents.   [initiative|instruction]
[29.9s] Assistant: Next, we need to change the battery for the GoPro. Can you locate the battery door?   [initiative|instruction,info_sharing]
[31.0s] Assistant: Great! Now, pull the battery door open.   [initiative|instruction]
[34.2s] User: How do I open it?
[34.3s] Assistant: Just pull it downwards, and it should open easily.   [responsive|instruction]
[41.0s] Assistant: Now, open the battery door completely.   [initiative|instruction]
[43.5s] Assistant: Next, grab the battery and withdraw it from the GoPro.   [initiative|instruction]
[47.5s] User: Where should I put it?
[47.6s] Assistant: Just place it on the 

### EgoExoLearn

In [36]:
import os
import json
import glob
import pandas as pd
from mmassist.configs.arguments import DATA_ROOT_DIR

data_dir = f"{DATA_ROOT_DIR}/datasets/EgoExoLearn"
videos_dir = os.path.join(data_dir, "videos")
ann_dir = os.path.join(data_dir, "annotations")

ann_file = os.path.join(ann_dir, "fine_annotation_trainval_en.csv")
all_step_annotations = pd.read_csv(ann_file).to_dict(orient="records")

video_annotations = {}
for ann in all_step_annotations:
    if ann["view"] != "ego":
        continue
    if ann["subset"] not in ["train", "val"]:
        print(ann)
    vid = ann["video_uid"]
    if vid not in video_annotations:
        video_annotations[vid] = {
            "video_uid": vid,
            "split": ann["subset"],
            "scene": ann["scene"],
            "steps": [],
        }
    video_ann = video_annotations[vid]
    video_ann["steps"]. append(
        {
            "start": ann["start_sec"],
            "end": ann["end_sec"],
            "narration": ann["narration_en_no_hand_prompt"],
        }
    )

for vid, ann in video_annotations.items():
    ann["steps"].sort(key=lambda s: s["start"])

print(len(video_annotations))

421


In [None]:
def parse_egoexolearn_ann(ann: dict, max_num_lines_per_gen: int = 6) -> ParsedVideoAnns:
    video_uid = ann["video_uid"]
    duration = ann["steps"][-1]["end"] - ann["steps"][0]["start"]

    annotated_duration = 0
    num_steps = 0

    for s in ann["steps"]:
        start_time = s["start"]
        end_time = s["end"]
        annotated_duration += end_time - start_time
        num_steps += 1
        time_span = f"[{start_time:.1f}s-{end_time:.1f}s]"
        s["desc"] = f"{time_span} {s['narration']}"
    ann_ratio = annotated_duration / duration

    # get a single string for all the step and substep descriptions
    all_step_descriptions = "\n".join([s["desc"] for s in ann["steps"]])

    # split the descriptions into clips
    clips = []
    clip_st = -1
    for idx, step in enumerate(ann["steps"]):
        if clip_st < 0:
            clip_st = step["start"]
            clip_start_idx = idx
        
        if (idx + 1) % max_num_lines_per_gen == 0:
            clip_steps = ann["steps"][max(clip_start_idx-5 , 0):idx+5]
            clip_description = "\n".join([s["desc"] for s in clip_steps])
            clips.append((clip_st, step["end"], clip_description))
            clip_st = -1

    domain = "cooking" if ann['scene'] == "kitchen" else "lab task"
    ktype = "cooking recipe" if domain == "cooking" else "lab task steps"
    parsed_ann = ParsedVideoAnns(
        dataset="egoexolearn",
        domain=domain,
        knowledge_type=ktype,
        video_uid=video_uid,
        goal_description="",
        all_step_descriptions=all_step_descriptions,
        clips=clips,
        duration=duration,
        ann_ratio=ann_ratio,
        num_steps=num_steps,
        original_ann=ann,
    )
    return parsed_ann



from collections import Counter
domains = Counter()
for i in range(len(video_annotations)):
    annotation = list(video_annotations.values())[i]
    parsed_ann = parse_egoexolearn_ann(annotation, max_num_lines_per_gen=12)
    domains[parsed_ann.domain] += 1
    if parsed_ann.domain == "lab task":
        print(parsed_ann.video_uid, i)
        break
        # print(parsed_ann.all_step_descriptions)
        # break
    # if parsed_ann.ann_ratio < 0.35:
    #     cnt += 1
    #     print(i, parsed_ann.ann_ratio)
    # if i == 52  :
    #     print(parsed_ann.video_uid)
    #     print(parsed_ann.all_step_descriptions)
    # print(parsed_ann.all_step_descriptions)
    # for c in parsed_ann.clips:
    #     print(c[0], c[1])
    #     print(c[2])
print(domains) 

beeae8f8-ac78-11ee-819f-80615f12b59e 314
Counter({'cooking': 314, 'lab task': 1})


In [5]:
annotation = list(video_annotations.values())[314]
parsed_ann = parse_egoexolearn_ann(annotation, max_num_lines_per_gen=10)
print(f"video: {parsed_ann.video_uid}")


# user_types = ["no_talk"] + ["talk_some"] * 2 + ["talk_more"]
user_types = ["talk_some"]
print(parsed_ann.all_step_descriptions)
outputs = generate_from_annotation(parsed_ann, llm, user_types=user_types)
print(outputs.to_dict())

video: beeae8f8-ac78-11ee-819f-80615f12b59e
[3.3s-4.2s] Press the water pump switch.
[5.9s-8.3s] Remove the plunger and needle tip from the syringe.
[8.4s-9.3s] Insert the syringe into the filter bottle.
[11.8s-12.7s] Take the bottle.
[12.7s-16.2s] Use a bottle washer to flush the plunger of the syringe.
[16.2s-17.2s] Put the bottle down.
[20.9s-22.9s] Rinse the bottle against the empty cylinder of the syringe.
[23.1s-23.8s] Pick up the empty syringe barrel.
[23.9s-25.0s] Use your left hand to rinse the bottle against the empty tube.
[25.1s-34.5s] Rinse the bottle and transfer the solution into the empty cylinder. Then, insert the empty cylinder into the filtering bottle and pour in the solution.
[34.7s-35.8s] Put the bottle down.
[36.3s-37.0s] Pick up another bottle.
[49.7s-50.7s] Put the bottle down.
[52.2s-52.9s] Pick up the detergent bottle.
[53.0s-64.5s] Use the wash bottle to repeatedly inject the solution into the empty cylinder, and inject the solution from the cylinder into th

In [7]:
conversation = outputs.conversations[0]["conversation"]
text = ""
for idx, c in enumerate(conversation):
    text += f"[{c['time']}s] {c['role'].capitalize()}: {c['content']}"
    if c.get("labels"):
        text += f"   [{c['labels']}]"
    # if c.get("labels"):
        # text += f"\n       -> {c['labels']}"text += f"\n       -> {c['labels']}"
    # if "progress" in c:
    #     text += f"\n - Progress: {c['progress']}"
    text += "\n"

print(text)

[0.0s] User: Hi, I'm about to start Solid Phase Peptide Synthesis. Can you guide me through it?
[0.0s] Assistant: Absolutely! I'll walk you through each step. First, let's get started with the initial setup. Press the water pump switch to begin.   [initiative|instruction]
[3.3s] Assistant: Now, remove the plunger and needle tip from the syringe, and then insert the syringe into the filter bottle.   [initiative|instruction]
[12.7s] Assistant: Next, use a bottle washer to flush the plunger of the syringe. This will clean it properly.   [initiative|instruction,info_sharing]
[16.2s] User: Just to confirm, I'm using the bottle washer to clean the plunger?
[16.2s] Assistant: That's correct! You're using it to flush the plunger. Now, put the bottle down for a moment.   [responsive|instruction,feedback]
[20.9s] Assistant: Now, rinse the bottle against the empty cylinder of the syringe, and then pick up the empty syringe barrel. We'll rinse the bottle again.   [initiative|instruction]
[25.1s] A

### Epic-Kitchen

In [42]:
import os
import json
import pandas as pd
from mmassist.configs.arguments import DATA_ROOT_DIR

data_dir = f"{DATA_ROOT_DIR}/datasets/epic-kitchens"
ann_dir = os.path.join(data_dir, "epic-kitchens-100-annotations")

split = "train"
ann_file = os.path.join(ann_dir, f"EPIC_100_{split}.csv")
action_annotations = pd.read_csv(ann_file).to_dict(orient="records")

video_info_file = os.path.join(ann_dir, "EPIC_100_video_info.csv")
video_info = pd.read_csv(video_info_file).to_dict(orient="records")

video_id_to_info = {v["video_id"]: v for v in video_info}

def timestamp_to_seconds(ts: str) -> float:
    h, m, s = ts.split(":")
    return int(h) * 3600 + int(m) * 60 + float(s)

video_annotations = {}
for ann in action_annotations:
    vid = ann["video_id"]
    vinfo = video_id_to_info[vid]
    if vid not in video_annotations:
        video_annotations[vid] = {
            "video_uid": vid,
            "duration": vinfo["duration"],
            "fps": vinfo["fps"],
            "steps": [],
        }
    video_ann = video_annotations[vid]
    video_ann["steps"]. append(
        {
            "start": timestamp_to_seconds(ann["start_timestamp"]),
            "end": timestamp_to_seconds(ann["stop_timestamp"]),
            "narration": ann["narration"],
        }
    )

for vid, ann in video_annotations.items():
    ann["steps"].sort(key=lambda s: s["start"])

print(len(video_annotations))

495


In [41]:
print(len(video_annotations))

138


In [None]:
def parse_epickitchens_ann(
    ann: dict, max_num_lines_per_gen: int = 10
) -> ParsedVideoAnns:
    video_uid = ann["video_uid"]
    duration = ann["duration"]

    annotated_duration = 0
    num_steps = 0

    for s in ann["steps"]:
        start_time = s["start"]
        end_time = s["end"]
        annotated_duration += end_time - start_time
        num_steps += 1
        time_span = f"[{start_time:.1f}s-{end_time:.1f}s]"
        s["desc"] = f"{time_span} {s['narration']}"
    ann_ratio = annotated_duration / duration

    # get a single string for all the step and substep descriptions
    all_step_descriptions = "\n".join([s["desc"] for s in ann["steps"]])

    # split the descriptions into clips
    clips = []
    clip_st = -1
    additional_ctx_len = 5
    for idx, step in enumerate(ann["steps"]):
        if clip_st < 0:
            clip_st = step["start"]
            clip_start_idx = idx

        if (idx + 1) % max_num_lines_per_gen == 0:
            start_idx = max(clip_start_idx - additional_ctx_len, 0)
            clip_steps = ann["steps"][start_idx : idx + additional_ctx_len]
            clip_description = "\n".join([s["desc"] for s in clip_steps])
            clips.append((clip_st, step["end"], clip_description))
            clip_st = -1

    parsed_ann = ParsedVideoAnns(
        dataset="epickitchens",
        domain="cooking",
        knowledge_type="cooking recipe",
        video_uid=video_uid,
        goal_description="",
        all_step_descriptions=all_step_descriptions,
        clips=clips,
        duration=duration,
        ann_ratio=ann_ratio,
        num_steps=num_steps,
        original_ann=ann,
    )
    return parsed_ann


for i in range(len(video_annotations)):
    annotation = list(video_annotations.values())[i]
    parsed_ann = parse_epickitchens_ann(annotation, max_num_lines_per_gen=10)
    break

print(parsed_ann.all_step_descriptions)
print(parsed_ann.ann_ratio)

[0.1s-3.4s] open door
[4.4s-6.2s] turn on light
[7.0s-9.5s] close door
[12.8s-14.0s] open fridge
[15.2s-16.4s] take celery
[16.5s-18.1s] take container
[18.2s-21.1s] take tofu
[21.9s-23.3s] close fridge
[23.2s-24.3s] open fridge
[24.5s-27.9s] take carrots and
[25.0s-26.2s] open drawer
[29.2s-31.3s] close fridge
[36.7s-37.8s] put down vegetables
[41.8s-42.5s] open cupboard
[43.2s-46.6s] take cutting board
[46.5s-47.8s] put down cutting board
[47.8s-48.8s] close cupboard
[51.7s-53.0s] open drawer
[53.5s-56.0s] take knife
[53.5s-54.4s] take knife
[54.7s-55.7s] put down knife
[56.1s-56.6s] close drawer
[57.5s-58.7s] put down knife
[59.0s-62.3s] open tap
[61.9s-65.2s] wash courgette
[65.2s-73.1s] still washing courgette
[74.3s-78.9s] wash carrot
[79.0s-82.8s] still washing carrot
[82.2s-83.2s] close tap
[85.5s-86.5s] put down vegetables
[87.4s-88.3s] open cupboard
[92.5s-94.1s] take grater
[95.3s-99.6s] take pan
[100.7s-102.3s] put down pan
[101.7s-102.4s] close cupboard
[103.0s-103.8s] clo

In [20]:
annotation = list(video_annotations.values())[0]
parsed_ann = parse_epickitchens_ann(annotation, max_num_lines_per_gen=10)
print(f"video: {parsed_ann.video_uid}")


# user_types = ["no_talk"] + ["talk_some"] * 2 + ["talk_more"]
user_types = ["talk_some"]
print(parsed_ann.all_step_descriptions)
outputs = generate_from_annotation(parsed_ann, llm, user_types=user_types, clips_to_process=5)
print(outputs.to_dict())



video: P01_01
[0.1s-3.4s] open door
[4.4s-6.2s] turn on light
[7.0s-9.5s] close door
[12.8s-14.0s] open fridge
[15.2s-16.4s] take celery
[16.5s-18.1s] take container
[18.2s-21.1s] take tofu
[21.9s-23.3s] close fridge
[23.2s-24.3s] open fridge
[24.5s-27.9s] take carrots and
[25.0s-26.2s] open drawer
[29.2s-31.3s] close fridge
[36.7s-37.8s] put down vegetables
[41.8s-42.5s] open cupboard
[43.2s-46.6s] take cutting board
[46.5s-47.8s] put down cutting board
[47.8s-48.8s] close cupboard
[51.7s-53.0s] open drawer
[53.5s-56.0s] take knife
[53.5s-54.4s] take knife
[54.7s-55.7s] put down knife
[56.1s-56.6s] close drawer
[57.5s-58.7s] put down knife
[59.0s-62.3s] open tap
[61.9s-65.2s] wash courgette
[65.2s-73.1s] still washing courgette
[74.3s-78.9s] wash carrot
[79.0s-82.8s] still washing carrot
[82.2s-83.2s] close tap
[85.5s-86.5s] put down vegetables
[87.4s-88.3s] open cupboard
[92.5s-94.1s] take grater
[95.3s-99.6s] take pan
[100.7s-102.3s] put down pan
[101.7s-102.4s] close cupboard
[103.

In [21]:
conversation = outputs.conversations[0]["conversation"]
text = ""
for idx, c in enumerate(conversation):
    text += f"[{c['time']}s] {c['role'].capitalize()}: {c['content']}"
    if c.get("labels"):
        text += f"   [{c['labels']}]"
    text += "\n"
print(text)

[0.1s] User: I want to make a vegetable and tofu stir-fry with rice.
[0.1s] Assistant: Great choice! Let's get started. Open the kitchen door and turn on the light, please.   [initiative|instruction]
[4.4s] Assistant: Now, close the door behind you to keep the kitchen cozy.   [initiative|instruction]
[12.8s] Assistant: Next, open the fridge to take out the ingredients. We'll need a few things for this recipe.   [initiative|instruction,info_sharing]
[15.2s] User: What vegetables should I take out?
[15.2s] Assistant: For this recipe, we'll need celery and carrots. Take out the celery first, please.   [responsive|instruction]
[16.5s] Assistant: Now, grab the container with the tofu. It should be nearby.   [initiative|instruction]
[18.2s] User: Got it. What's next?
[18.2s] Assistant: Now, take out the carrots. If you need to, open the fridge again to get them.   [responsive|instruction]
[24.5s] User: Okay, I've got the carrots. What about the rice?
[24.5s] Assistant: We'll get to the rice 

### WTaG

In [13]:
import os
import json
import glob
import pandas as pd
from mmassist.configs.arguments import DATA_ROOT_DIR

data_dir = f"{DATA_ROOT_DIR}/datasets/WTaG"
video_ids = sorted([f for f in os.listdir(data_dir)])

sampling_rate = 10_000_000

all_annotations = []
for vid in video_ids:
    video_timing_file = os.path.join(data_dir, vid, "Video/VideoMpegTiming.txt")
    with open(video_timing_file, "r") as f:
        lines = f.readlines()
        start_ts = float(lines[0].strip())
        end_ts = float(lines[1].strip())
        duration = (end_ts - start_ts) / sampling_rate

    # add step annotations
    events = []
    for ann_file, role in [
        ("StepDetection/StepDetection.txt", ""),
        ("TextASR/InstructorAnnotations_intent.txt", "assistant"),
        ("TextASR/UserAnnotations_intent.txt", "user"),
    ]:
        anns = []
        with open(os.path.join(data_dir, vid, ann_file), "r") as f:
            for line in f:
                anns.append(line.strip().split("\t"))
        for ann in anns:
            st = (int(ann[0]) - start_ts) / sampling_rate
            et = (int(ann[1]) - start_ts) / sampling_rate
            if "TextASR" in ann_file:
                st = (st + et) / 2  # use the middle time for dialog annotations
            content = ann[2] if not role else f'{role}: "{ann[2]}"'
            events.append({"start": st, "end": et, "narration": content})
    events.sort(key=lambda s: s["start"])

    end_of_start_step = 0
    for e in events:
        if e["narration"].lower() == "start":
            end_of_start_step = e["end"]
            break

    steps = []
    ann_duration = 0
    for e in events:
        if e["start"] < end_of_start_step:
            continue
        if ":" not in e["narration"]:
            e["step"] = f"[{e['start']:.1f}s-{e['end']:.1f}s] {e['narration']}"
            e["substeps"] = []
            ann_duration += e["end"] - e["start"]
            steps.append(e)
        elif steps:
            last_step = steps[-1]
            ss_desc = f"- [{e['start']:.1f}s] {e['narration']}"
            last_step["substeps"].append(ss_desc)
            if e["start"] > last_step["end"]:
                last_step["end"] = e["start"]
                last_step["step"] = (
                    f"[{last_step['start']:.1f}s-{last_step['end']:.1f}s] {last_step['narration']}"
                )
    dummy_start_time = end_of_start_step / 2
    steps.insert(
        0,
        {
            "start": dummy_start_time,
            "end": end_of_start_step,
            "step": f"- [{dummy_start_time:.1f}s] start",
            "substeps": [],
        },
    )
    ann_ratio = ann_duration / duration
    video_ann = {
        "video_uid": vid,
        "duration": duration,
        "ann_ratio": ann_ratio,
        "steps": steps,
    }
    all_annotations.append(video_ann)

In [None]:
def parse_wtag_ann(
    ann: dict, max_num_lines_per_gen: int = 6
) -> ParsedVideoAnns:
    video_uid = ann["video_uid"]
    duration = ann["duration"]
    ann_ratio = ann["ann_ratio"]
    num_steps = len(ann["steps"])


    # get a single string for all the step and substep descriptions
    all_step_descriptions = ""
    for s in ann["steps"]:
        all_step_descriptions += s["step"] + "\n"
        if s.get("substeps"):
            all_step_descriptions += "\n".join(s["substeps"]) + "\n"

    # split the descriptions into clips
    clips = []
    num_lines_in_clip = 0
    clip_st = -1
    for idx, step in enumerate(ann["steps"]):
        num_lines_in_clip += 1 + step["substeps"].count("\n")

        if clip_st < 0:
            clip_st = step["start"]

        if (
            num_lines_in_clip > max_num_lines_per_gen
            or idx == len(ann["steps"]) - 1
        ):
            # add the clip
            clip_description = ""
            clip_et = step["end"]
            for s in ann["steps"]:
                clip_description += s["step"] + "\n"
                if s["start"] >= clip_st and s["end"] <= clip_et and s["substeps"]:
                    clip_description += "\n".join(s["substeps"]) + "\n"
            clips.append((clip_st, clip_et, clip_description))
            clip_st = -1
            num_lines_in_clip = 0

    parsed_ann = ParsedVideoAnns(
        dataset="wtag",
        domain="cooking",
        knowledge_type="cooking recipe",
        video_uid=video_uid,
        goal_description="",
        all_step_descriptions=all_step_descriptions,
        clips=clips,
        duration=duration,
        ann_ratio=ann_ratio,
        num_steps=num_steps,
        original_ann=ann,
    )
    return parsed_ann


for i in [0]:
    annotation = all_annotations[i]
    parsed_ann = parse_wtag_ann(annotation, max_num_lines_per_gen=6)
    break

print(parsed_ann.all_step_descriptions)
print(parsed_ann.ann_ratio)

NameError: name 'ParsedVideoAnns' is not defined

In [14]:
for i in range(len(all_annotations)):
    annotation = all_annotations[i]
    parsed_ann = parse_wtag_ann(annotation)
    print(f"{i} video: {parsed_ann.video_uid}")
    user_types = ["talk_some"]
    outputs = generate_from_annotation(parsed_ann, llm, user_types=user_types)
    print(outputs.to_dict())
    break

0 video: T1
Processing video T1
Goal: | Duration: 321.5s| Num steps: 14| Num substeps: None| Num clips: 2 | Ann ratio: 0.73
Infer goal and knowledge
inferred_knowledge: Pinwheels

Ingredients
1 8-inch flour tortilla
Jar of nut butter or allergy-friendly alternative (such as sunbutter, soy butter, or seed butter) Jar of jelly, jam, or fruit preserves

Tools and Utensils
cutting board
butter knife
paper towel
toothpicks
~12-inch strand of dental floss plate

Steps
1. Place tortilla on cutting board.
2. Use a butter knife to scoop nut butter from the jar. Spread nut butter onto tortilla, leaving 1/2-
inch uncovered at the edges.
3. Clean the knife by wiping with a paper towel.
4. Use the knife to scoop jelly from the jar. Spread jelly over the nut butter.
5. Clean the knife by wiping with a paper towel.
6. Roll the tortilla from one end to the other into a log shape, about 1.5 inches thick. Roll it tight
enough to prevent gaps, but not so tight that the filling leaks.
7. Secure the rolled

In [15]:
conversation = outputs.conversations[0]["conversation"]
text = ""
for idx, c in enumerate(conversation):
    text += f"[{c['time']}s] {c['role'].capitalize()}: {c['content']}"
    if c.get("labels"):
        text += f"   [{c['labels']}]"
    text += "\n"
print(text)

[16.3s] Assistant: Great! To make pinwheels, we'll be using tortillas, nut butter, jelly, and some other ingredients. Let's start by placing a tortilla on the cutting board.   [initiative|instruction]
[35.2s] Assistant: Yes, that's perfect. Make sure it's flat and even.   [responsive|feedback]
[45.5s] Assistant: Now, let's spread some nut butter on the tortilla. Remember to use a butter knife for this step.   [initiative|instruction,info_sharing]
[53.1s] Assistant: Also, could you look down more to make sure you're spreading it evenly?   [initiative|instruction]
[57.2s] Assistant: Just a thin layer is fine. You can always add more if needed.   [responsive|info_sharing]
[60.1s] Assistant: By the way, you should be using a butter knife to scoop the peanut butter instead of the spoon.   [initiative|correction]
[97.7s] Assistant: Great job on spreading the nut butter! Now, let's clean the knife.   [initiative|instruction,feedback]
[102.2s] Assistant: Next, we'll spread some jelly on the to

### Assembly101

In [352]:
import os
import json
import copy
import glob
import pandas as pd
from mmassist.configs.arguments import DATA_ROOT_DIR

data_dir = f"{DATA_ROOT_DIR}/datasets/assembly101"
ann_dir = os.path.join(data_dir, "annotations")
fine_anns_dir = os.path.join(ann_dir, "fine-grained-annotations")
coarse_anns_dir = os.path.join(ann_dir, "coarse-annotations/coarse_labels")
mistake_anns_dir = os.path.join(ann_dir, "assembly101-mistake-detection/annots")

fps = 30
selected_cam_ids = [
    "HMC_21110305_mono10bit",
    "HMC_21179183_mono10bit",
    "HMC_84355350_mono10bit",
    "HMC_84358933_mono10bit",
]
concrete_actions = ["screw", "unscrew"] #, "remove", "position"]

seq_id_to_fine_anns = {}
for split in ["train", "val"]:
    fine_ann_file = os.path.join(
        fine_anns_dir, f"{split.replace('val', 'validation')}.csv"
    )
    fine_annotations = pd.read_csv(fine_ann_file).to_dict(orient="records")
    for ann in fine_annotations:
        video_file = ann["video"]
        if not any(v in video_file for v in ["HMC_21110305", "HMC_84355350"]):
            continue
        seq_id = video_file.split("/")[0]
        if seq_id not in seq_id_to_fine_anns:
            seq_id_to_fine_anns[seq_id] = {
                "seq_id": seq_id,
                "split": split,
                "substeps": [],
            }
        seq_ann = seq_id_to_fine_anns[seq_id]
        st = ann["start_frame"] / fps
        et = ann["end_frame"] / fps
        seq_ann["substeps"].append(
            {"start": st, "end": et, "narration": ann["action_cls"]}
        )
        seq_ann["toy_name"] = ann["toy_name"] if "-" not in ann["toy_name"] else ""


coarse_ann_files = glob.glob(os.path.join(coarse_anns_dir, "*.txt"))

all_seq_ids = set()
for coarse_ann_file in coarse_ann_files:
    seq_id = coarse_ann_file.split("/")[-1].split("_", 1)[1].split(".")[0]
    all_seq_ids.add(seq_id)


task_id_to_ann = {}
for seq_id in all_seq_ids:
    for task_type in ["assembly", "disassembly"]:
        task_id = f"{task_type}_{seq_id}"
        coarse_ann_file = os.path.join(coarse_anns_dir, f"{task_id}.txt")
        if not os.path.exists(coarse_ann_file):
            # print(f"File not found: {coarse_ann_file}")
            continue

        # has coarse annotations
        coarse_ann = pd.read_csv(coarse_ann_file, sep="\t", header=None)
        coarse_ann = coarse_ann.to_dict(orient="records")

        # has fine annotations
        if seq_id not in seq_id_to_fine_anns:
            # print(f"No fine annotations for {seq_id}")
            continue
        fine_anns = seq_id_to_fine_anns[seq_id]

        # has mistake annotations
        mistake_ann_file = os.path.join(mistake_anns_dir, f"{seq_id}.csv")
        if not os.path.exists(mistake_ann_file):
            # print(f"No mistake annotations for {seq_id}")
            continue
        mistake_anns = pd.read_csv(mistake_ann_file, header=None)
        mistake_anns = mistake_anns.to_dict(orient="records")

        fine_anns = seq_id_to_fine_anns[seq_id]
        toy_name = "toy " + fine_anns["toy_name"] if fine_anns["toy_name"] else "toy"
        task_name = f"{task_type.replace('ly', 'le')} {toy_name}"

        if task_id not in task_id_to_ann:
            task_id_to_ann[task_id] = copy.deepcopy(fine_anns)
            task_id_to_ann[task_id].update(
                {"task_id": task_id, "task": task_name, "steps": []}
            )

        has_mistake = False
        ann = task_id_to_ann[task_id]
        for c in coarse_ann:
            narration = c[2]
            mistake_msg = ""
            for m in mistake_anns:
                if m[0] == c[0]:
                    prop = "to" if m[2] == "attach" else "from"
                    narration = f"{m[2]} {m[3]} {prop} {m[4]}"
                    if m[5] == "mistake":
                        mistake_msg = f"mistake: {m[6]}"
                        has_mistake = True
                    elif m[5] == "correction":
                        if m[2] == "attach":
                            action_wrong = narration.replace("attach", "detach")
                            action_wrong = action_wrong.replace("to", "from")
                            mistake_msg = f'correction of "{action_wrong}"'
                        else:
                            action_wrong = narration.replace("detach", "attach")
                            action_wrong = action_wrong.replace("from", "to")
                            mistake_msg = f'correction of "{action_wrong}"'
                    break
            st, et = c[0] / fps, c[1] / fps
            ann["steps"].append(
                {
                    "start": st,
                    "end": et,
                    "narration": narration,
                    "mistake": mistake_msg,
                    "substeps": [],
                }
            )
            for s in fine_anns["substeps"]:
                add = False
                for verb in concrete_actions:
                    if s["narration"].startswith(verb):
                        add = True
                        break
                if add and s["start"] >= st and s["start"] <= et:
                    ann["steps"][-1]["substeps"].append(s)
            ann["steps"][-1]["substeps"].sort(key=lambda s: s["start"])
            ann["has_mistake"] = has_mistake


for task_id, ann in task_id_to_ann.items():
    ann["steps"].sort(key=lambda s: s["start"])
    ann["substeps"].sort(key=lambda s: s["start"])

    start_time = ann["steps"][0]["start"]
    duration = ann["steps"][-1]["end"] - start_time
    ann["video_start_time"] = max(0, start_time - 2)
    ann["duration"] = duration


video_dir = os.path.join(data_dir, "videos")
video_uid_to_ann = {}
seq_id_to_num_videos = {}
for task_id, ann in task_id_to_ann.items():
    seq_id = ann["seq_id"]
    for cam_id in selected_cam_ids:
        video_file = os.path.join(video_dir, f"{seq_id}/{cam_id}.mp4")
        if os.path.exists(video_file):
            video_uid = f"{task_id}__{cam_id}"
            ann = copy.deepcopy(ann)
            ann["video_uid"] = video_uid
            video_uid_to_ann[video_uid] = ann
print(len(video_uid_to_ann))

all_annotations = [v for v in video_uid_to_ann.values()]
all_annotations.sort(key=lambda v: v["video_uid"])

anns_per_split = {}
for split in ["train", "val"]:
    anns_per_split[split] = [v for v in all_annotations if v["split"] == split]
    print(f"{split}: {len(anns_per_split[split])}")

986
train: 756
val: 230


In [353]:
sorted_keys = sorted(anns_per_split["train"], key=lambda v: v["video_uid"])
for i in range(10):
    print(sorted_keys[i]["video_uid"])

assembly_nusar-2021_action_both_9011-b06b_9011_user_id_2021-02-01_154253__HMC_84355350_mono10bit
assembly_nusar-2021_action_both_9011-b06b_9011_user_id_2021-02-01_154253__HMC_84358933_mono10bit
assembly_nusar-2021_action_both_9011-b08c_9011_user_id_2021-02-01_154736__HMC_84355350_mono10bit
assembly_nusar-2021_action_both_9011-b08c_9011_user_id_2021-02-01_154736__HMC_84358933_mono10bit
assembly_nusar-2021_action_both_9012-a16_9012_user_id_2021-02-01_162904__HMC_84355350_mono10bit
assembly_nusar-2021_action_both_9012-a16_9012_user_id_2021-02-01_162904__HMC_84358933_mono10bit
assembly_nusar-2021_action_both_9012-b06d_9012_user_id_2021-02-01_163713__HMC_84355350_mono10bit
assembly_nusar-2021_action_both_9012-b06d_9012_user_id_2021-02-01_163713__HMC_84358933_mono10bit
assembly_nusar-2021_action_both_9012-c06d_9012_user_id_2021-02-18_121034__HMC_84355350_mono10bit
assembly_nusar-2021_action_both_9012-c06d_9012_user_id_2021-02-18_121034__HMC_84358933_mono10bit


In [354]:
sorted_keys = sorted(list(video_uid_to_ann.keys()))
for i in range(10):
    print(sorted_keys[i])

assembly_nusar-2021_action_both_9011-b06b_9011_user_id_2021-02-01_154253__HMC_84355350_mono10bit
assembly_nusar-2021_action_both_9011-b06b_9011_user_id_2021-02-01_154253__HMC_84358933_mono10bit
assembly_nusar-2021_action_both_9011-b08c_9011_user_id_2021-02-01_154736__HMC_84355350_mono10bit
assembly_nusar-2021_action_both_9011-b08c_9011_user_id_2021-02-01_154736__HMC_84358933_mono10bit
assembly_nusar-2021_action_both_9011-c03f_9011_user_id_2021-02-01_160239__HMC_84355350_mono10bit
assembly_nusar-2021_action_both_9011-c03f_9011_user_id_2021-02-01_160239__HMC_84358933_mono10bit
assembly_nusar-2021_action_both_9012-a16_9012_user_id_2021-02-01_162904__HMC_84355350_mono10bit
assembly_nusar-2021_action_both_9012-a16_9012_user_id_2021-02-01_162904__HMC_84358933_mono10bit
assembly_nusar-2021_action_both_9012-b06d_9012_user_id_2021-02-01_163713__HMC_84355350_mono10bit
assembly_nusar-2021_action_both_9012-b06d_9012_user_id_2021-02-01_163713__HMC_84358933_mono10bit


In [None]:
def parse_assembly101_ann(ann: dict, max_num_lines_per_gen: int = 6) -> ParsedVideoAnns:

    # get a single string for all the step and substep descriptions
    all_step_descriptions = ""
    for s in ann["steps"]:
        mistake = s["mistake"]
        if mistake:
            mistake = f" ({mistake})"
        s_desc = f"[{s['start']:.1f}s-{s['end']:.1f}s] {s['narration']}"
        all_step_descriptions += f"{s_desc}{mistake}\n"
        substeps = s.get("substeps", [])
        if not mistake and len(substeps) > 1:
            for ss in substeps:
                all_step_descriptions += f" - [{ss['start']:.1f}s] {ss['narration']}\n"

    # split the descriptions into clips
    clips = []
    num_lines_in_clip = 0
    clip_st = -1
    for idx, step in enumerate(ann["steps"]):
        num_lines_in_clip += 1 + step["substeps"].count("\n")

        if clip_st < 0:
            clip_st = step["start"]

        if (
            num_lines_in_clip > max_num_lines_per_gen
            or idx == len(ann["steps"]) - 1
        ):
            # add the clip
            clip_description = ""
            clip_et = step["end"]
            for s in ann["steps"]:
                mistake = s["mistake"]
                if mistake:
                    mistake = f" ({mistake})"
                s_desc = f"[{s['start']:.1f}s-{s['end']:.1f}s] {s['narration']}"
                clip_description += f"{s_desc}{mistake}\n"
                substeps = s.get("substeps", [])
                if mistake or len(substeps) <= 1:
                    continue
                if s["start"] >= clip_st and s["end"] <= clip_et and s["substeps"]:
                    for ss in substeps:
                        clip_description += f" - [{ss['start']:.1f}s] {ss['narration']}\n"
            clips.append((clip_st, clip_et - 1, clip_description))
            clip_st = -1
            num_lines_in_clip = 0

    task_type = "assembly" if ann["task"].startswith("asse") else "disassembly"
    knowledge_type = f"{task_type} steps"
    parsed_ann = ParsedVideoAnns(
        dataset="assembly101",
        domain="assembly/disassembly",
        knowledge_type=knowledge_type,
        video_uid=ann["video_uid"],
        goal_description=ann["task"],
        all_step_descriptions=all_step_descriptions,
        clips=clips,
        duration=ann["duration"],
        ann_ratio=1.0,
        num_steps=len(ann["steps"]),
        num_substeps=sum([len(s["substeps"]) for s in ann["steps"]]),
        original_ann=ann,
        video_start_time=ann["video_start_time"],
        has_mistake=ann["has_mistake"],
    )
    return parsed_ann

all_annotations = list(video_uid_to_ann.values())
for idx in range(90, 100):
    annotation = all_annotations[idx]
    parsed_ann = parse_assembly101_ann(annotation, max_num_lines_per_gen=6)
    if parsed_ann.has_mistake:
        print(f"{idx} video: {parsed_ann.video_uid}")
        print(parsed_ann.all_step_descriptions)
        break

# print(parsed_ann.all_step_descriptions)

90 video: assembly_nusar-2021_action_both_9026-b04b_9026_user_id_2021-02-03_163855__HMC_21179183_mono10bit
[214.7s-219.7s] attach interior to cabin
[219.7s-238.4s] attempt to attach body
[238.4s-245.3s] attach bumper to body
[245.3s-324.0s] attach bumper to body (mistake: shouldn't have happened)
[324.0s-347.9s] attach cabin to chassis
[347.9s-364.3s] attempt to attach body
[364.3s-369.6s] detach cabin from chassis (mistake: shouldn't have happened)
[369.6s-393.0s] attach cabin to chassis
[393.0s-413.5s] attempt to screw chassis
[413.5s-423.0s] detach cabin from chassis (mistake: shouldn't have happened)
[423.0s-455.8s] attach cabin to chassis
[455.8s-459.3s] attach body to chassis (mistake: wrong order)
[459.3s-478.4s] screw chassis
[478.4s-497.2s] attempt to attach bumper
[497.2s-506.0s] unscrew chassis
[506.0s-507.7s] detach body from chassis (correction of "attach body to chassis")
[507.7s-514.4s] attach bumper to body
[514.4s-524.6s] attach body to chassis
[524.6s-533.6s] screw ch

In [43]:
for i in [idx]:
    annotation = all_annotations[i]
    parsed_ann = parse_assembly101_ann(annotation)
    print(f"{i} video: {parsed_ann.video_uid}")
    user_types = ["talk_some"]
    outputs = generate_from_annotation(parsed_ann, llm, user_types=user_types)
    print(outputs.to_dict())
    break

90 video: assembly_nusar-2021_action_both_9026-b04b_9026_user_id_2021-02-03_163855__HMC_21179183_mono10bit
Processing video assembly_nusar-2021_action_both_9026-b04b_9026_user_id_2021-02-03_163855__HMC_21179183_mono10bit
Goal: assemble toy| Duration: 417.5s| Num steps: 22| Num substeps: 0| Num clips: 4 | Ann ratio: 1.00
Infer goal and knowledge
inferred_knowledge: Assembling a Toy Vehicle
1. Assemble the cabin by attaching the interior to it.
2. Attach the bumper to the body.
3. Attach the cabin to the chassis.
4. Attach the body to the chassis.
5. Screw the chassis together.
6. Attach the roof to the cabin.
7. Attach the arm connector to the chassis.
8. Attach the wheels to the chassis.
Generate conversations
Generating dialog for 214.7s-368.6s
before refine [214.7s] User: I want to assemble a toy vehicle. Can you guide me through it?
[214.8s] Assistant: I'd be happy to help you assemble the toy vehicle. This task should take about 10 minutes to complete. Let's get started! First, ple

In [44]:
conversation = outputs.conversations[0]["conversation"]
text = ""
for idx, c in enumerate(conversation):
    text += f"[{c['time']}s] {c['role'].capitalize()}: {c['content']}"
    if c.get("labels"):
        text += f"   [{c['labels']}]"
    text += "\n"
print(text)

[214.8s] User: I want to assemble a toy vehicle. Can you guide me through it?
[214.8s] Assistant: I'd be happy to help you assemble the toy vehicle. This task should take about 10 minutes to complete. Let's get started! First, attach the interior to the cabin.   [initiative|instruction,info_sharing]
[219.7s] Assistant: Now, attach the body to the cabin.   [initiative|instruction]
[238.4s] Assistant: Next, attach the bumper to the body.   [initiative|instruction]
[245.3s] User: Is the bumper securely attached?
[245.4s] Assistant: Yes, it should be. If not, try again with proper alignment. Now, attach the cabin to the chassis.   [responsive|instruction,info_sharing]
[324.0s] User: I've attached the bumper to the body again.
[324.1s] Assistant: Actually, you shouldn't have done that. Let's move on. Please attach the cabin to the chassis.   [responsive|instruction,correction]
[347.9s] Assistant: Now, attempt to attach the body to the cabin. Make sure it's aligned with the cabin and follow 

# EPFL Dataset Integration Demo

This section demonstrates how to use the new EPFL dataset support with the ProAssist dialog generation pipeline. The EPFL dataset contains cooking activities with both coarse-grained activity annotations and fine-grained action annotations.

In [2]:
# Import the new EPFL modules
import os

from mmassist.datasets.generate.generate_epfl import (
    load_epfl_annotations, 
    combine_annotations, 
    parse_epfl_annotations,
    EpflPreprocessArgs
)
from mmassist.datasets.generate.openrouter_utils import OpenRouterGenerator

# Set your OpenRouter API key (you'll need to get this from https://openrouter.ai/)
# os.environ["OPENROUTER_API_KEY"] = "your_api_key_here"

print("EPFL modules imported successfully!")

  from .autonotebook import tqdm as notebook_tqdm


EPFL modules imported successfully!


In [3]:
# Example: Load and parse EPFL annotations for a single session
split = "test"
participant = "YH2003"
session = "2023_05_17_09_08_58"
camera = "hololens_compressed"
annotation_dir = f"C:\\Users\\james\\Documents\\Research\\ProAssist\\my_data\\datasets\\epfl\\annotations\\{split}\\{participant}\\{session}\\annotations"
frames_dir = f"C:\\Users\\james\\Documents\\Research\\ProAssist\\my_data\\processed_data\\epfl\\frames\\{split}_{participant}_{session}_{camera}.arrow"
# frames_dir = f"/c/Users/james/Documents/Research/ProAssist/my_data/processed_data/epfl/frames/"

print("Loading EPFL annotations...")
print(f"Participant: {participant}")
print(f"Session: {session}")
print(f"Annotation directory: {annotation_dir}")

try:
    # Load coarse and fine-grained annotations
    coarse_annotations, fine_grained_annotations = load_epfl_annotations(annotation_dir)
    
    print(f"\nLoaded {len(coarse_annotations)} coarse annotations")
    print(f"Loaded {len(fine_grained_annotations)} fine-grained annotations")
    
    # Show first few coarse annotations
    print("\nFirst 5 coarse annotations:")
    for i, ann in enumerate(coarse_annotations[:5]):
        print(f"  {i+1}. [{ann['start']:.1f}s-{ann['end']:.1f}s] {ann['Activities']}")
    
    # Show first few fine-grained annotations
    print("\nFirst 5 fine-grained annotations:")
    for i, ann in enumerate(fine_grained_annotations[:5]):
        print(f"  {i+1}. [{ann['start']:.1f}s-{ann['end']:.1f}s] {ann['action']}")
        
except Exception as e:
    print(f"Error loading annotations: {e}")
    print("Make sure the annotation files exist at the specified path.")


Loading EPFL annotations...
Participant: YH2003
Session: 2023_05_17_09_08_58
Annotation directory: C:\Users\james\Documents\Research\ProAssist\my_data\datasets\epfl\annotations\test\YH2003\2023_05_17_09_08_58\annotations

Loaded 128 coarse annotations
Loaded 1370 fine-grained annotations

First 5 coarse annotations:
  1. [0.0s-7.3s] experimental procedure
  2. [7.3s-54.8s] getting ready
  3. [54.8s-56.1s] experimental procedure
  4. [56.1s-80.3s] cooking
  5. [80.3s-84.0s] experimental procedure

First 5 fine-grained annotations:
  1. [0.0s-3.0s] open cupboard
  2. [3.0s-6.1s] close cupboard
  3. [6.1s-10.7s] open cupboard
  4. [10.0s-11.6s] squat
  5. [10.7s-11.6s] close cupboard


In [4]:
# Combine annotations and parse into the pipeline format
try:
    # Combine coarse and fine-grained annotations
    combined_annotations = combine_annotations(coarse_annotations, fine_grained_annotations)
    
    print(f"Combined into {len(combined_annotations)} activity segments")
    
    # Show the first few combined annotations
    print("\nFirst 3 combined annotations:")
    for i, ann in enumerate(combined_annotations[:3]):
        print(f"\n{i+1}. Activity: {ann['activity']} [{ann['start']:.1f}s-{ann['end']:.1f}s]")
        if ann['actions']:
            print(f"   Fine-grained actions ({len(ann['actions'])}):")
            for j, action in enumerate(ann['actions'][:3]):  # Show first 3 actions
                print(f"     - [{action['start']:.1f}s] {action['action']}")
            if len(ann['actions']) > 3:
                print(f"     ... and {len(ann['actions']) - 3} more actions")
        else:
            print("   No fine-grained actions found")
                
    else:
        print("Please run the previous cell first to load annotations")
        
except Exception as e:
    print(f"Error combining annotations: {e}")

Combined into 128 activity segments

First 3 combined annotations:

1. Activity: experimental procedure [0.0s-7.3s]
   Fine-grained actions (3):
     - [0.0s] open cupboard
     - [3.0s] close cupboard
     - [6.1s] open cupboard

2. Activity: getting ready [7.3s-54.8s]
   Fine-grained actions (19):
     - [6.1s] open cupboard
     - [10.0s] squat
     - [10.7s] close cupboard
     ... and 16 more actions

3. Activity: experimental procedure [54.8s-56.1s]
   Fine-grained actions (1):
     - [33.0s] carry pot
Please run the previous cell first to load annotations


In [5]:
# Parse the EPFL session into the dialog generation format
try:
    parsed_ann = parse_epfl_annotations(
        split=split,
        participant=participant,
        session=session,
        annotation_dir=annotation_dir,
        max_num_lines_per_gen=20  # As specified in requirements
    )
    
    if parsed_ann:
        print("Successfully parsed EPFL annotations!")
        print(f"\nDataset: {parsed_ann.dataset}")
        print(f"Domain: {parsed_ann.domain}")
        print(f"Knowledge type: {parsed_ann.knowledge_type}")
        print(f"Video UID: {parsed_ann.video_uid}")
        print(f"Goal description: {parsed_ann.goal_description}")
        print(f"Duration: {parsed_ann.duration:.1f}s")
        print(f"Annotation ratio: {parsed_ann.ann_ratio:.2%}")
        print(f"Number of steps: {parsed_ann.num_steps}")
        print(f"Number of substeps: {parsed_ann.num_substeps}")
        print(f"Number of clips: {len(parsed_ann.clips)}")
        
        print("\nGenerated clips:")
        for i, (start, end, description) in enumerate(parsed_ann.clips):
            print(f"\nClip {i+1}: [{start:.1f}s-{end:.1f}s]")
            lines = description.split('\n')
            for line in lines[:3]:  # Show first 3 lines
                print(f"  {line}")
            if len(lines) > 3:
                print(f"  ... and {len(lines) - 3} more lines")
    else:
        print("Failed to parse annotations (no cooking activities found?)")
        
except Exception as e:
    print(f"Error parsing annotations: {e}")

Successfully parsed EPFL annotations!

Dataset: epfl
Domain: cooking
Knowledge type: cooking recipe
Video UID: test_YH2003_2023_05_17_09_08_58
Goal description: 
Duration: 2492.0s
Annotation ratio: 100.00%
Number of steps: 128
Number of substeps: 1534
Number of clips: 7

Generated clips:

Clip 1: [0.0s-259.0s]
  [0.0s-7.3s] experimental procedure
   - [0.0s] open cupboard
   - [3.0s] close cupboard
  ... and 139 more lines

Clip 2: [259.0s-492.6s]
  [259.0s-265.6s] experimental procedure
   - [258.8s] read recipe
   - [259.6s] move recipe
  ... and 169 more lines

Clip 3: [492.6s-916.8s]
  [492.6s-495.1s] cleaning
   - [470.2s] read recipe
   - [492.6s] touch recipe
  ... and 250 more lines

Clip 4: [916.8s-1257.0s]
  [916.8s-925.0s] preparing ingredients
   - [903.2s] carry carrots
   - [917.1s] open water
  ... and 290 more lines

Clip 5: [1257.0s-1610.0s]
  [1257.0s-1273.5s] experimental procedure
   - [1256.5s] read recipe
   - [1259.5s] grab trash
  ... and 227 more lines

Clip 6:

## Using OpenRouter for LLM Generation

The new implementation supports using OpenRouter API instead of the local vLLM server. This makes it easier to experiment with different models without requiring local GPU resources.

In [None]:
import os

os.environ["OPENROUTER_API_KEY"] = "asdf"

try:
    # Create OpenRouter generator
    llm_generator = OpenRouterGenerator.build(model_id="anthropic/claude-3.5-sonnet")
    
    # Test a simple generation
    test_inputs = [
        ("system", "You are a helpful cooking assistant."),
        ("user", "How do I make scrambled eggs?")
    ]
    
    response = llm_generator.generate(test_inputs)
    print("OpenRouter response:")
    print(response[0])
    
except Exception as e:
    print(f"OpenRouter test failed: {e}")
    print("Make sure to set your OPENROUTER_API_KEY environment variable")

OpenRouter response:
Here's a simple method to make scrambled eggs:

Ingredients:
- 2-3 eggs
- Salt and pepper to taste
- 1 tablespoon butter or oil
- Optional: splash of milk or cream

Steps:
1. Crack the eggs into a bowl
2. Whisk them together with a fork until well combined
3. Optional: add a small splash of milk/cream and season with salt and pepper
4. Heat a non-stick pan over medium heat
5. Add butter or oil and let it melt/heat up
6. Pour in the whisked eggs
7. Using a spatula, gently push the eggs from the edges to the center as they begin to set
8. Continue stirring and folding the eggs until they're mostly set but still look slightly wet (they'll continue cooking off the heat)
9. Remove from heat and serve immediately

Tips:
- Don't overcook - eggs will continue cooking even after removing from heat
- For creamier eggs, cook on medium-low heat
- Some people prefer to season at the end rather than before cooking


## Generate dialogs

In [8]:
from mmassist.datasets.generate.dialog_simulation import generate_from_annotation

generate_from_annotation(annotation=parsed_ann, 
                         llm=llm_generator, 
                         user_types= ["talk_some", "talk_more"],
                         num_repeats = 2,)


Processing video test_YH2003_2023_05_17_09_08_58
Goal: | Duration: 2492.0s| Num steps: 128| Num substeps: 1534| Num clips: 7 | Ann ratio: 1.00
Infer goal and knowledge
inferred_goal: [Making Risotto with Fresh Green Salad]
Generate conversations
Generating dialog for 0.0s-259.0s
Processing conversation 1/2
Processing conversation 2/2
Processing conversation 1/2
Processing conversation 2/2
Generating dialog for 259.0s-492.6s
Processing conversation 1/2
Processing conversation 2/2
Processing conversation 1/2
Processing conversation 2/2
Generating dialog for 492.6s-916.8s
Processing conversation 1/2
Processing conversation 2/2
Processing conversation 1/2
Processing conversation 2/2
Generating dialog for 916.8s-1257.0s
Processing conversation 1/2
Processing conversation 2/2
Processing conversation 1/2
Processing conversation 2/2
Generating dialog for 1257.0s-1610.0s
Processing conversation 1/2
Processing conversation 2/2
Processing conversation 1/2
Processing conversation 2/2
Generating di

KeyboardInterrupt: 

In [None]:
# Command to run the full EPFL dialog generation pipeline
print("To run the complete EPFL pipeline from command line:")
print()
print("cd /c/Users/james/Documents/Research/ProAssist/mmassist/datasets/generate")
print()
print("python generate_epfl.py \\")
print("  --data_dir=/c/Users/james/Documents/Research/ProAssist/my_data/datasets/epfl/annotations \\")
print("  --frames_dir=/c/Users/james/Documents/Research/ProAssist/my_data/processed_data/epfl/frames \\")
print("  --splits=train,test \\")
print("  --output_dir=/c/Users/james/Documents/Research/ProAssist/my_data/processed_data/epfl/generated_dialogs \\")
print("  --llm=anthropic/claude-3.5-sonnet \\")
print("  --max_num_lines_per_gen=20 \\")
print("  --user_types=no_talk@2,talk_some@4,talk_more@4 \\")
print("  --num_repeats=10")
print()
print("Key features:")
print("- Domain: cooking")
print("- Combines coarse activity annotations with fine-grained action annotations")
print("- Filters out confused annotations (confusion=1)")
print("- Combines verbs and nouns into action descriptions")
print("- Groups fine-grained actions under coarse activities by start time")
print("- Uses max_num_lines_per_gen=20 as specified")
print("- Supports OpenRouter API for LLM calls instead of local vLLM server")