## Dialogue Visualization
This notebook is used to visualize the live dialogues from the ProAssist dataset.

In [1]:
import os
import json
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from mmassist.eval.runners.stream_inference import FrameOutput, annotate_and_save_video
from pprint import pprint

def process_and_save_video(video, sample, dataset_name, processor,  save_root_dir, conv_meta=None):
    idx = video["sample_idx"]
    video_uid = sample["video_uid"]

    user_type = sample["metadata"]["user_type"]
    dataset_name = dataset_name.replace("/", "-")
    unique_id = f"{dataset_name}__{video_uid}__{user_type}"
    sample["unique_id"] = unique_id

    metadata = sample["metadata"]
    user_type = metadata["user_type"]
    goal = metadata['task_goal'].replace(" ", "_")
    # print(idx, sample["video_uid"], goal, user_type, f"{metadata['quality']:.1f}", ""metadata["has_summary"])
    print(idx, sample["video_uid"])
    # pprint(metadata)

    # if metadata["quality"] < 5:
    #     continue

    # if "0" not in metadata["user_id"]:
    #     continue

    # if not metadata["has_summary"]:
    #     print("no summary")
    #     continue
    
    save_dir = os.path.join(save_root_dir, unique_id)
    os.makedirs(save_dir, exist_ok=True)
    save_file = f"{save_dir}/video.mp4"
    
    # if os.path.exists(save_file):
    #     return []

    
    # process to the stream format
    streams = processor.processed_conv_data_to_stream(video)
    outputs = []
    for frames in streams:
        text_inputs = [(t["role"], t["content"]) for t in frames.input_messsages]
        cleaned_ass_text = processor.cleanup_text(frames.ref_output_str)[0]
        outputs.append(
            FrameOutput(
                gen="",
                ref=cleaned_ass_text,
                image=frames.images[0],
                text_inputs=text_inputs,
                frame_idx_in_stream=frames.frame_idxs_in_stream[0],
                frame_idx_in_original_video=frames.frame_idxs_in_original_video[0],
                timestamp_in_stream=frames.timestamps[0],
            ).to_dict(ignore_keys="")
        )
    annotate_and_save_video(
        outputs, 
        save_file, 
        fps=4, 
        assistant_name_gt="ASSISTANT",
        pause_time=2,
        add_system_prompt=False,
    )

    if conv_meta is not None:
        meta_save_file = save_file.replace("video.mp4", "meta.json")
        with open(meta_save_file, "w") as f:
            json.dump(conv_meta, f, indent=4)

    return outputs

  from .autonotebook import tqdm as notebook_tqdm


###  Video Clip with Iterative Progress Summarization
Long videos need to be split into multiple clips to fit in the model's context length. We propose the iterative progress summarization, where the model is prompted at the end of each clip to summarize the progress so far. The progress is then used as the context for the next clip. The following code generate example to show how IPS works.

In [5]:
import random

from mmassist.data import build_train_dataset
from mmassist.configs.arguments import parse_args
from mmassist.model.tokenization_proact import (
    build_tokenizer_and_update_config,
    ProActConfig,
)
from mmassist.eval.runners.stream_inference import StreamProcessor


dataset_names = [
    # "ego4d",
    # "holoassist",
    # "egoexolearn",
    # "epickitchens",
    # "wtag",
    "assembly101",
]
data_type = "dialog-klg-sum_val"
L = 4096
I = 5

save_root = "gt_video_clip_addklg_addsum_1104"
num_videos_per_user_type = 3

for dataset_name in dataset_names:
    dataset_full_name = f"{dataset_name}/{data_type}_L{L}_I{I}+SEP"

    # update the args
    model_args, train_args = parse_args(no_args=True)
    model_args.max_seq_len = L
    model_args.img_patch_token_size = I
    train_args.train_datasets = dataset_full_name
    all_args_dict = {**model_args.to_dict(), **train_args.to_dict()}

    # build the dataset, tokenizer and processor
    dataset = build_train_dataset(**all_args_dict, keep_images=True)
    config = ProActConfig.from_dict(model_args.to_dict())
    tokenizer = build_tokenizer_and_update_config(config)
    processor = StreamProcessor(tokenizer, tokenizer.chat_formatter, fps=2)
    dataset = dataset.datasets[0]

    # count the number of dialog turns that are within 2 minutes
    user_type_to_sample = {}
    for idx, d in enumerate(dataset.data):
        num_dialog_turns = 0
        for turn in d["conversation"]:
            if turn.get("time") is None:
                continue
            if turn["time"] > 120:
                break
            num_dialog_turns += 1
        user_type = d["metadata"]["user_type"]
        d["sample_idx"] = idx
        d["num_dialog_turns"] = num_dialog_turns
        if user_type not in user_type_to_sample:
            user_type_to_sample[user_type] = []
        user_type_to_sample[user_type].append(d)

    for user_type, samples in user_type_to_sample.items():
        sorted_samples = sorted(
            samples, key=lambda x: x["num_dialog_turns"], reverse=True
        )
        if not any(s for s in sorted_samples[:num_videos_per_user_type] if "0" in s["metadata"]["user_id"]):
            for s in sorted_samples:
                if "0" in s["metadata"]["user_id"]:
                    print(f"move sample {s['sample_idx']} to the front")
                    sorted_samples.insert(0, sorted_samples.pop(sorted_samples.index(s)))
                    break
        
        user_type_to_sample[user_type] = sorted_samples

    for user_type, samples in user_type_to_sample.items():
        existed_video_uids = set()
        print(len(samples))
        for sample in samples:
            idx = sample["sample_idx"]
            video_uid = sample["video_uid"]

            if video_uid in existed_video_uids:
                continue
            if len(existed_video_uids) == num_videos_per_user_type:
                break

            video = dataset[idx]
            process_and_save_video(video, sample, dataset_full_name, processor, save_root)
            existed_video_uids.add(video_uid)

140
40 disassembly_nusar-2021_action_both_9022-a18_9022_user_id_2021-02-23_104757__HMC_84358933_mono10bit
159 disassembly_nusar-2021_action_both_9046-b06b_9046_user_id_2021-02-22_105953__HMC_84358933_mono10bit
198 disassembly_nusar-2021_action_both_9054-a18_9054_user_id_2021-02-08_153620__HMC_21110305_mono10bit
142
178 disassembly_nusar-2021_action_both_9051-c13a_9051_user_id_2021-02-22_121941__HMC_84355350_mono10bit
181 disassembly_nusar-2021_action_both_9051-c13a_9051_user_id_2021-02-22_121941__HMC_84358933_mono10bit
199 disassembly_nusar-2021_action_both_9054-a18_9054_user_id_2021-02-08_153620__HMC_21110305_mono10bit
148
119 disassembly_nusar-2021_action_both_9034-c02b_9034_user_id_2021-02-23_173828__HMC_84358933_mono10bit
48 disassembly_nusar-2021_action_both_9023-c09c_9023_user_id_2021-02-23_134459__HMC_84355350_mono10bit
320 disassembly_nusar-2021_action_both_9073-a10_9073_user_id_2021-02-25_150711__HMC_84358933_mono10bit


### Full Video

In [3]:
import random

from mmassist.data import build_train_dataset
from mmassist.configs.arguments import parse_args
from mmassist.model.tokenization_proact import (
    build_tokenizer_and_update_config,
    ProActConfig,
)
from mmassist.eval.runners.stream_inference import StreamProcessor


dataset_names = [
    # "ego4d",
    # "holoassist",
    # "egoexolearn",
    # "epickitchens",
    # "wtag",
    "assembly101",
]
# data_type = "dialog-klg-sum_val"
data_type = "dialog_val"
L = 0
I = 1

save_root = "gt_video_full"
num_videos_per_user_type = 5

for dataset_name in dataset_names:
    dataset_full_name = f"{dataset_name}/{data_type}_L0_I1"

    # update the args
    model_args, train_args = parse_args(no_args=True)
    train_args.train_datasets = dataset_full_name
    all_args_dict = {**model_args.to_dict(), **train_args.to_dict()}

    # build the dataset, tokenizer and processor
    dataset = build_train_dataset(**all_args_dict, keep_images=True)
    config = ProActConfig.from_dict(model_args.to_dict())
    tokenizer = build_tokenizer_and_update_config(config)
    processor = StreamProcessor(tokenizer, tokenizer.chat_formatter, fps=2)
    dataset = dataset.datasets[0]

    # count the number of dialog turns that are within 2 minutes
    user_type_to_sample = {}
    for idx, d in enumerate(dataset.data):
        num_dialog_turns = 0
        for turn in d["conversation"]:
            if turn.get("time") is None:
                continue
            if turn["time"] > 120:
                break
            num_dialog_turns += 1
        user_type = d["metadata"]["user_type"]
        d["sample_idx"] = idx
        d["num_dialog_turns"] = num_dialog_turns
        if user_type not in user_type_to_sample:
            user_type_to_sample[user_type] = []
        user_type_to_sample[user_type].append(d)

    for user_type, samples in user_type_to_sample.items():
        sorted_samples = sorted(
            samples, key=lambda x: x["num_dialog_turns"], reverse=True
        )
        if not any(s for s in sorted_samples[:num_videos_per_user_type] if "0" in s["metadata"]["user_id"]):
            for s in sorted_samples:
                if "0" in s["metadata"]["user_id"]:
                    print(f"move sample {s['sample_idx']} to the front")
                    sorted_samples.insert(0, sorted_samples.pop(sorted_samples.index(s)))
                    break
        
        user_type_to_sample[user_type] = sorted_samples

    for user_type, samples in user_type_to_sample.items():
        # if user_type != "talk_more":
        #     continue
        existed_video_uids = set()
        print(len(samples))
        for sample in samples:
            idx = sample["sample_idx"]
            video_uid = sample["video_uid"]

            if video_uid in existed_video_uids:
                continue
            if len(existed_video_uids) == num_videos_per_user_type:
                break
            print(idx)

            video = dataset[idx]
            streams = process_and_save_video(video, sample, dataset_full_name, processor, save_root)
            for e in streams:
                if e["text_inputs"]:
                    for role, text in e["text_inputs"]:
                        print(e["frame_idx_in_video"], f"{role.upper()}: {text}")
                if e["ref"]:
                    print(e["frame_idx_in_video"], "ASSISTANT:", e["ref"])
            existed_video_uids.add(video_uid)

112
33
33 disassembly_nusar-2021_action_both_9022-a18_9022_user_id_2021-02-23_104757__HMC_84358933_mono10bit
0 SYSTEM: You are a helpful and proactive assistant. Always be ready to assist and provide useful information ahead of time.
4 USER: I want to disassemble this toy fire truck into its main components.
4 ASSISTANT: Great goal! Let's start by detaching the door. Please unscrew the side door using a screwdriver.
17 ASSISTANT: If the screw is still tight, try to loosen it with your hand.
33 ASSISTANT: Still having trouble? Use the screwdriver again to unscrew the side door.
49 ASSISTANT: Almost there! Try to unscrew the side door with your hand one more time.
62 ASSISTANT: Great job! The door should be detached now. Next, let's detach the fire extinguisher.
78 ASSISTANT: Just pull it straight out, it should come off easily.
105 ASSISTANT: Now, let's move on to detaching the bumper. Use a screwdriver to start unscrewing it.
117 ASSISTANT: Try to loosen the bumper with your hand. If i