In [1]:
%cd /data/gunsbrother/prjs/ltvu/llms/Video-LLaVA

/data/gunsbrother/prjs/ltvu/llms/Video-LLaVA


In [2]:
!hostname

ariel-v8


In [3]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
from transformers import logging
logging.set_verbosity_error()

import subprocess
from pathlib import Path

import re
import json
import torch
from videollava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
from videollava.conversation import conv_templates, SeparatorStyle
from videollava.model.builder import load_pretrained_model
from videollava.utils import disable_torch_init
from videollava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria


In [4]:

# setup
disable_torch_init()
model_path = 'LanguageBind/Video-LLaVA-7B'
cache_dir = 'cache_dir'
device = 'cuda'
load_4bit, load_8bit = True, False
model_name = get_model_name_from_path(model_path)
tokenizer, model, processor, _ = load_pretrained_model(model_path, None, model_name, load_8bit, load_4bit, device=device, cache_dir=cache_dir)
video_processor = processor['video']
conv_mode = "llava_v1"


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:

def trim_video(clip_start_sec=0, clip_duration_sec=60):
    video_path = Path('/data/datasets/ego4d_data/v2/clips_320p-non_official/0aca0078-b6ab-41fb-9dc5-a70b8ad137b2.mp4')
    p_splitted_video_dir = Path('/tmp/video-llava')
    p_splitted_video_dir.mkdir(exist_ok=True, parents=True)
    clip_end_sec = clip_start_sec + clip_duration_sec
    p_splitted_video = p_splitted_video_dir / f'{video_path.stem}_{clip_start_sec:03d}_{clip_end_sec:03d}{video_path.suffix}'
    cmd = [
        'ffmpeg',
        '-i', str(video_path),
        '-ss', str(clip_start_sec),
        '-t', str(clip_duration_sec),
        '-c', 'copy', '-avoid_negative_ts', '1', '-y',
        str(p_splitted_video)
    ]
    print(' '.join(cmd))
    subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    print(str(p_splitted_video))
    return p_splitted_video

# load the video
START_SEC = 0
DURATION_SEC = 30
p_trimmed = trim_video(START_SEC, DURATION_SEC)
video_tensor = video_processor(str(p_trimmed), return_tensors='pt')['pixel_values']
if type(video_tensor) is list:
    tensor = [video.to(model.device, dtype=torch.float16) for video in video_tensor]
else:
    tensor = video_tensor.to(model.device, dtype=torch.float16)


ffmpeg -i /data/datasets/ego4d_data/v2/clips_320p-non_official/0aca0078-b6ab-41fb-9dc5-a70b8ad137b2.mp4 -ss 0 -t 30 -c copy -avoid_negative_ts 1 -y /tmp/video-llava/0aca0078-b6ab-41fb-9dc5-a70b8ad137b2_000_030.mp4
/tmp/video-llava/0aca0078-b6ab-41fb-9dc5-a70b8ad137b2_000_030.mp4


In [6]:
# load the caption
SEP = '</s>'
REPLACE_PATTERNS = [{
    'replaced_with': 'the man',
    'patterns': [
        r'[Tt]he man',
        r'[Tt]he person(?!(?:\s+in))',  # not followed by in
        r'[Tt]he person in this video',
    ]}, {
    'replaced_with': '',
    'patterns': [
        r'(, which.*| that.*| to )?help(s|ing)?.*answer.*\".*\"',
        r' in the video',
    ]}, {
    'replaced_with': 'A',
    'patterns': [
        r'[Ii]n this video, we can see a',
    ]}
]

with open("ltvu/captions/test/03_20240219v0/gathered.json") as f:
    data = json.load(f)
q_inst = data['0aca0078-b6ab-41fb-9dc5-a70b8ad137b2']['q_instances']['9e5cd376-1b29-5861-8115-be750272d0a9']
q_captions = q_inst['captions']
caption_prompts = ['Captions:']
for s, e, *_caps in q_captions:
    if s < START_SEC: continue
    if e > START_SEC + DURATION_SEC: break
    _cap = _caps[-1]
    _cap = f'{s}s: {_cap.replace(SEP, "")}'
    caption_prompts.append(_cap)
caption_prompt = '\n'.join(caption_prompts)

print(caption_prompt)
for pattern_dict in REPLACE_PATTERNS:
    repl = pattern_dict['replaced_with']
    for pattern in pattern_dict['patterns']:
        caption_prompt = re.sub(pattern, repl, caption_prompt)
print('\n\n\n')
print(caption_prompt)


Captions:
0.0s: The objects that help in answering the question "where did I put the piece of timber?" are a wooden floor and a wooden table.
1.5s: The man is using a wooden floor and a wooden table to help him answer the question "where did I put the piece of timber?"
3.0s: The man in the video is using a tool to push down on a board, which is a piece of timber. He is also using a hammer to hit the board, which is another piece of timber.
4.5s: The man in the video is using a piece of paper to mark the location of the piece of timber.
6.0s: The man is using a long metal instrument to sharpen a smaller metal instrument, which is the object that helps him answer the question "where did I put the piece of timber?"
7.5s: The man in the video is using a stick to sharpen the knife, which can help him remember where he put the piece of timber.
9.0s: The person is using a tool to push down on a piece of wood, which is the object that helps in answering the question "where did I put the piece 

In [13]:

query = "where did I put the piece of timber?"
prompts = [
    '{captions}\n\n'
    'Above is a sequence of short-term captions of this video generated by an video LLM after '
    'watching each short clip to later help another long-term LLM in answering the question "{query}". '
    'Each caption starts with its corresponding timestamp \'${{sec}}s:\'. '
    'What do you think is the most salient moment within this video for answering the question? And explain why.',

    'What temporal window within this video does correspond to that moment? '
    'Tell me the start, and the end of that moment in seconds. '
    # 'The temporal window can be longer than 1.5s. '
    'Answer briefly.',

    'The IoU score between your answer and the GT will be...?'
    # 'Answer briefly.'
]
prompts = [
    prompt.format(captions=caption_prompt, query=query)
    for prompt in prompts
]

def converse(prompt, conv, model, tokenizer, tensor):
    if conv.messages:
        inp = prompt
    else:
        inp = ' '.join([DEFAULT_IMAGE_TOKEN] * model.get_video_tower().config.num_frames) + '\n' + prompt
    conv.append_message(conv.roles[0], inp)
    conv.append_message(conv.roles[1], None)
    prompt = conv.get_prompt()
    input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
    stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
    keywords = [stop_str]
    stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)

    with torch.inference_mode():
        output_ids = model.generate(
            input_ids,
            images=tensor,
            do_sample=True,
            temperature=0.1,
            max_new_tokens=1024,
            use_cache=True,
            stopping_criteria=[stopping_criteria])

    outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
    conv.messages[-1][-1] = outputs
    return outputs

conv = conv_templates[conv_mode].copy()
for i, prompt in enumerate(prompts):
    print(f'{f" Round {i} ":=^80s}')
    print(f'{" User ":-^80s}\n{prompt}')
    outputs = converse(prompt, conv, model, tokenizer, tensor)
    print(f'{" Assistant ":-^80s}\n{outputs}')
    print(f'{"":=^80s}\n')

------------------------------------- User -------------------------------------
Captions:
0.0s: The objects are a wooden floor and a wooden table.
1.5s: the man is using a wooden floor and a wooden table
3.0s: the man is using a tool to push down on a board, which is a piece of timber. He is also using a hammer to hit the board, which is another piece of timber.
4.5s: the man is using a piece of paper to mark the location of the piece of timber.
6.0s: the man is using a long metal instrument to sharpen a smaller metal instrument
7.5s: the man is using a stick to sharpen the knife, which can help him remember where he put the piece of timber.
9.0s: the man is using a tool to push down on a piece of wood
10.5s: The person is using a wooden stick to mark the piece of wood
12.0s: The person is holding a wooden stick and a hammer
13.5s: The person uses a wooden stick and a knife to cut the wood
15.0s: The person is using a tool to sharpen the piece of wood, and there is a piece of wood on 

# Step 3

In [None]:
prompts3 = [
    'Here\'are other debators\' opinions:\n'
    '{debate}\n\n'
    ''

    f'Another discussant explained that the section from {debator1_start} seconds to {debator1_end} seconds of this video '
    f'is appropriate for solving the question "{query}".\n\n'
    f'"{explanation}"\n\n'
    f'What do you think about this? The discussant has rated their answer with a score of {score} and assessed their '
    f'confidence as {confidence}.',

    f'Please discuss any shortcomings in the explanation given by the discussant while solving the problem {query}. '
    f'The discussant\'s explanation is as follows:\n{explanation}',

    f'You answered that the video section capable of solving "{query}" is from {debator2_start} seconds to '
    f'{debator2_end} seconds. If you think a revision is necessary, please modify your answer.'
]
converse(prompt, conv, model, tokenizer, tensor)

In [27]:
narr = json.load(open('/data/datasets/ego4d_data/v2/annotations/narration.json'))
narr

In [None]:
narr[0]