# Otter Video Demo

Current Otter Video is Otter-v0.2-DC (0612), means it’s trianed on MIMIC-IT-DC at June 12th. The code reads a video and uniformly extracts 16 frames, so avoid using excessively long videos if you want the model to generate specific descriptions.

If your machine has over 16G GPU memory, you can run our model locally in fp16 mode for tasks like video labeling and identifying harmful content. For machines with over 36G GPU memory (by combining multiple cards with [device_map='auto'](https://huggingface.co/docs/accelerate/usage_guides/big_modeling) to one model different cards), you can run our model in the more accurate fp32 mode.

In [1]:
import mimetypes
import os
from typing import Union
import cv2
import requests
import torch
import transformers
from PIL import Image
import sys

sys.path.append("../..")
from otter.modeling_otter import OtterForConditionalGeneration

# Disable warnings
requests.packages.urllib3.disable_warnings()

[2023-08-25 16:29:11,803] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)

Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /usr/local/lib/python3.9/dist-packages/bitsandbytes/libbitsandbytes_cuda117.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /usr/local/lib/python3.9/dist-packages/bitsandbytes/libbitsandbytes_cuda117.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


In [2]:
# ------------------- Main Function -------------------
load_bit = "fp32"
if load_bit == "fp16":
    precision = {"torch_dtype": torch.float16}
elif load_bit == "bf16":
    precision = {"torch_dtype": torch.bfloat16}
elif load_bit == "fp32":
    precision = {"torch_dtype": torch.float32}

# This model version is trained on MIMIC-IT DC dataset.
# model = OtterForConditionalGeneration.from_pretrained("luodian/OTTER-9B-DenseCaption", device_map="auto", **precision)
model = OtterForConditionalGeneration.from_pretrained("/mnt/bn/ecom-govern-maxiangqian-lq/lj/Otter/exp_result/final_hfckpt", device_map="auto", **precision)

tensor_dtype = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp32": torch.float32}[load_bit]

model.text_tokenizer.padding_side = "left"
tokenizer = model.text_tokenizer
image_processor = transformers.CLIPImageProcessor()
model.eval()

Using pad_token, but it is not set yet.


The current model version is configured for Otter-Image with max_num_frames set to None.
Total Trainable param: 1.441012 B


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

OtterForConditionalGeneration(
  (lang_encoder): LlamaForCausalLM(
    (model): LlamaModel(
      (embed_tokens): Embedding(32004, 4096, padding_idx=0)
      (layers): ModuleList(
        (0-2): 3 x OtterLayer(
          (decoder_layer): LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
              (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
              (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
              (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
              (rotary_emb): LlamaRotaryEmbedding()
            )
            (mlp): LlamaMLP(
              (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
              (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
              (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
              (act_fn): SiLUActivation

In [3]:
tokenizer = model.text_tokenizer

In [5]:
tokenizer.pad_token_id

32003

In [53]:
# ------------------- Utility Functions -------------------


def get_content_type(file_path):
    content_type, _ = mimetypes.guess_type(file_path)
    return content_type


# ------------------- Image and Video Handling Functions -------------------


def extract_frames(video_path, num_frames=32):
    video = cv2.VideoCapture(video_path)
    total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_step = total_frames // num_frames
    frames = []

    for i in range(num_frames):
        video.set(cv2.CAP_PROP_POS_FRAMES, i * frame_step)
        ret, frame = video.read()
        if ret:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = Image.fromarray(frame).convert("RGB")
            frames.append(frame)

    video.release()
    return frames


def get_image(url: str) -> Union[Image.Image, list]:
    if "://" not in url:  # Local file
        content_type = get_content_type(url)
    else:  # Remote URL
        content_type = requests.head(url, stream=True, verify=False).headers.get("Content-Type")

    if "image" in content_type:
        if "://" not in url:  # Local file
            return Image.open(url)
        else:  # Remote URL
            return Image.open(requests.get(url, stream=True, verify=False).raw)
    elif "video" in content_type:
        video_path = "temp_video.mp4"
        if "://" not in url:  # Local file
            video_path = url
        else:  # Remote URL
            with open(video_path, "wb") as f:
                f.write(requests.get(url, stream=True, verify=False).content)
        frames = extract_frames(video_path)
        if "://" in url:  # Only remove the temporary video file if it was downloaded
            os.remove(video_path)
        return frames
    else:
        raise ValueError("Invalid content type. Expected image or video.")


# ------------------- OTTER Prompt and Response Functions -------------------


def get_formatted_prompt(prompt: str) -> str:
    return f"<image>User: {prompt} GPT:<answer>"


def get_response(vision_x, prompt: str, model=None, image_processor=None, tensor_dtype=None, batch_size=2) -> str:
    

    lang_x = model.text_tokenizer(
        [
            get_formatted_prompt(prompt),
        ],
        return_tensors="pt",
    )

    # Get the data type from model's parameters
    model_dtype = next(model.parameters()).dtype

    # Convert tensors to the model's data type
    # .unsqueeze(0).repeat(batch_size, 1, 1, 1, 1)
    vision_x = vision_x.to(dtype=model_dtype)
    lang_x_input_ids = lang_x["input_ids"].repeat(batch_size, 1)
    lang_x_attention_mask = lang_x["attention_mask"].repeat(batch_size, 1)

    bad_words_id = model.text_tokenizer(["User:", "GPT1:", "GFT:", "GPT:"], add_special_tokens=False).input_ids
    generated_text = model.generate(
        vision_x=vision_x.to(model.device),
        lang_x=lang_x_input_ids.to(model.device),
        attention_mask=lang_x_attention_mask.to(model.device),
        max_new_tokens=512,
        num_beams=3,
        no_repeat_ngram_size=3,
        bad_words_ids=bad_words_id,
    )
    parsed_output = (
        model.text_tokenizer.decode(generated_text[0])
        .split("<answer>")[-1]
        .lstrip()
        .rstrip()
        .split("<|endofchunk|>")[0]
        .lstrip()
        .rstrip()
        .lstrip('"')
        .rstrip('"')
    )
    return parsed_output

# video_url = "/mnt/bn/ecom-govern-maxiangqian-lq/lj/data/dwq/test/test_creative/C_KT_6_0151_0235.mp4"
# prompts_input = "Why is the video as a whole comedic?"
# frames_list = get_image(video_url)
# print(f"\nPrompt: {prompts_input}")
# response = get_response(frames_list, prompts_input, model, image_processor, tensor_dtype)
# print(f"Response: {response}")

In [54]:
import json
import os
from tqdm import tqdm

def get_test_video_path(root_dir, name):
    if name.startswith('H'):
        path = os.path.join(root_dir, 'test_humor', name)
    elif name.startswith('M'):
        path = os.path.join(root_dir, 'test_magic', name)
    else:
        path = os.path.join(root_dir, 'test_creative', name)
    return path

# 读取/mnt/bn/ecom-govern-maxiangqian-lq/lj/data/dwq/annotation_with_ID/funqa_test_group_by_video.json
with open('/mnt/bn/ecom-govern-maxiangqian-lq/lj/data/dwq/annotation_with_ID/funqa_test_group_by_video.json', 'r') as f:
    datas = json.load(f)
    
for video_name, instructions in tqdm(datas.items(), total=len(datas)):
    video_url = get_test_video_path('/mnt/bn/ecom-govern-maxiangqian-lq/lj/data/dwq/test', video_name)
    print(video_url)
    frames_list = get_image(video_url)
    if isinstance(frames_list, Image.Image):
        vision_x = image_processor.preprocess([frames_list], return_tensors="pt")["pixel_values"].unsqueeze(1).unsqueeze(0)
    elif isinstance(frames_list, list):  # list of video frames
        vision_x = image_processor.preprocess(frames_list, return_tensors="pt")["pixel_values"].unsqueeze(0).unsqueeze(0)
    else:
        raise ValueError("Invalid input data. Expected PIL Image or list of video frames.")
    batch_size = 2
    vision_x = vision_x.repeat(batch_size, 1, 1, 1, 1, 1)
    for data in tqdm(instructions):
        prompts_input = data['instruction']
        task = data['task']
        if task == 'H1' or task == 'C1' or task == 'M1':
            data['predict'] = data['output']
        else:
            print(prompts_input)
            
            response = get_response(vision_x, prompts_input, model, image_processor, tensor_dtype, batch_size=batch_size)
            print(response)
            data['predict'] = response
            with open('/mnt/bn/ecom-govern-maxiangqian-lq/lj/Otter/infer_data/test_res.jsonl', 'a+') as f:
                f.write(json.dumps(data) + '\n')
            
        

  0%|          | 0/424 [00:00<?, ?it/s]

/mnt/bn/ecom-govern-maxiangqian-lq/lj/data/dwq/test/test_humor/H_A_101_1433_1631.mp4




Provide a detailed account of the video's funny moment.


Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_vars.py", line 624, in change_attr_expression
    value = eval(expression, frame.f_globals, frame.f_locals)
  File "<string>", line 1, in <module>
NameError: name 'tensor' is not defined


In a cup of water, a black and white kitten is swimming.
Explain the comedic scene depicted in the video.


 30%|███       | 6/20 [06:33<15:18, 65.64s/it]
  0%|          | 0/424 [06:34<?, ?it/s]


KeyboardInterrupt: 

In [None]:
while True:
    video_url = input("Enter video path: ")  # Replace with the path to your video file, could be any common format.

    frames_list = get_image(video_url)

    while True:
        prompts_input = input("Enter prompts: ")

        if prompts_input.lower() == "quit":
            break

        print(f"\nPrompt: {prompts_input}")
        response = get_response(frames_list, prompts_input, model, image_processor, tensor_dtype)
        print(f"Response: {response}")

In [35]:
import torch
new_lang_x_input_ids = torch.randn([1,19])
# Assuming lang_x_input_ids is a PyTorch tensor
batch_size = 2
new_lang_x_input_ids = new_lang_x_input_ids.repeat(batch_size, 1)

In [50]:
vision_x.shape

torch.Size([1, 1, 32, 3, 224, 224])