In [1]:
import os
import re
import json

import requests

import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

import av

import torch
from transformers import VideoLlavaProcessor, VideoLlavaForConditionalGeneration

def read_video_pyav(container, indices):
    '''
    Decode the video with PyAV decoder.

    Args:
        container (av.container.input.InputContainer): PyAV container.
        indices (List[int]): List of frame indices to decode.

    Returns:
        np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])

2024-06-10 20:30:07.124048: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-10 20:30:07.592992: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-06-10 20:30:07.593035: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [2]:
def download_video(url):
    out_path = f'test_vlm.mp4'

    try:
        response = requests.get(url, stream=True, timeout=30)
        response.raise_for_status() 
        with open(out_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
    except Exception as e:
        print(f"An error occurred: {e}")
        return ''

    return out_path

In [3]:
model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", device_map='auto', torch_dtype=torch.float16)
processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")
model.eval()
# model.to('cuda')

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   9%|9         | 451M/4.95G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.81G [00:00<?, ?B/s]

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/148 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/890 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.59k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/66.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/582 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


VideoLlavaForConditionalGeneration(
  (video_tower): CLIPVisionModel(
    (vision_model): CLIPVisionTransformer(
      (embeddings): CLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
        (position_embedding): Embedding(257, 1024)
      )
      (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (encoder): CLIPEncoder(
        (layers): ModuleList(
          (0-23): 24 x CLIPEncoderLayer(
            (self_attn): CLIPAttention(
              (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
            )
            (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (mlp): CLIPMLP(
              (activation_fn): Q

In [4]:
tags = ', '.join([
    'Talks, Cartoon, Anime, Video game, Nature, Fashion, Travel',
    'Beauty, Education&science, Cars&motorcycles, Movie, Sport, Astrology', 
    'Woman, Food, Animals, Fun, Dances, Gadgets, Roblox, Dota2, Fifa, GTA',
    'Simpsons, Minecraft, RickAndMorty, Genshin, CounterStrike',
    'Food, Cooking, Animation, Art, Asian, People, None, iOS, Android',
    'Fitness, Lifehacks, Boobs, Ass'
])
questions = [
    'Describe what is shown in the video.',
    'Describe what happens in the video.',
    f'Select one to five categories for this video. Here are some examples: {tags}',
]

In [5]:
def descripe_video(video_url):
    video_path = ''
    while video_path == '':
        video_path = download_video(video_url)
    # answer = ''
    answers = []
    container = av.open(video_path)

    # sample uniformly 8 frames from the video
    total_frames = container.streams.video[0].frames
    indices = np.arange(0, total_frames, total_frames / 8).astype(int)
    clip = read_video_pyav(container, indices)
    
    for question in questions:
        #prompt = f"{answer} USER: <video>{question} ASSISTANT:"
        prompt = f"USER: <video>{question} ASSISTANT:"

        inputs = processor(text=prompt, videos=clip, return_tensors="pt")
        for i in inputs:
            inputs[i] = inputs[i].to('cuda')

        # Generate
        with torch.no_grad():
            generate_ids = model.generate(**inputs, max_length=256)
        answer = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        answers.append(answer.split('ASSISTANT:')[1].strip())
    #answer = ' '.join(i.split('USER:')[0].strip() for i in answer.split('ASSISTANT:')).strip()
    answer = '. '.join(answers)
    return answer

In [6]:
# data = pd.read_csv('yappy_hackaton_2024_400k.csv').iloc[:10000]

In [7]:
data = pd.read_csv('ЛЦТ датасет - All.csv')

In [8]:
urls = []
for line in data.urls:
    urls += line.split()

In [9]:
len(urls)

385

In [10]:
result = []
for video_url in tqdm(urls):
    try:
        description = descripe_video(video_url)
        result.append({
            'link': video_url,
            'description': description
        })
    except Exception:
        continue

  0%|          | 0/385 [00:00<?, ?it/s]

  return torch.tensor(value)


An error occurred: HTTPSConnectionPool(host='cdn-st.rutubelist.ru', port=443): Read timed out. (read timeout=30)
An error occurred: HTTPSConnectionPool(host='cdn-st.rutubelist.ru', port=443): Read timed out. (read timeout=30)


In [11]:
# result

In [12]:
json.dump(result, open('videollava-validation.json', 'w'))

In [13]:
len(result)

384