In [1]:
import json

import av
import torch
import numpy as np
import pandas as pd
import requests

from tqdm import tqdm

from transformers import AutoProcessor, AutoModel

np.random.seed(0)


def read_video_pyav(container, indices):
    '''
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
        indices (`List[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])


def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
    '''
    Sample a given number of frame indices from the video.
    Args:
        clip_len (`int`): Total number of frames to sample.
        frame_sample_rate (`int`): Sample every n-th frame.
        seg_len (`int`): Maximum allowed index of sample's last frame.
    Returns:
        indices (`List[int]`): List of sampled frame indices
    '''
    converted_len = int(clip_len * frame_sample_rate)
    end_idx = np.random.randint(converted_len, seg_len)
    start_idx = end_idx - converted_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices

2024-06-12 09:33:29.102742: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-12 09:33:29.614882: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-06-12 09:33:29.614925: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [2]:
def download_video(url):
    out_path = f'test_vlm.mp4'

    try:
        response = requests.get(url, stream=True, timeout=300)
        response.raise_for_status() 
        with open(out_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
    except Exception as e:
        print(f"An error occurred: {e}")
        return ''

    return out_path

In [3]:
processor = AutoProcessor.from_pretrained("finetuned-xclip-base-patch16")
model = AutoModel.from_pretrained("finetuned-xclip-base-patch16")
model.to('cuda')
projector = torch.nn.Linear(512, 1024, bias=False)
projector.load_state_dict(torch.load('projector.pth'))
projector.to('cuda')
model.eval()
projector.eval()

Linear(in_features=512, out_features=1024, bias=False)

In [4]:
# data = pd.read_csv('yappy_hackaton_2024_400k.csv').iloc[:1000]

In [5]:
data = pd.read_csv('ЛЦТ датасет - All.csv')

In [6]:
urls = []
for line in data.urls:
    urls += line.split()

In [7]:
def get_vidio_features(video_url):
    video_path = ''
    while video_path == '':
        video_path = download_video(video_url)
    # answer = ''
    answers = []
    container = av.open(video_path)

    # sample uniformly 8 frames from the video
    total_frames = container.streams.video[0].frames
    indices = np.arange(0, total_frames, total_frames / 8).astype(int)
    clip = read_video_pyav(container, indices)

    inputs = processor(text='', videos=list(clip), return_tensors="pt")
    for i in inputs:
        inputs[i] = inputs[i].to('cuda')
    video_features = projector(model(**inputs).video_embeds)[0].tolist()
    return video_features

In [8]:
result = []
for video_url in tqdm(urls):
    try:
        video_features = get_vidio_features(video_url)
        result.append({
            'link': video_url,
            'video_embedding': video_features
        })
    except Exception:
        continue

  return torch.tensor(value)
  1%|██                                                                                                                                                                                                     | 4/385 [00:03<05:29,  1.16it/s]

An error occurred: HTTPSConnectionPool(host='cdn-st.rutubelist.ru', port=443): Read timed out. (read timeout=300)


 25%|████████████████████████████████████████████████▊                                                                                                                                                     | 95/385 [07:18<06:59,  1.45s/it]

An error occurred: HTTPSConnectionPool(host='cdn-st.rutubelist.ru', port=443): Read timed out. (read timeout=300)


 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                | 227/385 [14:46<03:35,  1.36s/it]

An error occurred: HTTPSConnectionPool(host='cdn-st.rutubelist.ru', port=443): Read timed out. (read timeout=300)


 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                             | 266/385 [20:35<03:00,  1.51s/it]

An error occurred: HTTPSConnectionPool(host='cdn-st.rutubelist.ru', port=443): Read timed out. (read timeout=300)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 385/385 [27:49<00:00,  4.34s/it]


In [9]:
# json.dump(result, open('xclip.json', 'w'))

In [10]:
json.dump(result, open('finetuned-xclip-validation.json', 'w'))

In [11]:
len(result[0]['video_embedding'])

1024

In [12]:
len(result)

384