In [32]:
!pip install av==11.0.0 decord sentence_transformers faiss-cpu -q

In [33]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from sentence_transformers import InputExample, losses, SentenceTransformer
import random
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
import pandas as pd
import json
import faiss
from decord import VideoReader
import numpy as np

In [34]:
with open('/home/jovyan/lost+found/results_2.json', 'r', encoding='utf-8') as f:
    data2 = json.load(f)

In [35]:
train_data_cat = pd.read_csv("/home/jovyan/lost+found/train_data_categories.csv")
IAB_tags = pd.read_csv("/home/jovyan/lost+found/IAB_tags.csv")

In [36]:
def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
    converted_len = int(clip_len * frame_sample_rate)
    if seg_len - converted_len <= 0:
        start_idx = 0
        end_idx = seg_len
    else:
        end_idx = np.random.randint(converted_len, seg_len)
        start_idx = end_idx - converted_len
    indices = np.linspace(start_idx, end_idx - 1, num=clip_len)
    indices = np.clip(indices, 0, seg_len - 1).astype(np.int64)
    return indices

In [37]:
from transformers import XCLIPProcessor, XCLIPVisionModel

model_name = "microsoft/xclip-base-patch16-zero-shot" # microsoft/xclip-base-patch32

processor = XCLIPProcessor.from_pretrained(model_name)
video_model = XCLIPVisionModel.from_pretrained(model_name).to('cuda')



In [38]:
class CrossAttention(nn.Module):
    def __init__(
            self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
            proj_drop=0., attn_head_dim=None, out_dim=None):
        super().__init__()
        if out_dim is None:
            out_dim = dim
        self.num_heads = num_heads
        head_dim = dim // num_heads
        if attn_head_dim is not None:
            head_dim = attn_head_dim
        all_head_dim = head_dim * self.num_heads
        self.scale = qk_scale or head_dim ** -0.5
        assert all_head_dim == dim

        self.q = nn.Linear(dim, all_head_dim, bias=False)
        self.k = nn.Linear(dim, all_head_dim, bias=False)
        self.v = nn.Linear(dim, all_head_dim, bias=False)

        if qkv_bias:
            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
            self.k_bias = nn.Parameter(torch.zeros(all_head_dim))
            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
        else:
            self.q_bias = None
            self.k_bias = None
            self.v_bias = None

        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(all_head_dim, out_dim)
        self.proj_drop = nn.Dropout(proj_drop)

    def forward(self, x, k=None, v=None):
        B, N, C = x.shape
        N_k = k.shape[1]
        N_v = v.shape[1]

        q_bias, k_bias, v_bias = None, None, None
        if self.q_bias is not None:
            q_bias = self.q_bias
            k_bias = self.k_bias
            v_bias = self.v_bias

        q = F.linear(input=x, weight=self.q.weight, bias=q_bias)
        q = q.reshape(B, N, 1, self.num_heads, -1).permute(2, 0, 3, 1, 4).squeeze(0)  # (B, N_head, N_q, dim)

        k = F.linear(input=k, weight=self.k.weight, bias=k_bias)
        k = k.reshape(B, N_k, 1, self.num_heads, -1).permute(2, 0, 3, 1, 4).squeeze(0)

        v = F.linear(input=v, weight=self.v.weight, bias=v_bias)
        v = v.reshape(B, N_v, 1, self.num_heads, -1).permute(2, 0, 3, 1, 4).squeeze(0)

        q = q * self.scale
        attn = (q @ k.transpose(-2, -1))  # (B, N_head, N_q, N_k)

        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)

        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
        x = self.proj(x)
        x = self.proj_drop(x)

        return x

In [39]:
data = train_data_cat
taxonomy = IAB_tags

In [40]:
text_model = SentenceTransformer('intfloat/multilingual-e5-large-instruct').to('cuda')
dim = 1024 # размер вектора эмбеддинга

In [41]:
audio_df = pd.DataFrame(list(data2.items()), columns=['filename', 'transcription'])
audio_df['filename'] = audio_df['filename'].apply(lambda l: l.split('.')[0])

In [42]:
import os

directory = '/home/jovyan/lost+found/videos_2/'

# Получаем список всех файлов и папок в директории
files = os.listdir(directory)

# Фильтруем, оставляя только файлы (если нужно)
files = [f[:-4] for f in files if os.path.isfile(os.path.join(directory, f))]

In [43]:
print(audio_df.shape)
is_cool = [i in files for i in audio_df.filename.tolist()]
audio_df = audio_df[is_cool]
print(audio_df.shape)

(1049, 2)
(934, 2)


In [44]:
print(data.shape)
is_cool = [i in files for i in data.video_id.tolist()]
data = data[is_cool]
print(data.shape)

(1049, 4)
(934, 4)


In [45]:
data = pd.merge(data.reset_index(drop=True), audio_df.reset_index(drop=True), how='left', left_on='video_id', right_on='filename')
data = data.drop('filename', axis=1)

In [46]:
data.sample(2)

Unnamed: 0,video_id,title,description,tags,transcription
222,4354a1ad8bf75f42466420f4b52dcbcd,Артмеханика. Концерт группы Диктофон.,Концерт группы Диктофон.,"Массовая культура, Карьера, События и достопри...",Woo!
502,a8033ba9f69362b009d852b2a9d76749,Три лошадиные силы I Выпуск №14,В этом выпуске парни помогают Деду Морозу разв...,"Транспорт, Массовая культура","Дома, что ли?"


In [47]:
audio_model = SentenceTransformer('intfloat/multilingual-e5-large-instruct').to('cuda')
dim = 1024 # размер вектора эмбеддинга

In [48]:
template = [(isinstance(i, str)) for i in  data["tags"].tolist()]
data = data[template]

In [49]:
title_description = []
tag_list = []
video_ids = []
audio_text = []

for i in data.iterrows():
  tags = i[1].tags.split(", ")
  title = i[1].title
  description = i[1].description
  video_id = i[1].video_id
  for j in range(len(tags)):
    tag_list.append(tags[j])
    title_description.append("Title: " + title + ". Description: " + description)
    video_ids.append(video_id)
    audio_text.append(i[1].transcription)

df_work = pd.DataFrame({"video_id": video_ids, "title_description": title_description,
                        "audio_text": audio_text, "tag_list": tag_list})

In [50]:
#df_work['title_description_vector'] = df_work['title_description'].apply(lambda l: text_model.encode(l, convert_to_tensor=True).cpu().numpy())

In [51]:
#df_work['audio_text_vector'] = df_work['audio_text'].apply(lambda l: audio_model.encode(l, convert_to_tensor=True).cpu().numpy())

In [52]:
train_df, test_df = train_test_split(df_work, test_size=0.2, random_state=42)

In [55]:
class FusionEmbedModel(nn.Module):
  def __init__(self, video_emb_size, text_emb_size, audio_text_emb_size,
               out_emb_size=768, num_heads=8):
    super().__init__()
    self.description_title_video_cross_attn = CrossAttention(
            dim=text_emb_size,
            num_heads=num_heads,
            out_dim=text_emb_size
        )

    self.audio_text_video_cross_attn = CrossAttention(
        
        dim=audio_text_emb_size,
        num_heads=num_heads,
        out_dim=audio_text_emb_size
    )

    total_emb_size = text_emb_size + audio_text_emb_size
    self.linear_video_proj = nn.Linear(video_emb_size, text_emb_size)
    self.out_proj = nn.Linear(total_emb_size, out_emb_size)

  def forward(self, description_title_emb, audio_text_emb, video_emb):
        # description_title_emb: (B, N_desc, C_text)
        # audio_text_emb: (B, N_audio, C_audio_text)
        # video_emb: (B, N_video, C_video)

        video_emb = self.linear_video_proj(video_emb)

        description_title_video_attn = self.description_title_video_cross_attn(
            x=description_title_emb, k=video_emb, v=video_emb
        )  # (B, N_desc, C_text)

        audio_text_video_attn = self.audio_text_video_cross_attn(
            x=audio_text_emb, k=video_emb, v=video_emb
        )  # (B, N_audio, C_audio_text)

        # average pooling -> bottleneck (we can try different strategies)
        description_title_attn_pooled = description_title_video_attn.mean(dim=1)  # (B, C_text)
        audio_text_attn_pooled = audio_text_video_attn.mean(dim=1)    # (B, C_audio_text)

        cat_embs = torch.cat([
            description_title_attn_pooled,
            audio_text_attn_pooled
        ], dim=-1)  # (B, total_emb_size)

        universal_emb = self.out_proj(cat_embs)  # (B, out_emb_size)
        return universal_emb

In [56]:
video_model.eval()
video_embs_dict = dict()
video_embs = []
for i in train_df.iterrows():
  # видео эмбеддинги засовываем в train_df в новую колонку
  video_path = "/home/jovyan/lost+found/videos_2/" + str(i[1].video_id) + ".mp4" # поменять на директорию, где хранятся видео, типо video_id = "folder/" + video_id
  if video_path in video_embs_dict:
    video_embs.append(video_embs_dict[video_path])
  else:
    vr = VideoReader(video_path)
    seg_len = len(vr)
    clip_len = 32 # для конкретной модели (microsoft/xclip-base-patch16-zero-shot)
    frame_sample_rate = 1
    indices = sample_frame_indices(clip_len=clip_len, frame_sample_rate=frame_sample_rate, seg_len=seg_len)
    video = vr.get_batch(indices).asnumpy()  # (clip_len, H, W, C)
    inputs = processor(videos=list(video), return_tensors="pt", padding=True).to('cuda')
    pixel_values = inputs['pixel_values']
    batch_size, num_frames, channels, height, width = pixel_values.shape
    pixel_values = pixel_values.view(-1, channels, height, width)  # [batch_size * num_frames, channels, height, width]
    inputs['pixel_values'] = pixel_values.to('cuda')
    with torch.no_grad():
      outputs = video_model(**inputs)
    frame_embeddings = outputs.last_hidden_state[:, 0, :]
    frame_embeddings = frame_embeddings.view(batch_size, num_frames, -1)
    # average pooling over frame_embeddings
    video_embedding = frame_embeddings.mean(dim=1) # (batch_size, emb_size) # надеюсь emb_size=1024 иначе еще линейный слой
    video_embs.append(video_embedding)
    video_embs_dict[video_path] = video_embedding

  return self.preprocess(images, **kwargs)


In [57]:
train_df['video_emb'] = [i.cpu().detach() for i in video_embs]

In [58]:
train_df.sample(2)

Unnamed: 0,video_id,title_description,audio_text,tag_list,video_emb
1237,9d37073c63008c784bf95a8a4d1715d5,Title: Punch Box. Серия 1. Пельмень vs Керам. ...,"Первый кулак России, который сделан нокаут в т...",Массовая культура,"[[tensor(-0.1769), tensor(0.2236), tensor(0.00..."
1107,cb8f4c48fd10af6768c3a89a5f9ab2f7,Title: Команда 3/21 в ГрандТуре «Байкальская м...,Погнали,Путешествия,"[[tensor(-0.2604), tensor(-0.0189), tensor(0.0..."


In [59]:
general_tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large-instruct')

In [62]:
target_model = SentenceTransformer('intfloat/multilingual-e5-large-instruct').to('cuda')
dim = 1024 # размерность эмбеддинга

In [95]:
fusion_model = FusionEmbedModel(video_emb_size=768,
                                text_emb_size=1024,
                                audio_text_emb_size=1024,
                                out_emb_size=1024, num_heads=8).to('cuda')

In [98]:
fusion_optimizer = torch.optim.AdamW(fusion_model.parameters(), lr=3e-4)
target_optimizer = torch.optim.AdamW(target_model.parameters(), lr=3e-4)
triplet_loss = nn.TripletMarginLoss(margin=1.0, p=2)

class TripletDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe.reset_index(drop=True)
        self.classes = dataframe['tag_list'].unique()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        anchor_class = self.data.iloc[idx]['tag_list']

        positive_text = self.data.iloc[idx]['title_description']
        positive_audio = self.data.iloc[idx]['audio_text']
        positive_video = self.data.iloc[idx]['video_emb']

        positive_samples = self.data[(self.data['tag_list'] == anchor_class) & (self.data.index != idx)]
        if len(positive_samples) > 0:
            anchor_sample = positive_samples.sample(1).iloc[0]
        else:
            anchor_sample = self.data.iloc[idx]

        anchor_text = anchor_sample['title_description']
        anchor_audio = anchor_sample['audio_text']
        anchor_video = anchor_sample['video_emb']

        negative_samples = self.data[self.data['tag_list'] != anchor_class]
        negative_sample = negative_samples.sample(1).iloc[0]
        negative_text = negative_sample['title_description']
        negative_audio = negative_sample['audio_text']
        negative_video = negative_sample['video_emb']

        return (
            anchor_text, anchor_audio, anchor_video,
            positive_text, positive_audio, positive_video,
            negative_text, negative_audio, negative_video
        )



train_dataset = TripletDataset(train_df)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

def tokenize_batch(texts):
    tokens = general_tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    return {key: value.to('cuda') for key, value in tokens.items()}

def train_triplet_loss_model(fusion_model, target_model, train_loader,
                             fusion_optimizer, target_optimizer, epochs=2):
    fusion_model.train()
    target_model.train()
    text_model.train()
    audio_model.train()
    for epoch in tqdm(range(epochs)):
        total_loss = 0
        for batch in tqdm(train_loader):
            (anchor_text, anchor_audio, anchor_video,
             positive_text, positive_audio, positive_video,
             negative_text, negative_audio, negative_video) = batch

            anchor_text_embs = text_model.encode(anchor_text, convert_to_tensor=True).unsqueeze(1)
            anchor_audio_embs = audio_model.encode(anchor_audio, convert_to_tensor=True).unsqueeze(1)
            anchor_video_embs = anchor_video.to('cuda')

            anchor_embeddings = fusion_model(
                description_title_emb=anchor_text_embs,
                audio_text_emb=anchor_audio_embs,
                video_emb=anchor_video_embs
            )

            positive_text_embs = text_model.encode(positive_text, convert_to_tensor=True).unsqueeze(1)
            positive_audio_embs = audio_model.encode(positive_audio, convert_to_tensor=True).unsqueeze(1)
            positive_video_embs = positive_video.to('cuda')

            positive_embeddings = fusion_model(
                description_title_emb=positive_text_embs,
                audio_text_emb=positive_audio_embs,
                video_emb=positive_video_embs
            )

            negative_text_embs = text_model.encode(negative_text, convert_to_tensor=True).unsqueeze(1)
            negative_audio_embs = audio_model.encode(negative_audio, convert_to_tensor=True).unsqueeze(1)
            negative_video_embs = negative_video.to('cuda')

            negative_embeddings = fusion_model(
                description_title_emb=negative_text_embs,
                audio_text_emb=negative_audio_embs,
                video_emb=negative_video_embs
            )

            loss = triplet_loss(anchor_embeddings, positive_embeddings, negative_embeddings)

            fusion_optimizer.zero_grad()
            target_optimizer.zero_grad()
            loss.backward()
            fusion_optimizer.step()
            target_optimizer.step()

            total_loss += loss.item()

        print(f'Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}')



train_triplet_loss_model(fusion_model=fusion_model, target_model=target_model,
                         train_loader=train_loader, fusion_optimizer=fusion_optimizer,
                         target_optimizer=target_optimizer, epochs=5)

  0%|          | 0/10 [00:00<?, ?it/s]
  0%|          | 0/71 [00:00<?, ?it/s][A
  1%|▏         | 1/71 [00:00<00:34,  2.03it/s][A
  3%|▎         | 2/71 [00:00<00:31,  2.21it/s][A
  4%|▍         | 3/71 [00:01<00:34,  2.00it/s][A
  6%|▌         | 4/71 [00:01<00:32,  2.03it/s][A
  7%|▋         | 5/71 [00:02<00:29,  2.25it/s][A
  8%|▊         | 6/71 [00:02<00:28,  2.26it/s][A
 10%|▉         | 7/71 [00:03<00:28,  2.24it/s][A
 11%|█▏        | 8/71 [00:03<00:26,  2.38it/s][A
 13%|█▎        | 9/71 [00:04<00:27,  2.26it/s][A
 14%|█▍        | 10/71 [00:04<00:27,  2.25it/s][A
 15%|█▌        | 11/71 [00:04<00:26,  2.23it/s][A
 17%|█▋        | 12/71 [00:05<00:25,  2.35it/s][A
 18%|█▊        | 13/71 [00:05<00:25,  2.25it/s][A
 20%|█▉        | 14/71 [00:06<00:24,  2.36it/s][A
 21%|██        | 15/71 [00:06<00:23,  2.35it/s][A
 23%|██▎       | 16/71 [00:07<00:23,  2.37it/s][A
 24%|██▍       | 17/71 [00:07<00:24,  2.24it/s][A
 25%|██▌       | 18/71 [00:07<00:23,  2.30it/s][A
 27%|██▋  

Epoch 1, Loss: 0.6722388042950295



  0%|          | 0/71 [00:00<?, ?it/s][A
  1%|▏         | 1/71 [00:00<00:35,  1.95it/s][A
  3%|▎         | 2/71 [00:00<00:30,  2.27it/s][A
  4%|▍         | 3/71 [00:01<00:32,  2.08it/s][A
  6%|▌         | 4/71 [00:01<00:32,  2.08it/s][A
  7%|▋         | 5/71 [00:02<00:32,  2.05it/s][A
  8%|▊         | 6/71 [00:02<00:31,  2.05it/s][A
 10%|▉         | 7/71 [00:03<00:29,  2.14it/s][A
 11%|█▏        | 8/71 [00:03<00:31,  2.01it/s][A
 13%|█▎        | 9/71 [00:04<00:28,  2.16it/s][A
 14%|█▍        | 10/71 [00:04<00:28,  2.16it/s][A
 15%|█▌        | 11/71 [00:05<00:28,  2.14it/s][A
 17%|█▋        | 12/71 [00:05<00:27,  2.14it/s][A
 18%|█▊        | 13/71 [00:06<00:25,  2.30it/s][A
 20%|█▉        | 14/71 [00:06<00:25,  2.27it/s][A
 21%|██        | 15/71 [00:06<00:24,  2.30it/s][A
 23%|██▎       | 16/71 [00:07<00:22,  2.46it/s][A
 24%|██▍       | 17/71 [00:07<00:24,  2.20it/s][A
 25%|██▌       | 18/71 [00:08<00:25,  2.11it/s][A
 27%|██▋       | 19/71 [00:08<00:23,  2.19it/s]

Epoch 2, Loss: 0.6214179582369159



  0%|          | 0/71 [00:00<?, ?it/s][A
  1%|▏         | 1/71 [00:00<00:26,  2.68it/s][A
  3%|▎         | 2/71 [00:00<00:34,  1.98it/s][A
  4%|▍         | 3/71 [00:01<00:32,  2.12it/s][A
  6%|▌         | 4/71 [00:01<00:33,  1.99it/s][A
  7%|▋         | 5/71 [00:02<00:34,  1.90it/s][A
  8%|▊         | 6/71 [00:02<00:31,  2.05it/s][A
 10%|▉         | 7/71 [00:03<00:28,  2.26it/s][A
 11%|█▏        | 8/71 [00:03<00:28,  2.20it/s][A
 13%|█▎        | 9/71 [00:04<00:29,  2.13it/s][A
 14%|█▍        | 10/71 [00:04<00:29,  2.07it/s][A
 15%|█▌        | 11/71 [00:05<00:29,  2.03it/s][A
 17%|█▋        | 12/71 [00:05<00:29,  2.01it/s][A
 18%|█▊        | 13/71 [00:06<00:27,  2.10it/s][A
 20%|█▉        | 14/71 [00:06<00:27,  2.06it/s][A
 21%|██        | 15/71 [00:07<00:27,  2.04it/s][A
 23%|██▎       | 16/71 [00:07<00:25,  2.14it/s][A
 24%|██▍       | 17/71 [00:08<00:26,  2.00it/s][A
 25%|██▌       | 18/71 [00:08<00:25,  2.11it/s][A
 27%|██▋       | 19/71 [00:09<00:23,  2.23it/s]

Epoch 3, Loss: 0.605287535929344



  0%|          | 0/71 [00:00<?, ?it/s][A
  1%|▏         | 1/71 [00:00<00:34,  2.03it/s][A
  3%|▎         | 2/71 [00:00<00:29,  2.37it/s][A
  4%|▍         | 3/71 [00:01<00:30,  2.20it/s][A
  6%|▌         | 4/71 [00:01<00:31,  2.15it/s][A
  7%|▋         | 5/71 [00:02<00:31,  2.09it/s][A
  8%|▊         | 6/71 [00:02<00:28,  2.25it/s][A
 10%|▉         | 7/71 [00:03<00:27,  2.33it/s][A
 11%|█▏        | 8/71 [00:03<00:26,  2.36it/s][A
 13%|█▎        | 9/71 [00:03<00:24,  2.51it/s][A
 14%|█▍        | 10/71 [00:04<00:23,  2.62it/s][A
 15%|█▌        | 11/71 [00:04<00:23,  2.60it/s][A
 17%|█▋        | 12/71 [00:05<00:23,  2.56it/s][A
 18%|█▊        | 13/71 [00:05<00:23,  2.49it/s][A
 20%|█▉        | 14/71 [00:05<00:23,  2.38it/s][A
 21%|██        | 15/71 [00:06<00:23,  2.40it/s][A
 23%|██▎       | 16/71 [00:06<00:23,  2.37it/s][A
 24%|██▍       | 17/71 [00:07<00:22,  2.40it/s][A
 25%|██▌       | 18/71 [00:07<00:22,  2.39it/s][A
 27%|██▋       | 19/71 [00:07<00:21,  2.48it/s]

Epoch 4, Loss: 0.5748615367731578



  0%|          | 0/71 [00:00<?, ?it/s][A
  1%|▏         | 1/71 [00:00<00:33,  2.07it/s][A
  3%|▎         | 2/71 [00:00<00:26,  2.59it/s][A
  4%|▍         | 3/71 [00:01<00:31,  2.19it/s][A
  6%|▌         | 4/71 [00:01<00:30,  2.20it/s][A
  7%|▋         | 5/71 [00:02<00:28,  2.35it/s][A
  8%|▊         | 6/71 [00:02<00:26,  2.44it/s][A
 10%|▉         | 7/71 [00:02<00:26,  2.43it/s][A
 11%|█▏        | 8/71 [00:03<00:26,  2.38it/s][A
 13%|█▎        | 9/71 [00:03<00:26,  2.32it/s][A
 14%|█▍        | 10/71 [00:04<00:25,  2.41it/s][A
 15%|█▌        | 11/71 [00:04<00:25,  2.37it/s][A
 17%|█▋        | 12/71 [00:05<00:24,  2.38it/s][A
 18%|█▊        | 13/71 [00:05<00:27,  2.13it/s][A
 20%|█▉        | 14/71 [00:06<00:25,  2.23it/s][A
 21%|██        | 15/71 [00:06<00:26,  2.13it/s][A
 23%|██▎       | 16/71 [00:07<00:26,  2.10it/s][A
 24%|██▍       | 17/71 [00:07<00:26,  2.07it/s][A
 25%|██▌       | 18/71 [00:08<00:25,  2.08it/s][A
 27%|██▋       | 19/71 [00:08<00:23,  2.25it/s]

Epoch 5, Loss: 0.5704924837804176



  0%|          | 0/71 [00:00<?, ?it/s][A
  1%|▏         | 1/71 [00:00<00:37,  1.88it/s][A
  3%|▎         | 2/71 [00:00<00:31,  2.19it/s][A
  4%|▍         | 3/71 [00:01<00:32,  2.11it/s][A
  6%|▌         | 4/71 [00:01<00:32,  2.08it/s][A
  7%|▋         | 5/71 [00:02<00:27,  2.39it/s][A
  8%|▊         | 6/71 [00:02<00:27,  2.32it/s][A
 10%|▉         | 7/71 [00:03<00:29,  2.19it/s][A
 11%|█▏        | 8/71 [00:03<00:29,  2.16it/s][A
 13%|█▎        | 9/71 [00:03<00:25,  2.42it/s][A
 14%|█▍        | 10/71 [00:04<00:26,  2.35it/s][A
 15%|█▌        | 11/71 [00:04<00:24,  2.45it/s][A
 17%|█▋        | 12/71 [00:05<00:22,  2.60it/s][A
 18%|█▊        | 13/71 [00:05<00:22,  2.63it/s][A
 20%|█▉        | 14/71 [00:05<00:23,  2.47it/s][A
 21%|██        | 15/71 [00:06<00:24,  2.30it/s][A
 23%|██▎       | 16/71 [00:07<00:26,  2.10it/s][A
 24%|██▍       | 17/71 [00:07<00:25,  2.15it/s][A
 25%|██▌       | 18/71 [00:07<00:25,  2.11it/s][A
 27%|██▋       | 19/71 [00:08<00:24,  2.16it/s]

KeyboardInterrupt: 

In [99]:
# METRICS
import pandas as pd
import argparse
import ast
import numpy as np

def iou_metric(ground_truth, predictions):
    iou =  len(set.intersection(set(ground_truth), set(predictions)))
    iou = iou/(len(set(ground_truth).union(set(predictions))))
    return iou

def split_tags(tag_list):
    final_tag_list = []
    for tag in tag_list:
        tags = tag.split(": ")
        if len(tags) == 3:
            final_tag_list.append(tags[0])
            final_tag_list.append(tags[0] + ": " + tags[1])
            final_tag_list.append(tags[0]+ ": " + tags[1] + ": " + tags[2])
        elif len(tags) == 2:
            final_tag_list.append(tags[0])
            final_tag_list.append(tags[0] + ": " + tags[1])
        elif len(tags) == 1:
            final_tag_list.append(tags[0])
        else:
            print("NOT IMPLEMENTED!!!!", tag)
    return final_tag_list

def find_iou_for_sample_submission(pred_submission, true_submission):
    ground_truth_df = true_submission
    ground_truth_df["tag_list"] = ground_truth_df["tag_list"].str.split(', ')
    ground_truth_df["tags_split"] = ground_truth_df["tag_list"].apply(lambda l: split_tags(l))

    predictions_df = pred_submission
#     predictions_df["predicted_tags"] = predictions_df["predicted_tags"].apply(ast.literal_eval
    predictions_df["predicted_tags_split"] = predictions_df["predicted_tags"] = predictions_df["predicted_tags"].apply(
                                                                lambda l: split_tags(l) if not isinstance(l, float) else l
                                                )

    iou=0
    counter = 0
    for i, row in ground_truth_df.iterrows():
        predicted_tags = predictions_df[predictions_df["video_id"]==row["video_id"]]["predicted_tags_split"].values[0]
        iou_temp=iou_metric(row['tags_split'], predicted_tags)
        iou+=iou_temp
        counter+=1

    return iou/counter

In [66]:
# INFERENCE (по test_df)

In [67]:
video_model.eval()
video_embs_dict_test = dict()
video_embs_test = []
for i in test_df.iterrows():
  # видео эмбеддинги засовываем в test_df в новую колонку
  video_path = "/home/jovyan/lost+found/videos_2/" + str(i[1].video_id) + ".mp4" # поменять на директорию, где хранятся видео, типо video_id = "folder/" + video_id
  if video_path in video_embs_dict_test:
    video_embs_test.append(video_embs_dict_test[video_path])
  else:
    vr = VideoReader(video_path)
    seg_len = len(vr)
    clip_len = 32 # для конкретной модели (microsoft/xclip-base-patch16-zero-shot)
    frame_sample_rate = 1
    indices = sample_frame_indices(clip_len=clip_len, frame_sample_rate=frame_sample_rate, seg_len=seg_len)
    video = vr.get_batch(indices).asnumpy()  # (clip_len, H, W, C)
    inputs = processor(videos=list(video), return_tensors="pt", padding=True).to('cuda')
    pixel_values = inputs['pixel_values']
    batch_size, num_frames, channels, height, width = pixel_values.shape
    pixel_values = pixel_values.view(-1, channels, height, width)  # [batch_size * num_frames, channels, height, width]
    inputs['pixel_values'] = pixel_values.to('cuda')
    with torch.no_grad():
      outputs = video_model(**inputs)
    frame_embeddings = outputs.last_hidden_state[:, 0, :]
    frame_embeddings = frame_embeddings.view(batch_size, num_frames, -1)
    # average pooling over frame_embeddings
    video_embedding = frame_embeddings.mean(dim=1) # (batch_size, emb_size) # надеюсь emb_size=1024 иначе еще линейный слой
    video_embs_test.append(video_embedding)
    video_embs_dict_test[video_path] = video_embedding

  return self.preprocess(images, **kwargs)


In [78]:
test_df.sample(2)

Unnamed: 0,video_id,title_description,audio_text,tag_list,video_emb,title_description_vector,audio_text_vector,common_vector
377,a40df9cd5c1517f478b458d74081ab12,Title: Один выходной | Сезон 2 | Выпуск 1 | Ск...,"Можно все, вообще нет правила и в этом кайф. Т...",События и достопримечательности: Активный отдых,"[[tensor(-0.2840), tensor(-0.0927), tensor(-0....","[0.02019478, 0.021183835, -0.028113348, -0.039...","[0.0122921225, -0.0072347517, -0.012975653, -0...","[-0.07921195, -0.041533127, -0.03266341, -0.11..."
1175,fbbd10e0e69381ef8d56896392eef543,Title: Тот Самый Мент I Выпуск 74 I Начальник ...,Присаживайтесь в Москве,Массовая культура: Юмор и сатира,"[[tensor(-0.3112), tensor(0.0429), tensor(-0.0...","[0.025576632, 0.012932097, -0.018839719, -0.06...","[0.019080197, 0.01883682, -0.023289552, -0.045...","[-0.084199436, -0.07979075, -0.09679501, -0.11..."


In [100]:
test_df['video_emb'] = [i.cpu().detach() for i in video_embs_test]

In [101]:
test_df['title_description_vector'] = test_df['title_description'].apply(lambda l: text_model.encode(l, convert_to_tensor=True).cpu().numpy())

In [102]:
test_df['audio_text_vector'] = test_df['audio_text'].apply(lambda l: audio_model.encode(l, convert_to_tensor=True).cpu().numpy())

In [103]:
def compute_common_vector(l):
    description_title_emb = torch.tensor(l['title_description_vector']).unsqueeze(0).unsqueeze(1).to('cuda')  # (1, 1, emb_dim)
    audio_text_emb = torch.tensor(l['audio_text_vector']).unsqueeze(0).unsqueeze(1).to('cuda')  # (1, 1, emb_dim)
    video_emb = torch.tensor(l['video_emb']).unsqueeze(0).to('cuda')  # (1, video_emb_dim)

    with torch.no_grad():
        common_vector = fusion_model(
            description_title_emb=description_title_emb,
            audio_text_emb=audio_text_emb,
            video_emb=video_emb
        )  # (1, output_dim)

    return common_vector.cpu().numpy()[0]

test_df['common_vector'] = test_df.apply(compute_common_vector, axis=1)

  video_emb = torch.tensor(l['video_emb']).unsqueeze(0).to('cuda')  # (1, video_emb_dim)


In [104]:
def get_tags():
    target_model.eval()
    tags = {}
    with torch.no_grad():
        for i, row in tqdm(taxonomy.iterrows()):
            if isinstance(row['Уровень 1 (iab)'], str):
                tags[row['Уровень 1 (iab)']] = target_model.encode(row['Уровень 1 (iab)'], convert_to_tensor=True).cpu().numpy()#.tolist()
            if isinstance(row['Уровень 2 (iab)'], str):
                tags[row['Уровень 1 (iab)']+ ": "+row['Уровень 2 (iab)']] = target_model.encode(row['Уровень 1 (iab)']+ ": "+row['Уровень 2 (iab)'], convert_to_tensor=True).cpu().numpy()#.tolist()
            if isinstance(row['Уровень 3 (iab)'], str):
                tags[row['Уровень 1 (iab)']+ ": "+row['Уровень 2 (iab)']+": "+row['Уровень 3 (iab)']] = target_model.encode(row['Уровень 1 (iab)']+ ": "+row['Уровень 2 (iab)']+": "+row['Уровень 3 (iab)'], convert_to_tensor=True).cpu().numpy()#.tolist()
        return tags

tags = get_tags()
tags_list = list(tags.keys())
vectors = np.array(list(tags.values()))

611it [00:16, 37.24it/s]


In [105]:
index = faiss.index_factory(dim, "Flat", faiss.METRIC_INNER_PRODUCT)
print(index.ntotal)
index.add(vectors)
print(index.ntotal)

0
610


In [83]:
test_df.sample(2)

Unnamed: 0,video_id,title_description,audio_text,tag_list,video_emb,title_description_vector,audio_text_vector,common_vector
1030,8a762b2f22accc32159926c8054fd8e9,Title: location | Pompeya - Odelay. Descriptio...,Маленький,Музыка и аудио: Разное (Музыка и аудио),"[[tensor(-0.2467), tensor(0.1080), tensor(-0.0...","[0.010618988, 0.018526142, -0.019410837, -0.04...","[-0.0023610704, 0.03212811, -0.014340383, -0.0...","[-0.09672257, -0.027643707, -0.037916858, -0.0..."
819,78002eb306ae22e44147464bc45842f8,Title: Много половин. Акмаль и его девушка Саш...,Сложно не заметить легендарную кредитную карту...,Карьера,"[[tensor(-0.6046), tensor(-0.0152), tensor(0.0...","[0.013182346, 0.016781544, -0.025050532, -0.02...","[0.011375706, 0.017268492, -0.037079215, -0.04...","[-0.07911661, -0.06265162, -0.11458967, -0.096..."


In [106]:
sample_submission.sample(2)

Unnamed: 0,video_id,predicted_tags,predicted_tags_split
260,441c3db4884b412ee3b99db89c350fe3,,
893,df635402135e4ee4f14188be19f35c83,,


In [107]:
topn = 1

sample_submission = pd.DataFrame(data=data['video_id'].to_list(), columns=['video_id'])
sample_submission['predicted_tags'] = np.nan
sample_submission['predicted_tags'] = sample_submission['predicted_tags'].astype('object')

for i, row in test_df.iterrows():
#   topn = int(model_num_tags.predict([row['common_vector']])[0])
  
  scores, predictions = index.search(np.array([row['common_vector']]), topn)
  index_i = sample_submission[sample_submission.video_id == row.video_id].index
  sample_submission.at[index_i[0], 'predicted_tags'] = [
      tags_list[tag] for i, tag in enumerate(predictions[0])
  ]

test_df_copy = test_df.copy()
find_iou_for_sample_submission(sample_submission, test_df_copy)

0.008244994110718492

In [None]:
def get_tags():
    tags = {}
    for i, row in tqdm(taxonomy.iterrows()):
        if isinstance(row['Уровень 1 (iab)'], str):
            tags[row['Уровень 1 (iab)']] = model.encode(row['Уровень 1 (iab)'], convert_to_tensor=True).cpu().numpy()#.tolist()
        if isinstance(row['Уровень 2 (iab)'], str):
            tags[row['Уровень 1 (iab)']+ ": "+row['Уровень 2 (iab)']] = model.encode(row['Уровень 1 (iab)']+ ": "+row['Уровень 2 (iab)'], convert_to_tensor=True).cpu().numpy()#.tolist()
        if isinstance(row['Уровень 3 (iab)'], str):
            tags[row['Уровень 1 (iab)']+ ": "+row['Уровень 2 (iab)']+": "+row['Уровень 3 (iab)']] = model.encode(row['Уровень 1 (iab)']+ ": "+row['Уровень 2 (iab)']+": "+row['Уровень 3 (iab)'], convert_to_tensor=True).cpu().numpy()#.tolist()
    return tags

tags = get_tags()
tags_list = list(tags.keys())
vectors = np.array(list(tags.values()))

In [None]:
index = faiss.index_factory(dim, "Flat", faiss.METRIC_INNER_PRODUCT)
print(index.ntotal)
index.add(vectors)
print(index.ntotal)

In [None]:
topn = 1

sample_submission = pd.DataFrame(data=test_df['video_id'].to_list(), columns=['video_id'])
sample_submission['predicted_tags'] = np.nan
sample_submission['predicted_tags'] = sample_submission['predicted_tags'].astype('object')

for i, row in data.iterrows():
#   topn = int(model_num_tags.predict([row['common_vector']])[0])
  scores, predictions = index.search(np.array([row['common_vector']]), topn)
  index_i = sample_submission[sample_submission.video_id == row.video_id].index
  sample_submission.at[index_i[0], 'predicted_tags'] = [
      tags_list[tag] for i, tag in enumerate(predictions[0])
  ]

data_copy = data.copy()
find_iou_for_sample_submission(sample_submission, data_copy)

In [None]:
sample_submission.to_csv("sample_submission.csv", index_label=0)