In [2]:
# pip install torch torchvision

In [3]:
# !pip install av
# !pip install matplotlib

In [4]:
# !pip install timm

In [5]:
# !pip install Pillow==9.5.0

In [6]:
# !pip3 install --upgrade jupyter-console
# !pip3 install --upgrade jupyter

In [7]:
# !git clone https://github.com/MKLab-ITI/visil.git visil_pytorch

In [8]:
pip install -r visil_pytorch/requirements.txt

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [9]:
import zipfile,os.path
def unzip(source_filename, dest_dir):
    with zipfile.ZipFile(source_filename) as zf:
        for member in zf.infolist():
            # Path traversal defense copied from
            # http://hg.python.org/cpython/file/tip/Lib/http/server.py#l789
            words = member.filename.split('/')
            path = dest_dir
            for word in words[:-1]:
                while True:
                    drive, word = os.path.splitdrive(word)
                    head, word = os.path.split(word)
                    if not drive:
                        break
                if word in (os.curdir, os.pardir, ''):
                    continue
                path = os.path.join(path, word)
            zf.extract(member, path)

In [10]:
# import urllib.request

# # URL of the file
# url = "http://ndd.iti.gr/visil/ckpt.zip"

# # Download the file and save it locally
# urllib.request.urlretrieve(url, "ckpt.zip")

# print("File downloaded successfully!")

In [11]:
# !wget http://ndd.iti.gr/visil/ckpt.zip
# !unzip ckpt.zip

In [12]:
# unzip("data_finetune.zip", "data_finetune")

# Data processing

In [13]:
import os
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
# import skvideo.io
import torch, torchvision
import matplotlib.pyplot as plt
# import timm
import time
import gc
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
# import cv2

In [14]:
def empty_cache():
    torch.cuda.empty_cache()
    gc.collect()

In [15]:
def seed_everything(seed):
    """
    Обеспечивает воспроизводимость экспериментов
    """
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

In [16]:
y = pd.read_csv("y_finetune.csv")
# y = y.sort_values('created')
y

Unnamed: 0,uuid,duplicate_for,is_duplicate,type
0,3dd424ce-f0e5-4727-88a3-8f318b612afd,1e9efc51-a74c-4f32-b03e-71905f8d6dd1,False,hard
1,d444f2d0-a7cd-4c9b-bc56-8d5ef88ec015,1e9efc51-a74c-4f32-b03e-71905f8d6dd1,False,hard
2,49292bf9-dc53-44d5-9980-a658ab3a3921,1e9efc51-a74c-4f32-b03e-71905f8d6dd1,False,hard
3,6a816304-7c1b-4c16-81c6-09f46bb0ad63,1e9efc51-a74c-4f32-b03e-71905f8d6dd1,False,hard
4,1f4df893-663e-4ca3-a6ca-46819aebd8a3,3dd424ce-f0e5-4727-88a3-8f318b612afd,False,hard
...,...,...,...,...
755,2f33171f-a356-4dfc-8fc5-b0c492f8d091,15208cba-2aee-41e1-8073-18bb68dad838,False,false_positives
756,33afb61d-c38c-4e3a-95e5-78b06c38cf0e,0670afa5-d74e-4ae9-82dd-8dc7aee1ffc7,False,false_positives
757,367b94a1-4757-4cf3-a726-ca0088f67dd8,3ba31a54-e0a0-48ff-82e8-af0db8a5e873,False,false_positives
758,1eb88ab3-ebbd-41e8-a5b3-7fb19914a677,2c5a5717-72b0-4374-be04-b9d823425101,False,false_positives


In [17]:
y['is_duplicate'].value_counts(normalize=True)

is_duplicate
False    0.543421
True     0.456579
Name: proportion, dtype: float64

# Dataloader

In [18]:
from torch.utils.data import Dataset, DataLoader

class VideoDataset(Dataset):
    def __init__(self, videos1, videos2, labels):
        self.videos1 = videos1
        self.videos2 = videos2
        self.labels = [int(l) for l in labels]

    def __len__(self):
        return len(self.videos1)

    def load_frames(self, video):
        return torch.from_numpy(load_video(os.path.join("data_finetune/data_finetune/data_finetune", video+'.mp4')))

    def __getitem__(self, idx):
        videos = {'video1' : self.videos1[idx], 'video2': self.videos2[idx]}
        
        frames = {}
        frames['video1'] = self.load_frames(videos['video1'])
        frames['video2'] = self.load_frames(videos['video2'])


        return videos, frames, self.labels[idx]

dataset_train = VideoDataset(y['uuid'].values, y['duplicate_for'].values, y['is_duplicate'])

In [19]:
class DataCollator:
    def __init__(self):
        pass
        # self.cfg = cfg
        # self.processor = processor
        # self.max_length = max_length

    def __call__(self, batch):
        # print(batch)
        videos_batch = {'video1' : [], 'video2': []}
        frames_batch = {'video1' : [], 'video2': []}
        lengths = {'video1' : [], 'video2': []}
        labels = []

        for videos, frames, label in batch:
            labels.append(label)
            for video_type in ['video1', 'video2']:
                
                videos_batch[video_type].append(videos[video_type])

                video_frames = frames[video_type]

                lengths[video_type].append(len(video_frames))
                frames_batch[video_type].append(video_frames)
            
        for video_type in ['video1', 'video2']:
            frames_batch[video_type] = torch.concatenate(frames_batch[video_type], dim=0)
        
        return videos_batch, frames_batch, lengths, torch.tensor(labels)

data_collator = DataCollator()

In [20]:
dataloader_train = DataLoader(dataset_train, batch_size=2, collate_fn=data_collator, shuffle=True)

In [21]:
from visil_pytorch.utils import load_video

In [1]:
for b in dataloader_train:
    break
# b

In [23]:
import sys
# sys.path.append('visil_pytorch/')

from visil_pytorch.utils import load_video

In [24]:
for videos, frames, lengths, labels in dataloader_train:
    break

In [25]:
videos

{'video1': ['cbacb5ab-5b69-400e-8572-28d26d0b56e8',
  '3c21a653-5701-4aa0-99e6-2787a08f50fe'],
 'video2': ['1659eef1-e919-4077-82da-cca40d277bf1',
  '48776192-a1d2-4e88-b8a4-e5f65edaa06b']}

In [26]:
frames['video1'].shape

torch.Size([31, 224, 224, 3])

In [27]:
lengths['video1']

[8, 23]

In [28]:
labels

tensor([1, 0])

# Model

In [29]:
import os
os.chdir('visil_pytorch')

In [30]:
import torch

from model.visil import *


device = torch.device('cuda')
# device = torch.device('cpu')

In [31]:
os.chdir('../')

In [32]:

class CosineContrastiveLoss(nn.Module):
    def __init__(self, margin=0.5):
        super(CosineContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, embedding1, embedding2, label):
        # Cosine similarity ranges from -1 to 1
        cos_sim = F.cosine_similarity(embedding1, embedding2)
        loss_pos = (label * (1 - cos_sim)).mean()  # For positive pairs (similar)
        # loss_neg = ((1 - label) * torch.clamp(cos_sim - self.margin, min=0.0)).mean()  # For negative pairs (dissimilar)
        loss_neg = ((1 - label) * (cos_sim + 1)).mean()  # For negative pairs (dissimilar)
        return loss_pos + loss_neg

criterion = CosineContrastiveLoss(margin=0.5)


In [35]:
class ViSiL(nn.Module):
    
    def __init__(self, network='resnet50', pretrained=False, dims=3840,
                 whiteninig=True, attention=True, video_comperator=True, symmetric=False):
        super(ViSiL, self).__init__()
        
        if pretrained and not symmetric:
            self.cnn = Feature_Extractor('resnet50', True, 3840)
            self.visil_head = ViSiLHead(3840, True, True, False)
            self.visil_head.load_state_dict(
                torch.hub.load_state_dict_from_url(
                    'http://ndd.iti.gr/visil/visil.pth'))
        elif pretrained and symmetric:
            self.cnn = Feature_Extractor('resnet50', True, 512)
            self.visil_head = ViSiLHead(512, True, True, True)
            self.visil_head.load_state_dict(
                torch.hub.load_state_dict_from_url(
                    'http://ndd.iti.gr/visil/visil_symmetric.pth'))
        else:
            self.cnn = Feature_Extractor(network, whiteninig, dims)
            self.visil_head = ViSiLHead(dims, attention, video_comperator, symmetric)
    
    def calculate_video_similarity(self, query, target):
        return self.visil_head(query, target)

    def calculate_f2f_matrix(self, query, target):
        return self.visil_head.frame_to_frame_similarity(query, target)

    def calculate_visil_output(self, query, target):
        sim = self.visil_head.frame_to_frame_similarity(query, target)
        return self.visil_head.visil_output(sim)
        
    def forward(self, video_tensors, lengths):
        # print(1, time.time())
        # print(video_tensor.shape)
        cnn_features = self.cnn(video_tensors)
        features = []
        # print(features.shape)
        # print(2, time.time())
        sum_length = 0
        for length in lengths:
            feature  = cnn_features[sum_length:sum_length+length]
            # print(feature.shape, feature[0])
            sum_length += length

            feature = self.visil_head.prepare_tensor(feature)
            features.append(feature.mean(dim=0))
            
        features = torch.stack(features)
        return features

In [36]:
# Initialize pretrained ViSiL model
model = ViSiL(pretrained=True).to(device)
model.eval();



In [38]:
videos

{'video1': ['cbacb5ab-5b69-400e-8572-28d26d0b56e8',
  '3c21a653-5701-4aa0-99e6-2787a08f50fe'],
 'video2': ['1659eef1-e919-4077-82da-cca40d277bf1',
  '48776192-a1d2-4e88-b8a4-e5f65edaa06b']}

In [39]:
# loss

# Training

In [40]:
from sklearn.metrics import roc_auc_score

In [41]:
# del frames, video1_embeds, video2_embeds
empty_cache()

In [42]:
from torch import optim

optimizer = optim.Adam(model.parameters(), lr=3e-5)

seed_everything(42)

num_epochs=2
device='cuda'


model = model.to(device)

model.load_state_dict(torch.load("model_0.pt"))

  model.load_state_dict(torch.load("model_0.pt"))


<All keys matched successfully>

In [None]:

for epoch in range(1, num_epochs+1):
    model.train()
    running_loss = 0.0
    # correct, total = 0, 0
    preds = []
    targets = []
    for i, (videos, frames, lengths, labels) in tqdm(enumerate(dataloader_train), total=len(dataloader_train)):
        frames = {k:v.to(device) for k, v in frames.items()}

        labels[labels == 0] = -1
        

        optimizer.zero_grad()

        # 1/0
        
        # Forward pass
        video1_embeds = model(frames['video1'], lengths['video1'])
        video1_embeds = video1_embeds.reshape(video1_embeds.shape[0], -1)
        video2_embeds = model(frames['video2'], lengths['video2'])
        video2_embeds = video2_embeds.reshape(video2_embeds.shape[0], -1)

        loss = criterion(video1_embeds, video2_embeds, labels.to(device))

        for (p, t) in zip(F.cosine_similarity(video1_embeds, video2_embeds).cpu().tolist(), labels):
            preds.append(p)
            targets.append(t if t > 0 else 0)
            # if p == t or (p == 0 and t == -1):
            #     correct += 1
            # total += 1

        # 1/0
        if i % 10 == 0:
            # print('ACC', correct/total)
            try:
                print("AUC", roc_auc_score(targets, preds))
            except:
                print("AUC", -1)
            print('Loss', running_loss /( i + 1))
            print('-'*50)

        # Backward pass
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        del frames, video1_embeds, video2_embeds
        empty_cache()

        # print(loss.item())

    avg_loss = running_loss / len(dataloader_train)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")
    torch.save(model.state_dict(), f'model_{epoch}.pt')


  0%|          | 0/380 [00:00<?, ?it/s]

AUC -1
Loss 0.0
--------------------------------------------------
AUC 0.9173553719008265
Loss 0.964179662140933
--------------------------------------------------
AUC 0.9568181818181818
Loss 1.0244512274151756
--------------------------------------------------
AUC 0.9241452991452992
Loss 1.0839240243357997
--------------------------------------------------
AUC 0.9239130434782609
Loss 1.1475701026800202
--------------------------------------------------
AUC 0.8912037037037036
Loss 1.1119052089896857
--------------------------------------------------
AUC 0.8822252374491181
Loss 1.151971215107402
--------------------------------------------------
AUC 0.8787212787212788
Loss 1.1311018739787626
--------------------------------------------------
AUC 0.875
Loss 1.1634588120160279
--------------------------------------------------
AUC 0.8690115221346271
Loss 1.167564984533813
--------------------------------------------------
AUC 0.8637481554353172
Loss 1.1457638572348225
--------------------

  0%|          | 0/380 [00:00<?, ?it/s]

AUC -1
Loss 0.0
--------------------------------------------------
AUC 0.7946428571428572
Loss 1.182900147004561
--------------------------------------------------
AUC 0.8795454545454545
Loss 1.0249127206348239
--------------------------------------------------
AUC 0.9072916666666666
Loss 0.9541967559245325
--------------------------------------------------
AUC 0.9083878643664485
Loss 0.9549648878051014
--------------------------------------------------
AUC 0.8947775628626693
Loss 0.9296212354127098
--------------------------------------------------
AUC 0.873144399460189
Loss 0.9676878305732227
--------------------------------------------------
AUC 0.8945374800637957
Loss 0.9674933968295514
--------------------------------------------------
AUC 0.8832183908045977
Loss 0.9713745279076659
--------------------------------------------------
AUC 0.8915065845112964
Loss 1.002977505817518
--------------------------------------------------
AUC 0.8794117647058823
Loss 1.0389385090606047
-------