In [None]:
from tqdm import tqdm
import numpy as np
import random
import torch
import re

from torch.utils.data import Dataset, RandomSampler, DataLoader, SequentialSampler
from transformers import Blip2ForConditionalGeneration, AutoTokenizer
from transformers import get_linear_schedule_with_warmup

from dataloader import MSVDDataset
from model import VCModel
from utils import train, test, get_groundtruth_captions, get_predicted_captions, score

In [None]:
def prep_optimizer(model, len_train_iter, config, device):
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    decay_params = [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)]
    no_decay_params = [p for n, p in param_optimizer if any(nd in n for nd in no_decay)]
    print(len(decay_params), len(no_decay_params))
    optimizer_grouped_parameters = [
        {'params': decay_params, 'weight_decay': config.weight_decay},
        {'params': no_decay_params, 'weight_decay': 0.0,},
    ]

    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=config.lr)
    num_training_steps = int(len_train_iter / config.gradient_accumulation_steps) * config.epochs
    num_warmup_steps = int(0.1 * num_training_steps)
    lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)


    return optimizer, lr_scheduler, model.to(device)

In [None]:
def MSVD_Dataloader(path, phase, tokenizer, config):
    dataset = MSVDDataset(
        path=path,
        phase=phase,
        tokenizer=tokenizer,
        max_frame=config.max_frames,
        max_seq_len=config.max_seq_len
    )
    if phase == 'train':
        sampler = RandomSampler(dataset, replacement=False)
    else:
        sampler = SequentialSampler(dataset)

    data_loader = DataLoader(
        dataset,
        batch_size=config.batch_size,
        shuffle=False,
        sampler=sampler,
        pin_memory=True
    )
    return data_loader

In [None]:
class config:
    local_rank = 0
    semantic_dim = 512
    
    visual_num_hidden_layers = 4
    d_graph = 1024
    node_feat_dim = 512
    edge_dim = 1024
    hidden_size = 768
    project_edge_dim = None
    no_skip = False
    last_average = False
    no_beta_transformer = False

    decoder_name='Salesforce/blip2-flan-t5-xl'

    max_frames = 20
    max_seq_len = 20
    batch_size = 32

    epochs = 10
    lr = 1e-4
    weight_decay = 0.001
    gradient_accumulation_steps = 2
    gradient_clip = 5.0

    path = '/kaggle/input/msvd-and-msrvtt'
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
tokenizer = AutoTokenizer.from_pretrained(config.decoder_name)

train_dataloader = MSVD_Dataloader(config.path, 'train', tokenizer, config)
val_dataloader = MSVD_Dataloader(config.path, 'val', tokenizer, config)

val_vid2GTs = get_groundtruth_captions(tokenizer, val_dataloader)
model = VCModel(tokenizer, config, Blip2ForConditionalGeneration)
optimizer, lr_scheduler, model = prep_optimizer(model, len(train_dataloader), config, config.device)

In [None]:
for epoch in range(1, config.epochs + 1):
    train_loss = train(epoch, train_dataloader, model,
                       tokenizer, config.gradient_accumulation_steps,
                       optimizer, lr_scheduler, config.gradient_clip, config.device)
    val_loss = test(epoch, val_dataloader, model, tokenizer, config.device)
    val_vid2pred = get_predicted_captions(model, tokenizer, val_dataloader, config.device)
    metric_score =  score(val_vid2pred, val_vid2GTs)
    print("Epoch {}/{} Train loss: {} ".format(epoch, config.epochs, train_loss))
    print("Epoch {}/{} Validation loss: {}".format(epoch, config.epochs, val_loss))
    print("Score: {}".format(metric_score))