<a href="https://colab.research.google.com/github/Telescope-U/Video-Comment-Generator/blob/master/%5BMscProject%5DGatedAttentionModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import torchtext
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer

import numpy as np
import pandas as pd

import math
import time
import os
import random
from tqdm import tqdm

In [None]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
WORK_DIR = '/content/drive/MyDrive/Colab Notebooks/[Msc]Video-Comment-Generator/'
if not os.path.isdir(WORK_DIR):
    WORK_DIR = ''
TRAIN_FOLDER = WORK_DIR + 'Dataset/Train/'
VALID_FOLDER = WORK_DIR + 'Dataset/Valid/'
TEST_FOLDER = WORK_DIR + 'Dataset/Test/'
VIDEO_PATH = WORK_DIR + 'Dataset/video_data.csv'
COMMENT_PATH = WORK_DIR + 'Dataset/comment_data.csv'

# 1. Dataset

In [None]:
def split_ids(video_ids, train=0.7, valid=0.1, test=0.2):
    list_copy = video_ids.copy()
    random.shuffle(list_copy)
    train_size = math.floor(len(list_copy)*train)
    valid_size = math.floor(len(list_copy)*valid)
    return list_copy[:train_size], list_copy[train_size:(train_size+valid_size)], list_copy[(train_size+valid_size):]

In [None]:
video_df = pd.read_csv(VIDEO_PATH)
comment_df = pd.read_csv(COMMENT_PATH)
train_ids, valid_ids, test_ids = split_ids(video_df['vid'].unique())
print(len(train_ids), len(valid_ids), len(test_ids))

In [None]:
import string
def clean_text(text):
    text = str(text)
    for i in string.punctuation:
        text = text.replace(i, '')
    return text.lower()

video_df['title'] = video_df['title'].apply(clean_text)
video_df['transcript'] = video_df['transcript'].apply(clean_text)
comment_df['en_content'] = comment_df['en_content'].apply(clean_text)

In [None]:
# Split and save dataset
train_video_df = video_df[video_df['vid'].isin(train_ids)]
valid_video_df = video_df[video_df['vid'].isin(valid_ids)]
test_video_df = video_df[video_df['vid'].isin(test_ids)]

train_comment_df = comment_df[comment_df['vid'].isin(train_ids)]
valid_comment_df = comment_df[comment_df['vid'].isin(valid_ids)]
test_comment_df = comment_df[comment_df['vid'].isin(test_ids)]

train_comment_df.to_csv('Dataset/Train/comment.csv', index = None)
test_comment_df.to_csv('Dataset/Test/comment.csv', index = None)
valid_comment_df.to_csv("Dataset/Valid/comment.csv", index = None)

train_video_df.to_csv('Dataset/Train/video.csv', index = None)
test_video_df.to_csv('Dataset/Test/video.csv', index = None)
valid_video_df.to_csv("Dataset/Valid/video.csv", index = None)

In [None]:
def get_data(folder):
    comment_path = os.path.join(folder,'comment.csv')
    video_path = os.path.join(folder,'video.csv')
    comment_df = pd.read_csv(comment_path)
    video_df = pd.read_csv(video_path)
    return {'comment':comment_df, 'video': video_df}
        

In [None]:
train_data = get_data(TRAIN_FOLDER)
valid_data = get_data(VALID_FOLDER)
test_data = get_data(TEST_FOLDER)

## 1.1 Vocabulary

In [None]:
from collections import Counter
tokenizer = get_tokenizer("basic_english")
specials = ['<unk>','<pad>', '<sos>', '<eos>']

word_list = []

def yield_tokens():
    for data in [train_data, valid_data]:
        columns = [data['comment']['en_content'],
                data['video']['title'],
                data['video']['transcript']]

        token_lists = [tokenizer(str(text)) for column in columns for text in column]
        for tokens in token_lists:
            yield tokens
        
vocabulary = build_vocab_from_iterator(yield_tokens(), specials=specials, min_freq=2)
vocabulary.set_default_index(vocabulary['<unk>'])

In [None]:
len(vocabulary)

50437

In [None]:
sentence = "i am happy".split()
indexs = vocabulary.forward(sentence)
indexs

[6, 166, 198]

In [None]:
vocabulary.forward(['<eos>', '<unk>', '<sos>'])

[3, 0, 2]

In [None]:
vocabulary.lookup_tokens(indexs)

['i', 'am', 'happy']

## 1.2 Dataset & Dataloader

In [None]:
class TextDataset(Dataset):
    def __init__(self, data, tokenizer=tokenizer, vocab=vocabulary, video_length=7000, comment_length = 32):
        self.video_length = video_length
        self.comment_length = comment_length
        
        self.tokenizer = tokenizer
        self.vocab = vocab
    
        self.video_df = data['video']
        self.comment_df = data['comment']

    
    def __getitem__(self, index):
        vid = self.comment_df.iloc[index]['vid']
        
        # trg
        comment = str(self.comment_df.iloc[index]['en_content'])
        tokenized_comment = self.tokenizer(comment)
        tagged_comment = self.tag(tokenized_comment, self.comment_length)
        comment_idxs = self.vocab.forward(tagged_comment)
        
        # src
        video = self.video_df[self.video_df['vid'] == vid]
        
        title = str(video['title'].item())
        transcript = str(video['transcript'].item())
        
        video_texts = ' '.join([title, transcript])
        
        tokenized_video_texts = self.tokenizer(video_texts)[:self.video_length-2]
        text_length = self.video_length 
        
        tagged_video_texts = self.tag(tokenized_video_texts, self.video_length)
        
        video_idxs = self.vocab.forward(tagged_video_texts)
        
        return torch.tensor(video_idxs), torch.tensor(text_length),torch.tensor(comment_idxs)
        # return torch.tensor(video_idxs).to(device), torch.tensor(comment_idxs).to(device)
        
    def __len__(self):
        return self.comment_df.shape[0]
        
    def tag(self, words, length):
        words.insert(0, '<sos>')
        words.append("<eos>")
        words = words + ['<pad>']*(length-len(words))
        return words
        

In [None]:
def collate_fn(data):
    data.sort(key=lambda x: x[1], reverse=True) 
    video = [row[0].numpy() for row in data]
    length = [row[1] for row in data]
    comment = [row[2].numpy() for row in data]
    # return torch.Tensor(video).int().to(device), torch.Tensor(length).int().to(device), torch.Tensor(comment).int().to(device)
    return torch.Tensor(video).int().to(device), torch.Tensor(length), torch.Tensor(comment).int().to(device)

### Dataset and Dataloader Init

In [None]:
train_dataset = TextDataset(train_data)
test_dataset = TextDataset(test_data)
valid_dataset = TextDataset(valid_data)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=16, collate_fn=collate_fn)


# 2. Model



* `nn.Embedding(num_embedding, embedding_dim)`
    * `num_embedding`: vocabulary_size
    * `embedding_dim`: embedding vector size -> output size
    * output shape : (batch_size, embedding_dim)
* `nn.LSTM(input_size, hidden_size, num_layers, batch_first=False)`
    * **QUESTION**: what is the `output` of LSTM?
        * equal to last hidden of output?
* 

## 2.1 Encoder

In [None]:
class Encoder(nn.Module):

    def __init__(self, input_dim, embed_dim, hidden_dim, rnn_layers, dropout_ratio):
        super().__init__()

        self.embedding = nn.Embedding(input_dim, embed_dim)

        self.dropout = nn.Dropout(dropout_ratio)

        self.rnn = nn.LSTM(embed_dim, hidden_dim, rnn_layers)

    def forward(self, src, src_len):
        embedded_seq = self.dropout(self.embedding(src))
        # embedded_seq [src_len ,batch_size, embedding_dim]

        # must work on
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded_seq, src_len.to('cpu'))
        # print(packed_embedded.data.shape, packed_embedded.batch_sizes)

        # outputs is the final outputs of the hidden states 
        packed_outputs, (hidden, cell) = self.rnn(packed_embedded)  
        # outputs [src_len, batch_size, enc_hidden_dim]
        # hidden [n_layers * direction, batch_size, enc_hidden_dim]
        # cell [n layers * n directions, batch size, enc_hidden_dim]

        outputs, lens_unpacked = nn.utils.rnn.pad_packed_sequence(packed_outputs)

        # context vector: the last hidden vector of the encoder
        context = outputs[-1, :,:]  # [1, batch_size, enc_hidden_dim]
        context = context.squeeze(0)  # [batch_size, enc_hidden_dim]
        return outputs, context, hidden, cell

## 2.2 Attention

In [None]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()

        self.attention = nn.Linear(enc_hid_dim + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias=False)  # 输入为hidden_dim, 输出shape = 1

    def forward(self, hidden, encoder_outputs, mask):
        # hidden = [batch size, dec hid dim]
        # encoder_outputs = [src len, batch size, enc hid dim]

        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]

        # repeat decoder hidden state src_len times
        hidden = hidden.unsqueeze(1)
        # hidden shape [batch size, 1, dec hid dim]


        hidden = hidden.repeat(1, src_len, 1)  
        # hidden = [batch size, src len, dec hid dim] 
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        # encoder_outputs = [batch size, src len, enc hid dim]

        concat_attention_input = torch.cat((hidden, encoder_outputs), dim=2)
        # [batch size, src len, enc hid dim+dec hid dim]

        energy = torch.tanh(self.attention(concat_attention_input))  
        # energy = [batch size, src len, dec hid dim]

        attention = self.v(energy)
        # [batch size, src len, 1]

        attention = attention.squeeze(2)
        # [batch size, src len]

        # mask is True replace with inf
        attention = attention.masked_fill(mask, -1e10)

        return F.softmax(attention, dim=1)

## 2.3 Decoder

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embed_dim, hidden_dim, rnn_layers, dropout_ratio, gated_attention):
        super().__init__()
        # attention added in decoder
        self.embedding = nn.Embedding(output_dim, embed_dim)
        self.rnn = nn.LSTM(embed_dim, hidden_dim, rnn_layers)
        self.dropout = nn.Dropout(dropout_ratio)
        self.gated = gated_attention
        self.fcl = nn.Linear(hidden_dim + hidden_dim, output_dim)

    def forward(self, trg_word, encoder_outputs,hidden=None, cell=None):
        '''
        trg_word: target word
        context: [batch size, dec hid dim]
        cell: [n_layers*n_direction, batch_size,enc_dim]
        hidden: [n_layers*n_direction, batch_size,enc_dim]
        mask: [batch_size, src_len]

        '''
        trg_seq = trg_word.unsqueeze(0)
        # trg_seq [1, batch_size]

        embedded = self.dropout(self.embedding(trg_seq))  # embedded comment
        # [1, batch_size, embed_dim]

        if hidden!= None and cell!=None:
            output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        else:
            output, (hidden, cell) = self.rnn(embedded)

        # output [1, batch_size, dec_hidden_dim]
        # hidden [n_layers, batch_size,dec_hidden_dim]
        # cell [n_layers, batch_size, dec_hidden_dim]

        gated_weight = self.gated(encoder_outputs, output).squeeze(0)
        # gated_weight [batch_size, hidden_dim]
        output = output.squeeze(0)
        # output [batch_size, hidden_dim]

        fcl_input = torch.cat((output, gated_weight), dim=1)
        # fcl_input [batch_size, hidden_dim+hidden_dim]
        prediction = self.fcl(fcl_input)

        return prediction, hidden, cell

## 2.4 Seq2Seq model

In [None]:
class GatedAttention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.encoder_linear = nn.Linear(hidden_dim, hidden_dim, bias=False) 
        self.decoder_linear = nn.Linear(hidden_dim, hidden_dim, bias=False)
        self.attention = nn.Linear(hidden_dim, 1, bias=False)
        self.gate = nn.Linear(hidden_dim, hidden_dim)

    def forward(self, encoder_outputs, decoder_outputs):
        # encoder_outputs [src_len, batch_size, hidden_dim]
        # decoder_outputs [1, batch_size, hidden_dim]
        src_len = encoder_outputs.shape[0]
        batch_size = encoder_outputs.shape[1]

        weighted_h = self.encoder_linear(encoder_outputs)
        weighted_s = self.decoder_linear(decoder_outputs)
        # weighted_h [src_len, batch_size, hidden_dim]
        # weighted_s [1, batch_size, hidden_dim]

        weighted_s = weighted_s.repeat(src_len, 1, 1)
        # weighted_s [src_len, batch_size, hidden_dim]

        weighted = torch.tanh(weighted_h + weighted_s)
        # weighted [src_len, batch_size, hidden_dim]

        a = self.attention(weighted)
        # attention [src_len, batch_size, 1]
        a = F.softmax(a, dim=0)

        a = a.permute(1, 2, 0)
        # a [batch_size, 1, src_len]
        encoder_outputs = encoder_outputs.permute(1,0,2)
        # encoder_outputs [batch_size, src_len, hidden_dim]

        context = torch.bmm(a, encoder_outputs)
        # context [batch_size, 1, hidden_dim]
        context = context.permute(1,0,2)
        # context [1, batch_size, hidden_dim]

        m = self.gate(decoder_outputs)
        # m [1, batch_size, hidden_dim]
        m = torch.sigmoid(m)

        gated_weight = m.mul(context)
        # gated_weight [1, batch_size, hidden_dim]

        return gated_weight

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, attention, vocab, device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.attention = attention
        self.vocab = vocab
        self.pad_idx = vocab['<pad>']
        self.device = device

    def create_mask(self, src):
        mask = src != self.pad_idx
        mask = mask.permute(1, 0)
        # mask [batch_size, src_len]
        return mask

    def forward(self, src, src_len, trg, teacher_forcing_ratio=0.5):
        # src = [src len, batch size]
        # src_len = [batch size]
        # trg = [trg len, batch size]

        trg_len = trg.shape[0]
        batch_size = trg.shape[1]
        vocab_size = len(self.vocab)

        # init record decoder outputs
        decoder_outputs = torch.zeros(trg_len, batch_size, vocab_size).to(self.device)

        encoder_outputs, encoder_context, hidden, cell = self.encoder(src, src_len)
        # encoder_outputs [src_len, batch_size, enc_hidden_dim]
        # encoder_context [batch_size, enc_hidden_dim]

        # the first of the target_word is <sos>
        target_word = trg[0, :]
        # pad attention
        mask = self.create_mask(src)
        a = self.attention(encoder_context, encoder_outputs, mask)
        # a [batch_size, src_len]
        a = a.unsqueeze(1)
        # a [batch_size, 1, src_len]
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        # encoder_outputs [batch_size, src_len, enc_hidden_dim]

        weighted_encoder_outputs = torch.bmm(a, encoder_outputs) 
        # weighted [batch_size, 1, enc_hidden_dim]
        weighted_encoder_outputs = weighted_encoder_outputs.permute(1, 0, 2)
        # weighted [1, batch_size, enc_hidden_dim]

        for t in range(1, trg_len):
            outputs, hidden, cell = self.decoder(target_word,weighted_encoder_outputs, hidden, cell)

            # output: Probability distribution
            decoder_outputs[t] = outputs

            teacher_force = random.random() < teacher_forcing_ratio

            top1_word = outputs.argmax(1)
            target_word = trg[t] if teacher_force else top1_word

        return decoder_outputs

## 2.5 Train & Evaulation

In [None]:
# from torchtext.data.metrics import bleu_score
def train(model, loader, optimizer, clip, loss_fn=nn.CrossEntropyLoss()):
    model.train()
    epoch_loss = 0
    for video_text, text_len, comment in tqdm(loader):
        optimizer.zero_grad()
        video_text = video_text.permute(1,0)
        comment = comment.permute(1,0) # [src len, batch size]
        
        output = model(video_text,text_len, comment)
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        comment = comment[1:].contiguous().long().view(-1)
        
        loss = loss_fn(output, comment)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(loader)

In [None]:
def evaluate(model, loader,loss_fn=nn.CrossEntropyLoss()):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for video_text, text_length, comment in tqdm(loader): 
            
            video_text = video_text.permute(1,0)
            comment = comment.permute(1,0)
        
            output = model(video_text, text_length, comment)

            output_dim = output.shape[-1]

            output = output[1:].view(-1, output_dim)
            comment = comment[1:].contiguous().long().view(-1)

            loss = loss_fn(output, comment)
            epoch_loss += loss.item()
    return epoch_loss / len(loader)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

# 3. Train

In [None]:
# Model Parameters
INPUT_DIM = len(vocabulary)
OUTPUT_DIM = len(vocabulary)

ENCODER_EMBEDDED_DIM = 256
DECODER_EMBEDDED_DIM = 256
HIDDEN_DIM = 512
NUM_LAYERS = 2
ENCODER_DROPOUT = 0.5
DECODER_DROPOUT = 0.5

encoder = Encoder(INPUT_DIM, ENCODER_EMBEDDED_DIM, HIDDEN_DIM, NUM_LAYERS,ENCODER_DROPOUT).to(device)
attention = Attention(HIDDEN_DIM, HIDDEN_DIM).to(device)
gated_attention = GatedAttention(HIDDEN_DIM).to(device)
decoder = Decoder(OUTPUT_DIM, DECODER_EMBEDDED_DIM, HIDDEN_DIM, NUM_LAYERS, DECODER_DROPOUT,gated_attention).to(device)

model = Seq2Seq(encoder, decoder, attention,vocabulary, device).to(device)

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(50437, 256)
    (dropout): Dropout(p=0.5, inplace=False)
    (rnn): LSTM(256, 512, num_layers=2)
  )
  (decoder): Decoder(
    (embedding): Embedding(50437, 256)
    (rnn): LSTM(256, 512, num_layers=2)
    (dropout): Dropout(p=0.5, inplace=False)
    (gated): GatedAttention(
      (encoder_linear): Linear(in_features=512, out_features=512, bias=False)
      (decoder_linear): Linear(in_features=512, out_features=512, bias=False)
      (attention): Linear(in_features=512, out_features=1, bias=False)
      (gate): Linear(in_features=512, out_features=512, bias=True)
    )
    (fcl): Linear(in_features=1024, out_features=50437, bias=True)
  )
  (attention): Attention(
    (attention): Linear(in_features=1024, out_features=512, bias=True)
    (v): Linear(in_features=512, out_features=1, bias=False)
  )
  (vocab): Vocab()
)

In [None]:
# model_path = os.path.join(WORK_DIR, 'Models','Pad+GateModel4')
# results = torch.load(os.path.join(model_path, 'model.pt'))
# best_valid_loss = results['valid_loss']
# model.load_state_dict(results['state_dict'])

<All keys matched successfully>

In [None]:
EPOCHS = 10
CLIP = 1

optimizer = optim.Adam(model.parameters())


for epoch in range( EPOCHS):
    start_time = time.time()
    train_loss = train(model, train_loader, optimizer= optimizer, clip=CLIP)
    valid_loss = evaluate(model, valid_loader)
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    result_dict = {
        'epoch':epoch + 1,
        'train_loss':train_loss,
        'valid_loss':valid_loss,
        'train_ppl':math.exp(train_loss),
        'valid_ppl':math.exp(valid_loss),
        'state_dict': model.state_dict(),
    }
    
    if valid_loss < best_valid_loss:
        torch.save(result_dict, os.path.join(model_path, f'model.pt'))
        best_valid_loss = valid_loss
    
    with open(os.path.join(model_path, 'evluation-results.csv'), 'a')as f:
        f.write(','.join([str(item) for item in list(result_dict.values())[:-1]]))
        f.write('\n')

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {result_dict["train_ppl"]:7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {result_dict["valid_ppl"]:7.3f}')

  import sys
100%|██████████| 4272/4272 [2:30:13<00:00,  2.11s/it]
100%|██████████| 1240/1240 [11:30<00:00,  1.80it/s]


Epoch: 07 | Time: 161m 44s
	Train Loss: 2.642 | Train PPL:  14.048
	 Val. Loss: 2.946 |  Val. PPL:  19.038


100%|██████████| 4272/4272 [2:30:16<00:00,  2.11s/it]
100%|██████████| 1240/1240 [11:31<00:00,  1.79it/s]


Epoch: 08 | Time: 161m 47s
	Train Loss: 2.528 | Train PPL:  12.523
	 Val. Loss: 2.965 |  Val. PPL:  19.392


100%|██████████| 4272/4272 [2:30:08<00:00,  2.11s/it]
100%|██████████| 1240/1240 [11:33<00:00,  1.79it/s]


Epoch: 09 | Time: 161m 41s
	Train Loss: 2.446 | Train PPL:  11.538
	 Val. Loss: 2.990 |  Val. PPL:  19.888


100%|██████████| 4272/4272 [2:30:21<00:00,  2.11s/it]
100%|██████████| 1240/1240 [11:34<00:00,  1.79it/s]

Epoch: 10 | Time: 161m 55s
	Train Loss: 2.379 | Train PPL:  10.791
	 Val. Loss: 3.033 |  Val. PPL:  20.756





# 4. Evaluation

## TestDataset

In [None]:
class TestTextDataset(Dataset):
    def __init__(self, data, tokenizer=tokenizer, vocab=vocabulary, video_length=7000, comment_length = 32):
        self.video_length = video_length
        self.comment_length = comment_length
        
        self.tokenizer = tokenizer
        self.vocab = vocab
    
        self.video_df = data['video']
        self.comment_df = data['comment']

    
    def __getitem__(self, index):
        vid = self.comment_df.iloc[index]['vid']
        
        # trg
        comment = str(self.comment_df.iloc[index]['en_content'])
        tokenized_comment = self.tokenizer(comment)
        tagged_comment = self.tag(tokenized_comment, self.comment_length)
        comment_idxs = self.vocab.forward(tagged_comment)
        
        # src
        video = self.video_df[self.video_df['vid'] == vid]
        
        title = str(video['title'].item())
        transcript = str(video['transcript'].item())
        
        video_texts = ' '.join([title, transcript])
        
        tokenized_video_texts = self.tokenizer(video_texts)[:self.video_length-2]
        text_length = self.video_length 
        
        tagged_video_texts = self.tag(tokenized_video_texts, self.video_length)
        
        video_idxs = self.vocab.forward(tagged_video_texts)
        
        return torch.tensor(video_idxs), torch.tensor(text_length),torch.tensor(comment_idxs), vid
        # return torch.tensor(video_idxs).to(device), torch.tensor(comment_idxs).to(device)
        
    def __len__(self):
        return self.comment_df.shape[0]
        
    def tag(self, words, length):
        words.insert(0, '<sos>')
        words.append("<eos>")
        words = words + ['<pad>']*(length-len(words))
        return words
        

In [None]:
# model_path = os.path.join(WORK_DIR, 'Models','Pad+GateModel2', 'epoch8.pt')
model_path = os.path.join(WORK_DIR, 'Models','Pad+GateModel4', 'model-10.pt')
result = torch.load(model_path)
model.load_state_dict(result['state_dict'])

<All keys matched successfully>

In [None]:
def test_collate_fn(data):
    data.sort(key=lambda x: x[1], reverse=True) 
    video = [row[0].numpy() for row in data]
    length = [row[1] for row in data]
    comment = [row[2].numpy() for row in data]
    vid = [row[3]for row in data]
    # return torch.Tensor(video).int().to(device), torch.Tensor(length).int().to(device), torch.Tensor(comment).int().to(device)
    return torch.Tensor(video).int().to(device), torch.Tensor(length), torch.Tensor(comment).int().to(device), vid

### Generate comments on Test dataset and Save the result

In [None]:
model.eval()
test_dataset = TestTextDataset(test_data)
test_loader = DataLoader(test_dataset, batch_size=128, collate_fn=test_collate_fn)
ratio = 0.5
with torch.no_grad():
    for video_texts, text_lens, comments, vid in tqdm(test_loader): 
        results = []
        video_texts = video_texts.permute(1,0)
        comments = comments.permute(1,0)
        outputs = model(video_texts,text_lens, comments, ratio)
        outputs = outputs.permute(1, 0 ,2)
        comments = comments.permute(1, 0)
        
        for i in range(outputs.shape[0]):
            generated_comment_index = outputs[i,:,:].argmax(1)
            generated_comment = vocabulary.lookup_tokens(list(generated_comment_index))
            generated_comment = ' '.join([word for word in generated_comment if word not in ['<pad>', '<eos>', '<sos>']])
            comment = vocabulary.lookup_tokens(list(comments[i]))
            comment = ' '.join([word for word in comment if word not in ['<pad>', '<eos>', '<sos>']])
            results.append(['PadGateModel4',vid[i], str(ratio), comment, generated_comment, '\n']) 
        with open(WORK_DIR+'Models/Pad+GateModel4/'+'predictions-10.csv', 'a') as f:
            f.writelines([','.join(line) for line in results])

  
100%|██████████| 304/304 [12:06<00:00,  2.39s/it]
