<a href="https://colab.research.google.com/github/Telescope-U/Video-Comment-Generator/blob/GateModel/%5BMscProject%5DGatedAttentionModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import torchtext
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer

import numpy as np
import pandas as pd

import math
import time
import os
import random
from tqdm import tqdm

In [None]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
WORK_DIR = '/content/drive/MyDrive/Colab Notebooks/[Msc]Video-Comment-Generator/'
if not os.path.isdir(WORK_DIR):
    WORK_DIR = ''
TRAIN_FOLDER = WORK_DIR + 'Dataset/Train/'
VALID_FOLDER = WORK_DIR + 'Dataset/Valid/'
TEST_FOLDER = WORK_DIR + 'Dataset/Test/'
VIDEO_PATH = WORK_DIR + 'Dataset/video_data.csv'
COMMENT_PATH = WORK_DIR + 'Dataset/comment_data.csv'

# 1. Dataset

In [None]:
# 数据分离部分
# def split_ids(video_ids, train=0.7, valid=0.1, test=0.2):
#     list_copy = video_ids.copy()
#     random.shuffle(list_copy)
#     train_size = math.floor(len(list_copy)*train)
#     valid_size = math.floor(len(list_copy)*valid)
#     return list_copy[:train_size], list_copy[train_size:(train_size+valid_size)], list_copy[(train_size+valid_size):]

In [None]:
# video_df = pd.read_csv(VIDEO_PATH)
# comment_df = pd.read_csv(COMMENT_PATH)
# train_ids, valid_ids, test_ids = split_ids(video_df['vid'].unique())
# print(len(train_ids), len(valid_ids), len(test_ids))

In [None]:
# import string
# def clean_text(text):
#     text = str(text)
#     for i in string.punctuation:
#         text = text.replace(i, '')
#     return text.lower()

# video_df['title'] = video_df['title'].apply(clean_text)
# video_df['transcript'] = video_df['transcript'].apply(clean_text)
# comment_df['en_content'] = comment_df['en_content'].apply(clean_text)

In [None]:
# # 分离训练数据并且存储数据
# train_video_df = video_df[video_df['vid'].isin(train_ids)]
# valid_video_df = video_df[video_df['vid'].isin(valid_ids)]
# test_video_df = video_df[video_df['vid'].isin(test_ids)]

# train_comment_df = comment_df[comment_df['vid'].isin(train_ids)]
# valid_comment_df = comment_df[comment_df['vid'].isin(valid_ids)]
# test_comment_df = comment_df[comment_df['vid'].isin(test_ids)]

# train_comment_df.to_csv('Dataset/Train/comment.csv', index = None)
# test_comment_df.to_csv('Dataset/Test/comment.csv', index = None)
# valid_comment_df.to_csv("Dataset/Valid/comment.csv", index = None)

# train_video_df.to_csv('Dataset/Train/video.csv', index = None)
# test_video_df.to_csv('Dataset/Test/video.csv', index = None)
# valid_video_df.to_csv("Dataset/Valid/video.csv", index = None)

In [None]:
def get_data(folder):
    comment_path = os.path.join(folder,'comment.csv')
    video_path = os.path.join(folder,'video.csv')
    comment_df = pd.read_csv(comment_path)
    video_df = pd.read_csv(video_path)
    return {'comment':comment_df, 'video': video_df}
        

In [None]:
train_data = get_data(TRAIN_FOLDER)
valid_data = get_data(VALID_FOLDER)
test_data = get_data(TEST_FOLDER)

## 1.1 Vocabulary

In [None]:
from collections import Counter
tokenizer = get_tokenizer("basic_english")
specials = ['<unk>','<pad>', '<sos>', '<eos>']

word_list = []

def yield_tokens():
    for data in [train_data, valid_data]:
        columns = [data['comment']['en_content'],
                data['video']['title'],
                data['video']['transcript']]

        token_lists = [tokenizer(str(text)) for column in columns for text in column]
        for tokens in token_lists:
            yield tokens
        
vocabulary = build_vocab_from_iterator(yield_tokens(), specials=specials, min_freq=2)
vocabulary.set_default_index(vocabulary['<unk>'])

In [None]:
len(vocabulary)

50437

In [None]:
sentence = "i am happy".split()
indexs = vocabulary.forward(sentence)
indexs

[6, 166, 198]

In [None]:
vocabulary.forward(['<eos>', '<unk>', '<sos>'])

[3, 0, 2]

In [None]:
vocabulary.lookup_tokens(indexs)

['i', 'am', 'happy']

## 1.2 Dataset & Dataloader

In [None]:
class TextDataset(Dataset):
    def __init__(self, data, tokenizer=tokenizer, vocab=vocabulary, video_length=7000, comment_length = 32):
        self.video_length = video_length
        self.comment_length = comment_length
        
        self.tokenizer = tokenizer
        self.vocab = vocab
    
        self.video_df = data['video']
        self.comment_df = data['comment']

    
    def __getitem__(self, index):
        vid = self.comment_df.iloc[index]['vid']
        
        # trg
        comment = str(self.comment_df.iloc[index]['en_content'])
        tokenized_comment = self.tokenizer(comment)
        tagged_comment = self.tag(tokenized_comment, self.comment_length)
        comment_idxs = self.vocab.forward(tagged_comment)
        
        # src
        video = self.video_df[self.video_df['vid'] == vid]
        
        title = str(video['title'].item())
        transcript = str(video['transcript'].item())
        
        video_texts = ' '.join([title, transcript])
        
        tokenized_video_texts = self.tokenizer(video_texts)[:self.video_length-2]
        text_length = self.video_length 
        
        tagged_video_texts = self.tag(tokenized_video_texts, self.video_length)
        
        video_idxs = self.vocab.forward(tagged_video_texts)
        
        return torch.tensor(video_idxs), torch.tensor(text_length),torch.tensor(comment_idxs)
        # return torch.tensor(video_idxs).to(device), torch.tensor(comment_idxs).to(device)
        
    def __len__(self):
        return self.comment_df.shape[0]
        
    def tag(self, words, length):
        words.insert(0, '<sos>')
        words.append("<eos>")
        words = words + ['<pad>']*(length-len(words))
        return words
        

In [None]:
def collate_fn(data):
    data.sort(key=lambda x: x[1], reverse=True) 
    video = [row[0].numpy() for row in data]
    length = [row[1] for row in data]
    comment = [row[2].numpy() for row in data]
    # return torch.Tensor(video).int().to(device), torch.Tensor(length).int().to(device), torch.Tensor(comment).int().to(device)
    return torch.Tensor(video).int().to(device), torch.Tensor(length), torch.Tensor(comment).int().to(device)

### Dataset 和 Dataloader 实例化

In [None]:
train_dataset = TextDataset(train_data)
test_dataset = TextDataset(test_data)
valid_dataset = TextDataset(valid_data)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=16, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=16, collate_fn=collate_fn)


# 2. Model



* `nn.Embedding(num_embedding, embedding_dim)`
    * `num_embedding`: vocabulary_size
    * `embedding_dim`: embedding vector size -> 输出维度
    * output shape : (batch_size, embedding_dim)
* `nn.LSTM(input_size, hidden_size, num_layers, batch_first=False)`
    * **QUESTION**: what is the `output` of LSTM?
        * equal to last hidden of output?
* 

## 2.1 Encoder

In [None]:
class Encoder(nn.Module):
    # Q: 如果要按照weibo模型进行修改的话，是不是encoder 需要改成 seq2seq class中 decoder的样子，for循环
    # A: 不用，因为encoder outputs就是所有的时间步的最末层hidden_state 即 H = [h_1, h_2, ..., h_T]

    def __init__(self, input_dim, embed_dim, hidden_dim, rnn_layers, dropout_ratio):
        super().__init__()

        self.embedding = nn.Embedding(input_dim, embed_dim)

        self.dropout = nn.Dropout(dropout_ratio)

        self.rnn = nn.LSTM(embed_dim, hidden_dim, rnn_layers)

    def forward(self, src, src_len):
        embedded_seq = self.dropout(self.embedding(src))
        # embedded_seq [src_len ,batch_size, embedding_dim]

        #!!!这一步必须在cpu上进行
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded_seq, src_len.to('cpu'))
        # print(packed_embedded.data.shape, packed_embedded.batch_sizes)

        # outputs 是所有hidden states hidden 是最终输出
        packed_outputs, (hidden, cell) = self.rnn(packed_embedded)  # 和教程不同，教程中使用gru 只有outputs 和 hidden
        # outputs [src_len, batch_size, enc_hidden_dim]
        # hidden [n_layers * direction, batch_size, enc_hidden_dim]
        # cell [n layers * n directions, batch size, enc_hidden_dim]

        # 将outputs 解压,lens 是实际有意义的部分
        outputs, lens_unpacked = nn.utils.rnn.pad_packed_sequence(packed_outputs)

        # context vector
        # 根据news的意思，将h_T（最后一个hidden 作为语义向量）
        context = outputs[-1, :,:]  # [1, batch_size, enc_hidden_dim]
        context = context.squeeze(0)  # [batch_size, enc_hidden_dim]
        return outputs, context, hidden, cell

demo 中有很多处理数据形状的，其实按需求加维减维就好了 不需要特别头疼

## 2.2 Attention

In [None]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()

        self.attention = nn.Linear(enc_hid_dim + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias=False)  # 输入为hidden_dim, 输出shape = 1

    def forward(self, hidden, encoder_outputs, mask):
        # hidden = [batch size, dec hid dim]
        # encoder_outputs = [src len, batch size, enc hid dim]

        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]

        # repeat decoder hidden state src_len times
        hidden = hidden.unsqueeze(1)
        # hidden shape [batch size, 1, dec hid dim]


        hidden = hidden.repeat(1, src_len, 1)  # repeat() 最终结果就是（1, src len, 1）次
        # hidden = [batch size, src len, dec hid dim] (可以理解为shape 元素相乘了)

        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        # encoder_outputs = [batch size, src len, enc hid dim]

        concat_attention_input = torch.cat((hidden, encoder_outputs), dim=2)
        # [batch size, src len, enc hid dim+dec hid dim]

        energy = torch.tanh(self.attention(concat_attention_input))  # dim 就是拼接的维度
        # energy = [batch size, src len, dec hid dim]

        attention = self.v(energy)
        # [batch size, src len, 1]

        attention = attention.squeeze(2)
        # [batch size, src len]

        # mask is True 就用极小值替代
        attention = attention.masked_fill(mask, -1e10)

        return F.softmax(attention, dim=1)

## 2.3 Decoder

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embed_dim, hidden_dim, rnn_layers, dropout_ratio, gated_attention):
        super().__init__()
        # attention 机制加在decoder 中
        self.embedding = nn.Embedding(output_dim, embed_dim)
        self.rnn = nn.LSTM(embed_dim, hidden_dim, rnn_layers)
        self.dropout = nn.Dropout(dropout_ratio)
        self.gated = gated_attention
        self.fcl = nn.Linear(hidden_dim + hidden_dim, output_dim)
    # 因为attention 的需要添加了encoder_outputs
    # def forward(self, trg_word, encoder_outputs, context, hidden, cell, mask):
    def forward(self, trg_word, encoder_outputs,hidden=None, cell=None):
        '''
        trg_word: 目标词汇 是一个单独的词
        context: [batch size, dec hid dim]
        cell: [n_layers*n_direction, batch_size,enc_dim]
        hidden: [n_layers*n_direction, batch_size,enc_dim]
        mask: [batch_size, src_len]

        '''
        # 因为不是序列，所以需要加一步，[1, batch_size]
        trg_seq = trg_word.unsqueeze(0)
        # trg_seq [1, batch_size]

        # 可视做 seq_len = 1
        embedded = self.dropout(self.embedding(trg_seq))  # embedded comment
        # [1, batch_size, embed_dim]

        # hidden和cell初始值为encoder 的最终状态 #4中没有cell，不确定cell要不要压缩
        if hidden!= None and cell!=None:
            output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        else:
            output, (hidden, cell) = self.rnn(embedded)

        # output [1, batch_size, dec_hidden_dim]
        # hidden [n_layers, batch_size,dec_hidden_dim]
        # cell [n_layers, batch_size, dec_hidden_dim]

        gated_weight = self.gated(encoder_outputs, output).squeeze(0)
        # gated_weight [batch_size, hidden_dim]
        output = output.squeeze(0)
        # output [batch_size, hidden_dim]

        fcl_input = torch.cat((output, gated_weight), dim=1)
        # fcl_input [batch_size, hidden_dim+hidden_dim]
        prediction = self.fcl(fcl_input)

        return prediction, hidden, cell

## 2.4 Seq2Seq model

In [None]:
class GatedAttention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.encoder_linear = nn.Linear(hidden_dim, hidden_dim, bias=False) 
        self.decoder_linear = nn.Linear(hidden_dim, hidden_dim, bias=False)
        self.attention = nn.Linear(hidden_dim, 1, bias=False)
        self.gate = nn.Linear(hidden_dim, hidden_dim)

    def forward(self, encoder_outputs, decoder_outputs):
        # encoder_outputs [src_len, batch_size, hidden_dim]
        # decoder_outputs [1, batch_size, hidden_dim]
        src_len = encoder_outputs.shape[0]
        batch_size = encoder_outputs.shape[1]

        weighted_h = self.encoder_linear(encoder_outputs)
        weighted_s = self.decoder_linear(decoder_outputs)
        # weighted_h [src_len, batch_size, hidden_dim]
        # weighted_s [1, batch_size, hidden_dim]

        weighted_s = weighted_s.repeat(src_len, 1, 1)
        # weighted_s [src_len, batch_size, hidden_dim]

        weighted = torch.tanh(weighted_h + weighted_s)
        # weighted [src_len, batch_size, hidden_dim]

        score = self.attention(weighted)
        # attention [src_len, batch_size, 1]
        score = F.softmax(score, dim=0) # 正则化

        score = score.permute(1, 2, 0)
        # score [batch_size, 1, src_len]
        encoder_outputs = encoder_outputs.permute(1,0,2)
        # encoder_outputs [batch_size, src_len, hidden_dim]

        context = torch.bmm(score, encoder_outputs)
        # context [batch_size, 1, hidden_dim] src_len应该为1？
        context = context.permute(1,0,2)
        # context [1, batch_size, hidden_dim] src_len应该为1？

        m = self.gate(decoder_outputs)
        # m [1, batch_size, hidden_dim]
        m = torch.sigmoid(m)

        gated_weight = m.mul(context)
        # gated_weight [1, batch_size, hidden_dim]

        return gated_weight

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, attention, vocab, device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.attention = attention
        self.vocab = vocab
        self.pad_idx = vocab['<pad>']
        self.device = device

    def create_mask(self, src):
        mask = src != self.pad_idx
        mask = mask.permute(1, 0)
        # mask [batch_size, src_len]
        return mask

    def forward(self, src, src_len, trg, teacher_forcing_ratio=0.5):
        # teacher_forcing_ratio 是什么 - 根据这个概率决定 下一个输出是依靠trg 还是实际输出
        # src = [src len, batch size]
        # src_len = [batch size]
        # trg = [trg len, batch size]

        trg_len = trg.shape[0]
        batch_size = trg.shape[1]
        vocab_size = len(self.vocab)
        #         print('trg_len:',trg_len, 'batch_size: ',batch_size)

        # 记录decoder结果，初始化
        decoder_outputs = torch.zeros(trg_len, batch_size, vocab_size).to(self.device)

        encoder_outputs, encoder_context, hidden, cell = self.encoder(src, src_len)
        # encoder_outputs [src_len, batch_size, enc_hidden_dim]
        # encoder_context [batch_size, enc_hidden_dim]

        # 第一个是<sos>
        target_word = trg[0, :]
        # pad attention
        mask = self.create_mask(src)
        a = self.attention(encoder_context, encoder_outputs, mask)
        # a [batch_size, src_len]
        a = a.unsqueeze(1)
        # a [batch_size, 1, src_len]
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        # encoder_outputs [batch_size, src_len, enc_hidden_dim]

        weighted_encoder_outputs = torch.bmm(a, encoder_outputs)  # batch 矩阵相乘，在有batch的情况下做矩阵相乘，不考虑batch
        # weighted [batch_size, 1, enc_hidden_dim]
        weighted_encoder_outputs = weighted_encoder_outputs.permute(1, 0, 2)
        # weighted [1, batch_size, enc_hidden_dim]

        ## QUESTION: decoder 将每个词拆开放入decoder 需要再想想逻辑
        # decoder 输入的修改可能有误需检查
        for t in range(1, trg_len):
            outputs, hidden, cell = self.decoder(target_word,weighted_encoder_outputs, hidden, cell)

            # output 是预测分布
            decoder_outputs[t] = outputs

            teacher_force = random.random() < teacher_forcing_ratio

            # 最大可能的词
            top1_word = outputs.argmax(1)
            target_word = trg[t] if teacher_force else top1_word

        return decoder_outputs

## 2.5 Train & Evaulation

In [None]:
# from torchtext.data.metrics import bleu_score
def train(model, loader, optimizer, clip, loss_fn=nn.CrossEntropyLoss()):
    model.train()
    epoch_loss = 0
    for video_text, text_len, comment in tqdm(loader):
        optimizer.zero_grad()
        video_text = video_text.permute(1,0)
        comment = comment.permute(1,0) # 原数据集为batch first, 因rnn需要，因此改成seq len first
        
        output = model(video_text,text_len, comment)
#         print(output.shape)
        # 变形，具体不太清楚 0默认为<sos> 不加入计算
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        comment = comment[1:].contiguous().long().view(-1)
        
        loss = loss_fn(output, comment)
        loss.backward()
        
        # 计算梯度的，具体不太清楚
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        # 具体也不太清楚
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(loader)

In [None]:
def evaluate(model, loader,loss_fn=nn.CrossEntropyLoss()):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for video_text, text_length, comment in tqdm(loader): 
            
            video_text = video_text.permute(1,0)
            comment = comment.permute(1,0) # 原数据集为batch first, 因rnn需要，因此改成seq len first
        
            output = model(video_text, text_length, comment)
#         print(output.shape)

            # 变形，具体不太清楚 0默认为<sos> 不加入计算
            output_dim = output.shape[-1]

            output = output[1:].view(-1, output_dim)
            comment = comment[1:].contiguous().long().view(-1)

            loss = loss_fn(output, comment)
            epoch_loss += loss.item()
    return epoch_loss / len(loader)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

# 3. Train

In [None]:
# 训练参数
INPUT_DIM = len(vocabulary)
OUTPUT_DIM = len(vocabulary)

ENCODER_EMBEDDED_DIM = 256
DECODER_EMBEDDED_DIM = 256
HIDDEN_DIM = 1024
NUM_LAYERS = 2
ENCODER_DROPOUT = 0.5
DECODER_DROPOUT = 0.5

# input_dim, embedding_dim, hidden_dim, rnn_layers, dropout_ratio
encoder = Encoder(INPUT_DIM, ENCODER_EMBEDDED_DIM, HIDDEN_DIM, NUM_LAYERS,ENCODER_DROPOUT).to(device)
attention = Attention(HIDDEN_DIM, HIDDEN_DIM).to(device)
gated_attention = GatedAttention(HIDDEN_DIM).to(device)
decoder = Decoder(OUTPUT_DIM, DECODER_EMBEDDED_DIM, HIDDEN_DIM, NUM_LAYERS, DECODER_DROPOUT,gated_attention).to(device)

model = Seq2Seq(encoder, decoder, attention,vocabulary, device).to(device)

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(50437, 256)
    (dropout): Dropout(p=0.5, inplace=False)
    (rnn): LSTM(256, 1024, num_layers=2)
  )
  (decoder): Decoder(
    (embedding): Embedding(50437, 256)
    (rnn): LSTM(256, 1024, num_layers=2)
    (dropout): Dropout(p=0.5, inplace=False)
    (gated): GatedAttention(
      (encoder_linear): Linear(in_features=1024, out_features=1024, bias=False)
      (decoder_linear): Linear(in_features=1024, out_features=1024, bias=False)
      (attention): Linear(in_features=1024, out_features=1, bias=False)
      (gate): Linear(in_features=1024, out_features=1024, bias=True)
    )
    (fcl): Linear(in_features=2048, out_features=50437, bias=True)
  )
  (attention): Attention(
    (attention): Linear(in_features=2048, out_features=1024, bias=True)
    (v): Linear(in_features=1024, out_features=1, bias=False)
  )
  (vocab): Vocab()
)

In [None]:
# model_path = os.path.join(WORK_DIR, 'Models','Pad+GateModel2', f'epoch5.pt')
# results = torch.load(model_path)
# model.load_state_dict(results['state_dict'])

In [None]:
# 超参数
# model 4 是带initweight
EPOCHS = 10
CLIP = 1
# model_path = os.path.join(WORK_DIR, 'Models','Pad+GateModel2', f'epoch5.pt')
# results = torch.load(model_path)
# model.load_state_dict(results['state_dict'])

optimizer = optim.Adam(model.parameters())
val_results = dict()
train_results = dict()
best_valid_loss = float('inf')

for epoch in range(EPOCHS):
    start_time = time.time()
    train_loss = train(model, train_loader, optimizer= optimizer, clip=CLIP)
    valid_loss = evaluate(model, valid_loader)
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    result_dict = {
        'train_loss':train_loss,
        'valid_loss':valid_loss,
        'state_dict': model.state_dict(),
    }
    
    val_results[epoch+1] = valid_loss
    train_results[epoch+1] = train_loss
    if valid_loss < best_valid_loss:
        torch.save(result_dict, os.path.join(WORK_DIR, 'Models','Pad+GateModel4', f'epoch{epoch+1}.pt'))
        best_valid_loss = valid_loss
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

  import sys
  8%|▊         | 704/8544 [44:12<8:12:08,  3.77s/it]

In [None]:
torch.save(result_dict, os.path.join(WORK_DIR, 'Models','Pad+GateModel2', f'epoch1.pt'))

In [None]:
print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

In [None]:
result_df = pd.DataFrame({'train_loss':train_results,'valid_loss': val_results})
result_df = result_df.reindex(sorted(train_results.keys()))
result_df['model'] = 'Pad+GateModel2'
result_df['train_ppl'] = result_df['train_loss'].apply(lambda x: math.exp(x))
result_df['valid_ppl'] = result_df['valid_loss'].apply(lambda x: math.exp(x))

result_df = result_df.reindex(columns = ['model', 'train_loss', 'valid_loss','train_ppl', 'valid_ppl'])
result_df.to_csv(os.path.join(WORK_DIR, 'Models','Pad+GateModel2','results.csv'))

# 4. Evaluation

## 自定义testDataset

In [None]:
class TestTextDataset(Dataset):
    def __init__(self, data, tokenizer=tokenizer, vocab=vocabulary, video_length=7000, comment_length = 32):
        self.video_length = video_length
        self.comment_length = comment_length
        
        self.tokenizer = tokenizer
        self.vocab = vocab
    
        self.video_df = data['video']
        self.comment_df = data['comment']

    
    def __getitem__(self, index):
        vid = self.comment_df.iloc[index]['vid']
        
        # trg
        comment = str(self.comment_df.iloc[index]['en_content'])
        tokenized_comment = self.tokenizer(comment)
        tagged_comment = self.tag(tokenized_comment, self.comment_length)
        comment_idxs = self.vocab.forward(tagged_comment)
        
        # src
        video = self.video_df[self.video_df['vid'] == vid]
        
        title = str(video['title'].item())
        transcript = str(video['transcript'].item())
        
        video_texts = ' '.join([title, transcript])
        
        tokenized_video_texts = self.tokenizer(video_texts)[:self.video_length-2]
        text_length = self.video_length 
        
        tagged_video_texts = self.tag(tokenized_video_texts, self.video_length)
        
        video_idxs = self.vocab.forward(tagged_video_texts)
        
        return torch.tensor(video_idxs), torch.tensor(text_length),torch.tensor(comment_idxs), vid
        # return torch.tensor(video_idxs).to(device), torch.tensor(comment_idxs).to(device)
        
    def __len__(self):
        return self.comment_df.shape[0]
        
    def tag(self, words, length):
        words.insert(0, '<sos>')
        words.append("<eos>")
        words = words + ['<pad>']*(length-len(words))
        return words
        

In [None]:
model_path = os.path.join(WORK_DIR, 'Models','Pad+GateModel2', 'epoch8.pt')
result = torch.load(model_path)
model.load_state_dict(result['state_dict'])

In [None]:
def test_collate_fn(data):
    data.sort(key=lambda x: x[1], reverse=True) 
    video = [row[0].numpy() for row in data]
    length = [row[1] for row in data]
    comment = [row[2].numpy() for row in data]
    vid = [row[3]for row in data]
    # return torch.Tensor(video).int().to(device), torch.Tensor(length).int().to(device), torch.Tensor(comment).int().to(device)
    return torch.Tensor(video).int().to(device), torch.Tensor(length), torch.Tensor(comment).int().to(device), vid

In [None]:
model.eval()
test_dataset = TestTextDataset(test_data)
test_loader = DataLoader(test_dataset, batch_size=128, collate_fn=test_collate_fn)
ratio = 0
with torch.no_grad():
    while ratio < 0.8:
        for video_texts, text_lens, comments, vid in tqdm(test_loader): 
            results = []
            video_texts = video_texts.permute(1,0)
            comments = comments.permute(1,0) # 原数据集为batch first, 因rnn需要，因此改成seq len first
            outputs = model(video_texts,text_lens, comments, ratio)
            outputs = outputs.permute(1, 0 ,2)
            comments = comments.permute(1, 0)
            
            for i in range(outputs.shape[0]):
                generated_comment_index = outputs[i,:,:].argmax(1)
                generated_comment = vocabulary.lookup_tokens(list(generated_comment_index))
                generated_comment = ' '.join([word for word in generated_comment if word not in ['<pad>', '<eos>', '<sos>']])
                comment = vocabulary.lookup_tokens(list(comments[i]))
                comment = ' '.join([word for word in comment if word not in ['<pad>', '<eos>', '<sos>']])
                results.append(['PadGateModel',vid[i], str(ratio), comment, generated_comment, '\n']) 
            with open(WORK_DIR+'Models/Pad+GateModel2/'+'predictions.csv', 'a') as f:
                f.writelines([','.join(line) for line in results])
        ratio += 0.1

In [None]:
# 目前的prediction.csv 是 epoch 8

In [None]:
output_df = pd.DataFrame(results, columns=['vid', 'prediction', 'comment'])
output_df['model'] = 

In [None]:
folder =  os.path.join(WORK_DIR, 'Models','Pad+GateModel2')
val_results = dict()
train_results = dict()
for file in os.listdir(folder):
    epoch = int(file[5])
    result = torch.load(os.path.join(folder, file))
    val_results[epoch] = result['valid_loss']
    train_results[epoch] =  result['train_loss']

result_df = pd.DataFrame({'train_loss':train_results,'valid_loss': val_results})
result_df = result_df.reindex(sorted(train_results.keys()))
result_df['model'] = 'Pad+GateModel2'
result_df['train_ppl'] = result_df['train_loss'].apply(lambda x: math.exp(x))
result_df['valid_ppl'] = result_df['valid_loss'].apply(lambda x: math.exp(x))

result_df = result_df.reindex(columns = ['model', 'train_loss', 'valid_loss','train_ppl', 'valid_ppl'])
result_df.to_csv('results.csv')

In [None]:
import matplotlib.pyplot as plt
plt.plot(result_df['train_ppl'], label='train ppl')
plt.plot(result_df['valid_ppl'], label='valid ppl')
plt.legend()

In [None]:
folder =  os.path.join(WORK_DIR, 'Models','Pad+GateModel2')
val_results = dict()
train_results = dict()
for file in os.listdir(folder):
    epoch = int(file[5])
    result = torch.load(os.path.join(folder, file))
    model.load_state_dict(result['state_dict'])
    valid_loss = evaluate(model, valid_loader)
    print(f'epoch{epoch}: ')
    print(f"train loss:{result['train_loss']} | 0.5-valid loss:{result['valid_loss']} | 0-valid loss:{valid_loss}" )