<a href="https://colab.research.google.com/github/Telescope-U/Video-Comment-Generator/blob/master/%5BMscProject%5DSeq2Seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

import torchtext
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer

import numpy as np
import pandas as pd

import math
import time
import os
import random
from tqdm import tqdm

In [None]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
WORK_DIR = '/content/drive/MyDrive/Colab Notebooks/[Msc]Video-Comment-Generator/'
TRAIN_FOLDER = WORK_DIR + 'Dataset/Train/'
VALID_FOLDER = WORK_DIR + 'Dataset/Valid/'
TEST_FOLDER = WORK_DIR + 'Dataset/Test/'
VIDEO_PATH = WORK_DIR + 'Dataset/video_data.csv'
COMMENT_PATH = WORK_DIR + 'Dataset/comment_data.csv'

# 1. Dataset

In [None]:

def split_ids(video_ids, train=0.7, valid=0.1, test=0.2):
    list_copy = video_ids.copy()
    random.shuffle(list_copy)
    train_size = math.floor(len(list_copy)*train)
    valid_size = math.floor(len(list_copy)*valid)
    return list_copy[:train_size], list_copy[train_size:(train_size+valid_size)], list_copy[(train_size+valid_size):]

In [None]:
video_df = pd.read_csv(VIDEO_PATH)
comment_df = pd.read_csv(COMMENT_PATH)
train_ids, valid_ids, test_ids = split_ids(video_df['vid'].unique())
print(len(train_ids), len(valid_ids), len(test_ids))

In [None]:
import string
def clean_text(text):
    text = str(text)
    for i in string.punctuation:
        text = text.replace(i, '')
    return text.lower()

video_df['title'] = video_df['title'].apply(clean_text)
video_df['transcript'] = video_df['transcript'].apply(clean_text)
comment_df['en_content'] = comment_df['en_content'].apply(clean_text)

In [None]:
train_video_df = video_df[video_df['vid'].isin(train_ids)]
valid_video_df = video_df[video_df['vid'].isin(valid_ids)]
test_video_df = video_df[video_df['vid'].isin(test_ids)]

train_comment_df = comment_df[comment_df['vid'].isin(train_ids)]
valid_comment_df = comment_df[comment_df['vid'].isin(valid_ids)]
test_comment_df = comment_df[comment_df['vid'].isin(test_ids)]

train_comment_df.to_csv('Dataset/Train/comment.csv', index = None)
test_comment_df.to_csv('Dataset/Test/comment.csv', index = None)
valid_comment_df.to_csv("Dataset/Valid/comment.csv", index = None)

train_video_df.to_csv('Dataset/Train/video.csv', index = None)
test_video_df.to_csv('Dataset/Test/video.csv', index = None)
valid_video_df.to_csv("Dataset/Valid/video.csv", index = None)

In [None]:
def get_data(folder):
    comment_path = os.path.join(folder,'comment.csv')
    video_path = os.path.join(folder,'video.csv')
    comment_df = pd.read_csv(comment_path)
    video_df = pd.read_csv(video_path)
    return {'comment':comment_df, 'video': video_df}
        

In [None]:
train_data = get_data(TRAIN_FOLDER)
valid_data = get_data(VALID_FOLDER)
test_data = get_data(TEST_FOLDER)

In [None]:
train_data['comment']

Unnamed: 0,vid,en_content
0,zCe72UH4Mto,full vlog posted
1,zCe72UH4Mto,i knew it was impossible for you to have just ...
2,zCe72UH4Mto,imagine if the friends that come over are like...
3,zCe72UH4Mto,yall own a whole grocery store
4,zCe72UH4Mto,holy cow i cant even imagine how much the elec...
...,...,...
136696,_1AjMAPtmtQ,its definitely worrying that climate change is...
136697,_1AjMAPtmtQ,last year in western oregon it was 117f or 47c...
136698,_1AjMAPtmtQ,im fine in a dry 110115f climate but i cannot ...
136699,_1AjMAPtmtQ,i aint seen our allies go through this of ever...


## 1.1 Vocabulary

In [None]:
from collections import Counter
tokenizer = get_tokenizer("basic_english")
specials = ['<unk>','<pad>', '<sos>', '<eos>']

word_list = []

def yield_tokens():
    for data in [train_data, valid_data]:
        columns = [data['comment']['en_content'],
                data['video']['title'],
                data['video']['transcript']]

        token_lists = [tokenizer(str(text)) for column in columns for text in column]
        for tokens in token_lists:
            yield tokens

vocabulary = build_vocab_from_iterator(yield_tokens(), specials=specials, min_freq=2)
vocabulary.set_default_index(vocabulary['<unk>'])

In [None]:
len(vocabulary)

50437

In [None]:
sentence = "i am happy".split()
indexs = vocabulary.forward(sentence)
indexs

[6, 166, 198]

In [None]:
vocabulary.lookup_tokens(indexs)

['i', 'am', 'happy']

## 1.2 Dataset & Dataloader

In [None]:
class TextDataset(Dataset):
    def __init__(self, data, tokenizer=tokenizer, vocab=vocabulary, video_length=7000, comment_length = 32):
        self.video_length = video_length
        self.comment_length = comment_length
        
        self.tokenizer = tokenizer
        self.vocab = vocab
    
        self.video_df = data['video']
        self.comment_df = data['comment']

    
    def __getitem__(self, index):
        vid = self.comment_df.iloc[index]['vid']
        
        # trg
        comment = str(self.comment_df.iloc[index]['en_content'])
        tokenized_comment = self.tokenizer(comment)
        tagged_comment = self.tag(tokenized_comment, self.comment_length)
        comment_idxs = self.vocab.forward(tagged_comment)
        
        # src
        video = self.video_df[self.video_df['vid'] == vid]
        
        title = str(video['title'].item())
        transcript = str(video['transcript'].item())
        
        video_texts = ' '.join([title, transcript])
        tokenized_video_texts = self.tokenizer(video_texts)[:self.video_length-2]
        tagged_video_texts = self.tag(tokenized_video_texts, self.video_length)
        
        video_idxs = self.vocab.forward(tagged_video_texts)
        
        return torch.tensor(video_idxs).to(device), torch.tensor(comment_idxs).to(device)
        
    def __len__(self):
        return self.comment_df.shape[0]
        
    def tag(self, words, length):
        words.insert(0, '<sos>')
        words.append("<eos>")
        words = words + ['<pad>']*(length-len(words))
        return words
        

In [None]:
train_dataset = TextDataset(train_data)
test_dataset = TextDataset(test_data)
valid_dataset = TextDataset(valid_data)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)
valid_loader = DataLoader(valid_dataset, batch_size=32)


In [None]:
for video, comment in train_loader:
    print(video.shape,video.device)
    print(comment.shape,comment.device)
    break

torch.Size([32, 7000]) cuda:0
torch.Size([32, 32]) cuda:0


* `nn.Embedding(num_embedding, embedding_dim)`
    * `num_embedding`: vocabulary_size
    * `embedding_dim`: embedding vector size
    * output shape : (batch_size, embedding_dim)
* `nn.LSTM(input_size, hidden_size, num_layers, batch_first=False)`
    * **QUESTION**: what is the `output` of LSTM?
        * equal to last hidden of output?
* 

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, rnn_layers, dropout_ratio):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim) 
        
        self.dropout = nn.Dropout(dropout_ratio)
        
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, rnn_layers)
    
    def forward(self, src):
        embedded_seq = self.dropout(self.embedding(src))
        # embedded shape = (src_len ,batch_size,  embedding_dim)
        output, (hidden, cell) = self.rnn(embedded_seq)
        # output_shape = (seq_len, batch_size, hidden_size)
    
        return hidden, cell 

class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, rnn_layers,  dropout_ratio):
        super().__init__()
        
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, rnn_layers)
        self.fcl = nn.Linear(hidden_dim, output_dim, bias=False)
        self.dropout = nn.Dropout(dropout_ratio)
    
    def forward(self, trg_word, hidden, cell):

        trg_seq = trg_word.unsqueeze(0) 
        # seq_len = 1
        embedded_seq = self.dropout(self.embedding(trg_seq)) # embedded comment
        
        output, (hidden, cell) = self.rnn(embedded_seq, (hidden, cell)) 
        prediction = self.fcl(output.squeeze(0)) 
        return prediction, hidden, cell
        
        
        
        

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, vocab): 
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.vocab = vocab
    
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
    
        trg_len = trg.shape[0]
        batch_size = trg.shape[1]
        decoder_outputs = torch.zeros(trg_len, batch_size, len(self.vocab)).to(device)
        
        # return final state
        encoder_hidden, encoder_cell = self.encoder(src)

        target_word = trg[0,:]

        for t in range(1, trg_len):
            if t == 1:
                output, hidden, cell = self.decoder(target_word, encoder_hidden, encoder_cell)
            else:
                output, hidden, cell = self.decoder(target_word, hidden, cell)
            
            decoder_outputs[t] = output
            
            teacher_force = random.random() < teacher_forcing_ratio
            
            top1_word = output.argmax(1)
            target_word = trg[t] if teacher_force else top1_word
        
        return decoder_outputs
        
        
        

## 2.2 Train & Evaulation

In [None]:
def train(model, loader, optimizer, clip, loss_fn=nn.CrossEntropyLoss()):
    model.train()
    epoch_loss = 0
    for video_text, comment in tqdm(loader):
        optimizer.zero_grad()
        video_text = video_text.permute(1,0)
        comment = comment.permute(1,0) 
        
        output = model(video_text, comment)
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        comment = comment[1:].contiguous().long().view(-1)
        
        loss = loss_fn(output, comment)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(loader)

In [None]:
def evaluate(model, loader,loss_fn=nn.CrossEntropyLoss()):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for video_text, comment in tqdm(loader): 
            
            video_text = video_text.T.contiguous()
            comment = comment.T.contiguous() 

            output = model(video_text, comment)
            output_dim = output.shape[-1]

            output = output[1:].view(-1, output_dim)
            comment = comment[1:].view(-1)

            loss = loss_fn(output, comment)
            epoch_loss += loss.item()
    return epoch_loss / len(loader)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
INPUT_DIM = len(vocabulary)
OUTPUT_DIM = len(vocabulary)

ENCODER_EMBEDDED_DIM = 256
DECODER_EMBEDDED_DIM = 256
HIDDEN_DIM = 512
NUM_LAYERS = 2
ENCODER_DROPOUT = 0.5
DECODER_DROPOUT = 0.5

# input_dim, embedding_dim, hidden_dim, rnn_layers, dropout_ratio
encoder = Encoder(INPUT_DIM, ENCODER_EMBEDDED_DIM, HIDDEN_DIM, NUM_LAYERS,ENCODER_DROPOUT).to(device)
decoder = Decoder(OUTPUT_DIM, DECODER_EMBEDDED_DIM, HIDDEN_DIM, NUM_LAYERS, DECODER_DROPOUT).to(device)

# encoder, decoder, vocab
model = Seq2Seq(encoder, decoder, vocabulary).to(device)

In [None]:
model_path = os.path.join(WORK_DIR, 'Models','Seq2SeqModel2', f'epoch5.pt')
results = torch.load(model_path)
model.load_state_dict(results['state_dict'])

<All keys matched successfully>

In [None]:
EPOCHS = 10
CLIP = 1
optimizer = optim.Adam(model.parameters())

best_valid_loss = float('inf')

for epoch in range( EPOCHS):
    start_time = time.time()
    train_loss = train(model, train_loader, optimizer= optimizer, clip=CLIP)
    valid_loss = evaluate(model, valid_loader)
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    result_dict = {
        'train_loss':train_loss,
        'valid_loss':valid_loss,
        'state_dict': model.state_dict(),
    }
    torch.save(result_dict, os.path.join(WORK_DIR, 'Models','Seq2SeqModel2', f'epoch{epoch+1}.pt'))


    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

In [None]:
class TestTextDataset(Dataset):
    def __init__(self, data, tokenizer=tokenizer, vocab=vocabulary, video_length=7000, comment_length = 32):
        self.video_length = video_length
        self.comment_length = comment_length
        
        self.tokenizer = tokenizer
        self.vocab = vocab
    
        self.video_df = data['video']
        self.comment_df = data['comment']

    
    def __getitem__(self, index):
        vid = self.comment_df.iloc[index]['vid']
        
        # trg
        comment = str(self.comment_df.iloc[index]['en_content'])
        tokenized_comment = self.tokenizer(comment)
        tagged_comment = self.tag(tokenized_comment, self.comment_length)
        comment_idxs = self.vocab.forward(tagged_comment)
        
        # src
        video = self.video_df[self.video_df['vid'] == vid]
        
        title = str(video['title'].item())
        transcript = str(video['transcript'].item())
        
        video_texts = ' '.join([title, transcript])
        
        tokenized_video_texts = self.tokenizer(video_texts)[:self.video_length-2]
        text_length = self.video_length 
        
        tagged_video_texts = self.tag(tokenized_video_texts, self.video_length)
        
        video_idxs = self.vocab.forward(tagged_video_texts)
        
        return torch.tensor(video_idxs), torch.tensor(text_length),torch.tensor(comment_idxs), vid
        # return torch.tensor(video_idxs).to(device), torch.tensor(comment_idxs).to(device)
        
    def __len__(self):
        return self.comment_df.shape[0]
        
    def tag(self, words, length):
        words.insert(0, '<sos>')
        words.append("<eos>")
        words = words + ['<pad>']*(length-len(words))
        return words
    
def test_collate_fn(data):
    data.sort(key=lambda x: x[1], reverse=True) 
    video = [row[0].numpy() for row in data]
    length = [row[1] for row in data]
    comment = [row[2].numpy() for row in data]
    vid = [row[3]for row in data]
    # return torch.Tensor(video).int().to(device), torch.Tensor(length).int().to(device), torch.Tensor(comment).int().to(device)
    return torch.Tensor(video).int().to(device), torch.Tensor(length), torch.Tensor(comment).int().to(device), vid

In [None]:
model.eval()
test_dataset = TestTextDataset(test_data)
test_loader = DataLoader(test_dataset, batch_size=128, collate_fn=test_collate_fn)
ratio = 0.5
with torch.no_grad():
    for video_texts, text_lens, comments, vid in tqdm(test_loader): 
        results = []
        video_texts = video_texts.permute(1,0)
        comments = comments.permute(1,0) # 原数据集为batch first, 因rnn需要，因此改成seq len first
        outputs = model(video_texts, comments, ratio)
        outputs = outputs.permute(1, 0 ,2)
        comments = comments.permute(1, 0)
        
        for i in range(outputs.shape[0]):
            generated_comment_index = outputs[i,:,:].argmax(1)
            generated_comment = vocabulary.lookup_tokens(list(generated_comment_index))
            generated_comment = ' '.join([word for word in generated_comment if word not in ['<pad>', '<eos>', '<sos>']])
            comment = vocabulary.lookup_tokens(list(comments[i]))
            comment = ' '.join([word for word in comment if word not in ['<pad>', '<eos>', '<sos>']])
            results.append(['PadGateModel',vid[i], str(ratio), comment, generated_comment, '\n']) 
        with open(WORK_DIR+'Models/Seq2SeqModel2/'+'predictions.csv', 'a') as f:
            f.writelines([','.join(line) for line in results])

100%|██████████| 304/304 [10:57<00:00,  2.16s/it]
