# HW.8 Seq2Seq

- 英文翻譯中文

    - 输入： 一句英文 （e.g. tom is a student .）
    - 输出： 中文翻译 （e.g. 湯姆 是 個 學生 。）
- TODO

    - 实现 Attention Mechanism
    - 实现 Beam Search
    - 实现 Schedule Sampling

## Dataset



In [1]:
!gdown --id '1r4px0i-NcrnXy1-tkBsIwvYwbWnxAhcg' --output data.tar.gz
!tar -zxvf data.tar.gz
!mkdir ckpt
!ls

Downloading...
From: https://drive.google.com/uc?id=1r4px0i-NcrnXy1-tkBsIwvYwbWnxAhcg
To: /content/data.tar.gz
5.83MB [00:00, 18.5MB/s]
cmn-eng/
cmn-eng/int2word_cn.json
cmn-eng/int2word_en.json
cmn-eng/preprocess/
cmn-eng/preprocess/build_dataset.py
cmn-eng/preprocess/build_dictionary.sh
cmn-eng/preprocess/cmn.txt
cmn-eng/preprocess/cn.txt
cmn-eng/preprocess/dict.txt.big
cmn-eng/preprocess/dict.txt.small
cmn-eng/preprocess/en.txt
cmn-eng/preprocess/en_code.txt
cmn-eng/preprocess/en_refine.txt
cmn-eng/preprocess/en_vocab.txt
cmn-eng/preprocess/tokenizer.py
cmn-eng/testing.txt
cmn-eng/training.txt
cmn-eng/validation.txt
cmn-eng/word2int_cn.json
cmn-eng/word2int_en.json
ckpt  cmn-eng  data.tar.gz  drive  sample_data


In [2]:
import json
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optimz
import numpy as np
import pandas as pd
import jieba
import random
from torch.utils.data import Dataset,DataLoader
from gensim.models import word2vec
import warnings
warnings.filterwarnings('ignore')

## Data preprocess

In [3]:
class data_preprocess():
    def __init__(self, config):
        self.root = config.data_path
        self.embed_dim = config.embed_dim
        self.cn_int2word, self.cn_word2int = self.get_dictionary('cn')
        self.en_int2word, self.en_word2int = self.get_dictionary('en')

    def get_dictionary(self, language):
        """
        Load the vocabulary
        return:
            int2word, word2int : dic of vocabulary
        """
        with open(os.path.join(self.root, f'int2word_{language}.json'), 'r') as f:
            int2word = json.load(f)
        with open(os.path.join(self.root, f'word2int_{language}.json'), 'r') as f:
            word2int = json.load(f)
        return int2word, word2int

    def train_word2vec(self, en_data, cn_data):
        """
        Train the word2vec model
        return:
            None
        """
        self.en_word2vec = word2vec.Word2Vec(en_data, size = self.embed_dim, window = 5, min_count = 2)
        self.cn_word2vec = word2vec.Word2Vec(cn_data, size = self.embed_dim, window = 5, min_count = 2) 

    def added_embedding(self):
        """
        Initial tag embedding <PAD><BOS><EOS><UNK>
        """
        pad_vector = torch.empty(1, self.embed_dim)
        vector = torch.empty(3, self.embed_dim)
        torch.nn.init.uniform_(vector)
        return torch.cat([pad_vector, vector], 0)

    def build_embedding(self):
        """
        Build the embedding matrix for nn.Embedding
        return:
            en_embedding,cn_embedding : tensor of vocab_size * embed_dim
        """
        en_add_embedding = self.added_embedding()
        en_embedding = np.empty((len(self.en_word2int), self.embed_dim))
        for word in self.en_word2int:
            en_embedding[self.en_word2int[word],:] = self.en_word2vec[word] if word in self.en_word2vec else en_add_embedding[3,:]
        self.en_embed_matrix = torch.cat([en_add_embedding, torch.tensor(en_embedding[4:],dtype = torch.float)], 0) 

        cn_add_embedding = self.added_embedding()
        cn_embedding = np.empty((len(self.cn_word2int), self.embed_dim))
        for word in self.cn_word2int:
            cn_embedding[self.cn_word2int[word],:] = self.cn_word2vec[word] if word in self.cn_word2vec else cn_add_embedding[3,:]
        self.cn_embed_matrix = torch.cat([cn_add_embedding, torch.tensor(cn_embedding[4:],dtype = torch.float)], 0)
        return self.en_embed_matrix, self.cn_embed_matrix
    
    def pad_sequence(self, seq, max_len):
        """
        Padding sequence '[BOS] sequences [EOS]' fixed length = max_len
        return:
            fixed length sequence
        """
        seq.insert(0,1)
        seq_out = [0]*max_len
        seq = seq[:max_len-1]
        seq.append(2)
        seq_out[:len(seq)] = seq
        return seq_out

    def seq_preprocess(self, sequence, max_len):
        """
        Take the input word sequnence to index sequence and padding to fixed length
        return:
            en_out,cn_out : list of fixed length
        """
        x = sequence.strip('\n').split('\t')
        # word to index 
        en_idx = [self.en_word2int[word] if word in self.en_word2int else self.en_word2int['<UNK>'] for word in x[0].split()]
        cn_idx = [self.cn_word2int[word] if word in self.cn_word2int else self.cn_word2int['<UNK>'] for word in x[1].split()]
        # padding
        en_out = self.pad_sequence(en_idx, max_len)
        cn_out = self.pad_sequence(cn_idx, max_len)  
        return en_out,cn_out
     

### Load word2vec data

In [4]:
def load_train_data(path):
    """
    Load the corpis to train the word2vec
    return:
        en_data, cn_data : corpus for word2vec training
    """
    with open(path, 'r') as f:
        en_data,cn_data = [],[]
        lines = f.readlines()
        for line in lines:
            seq = line.strip('\n').split('\t')
            en_data.append(seq[0].split())
            cn_data.append(seq[1].split()) 
        return en_data,cn_data

### Build data process

In [5]:
def build_data_process(path_list, config):
    """
    return:
        process : class data_preprocess 
        en_embedding : english word2vec | array of vocab_size * embed_dim
        cn_embedding : chinese word2vec | array of vacab_size * embed_dim
    """
    en_train_vec, cn_train_vec = [],[]
    for path in path_list:
        en_data, cn_data = load_train_data(path)
        en_train_vec += en_data
        cn_train_vec += cn_data
    
    process = data_preprocess(config)
    process.train_word2vec(en_train_vec, cn_train_vec)
    en_embedding,cn_embedding = process.build_embedding()
    return process, en_embedding, cn_embedding

In [6]:
class TextDataSet(Dataset):
    def __init__(self, path, process, max_len):
        with open(path, 'r') as f:
            self.en, self.cn = [],[]
            for line in f.readlines():
                en_seq, cn_seq = process.seq_preprocess(line, max_len)
                self.en.append(en_seq)
                self.cn.append(cn_seq)
            assert len(self.cn) == len(self.en)
    
    def __len__(self):
        return len(self.en)

    def __getitem__(self,index):
        return np.array(self.en[index]), np.array(self.cn[index])

## Model

### Dynamic RNN

In [7]:
class Dynamic_RNN(nn.Module):
    # Dynamic RNN with sequence pad
    def __init__(self, input_dim, hidden_dim, Type = 'LSTM', num_layers = 1, nonlinearity = 'tanh', bias = True, batch_first = True, dropout = 0, bidirectional = False):
        super(Dynamic_RNN, self).__init__()
        self.batch_first = batch_first
        self.rnn_type = Type
        if Type == 'RNN':
            self.RNN = nn.RNN(input_size = input_dim, hidden_size = hidden_dim, num_layers = num_layers,bias = bias, 
                              dropout = dropout, batch_first = batch_first, bidirectional = bidirectional)
        elif Type == 'LSTM':
            self.RNN = nn.LSTM(input_size = input_dim, hidden_size = hidden_dim, num_layers = num_layers,bias = bias, 
                               dropout = dropout, batch_first = batch_first, bidirectional = bidirectional)
        else:
            self.RNN = nn.GRU(input_size = input_dim, hidden_size = hidden_dim, num_layers = num_layers,bias = bias, 
                              dropout = dropout, batch_first = batch_first, bidirectional = bidirectional)

    def forward(self,x,x_len,hidden):
        x_sort_idx = torch.sort(-x_len)[1].long()
        x_unsort_idx = torch.sort(x_sort_idx)[1].long()
        x_len = x_len[x_sort_idx]
        x = x[x_sort_idx]

        x_embed_pad = torch.nn.utils.rnn.pack_padded_sequence(x, x_len, batch_first = self.batch_first)
        if self.rnn_type == 'LSTM':
            out_pack,(h_t,c_t) = self.RNN(x_embed_pad,hidden)
        else:
            out_pack, h_t = self.RNN(x_embed_pad,hidden)
            c_t = None
        
        h_t = torch.transpose(h_t, 0, 1)[x_unsort_idx] 
        h_t = torch.transpose(h_t, 0, 1)

        out = torch.nn.utils.rnn.pad_packed_sequence(out_pack, batch_first = self.batch_first)
        out = out[0]
        out = out[x_unsort_idx]
        if self.rnn_type == 'LSTM':
            c_t = torch.transpose(c_t, 0, 1)[x_unsort_idx]
            c_t = torch.transpose(c_t, 0, 1)
        return out,(h_t,c_t)  

### Encoder

In [8]:
class Encoder(nn.Module):
    # Encoder Model
    def __init__(self, embed_matrix, layers = 1, hidden_dim = 128):
        super(Encoder, self).__init__()
        self.embed_dim = embed_matrix.shape[1]
        self.embed = nn.Embedding.from_pretrained(embed_matrix)
        self.rnn = Dynamic_RNN(self.embed_dim, hidden_dim, Type = 'GRU',num_layers = layers, bidirectional = True)

    def forward(self, inputs):
        inputs_len = torch.tensor(torch.sum(inputs != 0,dim = -1),dtype = torch.float)
        tokens = self.embed(inputs)
        out,(h_t,_) = self.rnn(tokens, inputs_len, None)
        return out,h_t

### Attention

In [9]:
class Attention(nn.Module):
    # Attention Model
    def __init__(self, enc_dim, dec_dim, num_layers):
        super(Attention,self).__init__()
        self.enc_dim = enc_dim * 2
        self.dec_dim = dec_dim * 2
        self.softmax = nn.Softmax(dim = 2)
        self.dense1 = nn.Linear(self.enc_dim+self.dec_dim,self.dec_dim, bias = False)
        self.dense2 = nn.Linear(self.enc_dim * num_layers, self.enc_dim)

    def forward(self, k, q, v):
        q = self.dense2(q)
        q_t = q.permute(1,2,0).contiguous()
        score = torch.matmul(k, q_t)
        score = score.permute(0,2,1).contiguous()
        score = self.softmax(score)
        out = torch.matmul(score, v)
        q = q.permute(1,0,2).contiguous()   
        out = self.dense1(torch.cat([q, out], dim = -1))
        return out

### Decoder

In [10]:
class Decoder(nn.Module):
    # Decoder Model
    def __init__(self, embed_matrix, attention, layers = 1, hidden_dim = 128, dropout = 0.5):
        super(Decoder, self).__init__()
        self.embed_dim = embed_matrix.shape[1]
        self.out_dim = embed_matrix.shape[0]
        self.hid_dim = hidden_dim * 2
        self.embedding = nn.Embedding.from_pretrained(embed_matrix)
        self.attn = attention
        self.RNN = nn.GRU(self.embed_dim, self.hid_dim, num_layers = layers, dropout = 0.5 ,batch_first = True, bidirectional = False)
        self.dense = nn.Sequential(
                        nn.Dropout(dropout),
                        nn.Linear(self.hid_dim*2, self.hid_dim*4),
                        nn.Linear(self.hid_dim*4, self.hid_dim*2),
                        nn.Linear(self.hid_dim*2, self.hid_dim),
                        nn.Linear(self.hid_dim, self.out_dim),
                    )

    def forward(self, inputs, hidden, encoder_out):
        tokens = self.embedding(inputs)
        rnn_out, h_t = self.RNN(tokens, hidden)
        h_t = h_t.view(1,tokens.shape[0],-1)
        attn_out = self.attn(encoder_out, h_t, encoder_out)
        attn_out = attn_out.expand(-1,rnn_out.shape[1],-1)
        dense_in = torch.cat([rnn_out, attn_out], dim = -1)
        out = self.dense(dense_in) 
        return out,h_t

### Scheduler sampling

In [11]:
def scheudle_sampling(Type,rate = 0.7):
    if Type == 'train':
        return 0.5
    else:
        return 0

### Beam Search

In [12]:
def beam_search(beam_list, k):
    pass

### Seq2Seq

In [13]:
class Seq2Seq(nn.Module):
    # Sequence to Sequence Model
    def __init__(self, encoder, decoder, device, num_layers):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.num_layers = num_layers

    def forward(self, inputs, target, teacher_forcing_ratio):
        batch_size = target.shape[0]
        target_len = target.shape[1]
        vocab_size = self.decoder.out_dim

        outputs = torch.zeros(batch_size, target_len, vocab_size).to(self.device)
        preds = torch.cat([torch.ones(batch_size,1), torch.zeros(batch_size, target_len-1)], dim = 1)

        # encoder_out : [batch, seq_len,  num_directions * hidden_size]
        # encoder_ht  : [num_layers * num_directions, batch, hidden_size]
        encoder_out, encoder_ht = self.encoder(inputs)
        encoder_ht = encoder_ht.view(self.num_layers, 2, batch_size, -1).contiguous()
        encoder_ht = torch.cat((encoder_ht[:,-2,:,:], encoder_ht[:,-1,:,:]), dim = 2)
        
        decoder_in = torch.zeros((batch_size,target_len)).long()
        decoder_in[:,0] = target[:,0]
        for t in range(1,target_len):
            decoder_out, hid = self.decoder(decoder_in.to(self.device), encoder_ht, encoder_out)

            if t == decoder_out.shape[1]+1:
                break
            outputs[:,t] = decoder_out[:,t-1]
            top = decoder_out.argmax(-1)

            if random.random() < teacher_forcing_ratio and torch.sum(target[:,t]) != 0:
                decoder_in[:,t] = target[:,t]
            else:
                decoder_in[:,t] = top[:,t-1]
            
            preds[:,t] = top[:,t-1]
        
        return outputs,preds

## Utils

- Basic operation
    - Save Model
    - Load Model
    - Build Model
    - Tokens to Sequence
    - Compute BLEU score
    


### Save Model

In [14]:
def save_model(model, store_model_path, step):
    torch.save(model.state_dict(), f'{store_model_path}/model_{step}.cpkt')

### Load Model

In [15]:
def load_model(model, load_model_path):
    model.load_state_dict(torch.load(f'{load_model_path}.ckpt'))
    return model

### Build Model

In [16]:
def build_model(config,en_embed,cn_embed):
    attention = Attention(config.hidden_dim, config.hidden_dim, config.num_layers)
    encoder = Encoder(en_embed, layers = config.num_layers, hidden_dim = config.hidden_dim)
    decoder = Decoder(cn_embed, attention, layers = config.num_layers, hidden_dim = config.hidden_dim, dropout = config.dropout)
    model = Seq2Seq(encoder,decoder,config.device, config.num_layers)
    optimizer = optimz.Adam(model.parameters(), lr = config.learning_rate, weight_decay = config.weight_decay)
    
    if config.load_model:
        model = load_model(model, config.load_model_path)

    model = model.to(config.device)
    return model,optimizer

### Tokens to Sequence

In [17]:
def tokens_to_sequence(outputs, int2word):
    """
    Transform tokens into a sequence
    return :
        sentence of word character
    """
    sentences = []
    for tokens in outputs:
        sentence = []
        for token in tokens:
            word = int2word[str(int(token))]
            if word == '<EOS>':
                break
            sentence.append(word)
        sentences.append(sentence)
    return sentences

### Compute BLEU score

In [18]:
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

def compute_bleu(sentences, targets):
    """
    Compute the BLEU score between the predict sequence and target
    return :
        BLEU score of one batch
    """
    score = 0
    assert (len(sentences) == len(targets))

    def cut_token(sentence):
        """
        Split the sentence into character level list
        return:
            tmp : the list of character tokens
        """
        tmp = []
        for token in sentence:
            if token == '<UNK>' or token.isdigit() or len(bytes(token[0], encoding = 'utf-8')) == 1:
                tmp.append(token)
            else:
                tmp += [word for word in token]
        return tmp
    
    for sentence, target in zip(sentences, targets):
        sentenc = cut_token(sentence)
        target = cut_token(target)
        score += sentence_bleu([target], sentence, weights=(1,0,0,0))
    return score

## Config

In [19]:
class Config(object):
    def __init__(self):
        self.batch_size = 64
        self.embed_dim = 256
        self.hidden_dim = 512
        self.num_layers = 2
        self.dropout = 0.5
        self.learning_rate = 1e-4
        self.weight_decay = 0
        self.epoch_num = 30
        self.max_len = 8
        self.load_model = False
        self.load_model_path = ''
        self.store_model_path = "./ckpt"      
        self.data_path = "./cmn-eng"
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'


## Train & Test

### Train model

In [20]:
def train(model, optimizer, train_iter, loss_func, device):
    model.train()
    model.zero_grad()
    losses = 0.0

    for idx,data in enumerate(train_iter):
        sources = data[0].to(device)
        targets = data[1].to(device)
        outputs, preds = model(sources, targets, scheudle_sampling('train'))
           
        outputs = outputs[:,1:].reshape(-1, outputs.size(2))
        targets = targets[:,1:].reshape(-1)
        loss = loss_func(outputs, targets)

        optimizer.zero_grad() 
        loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()
        losses += loss.item()
        
    return model, optimizer, losses / len(train_iter)

### Test model

In [21]:
def test(model, test_loader, loss_function, process, device):
    model.eval()
    loss_sum, bleu_score = 0.0, 0.0
    n = 0
    for idx,data in enumerate(test_loader):
        sources,targets = data[0].to(device),data[1].to(device)
        batch_size = sources.shape[0]
        outputs, preds = model(sources, targets, scheudle_sampling('val'))

        outputs = outputs[:,1:].reshape(-1, outputs.shape[2])
        targets = targets[:,1:].reshape(-1)

        loss = loss_function(outputs, targets)
        loss_sum += loss.item()

        targets = targets.view(sources.shape[0], -1)
        preds = tokens_to_sequence(preds[:,1:], process.cn_int2word)
        sources = tokens_to_sequence(sources, process.en_int2word)
        targets = tokens_to_sequence(targets, process.cn_int2word)
        
        bleu_score += compute_bleu(preds, targets)
        n += batch_size
    
    return loss_sum / len(test_loader), bleu_score / n
        

### Train process

In [22]:
def train_process(config):
    process, en_embed, cn_embed =  build_data_process(['./cmn-eng/training.txt','./cmn-eng/validation.txt'],config)
    assert len(process.en_word2int) == en_embed.shape[0]
    assert len(process.cn_word2int) == cn_embed.shape[0]

    train_data = TextDataSet('./cmn-eng/training.txt', process, config.max_len)
    train_loader = DataLoader(train_data, batch_size = config.batch_size, shuffle = True, drop_last = False)

    val_data = TextDataSet('./cmn-eng/validation.txt', process, config.max_len)
    val_loader = DataLoader(val_data, batch_size = config.batch_size, shuffle = True)

    model, optimizer = build_model(config, en_embed, cn_embed)
    criterion = nn.CrossEntropyLoss()
    print('------------------ Model Info ------------------')
    print(model)
    print('------------------ Optimizer -------------------')
    print(optimizer)
    print('------------------ Train Epoch -----------------')
    best_loss, best_bleu = 99999,0
    for epoch in range(config.epoch_num):
        model, optimizer, train_loss = train(model, optimizer, train_loader, criterion, config.device)

        val_loss, bleu = test(model, val_loader, criterion, process, config.device)

        if val_loss < best_loss and bleu > best_bleu:
            best_loss, best_bleu = val_loss, bleu
            save_model(model,config.store_model_path,epoch+1)
            print(f"Epoch num is {epoch+1}, Best val loss is {val_loss}, Best Bleu socre is {best_bleu}")
        else:
            print(f"Epoch num is {epoch+1}, Bleu score is {bleu}")

    return model, criterion

### Test process

In [23]:
def test_process(config, model, criterion):
    process, en_embed, cn_embed = build_data_process(['./cmn-eng/testing.txt'], config)
    assert len(process.en_word2int) == en_embed.shape[0]
    assert len(process.cn_word2int) == cn_embed.shape[0]

    data = TextDataSet('./cmn-eng/testing.txt', process, config.max_len)
    data_loader = DataLoader(data, batch_size = config.batch_size, shuffle = True, drop_last = False)

    test_loss, bleu_score = test(model, data_loader, criterion, process, config.device)

    return test_loss, bleu_score

In [24]:
def main():
    config = Config()
    model,criterion = train_process(config)
    test_loss, bleu_socre = test_process(config, model, criterion)

In [25]:
main()

------------------ Model Info ------------------
Seq2Seq(
  (encoder): Encoder(
    (embed): Embedding(3922, 256)
    (rnn): Dynamic_RNN(
      (RNN): GRU(256, 512, num_layers=2, batch_first=True, bidirectional=True)
    )
  )
  (decoder): Decoder(
    (embedding): Embedding(3805, 256)
    (attn): Attention(
      (softmax): Softmax(dim=2)
      (dense1): Linear(in_features=2048, out_features=1024, bias=False)
      (dense2): Linear(in_features=2048, out_features=1024, bias=True)
    )
    (RNN): GRU(256, 1024, num_layers=2, batch_first=True, dropout=0.5)
    (dense): Sequential(
      (0): Dropout(p=0.5, inplace=False)
      (1): Linear(in_features=2048, out_features=4096, bias=True)
      (2): Linear(in_features=4096, out_features=2048, bias=True)
      (3): Linear(in_features=2048, out_features=1024, bias=True)
      (4): Linear(in_features=1024, out_features=3805, bias=True)
    )
  )
)
------------------ Optimizer -------------------
Adam (
Parameter Group 0
    amsgrad: False
   