In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import os
import random
import numpy as np

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from gensim.models import KeyedVectors

from utils import load_embed, save_embed, get_embedding

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
from data import Vocab
import pandas as pd

In [3]:
config = {
    'experiment_name': 'seq2seq_baseline',
    'task': 'train',
    'make_dict': False,
    'data_preprocessing': False,
    'src_lang': 'chinese',
    'tgt_lang': 'english',
    'max_length': 60,

    'ckpt_dir': 'ckpt/',

    'training':{
        'num_epochs': 20,
        'learning_rate': 0.01,
        'optimizer': 'sgd'
    },
    
    'embedding':{
        'cn_embed_path': 'data/sgns.merge.bigram.bz2',
        'en_embed_path': 'data/wiki.en.vec',
        'cur_cn_embedding_path': 'data/cn_embed.pkl',
        'cur_en_embedding_path': 'data/en_embed.pkl'
    },
        
    'model':{
        'fc_dim': 100,
        'name': 'seq2seq',
        'embed_size': 300,
        'batch_size': 64,
        'embedding_freeze': False,
        'encoder':{
            'hidden_size': 150,
            'num_layers': 1,
            'bidirectional': False,
            'dropout': 0.5,
        },  
        'decoder':{
            'hidden_size': 150,
            'num_layers': 1,
            'bidirectional': False,
            'dropout': 0.5,
        },
        'xgboost':{
            # 
        }
    },   
    
    'result':{
        'filename':'result.txt',
        'filepath':'res/',
    }
}

In [4]:
cn = pd.read_csv('data/cn_split.csv')
en = pd.read_csv('data/en.csv')
pair = pd.concat([cn, en], axis=1)

In [5]:
# split dataset
msk = np.random.rand(len(pair)) < 0.8
train = pair[msk]
valid = pair[~msk]

In [6]:
from torch.utils.data import Dataset
from collections import Counter

class myDS(Dataset):

    def __init__(self, df, src_lang, tgt_lang, src_sents, tgt_sents):
        # Assign vocabularies.
        self.src = df[src_lang].tolist()
        self.tgt = df[tgt_lang].tolist()
        self.src_vocab = Vocab(src_sents, sos_token='<sos>', eos_token='<eos>', unk_token='<unk>')
        self.tgt_vocab = Vocab(tgt_sents, sos_token='<sos>', eos_token='<eos>', unk_token='<unk>')
        
    def __len__(self):
        return len(self.src)

    def __getitem__(self, idx):
        # Split sentence into words.
        src_words = self.src[idx].split()
        tgt_words = self.tgt[idx].split()

        # Add <SOS> and <EOS> tokens.
        src_words = [self.src_vocab.sos_token] + src_words + [self.src_vocab.eos_token]
        tgt_words = [self.tgt_vocab.sos_token] + tgt_words + [self.tgt_vocab.eos_token]

        # Lookup word ids in vocabularies.
        src_ids = [self.src_vocab.word2id(word) for word in src_words]
        tgt_ids = [self.tgt_vocab.word2id(word) for word in tgt_words]

#         print(src_words)
#         print(tgt_words)
        
        return src_ids, tgt_ids

In [7]:
# src_lang = 'chinese'
# tgt_lang = 'english'

src_lang = config['src_lang']
tgt_lang = config['tgt_lang']

# All senteneces (including train and valid)
src_sents = pair[src_lang].tolist()
tgt_sents = pair[tgt_lang].tolist() 

train_ds = myDS(train, src_lang, tgt_lang, src_sents, tgt_sents)
valid_ds = myDS(valid, src_lang, tgt_lang, src_sents, tgt_sents)

In [8]:
print('Preparing {} - {} NMT Model.\n'.format(src_lang, tgt_lang))
print('Preparing {} Training sentence pairs; {} Validation pairs.\n\nWith {} source language words; {} target language words.'.format(
    train_ds.__len__(), valid_ds.__len__(), len(train_ds.src_vocab._id2word), len(train_ds.tgt_vocab._id2word)))

Preparing chinese - english NMT Model.

Preparing 80044 Training sentence pairs; 19956 Validation pairs.

With 53717 source language words; 35029 target language words.


In [82]:
train_dataloader = DataLoader(dataset=train_ds, shuffle=True, batch_size=64)

In [83]:
src_embed = config['src_embedding_matrix']

In [89]:
for idx, data in enumerate(train_dataloader, 0):
    src = data[0]
    outs= []
    h, c = enc.initHiddenCell()
    for i in range(len(src)):
        out,h,c = enc(src[i], h, c)
#         print(out)
        outs.append(out)
    
    for j in range(len(tgt)):
        
    
    
    print(src)
    print(tgt)
    break

tensor([[[-0.0984,  0.3723, -0.5471,  0.6153,  0.1835,  0.1941, -0.1816,
          -0.0765,  0.4914,  0.0046,  0.3725, -0.1949,  0.0384,  0.1225,
          -0.0020,  0.3702,  0.0790, -0.5191,  0.3687,  0.3941,  0.2960,
          -0.0864,  0.0015, -0.4496, -0.0343,  0.0303,  0.1771,  0.1550,
           0.0901,  0.2889, -0.0573,  0.0665,  0.0841, -0.2436, -0.2808,
          -0.3468,  0.3524, -0.2269, -0.1008,  0.1054, -0.0217, -0.3162,
          -0.1317, -0.2450,  0.1043,  0.3874,  0.0106,  0.0504, -0.3197,
          -0.2544,  0.5210,  0.1725, -0.1383, -0.0931, -0.0950,  0.1365,
           0.3695, -0.1193,  0.0436, -0.4731, -0.4016,  0.1256, -0.1041,
           0.0385, -0.1871, -0.0948,  0.0303,  0.1189, -0.2352,  0.1065,
           0.0145, -0.1852,  0.0066, -0.3150, -0.1500,  0.1299,  0.3527,
           0.0238,  0.4453,  0.3363,  0.5722, -0.1144, -0.2895, -0.0655,
          -0.0344, -0.0216, -0.1409, -0.3890,  0.2270,  0.1759,  0.2728,
          -0.3442,  0.2637,  0.0383,  0.1057,  0.27

In [9]:
def get_en_embedding(word_dict, embedding_path, embedding_dim=300):
    """

    :param word_dict: vocabulary words' list
    :param embedding_path: pre-trained embedding path
    :param embedding_dim: embedding dimensions
    :return:
    """
    # find existing word embeddings
    word_vec = {}
    with open(embedding_path) as f:
        for line in f:
            word, vec = line.split(' ', 1)
            if word in word_dict:
                word_vec[word] = np.array(list(map(float, vec.split())))
    
    
    print('Found {0}/{1} words with embedding vectors'.format(
        len(word_vec), len(word_dict)))
    missing_word_num = len(word_dict) - len(word_vec)
    missing_ratio = round(float(missing_word_num) / len(word_dict), 4) * 100
    print('Missing Ratio: {}%'.format(missing_ratio))

    # handling unknown embeddings
    for word in word_dict:
        if word not in word_vec:
            # If word not in word_vec, create a random embedding for it
            new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
            word_vec[word] = new_embedding
    print("Filled missing words' embeddings.")
    print("Embedding Matrix Size: ", len(word_vec))

    return word_vec

In [10]:
def get_cn_embeding(word_dict, full_embedding, embedding_dim=300):
    
    word_vec = {}
    count = 0

    for word in word_dict:
        if word in full_embedding.vocab:
            word_vec[word] = full_embedding[word]
            count+=1
        else: 
            new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
            word_vec[word] = new_embedding

    print('Found {0}/{1} words with full embedding vectors'.format(
        count, len(word_dict)))

    missing_word_num = len(word_dict) - count
    missing_ratio = round(float(missing_word_num) / len(word_dict), 4) * 100
    print('Missing Ratio: {}%'.format(missing_ratio))
    print("Filled missing words' embeddings.")
    print("Embedding Matrix Size: ", len(word_vec))
    
    return word_vec

In [11]:
def get_embedding(config):
    
    src_lang = config['src_lang']
    
    """
    English Embedding
    """
    cur_en_embed_path = config['embedding']['cur_en_embedding_path']
    full_en_embed_path = config['embedding']['en_embed_path']

    if os.path.exists(cur_en_embed_path) and not config['make_dict']:
        en_embed = load_embed(cur_en_embed_path)
        print('Loaded existing english embedding, containing {} words.'.format(len(en_embed)))
    else:
        print('Making embedding...')
        en_embed = get_embedding(train_ds.tgt_vocab._id2word, full_en_embed_path)
        save_embed(en_embed,cur_en_embed_path)
        print('Saved generated embedding.')
        
    """
    Chinese Embedding
    """
    cur_cn_embed_path = config['embedding']['cur_cn_embedding_path']
    full_cn_embed_path = config['embedding']['cn_embed_path']

    if os.path.exists(cur_cn_embed_path) and not config['make_dict']:
        cn_embed = load_embed(cur_cn_embed_path)
        print('Loaded existing chinese embedding, containing {} words.'.format(len(cn_embed)))
    else:
        print('loading full w2v embeddings...')
        word_vectors = KeyedVectors.load_word2vec_format('data/sgns.merge.bigram.bz2') 
        print('start extracting...')
        src_embed = get_cn_embeding(train_ds.src_vocab._id2word, word_vectors)
        save_embed(src_embed, 'data/cn_embed.pkl')
    
    
    if src_lang == 'chinese':
        src_embed = cn_embed
        tgt_embed = en_embed
    else:
        src_embed = en_embed
        tgt_embed = cn_embed

    src_vocab_size = len(src_embed)
    tgt_vocab_size = len(tgt_embed)

    # initialize nn embedding
    src_embedding = nn.Embedding(src_vocab_size, config['model']['embed_size'])
    tgt_embedding = nn.Embedding(tgt_vocab_size, config['model']['embed_size'])

    embed_list = []
    for word in train_ds.src_vocab._id2word:
        embed_list.append(src_embed[word])
    weight_matrix = np.array(embed_list)
    # pass weights to nn embedding
    src_embedding.weight = nn.Parameter(torch.from_numpy(weight_matrix).type(torch.FloatTensor), requires_grad = False)

    embed_list = []
    for word in train_ds.tgt_vocab._id2word:
        embed_list.append(tgt_embed[word])
    weight_matrix = np.array(embed_list)
    # pass weights to nn embedding
    tgt_embedding.weight = nn.Parameter(torch.from_numpy(weight_matrix).type(torch.FloatTensor), requires_grad = False)
    
    return src_embedding, src_vocab_size, tgt_embedding, tgt_vocab_size


In [12]:
# embedding
config['src_embedding_matrix'], config['src_vocab_size'], config['tgt_embedding_matrix'], config['tgt_vocab_size'] = get_embedding(config)


Loaded existing english embedding, containing 35029 words.
Loaded existing chinese embedding, containing 53717 words.


In [13]:
src_embed = config['src_embedding_matrix']


### English Embedding

In [37]:
cur_en_embed_path = config['embedding']['cur_en_embedding_path']
full_en_embed_path = config['embedding']['en_embed_path']

if os.path.exists(cur_en_embed_path) and not config['make_dict']:
    en_embed = load_embed(cur_en_embed_path)
    print('Loaded existing embedding.')
else:
    print('Making embedding...')
    en_embed = get_embedding(train_ds.tgt_vocab._id2word, full_en_embed_path)
    save_embed(en_embed,cur_en_embed_path)
    print('Saved generated embedding.')

Loaded existing embedding.


### Chinese Embedding

In [17]:
cur_cn_embed_path = config['embedding']['cur_cn_embedding_path']
full_cn_embed_path = config['embedding']['cn_embed_path']

""""""

if os.path.exists(cur_cn_embed_path) and not config['make_dict']:
    cn_embed = load_embed(cur_cn_embed_path)
    print('Loaded existing chinese embedding, containing {} words.'.format(len(cn_embed)))
else:
    print('loading full w2v embeddings...')
    word_vectors = KeyedVectors.load_word2vec_format('data/sgns.merge.bigram.bz2') 
    print('start extracting...')
    src_embed = get_cn_embeding(train_ds.src_vocab._id2word, word_vectors)
    save_embed(src_embed, 'data/cn_embed.pkl')

Loaded existing chinese embedding, containing 53717 words.


In [150]:
if src_lang == 'chinese':
    src_embed = cn_embed
    tgt_embed = en_embed
else:
    src_embed = en_embed
    tgt_embed = cn_embed
    
src_vocab_size = len(src_embed)
tgt_vocab_size = len(tgt_embed)

# initialize nn embedding
src_embedding = nn.Embedding(src_vocab_size, config['model']['embed_size'])
tgt_embedding = nn.Embedding(tgt_vocab_size, config['model']['embed_size'])

embed_list = []
for word in train_ds.src_vocab._id2word:
    embed_list.append(src_embed[word])
weight_matrix = np.array(embed_list)
# pass weights to nn embedding
src_embedding.weight = nn.Parameter(torch.from_numpy(weight_matrix).type(torch.FloatTensor), requires_grad = False)

embed_list = []
for word in train_ds.tgt_vocab._id2word:
    embed_list.append(tgt_embed[word])
weight_matrix = np.array(embed_list)
# pass weights to nn embedding
tgt_embedding.weight = nn.Parameter(torch.from_numpy(weight_matrix).type(torch.FloatTensor), requires_grad = False)


## Model

In [40]:

# model
enc = LSTMEncoder(config)
dec = AttnLSTMDecoder(config)

# data loader
train_dataloader = DataLoader(dataset=train_ds, shuffle=True, batch_size=config['model']['batch_size'])
teacher_forcing_ratio = 1.0

# loss
criterion = nn.NLLLoss()

# optimizer
enc_optimizer = optim.SGD(enc.parameters(), lr=config['training']['learning_rate'])
dec_optimizer = optim.SGD(dec.parameters(), lr=config['training']['learning_rate'])

SOS_TOKEN = train_ds.tgt_vocab._word2id[train_ds.tgt_vocab.sos_token]
EOS_TOKEN = train_ds.tgt_vocab._word2id[train_ds.tgt_vocab.eos_token]

epoch = 1

best_record = 100.0

while epoch < config['training']['num_epochs']:

    # Train
    print('Start Epoch {} Training...'.format(epoch))
    
    train_loss = []
    
    for idx, data in enumerate(train_dataloader, 0):

        src = data[0]
        tgt = data[1]

        loss = 0    

        # Encoder 
        enc_outputs = torch.zeros(config['max_length'], enc.hidden_size, device=device)
        enc_h, enc_c = enc.initHiddenCell()
        for i in range(len(src)):
            enc_out, enc_h, enc_c = enc(src[i], enc_h, enc_c)
            enc_outputs[i] = enc_out[0, 0]


        dec_in = torch.tensor(SOS_TOKEN, device=device).repeat(config['model']['batch_size'])
        dec_h = enc_h
        dec_c = enc_c

        # Decoder

        use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

        if use_teacher_forcing:
            # Teacher forcing: Feed the target as the next input
            for j in range(len(tgt)-1):
                dec_out, dec_h, dec_att = dec(dec_in, dec_h, dec_c, enc_outputs)
                print(criterion(dec_out, tgt[j+1]))
                loss += criterion(dec_out, tgt[j+1])
                dec_in = tgt[j+1]  # Teacher forcing
                break
        else:
            # Without teacher forcing: use its own predictions as the next input
            for j in range(len(tgt)-1):
                dec_out, dec_h, dec_att = dec(dec_in, dec_h, dec_c, enc_outputs)

                topv, topi = dec_out.topk(1)

                dec_in = topi.squeeze().detach()  # detach from history as input

                loss += criterion(dec_out, tgt[j+1])
    #             if dec_in == dec.embedding(torch.tensor(EOS_TOKEN, device=device)):
    #                 break

        train_loss.append(loss.data[0])

        enc_optimizer.zero_grad()
        dec_optimizer.zero_grad()

        loss.backward()

        enc_optimizer.step()
        dec_optimizer.step()

        if(len(train_loss)) % 10 == 0:
            print("{}/{} loss: {} ".format(idx+1, len(train_ds.src), round(np.mean(train_loss),4)))

        
#     # Valid
#     print('Epoch {} Validating...'.format(epoch))

#     # loss
#     valid_loss = []
    
#     # dataloader
#     valid_dataloader = DataLoader(dataset=valid_ds, shuffle=True, num_workers=2, batch_size=config['model']['batch_size'])

#     for idx, data in enumerate(valid_dataloader, 0):
#         src = data[0]
#         tgt = data[1]

#         # Encoder 
#         enc_outputs = torch.zeros(config['max_length'], enc.hidden_size, device=device)
#         enc_h, enc_c = enc.initHiddenCell()
#         for i in range(len(src)):
#             enc_out, enc_h, enc_c = enc(src[i], enc_h, enc_c)
#             enc_outputs[i] = enc_out[0, 0]


#         dec_in = torch.tensor(SOS_TOKEN, device=device).repeat(64)
#         dec_h = enc_h
#         dec_c = enc_c

#         for j in range(len(tgt)-1):
#             dec_out, dec_h, dec_att = dec(dec_in, dec_h, dec_c, enc_outputs)
#             loss += criterion(dec_out, tgt[j+1])
#             dec_in = tgt[j+1]  # Teacher forcing
            
#         valid_loss.append(loss.data[0])
    
#     print('Epoch {} Validation Loss: {}'.format(epoch, np.mean(valid_loss)))
    
#     epoch += 1
    
#     # Keep track of best record
#     if np.mean(valid_loss) < best_record:
#         best_record = np.mean(valid_loss)
#         # save the best model
#         state_dict = {
#             'epoch': epoch,
#             'siamese': siamese.state_dict(),
#             'optimizer': optimizer.state_dict(),
#         }
#         torch.save(state_dict, ckpt_path)
#         print('Model saved!\n')
        

  "num_layers={}".format(dropout, num_layers))


Start Epoch 1 Training...
tensor(10.4760, grad_fn=<NllLossBackward>)
tensor(10.4622, grad_fn=<NllLossBackward>)
tensor(10.4784, grad_fn=<NllLossBackward>)
tensor(10.4679, grad_fn=<NllLossBackward>)
tensor(10.4711, grad_fn=<NllLossBackward>)
tensor(10.4740, grad_fn=<NllLossBackward>)
tensor(10.4682, grad_fn=<NllLossBackward>)
tensor(10.4790, grad_fn=<NllLossBackward>)
tensor(10.4559, grad_fn=<NllLossBackward>)
tensor(10.4582, grad_fn=<NllLossBackward>)
10/80044 loss: 10.469099998474121 
tensor(10.4569, grad_fn=<NllLossBackward>)
tensor(10.4493, grad_fn=<NllLossBackward>)
tensor(10.4611, grad_fn=<NllLossBackward>)
tensor(10.4542, grad_fn=<NllLossBackward>)
tensor(10.4493, grad_fn=<NllLossBackward>)
tensor(10.4502, grad_fn=<NllLossBackward>)
tensor(10.4435, grad_fn=<NllLossBackward>)
tensor(10.4566, grad_fn=<NllLossBackward>)
tensor(10.4684, grad_fn=<NllLossBackward>)
tensor(10.4478, grad_fn=<NllLossBackward>)
20/80044 loss: 10.461400032043457 
tensor(10.4723, grad_fn=<NllLossBackward>)
t

KeyboardInterrupt: 

In [31]:
class AttnLSTMDecoder(nn.Module):
    def __init__(self, config):
        super(AttnLSTMDecoder, self).__init__()
        self.hidden_size = config['model']['decoder']['hidden_size']
        self.output_size = config['tgt_vocab_size']
        self.num_layers = config['model']['decoder']['num_layers']
        self.dropout_p = config['model']['decoder']['dropout']
        self.bidir = config['model']['decoder']['bidirectional']
        self.embed_size = config['model']['embed_size']
        self.embedding = config['src_embedding_matrix']
        self.batch_size = config['model']['batch_size']
        
        self.max_length = config['max_length']
        
        
        self.attn = nn.Linear(self.hidden_size + self.embed_size, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size + self.embed_size, self.embed_size)
        self.dropout = nn.Dropout(self.dropout_p)
        
        self.lstm = nn.LSTM(input_size=self.embed_size, hidden_size=self.hidden_size, dropout=self.dropout_p,
                            num_layers=self.num_layers, bidirectional=self.bidir)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, cell, encoder_outputs):
        embedded = self.embedding(input).view(1, self.batch_size, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)
        output = F.relu(output)
        output, (hidden,cell) = self.lstm(output, (hidden, cell))

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights


In [32]:
from torch.autograd import Variable
import torch.nn as nn

class LSTMEncoder(nn.Module):
    def __init__(self, config):
        super(LSTMEncoder, self).__init__()
        self.embed_size = config['model']['embed_size']
        self.batch_size = config['model']['batch_size']
#         self.batch_size = config['model']['batch_size']

        self.hidden_size = config['model']['encoder']['hidden_size']
        self.num_layers = config['model']['encoder']['num_layers']
        self.bidir = config['model']['encoder']['bidirectional']
        if self.bidir:
            self.direction = 2
        else: self.direction = 1
        self.dropout = config['model']['encoder']['dropout']

        self.embedding = config['src_embedding_matrix']
        self.lstm = nn.LSTM(input_size=self.embed_size, hidden_size=self.hidden_size, dropout=self.dropout,
                            num_layers=self.num_layers, bidirectional=self.bidir)

    def initHiddenCell(self):
        rand_hidden = Variable(torch.randn(self.direction * self.num_layers, self.batch_size, self.hidden_size))
        rand_cell = Variable(torch.randn(self.direction * self.num_layers, self.batch_size, self.hidden_size))
        if torch.cuda.is_available():
            rand_hidden = rand_hidden.cuda()
            rand_cell = rand_cell.cuda()
        return rand_hidden, rand_cell

    def forward(self, input, hidden, cell):
        input = self.embedding(input).view(1,self.batch_size, -1)
        output, (hidden, cell) = self.lstm(input, (hidden, cell))
        return output, hidden, cell

In [120]:
class EncoderRNN(nn.Module):
    def __init__(self, config):
        super(EncoderRNN, self).__init__()
        self.input_size = config['src_vocab_size']
        self.batch_size = config['model']['batch_size']
        self.hidden_size = config['model']['encoder']['hidden_size']
        self.embed_size = config['model']['embed_size']
        self.n_layers = config['model']['encoder']['num_layers']
        self.dropout = config['model']['encoder']['dropout']
        self.embedding = config['src_embedding_matrix']
        if config['model']['encoder']['bidirectional']:
            self.direction = 2
        else: self.direction = 1
        self.gru = nn.GRU(self.embed_size, self.hidden_size, self.n_layers, dropout=self.dropout, bidirectional=config['model']['encoder']['bidirectional'])

    def forward(self, input_seqs, input_lengths, hidden=None):
        '''
        :param input_seqs: 
            Variable of shape (num_step(T),batch_size(B)), sorted decreasingly by lengths(for packing)
        :param input:
            list of sequence length
        :param hidden:
            initial state of GRU
        :returns:
            GRU outputs in shape (T,B,hidden_size(H))
            last hidden stat of RNN(i.e. last output for GRU)
        '''
        embedded = self.embedding(input_seqs)
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        outputs, hidden = self.gru(packed, hidden)
        outputs, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(outputs)  # unpack (back to padded)
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]  # Sum bidirectional outputs
        return outputs, hidden
    
#     def initHiddenCell(self):
#         rand_hidden = Variable(torch.randn(self.direction * self.n_layers, self.batch_size, self.hidden_size))
#         rand_cell = Variable(torch.randn(self.direction * self.n_layers, self.batch_size, self.hidden_size))
#         if torch.cuda.is_available():
#             rand_hidden = rand_hidden.cuda()
#             rand_cell = rand_cell.cuda()
#         return rand_hidden, rand_cell

In [90]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, config):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = config['model']['decoder']['hidden_size']
        self.output_size =  config['tgt_vocab_size']
        self.dropout_p = config['model']['decoder']['dropout']
        self.max_length = config['max_length']

        self.embedding = config['tgt_embedding_matrix']
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
class DynamicEncoder(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size, n_layers=1, dropout=0.5):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.embed_size = embed_size
        self.n_layers = n_layers
        self.dropout = dropout
        self.embedding = nn.Embedding(input_size, embed_size)
        self.gru = nn.GRU(embed_size, hidden_size, n_layers, bidirectional=True)

    def forward(self, input_seqs, input_lens, hidden=None):
        """
        forward procedure. **No need for inputs to be sorted**
        :param input_seqs: Variable of [T,B]
        :param hidden:
        :param input_lens: *numpy array* of len for each input sequence
        :return:
        """
        batch_size = input_seqs.size(1)
        embedded = self.embedding(input_seqs)
        embedded = embedded.transpose(0, 1)  # [B,T,E]
        sort_idx = np.argsort(-input_lens)
        unsort_idx = cuda_(torch.LongTensor(np.argsort(sort_idx)))
        input_lens = input_lens[sort_idx]
        sort_idx = cuda_(torch.LongTensor(sort_idx))
        embedded = embedded[sort_idx].transpose(0, 1)  # [T,B,E]
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lens)
        outputs, hidden = self.gru(packed, hidden)
        outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs)
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]
        outputs = outputs.transpose(0, 1)[unsort_idx].transpose(0, 1).contiguous()
        hidden = hidden.transpose(0, 1)[unsort_idx].transpose(0, 1).contiguous()
        return outputs, hidden

In [None]:
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        self.hidden_size = hidden_size
        self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))
        stdv = 1. / math.sqrt(self.v.size(0))
        self.v.data.normal_(mean=0, std=stdv)

    def forward(self, hidden, encoder_outputs):
        '''
        :param hidden: 
            previous hidden state of the decoder, in shape (layers*directions,B,H)
        :param encoder_outputs:
            encoder outputs from Encoder, in shape (T,B,H)
        :return
            attention energies in shape (B,T)
        '''
        max_len = encoder_outputs.size(0)
        this_batch_size = encoder_outputs.size(1)
        H = hidden.repeat(max_len,1,1).transpose(0,1)
        encoder_outputs = encoder_outputs.transpose(0,1) # [B*T*H]
        attn_energies = self.score(H,encoder_outputs) # compute attention score
        return F.softmax(attn_energies).unsqueeze(1) # normalize with softmax

    def score(self, hidden, encoder_outputs):
        energy = F.tanh(self.attn(torch.cat([hidden, encoder_outputs], 2))) # [B*T*2H]->[B*T*H]
        energy = energy.transpose(2,1) # [B*H*T]
        v = self.v.repeat(encoder_outputs.data.shape[0],1).unsqueeze(1) #[B*1*H]
        energy = torch.bmm(v,energy) # [B*1*T]
        return energy.squeeze(1) #[B*T]

In [None]:
class BahdanauAttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, embed_size, output_size, n_layers=1, dropout_p=0.1):
        super(BahdanauAttnDecoderRNN, self).__init__()
        # Define parameters
        self.hidden_size = hidden_size
        self.embed_size = embed_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout_p = dropout_p
        # Define layers
        self.embedding = nn.Embedding(output_size, embed_size)
        self.dropout = nn.Dropout(dropout_p)
        self.attn = Attn('concat', hidden_size)
        self.gru = nn.GRU(hidden_size + embed_size, hidden_size, n_layers, dropout=dropout_p)
        #self.attn_combine = nn.Linear(hidden_size + embed_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, word_input, last_hidden, encoder_outputs):
        '''
        :param word_input:
            word input for current time step, in shape (B)
        :param last_hidden:
            last hidden stat of the decoder, in shape (layers*direction*B*H)
        :param encoder_outputs:
            encoder outputs in shape (T*B*H)
        :return
            decoder output
        Note: we run this one step at a time i.e. you should use a outer loop 
            to process the whole sequence
        Tip(update):
        EncoderRNN may be bidirectional or have multiple layers, so the shape of hidden states can be 
        different from that of DecoderRNN
        You may have to manually guarantee that they have the same dimension outside this function,
        e.g, select the encoder hidden state of the foward/backward pass.
        '''
        # Get the embedding of the current input word (last output word)
        word_embedded = self.embedding(word_input).view(1, word_input.size(0), -1) # (1,B,V)
        word_embedded = self.dropout(word_embedded)
        # Calculate attention weights and apply to encoder outputs
        attn_weights = self.attn(last_hidden[-1], encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))  # (B,1,V)
        context = context.transpose(0, 1)  # (1,B,V)
        # Combine embedded input word and attended context, run through RNN
        rnn_input = torch.cat((word_embedded, context), 2)
        #rnn_input = self.attn_combine(rnn_input) # use it in case your size of rnn_input is different
        output, hidden = self.gru(rnn_input, last_hidden)
        output = output.squeeze(0)  # (1,B,V)->(B,V)
        # context = context.squeeze(0)
        # update: "context" input before final layer can be problematic.
        # output = F.log_softmax(self.out(torch.cat((output, context), 1)))
        output = F.log_softmax(self.out(output))
        # Return final output, hidden state
        return output, hidden

### Evaluate

In [63]:
sentence = ""
src = tensorFromSentence(train_ds.src_vocab, sentence)

# # Encoder
# enc_outputs = torch.zeros(config['max_length'], enc.hidden_size, device=device)
# enc_h, enc_c = enc.initHiddenCell()

# for i in range(len(src)):
#     enc_out, enc_h, enc_c = enc(src[i], enc_h, enc_c)

In [61]:
def indexesFromSentence(vocab, sentence):
    return [vocab.word2id(word) for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_TOKEN)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

In [None]:

def evaluate(enc, dec, src, max_length=config['max_length']):
    with torch.no_grad():
        
        input_tensor = tensorFromSentence(train_ds.src_vocab, sentence)
        input_length = input_tensor.size()[0]
        

        # Encoder
        enc_outputs = torch.zeros(config['max_length'], enc.hidden_size, device=device)
        enc_h, enc_c = enc.initHiddenCell()
        for i in range(len(src)):
            enc_out, enc_h, enc_c = enc(src[i], enc_h, enc_c)
            if i >= config['max_length']:
                break
            enc_outputs[i] = enc_out[0, 0]

        decoder_input = torch.tensor([[SOS_TOKEN]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_TOKEN:
                decoded_words.append('<eos>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]