This is a seq2seq model which accept an input of number (e.g. "12345") and will add a 0 at the end of the number (e.g. "123450").

In [None]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

**HyperParameters**

In [None]:
train_batch_size = 256
max_len = 10

embedding_dim = 100 # word embedding
# GRU
num_layers = 1
hidden_size = 64

# Train
epochs = 5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Test
test_batch_size = 1000

**Num_Sequence**

In [None]:
class Num_sequence:
  UNK_TAG = '<UNK>'
  PAD_TAG = '<PAD>'
  SOS_TAG = '<SOS>' # start of sequence
  EOS_TAG = '<EOS>' # end of sequence

  UNK = 0
  PAD = 1
  SOS = 2
  EOS = 3

  def __init__(self) -> None:
    self.dict = {
        self.PAD_TAG : self.PAD,
        self.UNK_TAG : self.UNK,
        self.SOS_TAG : self.SOS,
        self.EOS_TAG : self.EOS,
         }

    for i in range(10):
      self.dict[str(i)] = len(self.dict)
    
    self.inverse_dict = dict(zip(self.dict.values(), self.dict.keys()))
  
  def transform(self, sentence, max_len=None, add_eos=False):
    '''string 2 vector
    :param sentence:  str or list(), "123..." or ["1","2","5" ... str]
    :param: max_len: int
    add_eos: if to add ""<EOS> True: sentence length = max_len + 1
                               False: sentence length = max_len
    :return: [int, int, int ...]
    '''

    if add_eos:
      assert(max_len != None)
      max_len = max_len - 1

    if max_len is not None:
      if len(sentence) > max_len: # cut if sentence > max_len
        sentence = sentence[:max_len]
      else: # add padding if sentence < max_len
        sentence = sentence + [self.PAD_TAG]*(max_len-len(sentence))
      
    if add_eos:
      if sentence[-1] == self.PAD_TAG:  # if there is PAD in the sentence, add EOS before the TAG
        pad_index = sentence.index(self.PAD_TAG)
        sentence.insert(pad_index, self.EOS_TAG)
      else: # No pad, and EOS at the end of the sentence
        sentence.append(self.EOS_TAG)

    result = [self.dict.get(i, self.UNK) for i in sentence]

    return result
  
  def inverse_transform(self, indices):
    '''vector 2 string
    :param indices: [int, int, int, ...]
    :return: "123123..."    
    '''
    result = []
    for i in indices:
      temp = self.inverse_dict.get(i, self.UNK_TAG)
      if temp != self.EOS_TAG:  # delete everything after EOS
        result.append(temp)
      else:
        break
    return "".join(result)
  
  def __len__(self):
    return len(self.dict)

In [None]:
num_sequence = Num_sequence()

In [None]:
# Test num_Sequence
print(num_sequence.dict)
s = "123123"
retVal = num_sequence.transform(s)
print(retVal)
retVal = num_sequence.inverse_transform(retVal)
print(retVal)

{'<PAD>': 1, '<UNK>': 0, '<SOS>': 2, '<EOS>': 3, '0': 4, '1': 5, '2': 6, '3': 7, '4': 8, '5': 9, '6': 10, '7': 11, '8': 12, '9': 13}
[5, 6, 7, 5, 6, 7]
123123


**Dataset**  
Prepare dataset and dataloader

1. In targets of the samples, EOS and SOS are needed to label the start and the end of the network.  
2. Add EOS in the target and transform.  

In [None]:
class NumDataset(Dataset):
  def __init__(self, train=True):
    # generate random number with numpy
    np.random.seed(10) if train else np.random.seed(11)
    self.size = 400000 if train else 100000
    self.data = np.random.randint(0, 1e8, size=[self.size])
  
  def __getitem__(self, index):
    input = list(str(self.data[index]))
    target = input + ['0']
    input_length = len(input)
    target_length = len(target)
    return input, target, input_length, target_length
  

  def __len__(self):
    return len(self.data)

In [None]:
def collate_fn(batch):
  '''
  :param batch: [(input, label, input_length, label_length), (input, label, input_length, label_length)]
  :return:
  '''

  batch = sorted(batch, key=lambda x: x[2], reverse=True) # big -> small
  
  input, target, input_length, target_length = zip(*batch)

  input = [num_sequence.transform(i, max_len=max_len) for i in input]
  target = [num_sequence.transform(i, max_len=max_len, add_eos=True) for i in target]
  input = torch.LongTensor(input)
  target = torch.LongTensor(target)

  input_length = torch.LongTensor(input_length)
  target_length = torch.LongTensor(target_length)


  return input, target, input_length, target_length

In [None]:
data_set = NumDataset(train=True)
train_data_loader = DataLoader(data_set, batch_size=train_batch_size, shuffle=True, collate_fn=collate_fn)

In [None]:
for input, target, input_length, target_length in train_data_loader:
  print(input.size())
  print(target.size())
  print(input_length.shape)
  print(target_length.shape)
  break

torch.Size([256, 10])
torch.Size([256, 10])
torch.Size([256])
torch.Size([256])


In [None]:
temp = list("abcd")
temp += ["ADD"]
print(temp)

temp = list("abcd")
temp.append("ADD")
print(temp)

['a', 'b', 'c', 'd', 'ADD']
['a', 'b', 'c', 'd', 'ADD']


**Encoder**

Before using GRU, there are two API for accelerating the calculation.  
1. pad_packed_sequence(out, batct_first, padding_value) *unpack*
2. pack_padded_sequence(embedded, real_length, batch_first) *pack*
3. Before using the two API, sort the batch in descending order.

In [None]:
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence

In [None]:
class Encoder(nn.Module):
  def __init__(self):
    super().__init__()
    self.embedding = nn.Embedding(num_embeddings=len(num_sequence), embedding_dim=embedding_dim, padding_idx=num_sequence.PAD)
    self.gru = nn.GRU(input_size=embedding_dim, num_layers=num_layers, hidden_size=hidden_size, batch_first=True, bidirectional=False, dropout=0)
  
  def forward(self, input, input_length):
    '''
    :param input: [batch_size, max_len]
    :return 
    '''
    embeded = self.embedding(input) # [batch_size, max_len, embedding_dim]

    # pack to accelerate calculation
    embeded = pack_padded_sequence(embeded, input_length.cpu(), batch_first=True)

    output, hidden = self.gru(embeded)

    # unpack
    output, output_length = pad_packed_sequence(output, batch_first=True, padding_value=num_sequence.PAD)

    # hidden: [1*1, batch_size, hidden_size]
    # output: [batch_size, seq_len, hidden_size]
    return output, hidden


In [None]:
encoder = Encoder()
print(encoder)
for input, target, input_length, target_length in train_data_loader:
  out, hidden = encoder(input, input_length)
  print(out.size())
  print(hidden.size())
  break

Encoder(
  (embedding): Embedding(14, 100, padding_idx=1)
  (gru): GRU(100, 64, batch_first=True)
)
torch.Size([256, 8, 64])
torch.Size([1, 256, 64])


**Decoder**

1. The output of the encoder is a classification problem. We choose the output with a highest probability. 
2. The output of the decoder is [batch_size, max_len, vocab_size].
3. Loss function: Cross Entropy

In [None]:
import torch.nn.functional as F

In [None]:
class Decoder(nn.Module):
  def __init__(self):
    super().__init__()
    self.embedding = nn.Embedding(num_embeddings=len(num_sequence), embedding_dim=embedding_dim,padding_idx=num_sequence.PAD)
    # hidden_state = [1, batch_size, hidden_size]
    self.gru = nn.GRU(input_size=embedding_dim,
                      hidden_size=hidden_size,
                      num_layers=num_layers,
                      batch_first=True,
                      bidirectional=False,
                      dropout=0)
    # encoder_hidden_t: [2, batch_size, hidden_size]
    self.fc = nn.Linear(hidden_size, len(num_sequence))
  
  def forward(self, target, encoder_hidden):
    # 1. For the first time, decoder has the same hidden state as the encoder
    decoder_hidden = encoder_hidden # [1, batch_size, hidden_size]
    # 2. For the first time, input of decoder is the SOS with size of [batch_size, 1]
    batch_size = encoder_hidden.size(1)
    # decoder_input = torch.LongTensor(torch.ones([batch_size, 1], dtype=torch.int64)*num_sequence.SOS).to(device)
    decoder_input = torch.LongTensor([[num_sequence.SOS]]*batch_size).to(device)
    # 3. Calculate at the first time stamp, get output and hidden_state

    # 4. Calculate the next output according to previous output
    # 5. Put previous hidden_state and output as current hidden_state and input
    # 6. Recurrsion step 4 and step 5

    # Save the result of prediction
    # [batch_size, max_len, vocab_size]
    decoder_outputs = torch.zeros([batch_size, max_len, len(num_sequence)]).to(device)

    for t in range(max_len):
      decoder_output_t, decoder_hidden = self.forward_step(decoder_input, decoder_hidden)
      # decoder_output_t: [batch_size, vocab_size]; decoder_hidden: [1, batch_size, hidden_size]
      # save decoder_output_t
      decoder_outputs[:,t,:] = decoder_output_t

      value, index = decoder_output_t.max(dim=-1)
      decoder_input = index.unsqueeze(-1)
    
    return decoder_outputs, decoder_hidden
  
  def forward_step(self, decoder_input, decoder_hidden):
    '''
    calculate output at each time stamp
    :param decoder_input: [batch_size, 1]
    :param decoder_hidden: [1, batch_size, hidden_size]
    :return:    
    '''
    decoder_input_embedded = self.embedding(decoder_input)  # [batch_size, 1, embedding_dim]

    # out: [batch_size, 1, hidden_size] It is 1 because at the first point seq_len=1
    # decoder_hidden: [1, batch_size, hidden_size]
    out, decoder_hidden = self.gru(decoder_input_embedded, decoder_hidden)
    # out: [batch_size, 1, hidden_size]
    out = out.squeeze(dim=1) # [batch_size, hidden_size]
    out = self.fc(out)  # [batch_size, vocab_size]
    output = F.log_softmax(out, dim=-1) # [batch_size, vocab_size]

    return output, decoder_hidden

  
  def evaluate(self, encoder_hidden):
    # First time step, decoder_hidden is the encoder_hidden
    decoder_hidden = encoder_hidden # [1,batch_size,encoder_hidden_size]
    # First time step, input is the [batch_size, 1]
    batch_size = encoder_hidden.size(1)
    decoder_input = torch.LongTensor(torch.ones([batch_size, 1], dtype=torch.int64)*num_sequence.SOS).to(device)
    
    # Buffer to store the outputs
    # [batch_size, max_len, vocab_size]
    decoder_output = torch.zeros([batch_size, max_len, len(num_sequence)]).to(device)

    decoder_predict = []  # [max_len, batch_size]

    for t in range(max_len):
      decoder_output_t, decoder_hidden = self.forward_step(decoder_input, decoder_hidden)
      decoder_output[:,t,:] = decoder_output_t

      value, index = torch.max(decoder_output_t, 1)
      decoder_input = index.unsqueeze(-1) # [batch_size, 1]
      decoder_predict.append(index.cpu().detach().numpy())
    
    decoder_predict = np.array(decoder_predict).transpose() # [batch_size, max_len]
    return decoder_outputs, decoder_predict
    

In [None]:
encoder = Encoder()
decoder = Decoder()
print(encoder)
print(decoder)
for input, target, input_length, target_length in train_data_loader:
  out, encoder_hidden = encoder(input, input_length)
  decoder(target, encoder_hidden)
  print(out.size())
  print(hidden.size())
  break

Encoder(
  (embedding): Embedding(14, 100, padding_idx=1)
  (gru): GRU(100, 64, batch_first=True)
)
Decoder(
  (embedding): Embedding(14, 100, padding_idx=1)
  (gru): GRU(100, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=14, bias=True)
)
torch.Size([256, 8, 64])
torch.Size([1, 256, 64])


Combine encoder and decoder to get seq2seq

In [None]:
class Seq2Seq(nn.Module):
  def __init__(self):
    super().__init__()
    self.encoder = Encoder()
    self.decoder = Decoder()
  
  def forward(self, input, target, input_length, target_length):
    encoder_outputs, encoder_hidden = self.encoder(input, input_length)
    decoder_outputs, decoder_hidden = self.decoder(target, encoder_hidden)
    # print('encoder_outputs', encoder_outputs.shape)
    # print('decoder_outputs', decoder_outputs.shape)
    # print('target', target.size())
    return decoder_outputs, decoder_hidden
  
  def evaluate(self, input, input_length):
    encoder_outputs, encoder_hidden = self.encoder(input, input_length)
    decoder_outputs, decoder_predict = self.decoder.evaluate(encoder_hidden)
    return decoder_outputs, decoder_predict
    

1. Initialize the model, optimizer and loss.  
2. Traverse dataloader.
3. Produce output from the model
4. Calculate the loss
5. Save and Load the model


In [None]:
from torch.optim import Adam
import torch.nn.functional as F

In [None]:
seq2seq = Seq2Seq().to(device)
optimizer = Adam(seq2seq.parameters(), lr=0.001)

**Training Loop**

In [None]:
import os
import pickle

In [None]:
# Create the directory
if not os.path.exists("./models"):
  os.mkdir("./models")

In [None]:
loss_list = []

for epoch in range(epochs):
  for index, (input, target, input_length, target_length) in enumerate(train_data_loader):
    input, target, input_length, target_length = input.to(device), target.to(device), input_length.to(device), target_length.to(device)
    decoder_outputs, _ = seq2seq(input, target, input_length, target_length)
    # print(decoder_outputs.size(), target.size())
    decoder_outputs = decoder_outputs.view(-1, len(num_sequence)) # [batch_size*seq_len, -1]
    # print(decoder_outputs.size(), target.size())
    target = target.view(-1)  # [batch_size*seq_len]
    loss = F.nll_loss(decoder_outputs, target)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    loss_list.append(loss.item())

    # Record the Training Phase
    if index % 300 == 0:
      torch.save(seq2seq.state_dict(), "./models/model.pkl")
      torch.save(optimizer.state_dict(), "./models/optimizer.pkl")
      pickle.dump(loss_list, open("./models/loss_list.pkl", "wb"))
      print("epoch: {}\t idx:{} \t loss:{: .6f}".format(epoch, index, loss.item()))

epoch: 0	 idx:0 	 loss: 2.656001
epoch: 0	 idx:300 	 loss: 1.269240
epoch: 0	 idx:600 	 loss: 0.955149
epoch: 0	 idx:900 	 loss: 0.736737
epoch: 0	 idx:1200 	 loss: 0.551190
epoch: 0	 idx:1500 	 loss: 0.379251
epoch: 1	 idx:0 	 loss: 0.346296
epoch: 1	 idx:300 	 loss: 0.260460
epoch: 1	 idx:600 	 loss: 0.170203
epoch: 1	 idx:900 	 loss: 0.124829
epoch: 1	 idx:1200 	 loss: 0.087696
epoch: 1	 idx:1500 	 loss: 0.070656


**Prediction**

In [None]:
_input = "123456"

In [None]:
model = Seq2Seq().to(device)
model.load_state_dict(torch.load("./models/model.pkl"))
input = list(str(_input))
input_length = torch.LongTensor([len(input)]) # [1]
input = torch.LongTensor([num_sequence.transform(input)])

In [None]:
with torch.no_grad():
  input = input.to(device)
  input_length = input_length.to(device)
  _, decoder_predict = model.evaluate(input, input_length)  # [batch_size, max_len, vocab_size]
  pred = [num_sequence.inverse_transform(i) for i in decoder_predict]

In [None]:
print(_input, "---->", pred[0])

123456 ----> 1234560


**Evaluation**

In [None]:
# Prepare Data
test_data_set = NumDataset(train=False)
test_data_loader = DataLoader(test_data_set, batch_size=test_batch_size, shuffle=False, collate_fn=collate_fn)

In [None]:
# Get Prediction
test_model = Seq2Seq().to(device)
test_model.load_state_dict(torch.load("./models/model.pkl"))

<All keys matched successfully>

In [None]:
# Inverse Transform
loss_list = []
acc_list = []
with torch.no_grad():
  for idx, (input, target, input_len, target_len) in enumerate(test_data_loader):
    input, target, input_len, target_len = input.to(device), target.to(device), input_length.to(device), target_length.to(device)
    decoder_outputs, decoder_predict = test_model.evaluate(input, input_len) # [batch_size, max_len, vocab_size]
    loss = F.nll_loss(decoder_outputs.view(-1, len(num_sequence)), target.view(-1), ignore_index = num_sequence.PAD)

    target_inverse_transformed = [num_sequence.inverse_transform(i) for i in target.numpy()]
    predict_inverse_transformed = [num_sequence.inverse_transform(i) for i in decoder_predict]
    cur_eq = [1 if target_inverse_transformed[i] == predict_inverse_transformed[i] else 0 for i in range(len(target_inverse_transformed))]
    acc_list.extend(cur_eq)

In [None]:
print("Mean acc: {} Mean loss:{:.6f}".format(np.mean(acc_list), np.mean(loss_list)))