## Transforming dates to similar format with encoder-decoder models
Props: https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html <br>
Data: 

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import Adam

In [22]:
class Config():
    learning_rate = 1e-3
    encoder_hidden = 256
    decoder_hidden = 256
    encoder_n_layers = 2
    decoder_n_layers = 2
    batch_size = 32
    n_epochs = 15
cfg = Config()

In [7]:
train_dataset = pd.read_csv('data/train.csv').values
test_dataset = pd.read_csv('data/test.csv',sep=',')
MAX_LENGTH = max(map(lambda x: len(x[0]), train_dataset)) + 1

In [8]:
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {
            'SOS': 0,
            'EOS': 1
        }
        self.index2word = {
            0: 'SOS',
            1: 'EOS'
        }

    @property
    def n_words(self) -> int:
        return len(self.index2word)

    def add_sentence(self, sentence):
        for word in list(sentence):
            self.add_word(word)

    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.index2word[self.n_words] = word

In [9]:
input_lang = Lang('human')
output_lang = Lang('iso')

for pair in train_dataset:
    input_lang.add_sentence(pair[0])
    output_lang.add_sentence(pair[1])

print(input_lang.name, input_lang.n_words)
print(output_lang.name, output_lang.n_words)

human 82
iso 13


In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [17]:
class Encoder(nn.Module):

    def __init__(self, input_size, hidden_size, num_layers):
        super().__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.LSTM(hidden_size, hidden_size, num_layers)

    def forward(self, x, hidden,cell):
        embedded = self.embedding(x).view(1, 1, -1)
        output = embedded
        output, (hidden, cell) = self.rnn(output, (hidden,cell))
        return output,hidden,cell

    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [18]:
class Decoder(nn.Module):

    def __init__(self, hidden_size, output_size, num_layers):
        super().__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.rnn = nn.LSTM(hidden_size, hidden_size, num_layers)
        self.out = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x, hidden,cell):
        output = self.embedding(x).view(1, 1, -1)
        output = self.relu(output)
        output, (hidden, cell)= self.rnn(output, (hidden,cell))
        output = self.softmax(self.out(output[-1]))
        return output, hidden, cell

    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [19]:
def sentence2idx(lang, sentence):
    return [lang.word2index[word] for word in list(sentence)]


def sentence2tensor(lang, sentence):
    indexes = sentence2idx(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def pair2tensor(x):
    input_tensor = sentence2tensor(input_lang, x[0])
    target_tensor = sentence2tensor(output_lang, x[1])
    return input_tensor, target_tensor

In [20]:
teacher_forcing_ratio = 0.5


def train_single(
        input_tensor, target_tensor,
        encoder, decoder,
        encoder_optimizer, decoder_optimizer,
        criterion
):
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    loss = 0

    encoder_hidden = encoder.init_hidden()
    encoder_cell = encoder.init_hidden()

    for elem in input_tensor:
        encoder_output, encoder_hidden, encoder_cell = encoder(elem, encoder_hidden,encoder_cell)

    decoder_input = torch.tensor([[SOS_token]], device=device)
    decoder_hidden = encoder_hidden
    decoder_cell = decoder.init_hidden()

    use_teacher_forcing = True if np.random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        for elem in target_tensor:
            decoder_output, decoder_hidden, decoder_cell = decoder(decoder_input, decoder_hidden, decoder_cell)
            loss += criterion(decoder_output, elem)
            decoder_input = elem
    else:
        for elem in target_tensor:
            decoder_output, decoder_hidden, decoder_cell = decoder(decoder_input, decoder_hidden, decoder_cell)
            _, topi = decoder_output.data.topk(1)
            decoder_input = topi.squeeze().detach()

            loss += criterion(decoder_output, elem)
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / len(target_tensor)

In [15]:
def train(encoder, decoder, n_epochs=cfg.num_epochs, print_every=cfg.print_every):
    encoder.train()
    decoder.train()

    encoder_optimizer = Adam(encoder.parameters(), lr=cfg.learning_rate)
    decoder_optimizer = Adam(decoder.parameters(), lr=cfg.learning_rate)

    criterion = nn.NLLLoss()

    for epoch in range(n_epochs):
        print_loss_total = 0
        
        print(f'Epoch [{epoch + 1:02d}/{n_epochs:02d}]')
        training_pairs = [
            pair2tensor(x) for x in train_dataset[np.random.randint(len(train_dataset), size=len(train_dataset))]
        ]

        for i, training_pair in enumerate(training_pairs):
            input_tensor = training_pair[0]
            target_tensor = training_pair[1]

            loss = train_single(
                input_tensor, target_tensor,
                encoder, decoder,
                encoder_optimizer, decoder_optimizer,
                criterion
            )
            print_loss_total += loss

            if (i + 1) % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                print(f'Training ({i / len(training_pairs) * 100:.1f}%) loss: {print_loss_avg:.4f}')

In [21]:
encoder_model = Encoder(input_lang.n_words, cfg.encoder_hidden, cfg.encoder_n_layers).to(device)
decoder_model = Decoder(cfg.decoder_hidden, output_lang.n_words, cfg.decoder_n_layers).to(device)

train(encoder_model, decoder_model)

Epoch [01/15]


AttributeError: 'tuple' object has no attribute 'size'

In [20]:
@torch.no_grad()
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    encoder.eval()
    decoder.eval()

    input_tensor = sentence2tensor(input_lang, sentence)
    encoder_hidden = encoder.init_hidden()
    encoder_cell = encoder.init_hidden()

    for elem in input_tensor:
        encoder_output, encoder_hidden, encoder_cell = encoder(elem, encoder_hidden, encoder_cell)

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden
    decoder_cell = encoder_cell

    decoded_words = []

    for di in range(max_length):
        decoder_output, decoder_hidden, encoder_cell = decoder(decoder_input, decoder_hidden, decoder_cell)
        _, topi = decoder_output.data.topk(1)

        decoded_words.append(output_lang.index2word[topi.item()])

        if topi.item() == EOS_token:
            break

        decoder_input = topi.squeeze().detach()

    return decoded_words


def predict_(encoder, decoder, dataset):
    result = []

    for _ in dataset:
        result.append(evaluate(encoder, decoder, _)[:10])

    return result

In [21]:
test_prediction = predict_(encoder_model, decoder_model, test_dataset['data'])
test_dataset['label'] = [''.join(x) for x in test_prediction]
test_dataset[['id', 'label']].to_csv('submission.csv', index=None)