In [1]:
import pandas as pd
import numpy as np
import random

import torch
import torch.nn as nn
import torchtext
from torchtext.vocab import build_vocab_from_iterator
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split


import itertools


# Data

In [2]:
device = torch.device("cpu")

In [3]:
data = pd.read_csv('csv/train.csv')
data

Unnamed: 0.1,Unnamed: 0,word,lemma,pos_tag,freq,word_length,lemma_length,ratio,lemma_length_category,word_length_category,ratio_category,reported_speech,freq_category
0,1104841,მონოზონისა,მონოზონი,N,1,10,8,1.250000,medium,medium,greater,False,low
1,1260770,სამხრეთიდგან,სამხრეთი,N,1,12,8,1.500000,medium,high,greater,False,low
2,637615,მოფიქრებაშია,მოფიქრება,N,1,12,9,1.333333,medium,high,greater,False,low
3,79017,სიდიადეს,სიდიადე,N,209,8,7,1.142857,low,low,greater,False,high
4,1107010,განგსტერებისათვის,განგსტერი,N,2,17,9,1.888889,medium,high,greater,False,low
...,...,...,...,...,...,...,...,...,...,...,...,...,...
956889,485527,ტონგას,ტონგა,N,102,6,5,1.200000,low,low,greater,False,high
956890,113442,ადენა,ადენა,N,5,5,5,1.000000,low,low,equal,False,medium
956891,131905,ვრაცხ,*რაცხვა,V,46,5,7,0.714286,low,low,less,False,medium
956892,1049224,ამოდგომაც,ამოდგომა,N,1,9,8,1.125000,medium,low,greater,False,low


In [4]:
MAX_LENGTH = data.word.str.len().max()
MAX_LENGTH

32

In [5]:
def get_tokens(t):
    yield from list(t)

UNK_TOKEN = '<unk>'
EOW_TOKEN = '<end>'
PAD_TOKEN = '<pad>'
SOW_TOKEN = '<start>'

vocab = build_vocab_from_iterator(iterator=get_tokens(itertools.chain(data['word'], data['lemma'])),
                                  specials=[EOW_TOKEN, UNK_TOKEN, PAD_TOKEN, SOW_TOKEN],
                                  special_first=False
                                  )
vocab.set_default_index(vocab[UNK_TOKEN])

def word_to_seq(word: str):
    return torch.tensor(vocab(list(word)) + [vocab[EOW_TOKEN]])

In [6]:
BATCH_SIZE = 64
RANDOM_STATE = 42

In [7]:
class LemmaDataSet(Dataset):
    def __init__(self, data_frame: pd.DataFrame):
        self.data = data_frame

    def __len__(self) -> int:
        return len(self.data)

    def __getitem__(self, idx: int) -> (str, str):
        row = self.data.iloc[idx]
        return row['word'], row['lemma']


In [8]:
def coallate_words(batch):
    word_list, lemma_list = [], []
    for word, lemma in batch:
        word_list.append(torch.tensor(word_to_seq(word)))
        lemma_list.append(torch.tensor(word_to_seq(word)))


    return torch.tensor(pad_sequence(word_list, batch_first=True, padding_value=vocab[PAD_TOKEN])), torch.tensor(pad_sequence(lemma_list, batch_first=True, padding_value=vocab[PAD_TOKEN]))

In [9]:
train_data, val_data = train_test_split(data, random_state=RANDOM_STATE, train_size=0.8, shuffle=True)
train_loader = DataLoader(LemmaDataSet(train_data), batch_size=BATCH_SIZE, shuffle=True, collate_fn=coallate_words)
val_loader = DataLoader(LemmaDataSet(val_data), batch_size=BATCH_SIZE, shuffle=True, collate_fn=coallate_words)

# Model

In [10]:
class EncoderRNN(nn.Module):
    def __init__(self, vocab_dim, emb_dim,  hidden_dim, num_layers=1):
        super(EncoderRNN, self).__init__()

        # set dimensions
        self.hidden_size = hidden_dim
        self.embedding_size = emb_dim
        self.vocab_size = vocab_dim
        self.num_layers = num_layers

        # initialize layers
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_size)
        self.gru = nn.GRU(self.embedding_size, self.hidden_size, num_layers=self.num_layers, batch_first=True)

    def forward(self, input):
        embedded = self.embedding(input)
        output, hidden = self.gru(embedded)
        return output, hidden


In [11]:
class DecoderRNN(nn.Module):
    def __init__(self, emb_dim, hidden_dim, vocab_dim, num_layers=1):
        super(DecoderRNN, self).__init__()
        # set dimensions
        self.hidden_size = hidden_dim
        self.embedding_size = emb_dim
        self.output_size = vocab_dim
        self.num_layers = num_layers

        #initialize layers
        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.gru = nn.GRU(self.hidden_size,self.hidden_size, batch_first=True)
        self.out = nn.Linear(self.hidden_size, self.output_size)
        self.softmax = nn.LogSoftmax(dim=-1)

    def forward(self, input, hidden):
        embedding = F.relu(self.embedding(input))
        output, hidden = self.gru(embedding, hidden)
        pred = self.softmax(self.out(output))
        return pred, hidden



In [12]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device='cpu'):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, source, target=None, teacher_forcing_ratio=0.5):
        input_length = source.size(1) #get the input length (number of words in sentence)
        batch_size = source.size(0)
        vocab_size = self.encoder.vocab_size

        #initialize a variable to hold the predicted outputs
        outputs = []

        #encode every word in a sentence
        for i in range(input_length):
            encoder_output, encoder_hidden = self.encoder(source[:, i].unsqueeze(1))

        #use the encoder’s hidden layer as the decoder hidden
        decoder_hidden = encoder_hidden.to(device)

        #add a token before the first predicted word
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(vocab[SOW_TOKEN])

        #topk is used to get the top K value over a list
        #predict the output word from the current target word. If we enable the teaching force,  then the #next decoder input is the next word, else, use the decoder output highest value.

        for t in range(target.size(1) if target is not None else MAX_LENGTH):
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
            outputs.append(decoder_output)
            _, topi = decoder_output.topk(1)
            if target is not None and target.size(1) > t:
                teacher_force = random.random() < teacher_forcing_ratio
                target_input = target[:, t].unsqueeze(1)
                decoder_input = (target_input if teacher_force else topi.squeeze(-1).detach())
            else:
                decoder_input = topi.squeeze(-1).detach()
        return torch.cat(outputs, dim=1)


In [13]:
model = Seq2Seq(EncoderRNN(vocab_dim=len(vocab), emb_dim=64 ,hidden_dim=128).to(device), DecoderRNN(vocab_dim=len(vocab), emb_dim=64, hidden_dim=128).to(device), device=device)
source, target = next(iter(train_loader))
model(source.to(device), target.to(device)).argmax(dim=1)

  word_list.append(torch.tensor(word_to_seq(word)))
  lemma_list.append(torch.tensor(word_to_seq(word)))
  return torch.tensor(pad_sequence(word_list, batch_first=True, padding_value=vocab[PAD_TOKEN])), torch.tensor(pad_sequence(lemma_list, batch_first=True, padding_value=vocab[PAD_TOKEN]))


tensor([[18, 14, 17,  ...,  0, 16, 18],
        [18,  6, 17,  ...,  0,  4, 18],
        [ 2,  6,  8,  ...,  0, 16,  1],
        ...,
        [18, 14,  1,  ...,  0, 16, 18],
        [ 8,  9,  3,  ...,  0, 16, 18],
        [ 1,  6,  7,  ...,  0, 16, 18]])

# Train

In [14]:
def train_epoch(dataloader, model, optimizer, criterion):

    total_loss = 0
    i = 0
    for data in dataloader:
        i += 1
        if i == 200:
            break
        input_tensor, target_tensor = data
        input_tensor = input_tensor.to(device)
        target_tensor = target_tensor.to(device)

        optimizer.zero_grad()

        decoder_outputs = model(input_tensor, target_tensor)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()

        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [16]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [17]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [18]:
from torch import optim


def train(train_dataloader, model, n_epochs, learning_rate=0.001,
          print_every=100, plot_every=100):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for epoch in range(1, n_epochs + 1):
        loss = train_epoch(train_dataloader, model, optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epochs),
                                         epoch, epoch / n_epochs * 100, print_loss_avg))

        if epoch % plot_every == 1:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)
    return plot_losses

In [19]:
train_epoch(train_loader, model.to(device))
train(train_loader, model.to(device), 1)