In [41]:
import pandas as pd
import numpy as np
from nltk.tokenize import regexp_tokenize 

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F


from torchtext.data import Field, BucketIterator, TabularDataset

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import dill

import random
import math
import time

In [42]:
def my_tokenizer(text):
    text=text.replace("**","^")
    return regexp_tokenize(text, "[\d]{1,9}|\(|\)|\+|\-|\*|\^|[a-z]{3}|[a-z]{1}")

In [43]:
SEED=1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [44]:
with open("src.Field","rb")as f:
     src=dill.load(f)
with open("trg.Field","rb")as f:
     trg=dill.load(f)


In [64]:
val_td, test_td = TabularDataset.splits(
    path='', validation='val_df.csv', test='test_df.csv',
    format='csv', skip_header=False, fields=[('x', src), ('y', trg)])

In [66]:
[len(src.vocab),len(trg.vocab)]

[71, 625]

In [67]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [68]:
BATCH_SIZE = 128
test_iterator = BucketIterator(
    test_td, 
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    sort_key = lambda x: len(x.x),
    device = device)

In [69]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):                
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)        
        return hidden, cell

In [70]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell

In [71]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        hidden, cell = self.encoder(src)
        input = trg[0,:]
        
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1) 
            input = trg[t] if teacher_force else top1
        
        return outputs

In [72]:
INPUT_DIM = len(src.vocab)
OUTPUT_DIM = len(trg.vocab)
ENC_EMB_DIM = 128
DEC_EMB_DIM = 128
HID_DIM = 256
N_LAYERS = 3
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

In [73]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 3,145,585 trainable parameters


In [74]:
optimizer = optim.Adam(model.parameters())


In [75]:
TRG_PAD_IDX = trg.vocab.stoi[trg.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [76]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch.x
            trg = batch.y
            output = model(src, trg, 0) #turn off teacher forcing
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

In [77]:
model.load_state_dict(torch.load('LSTM_model.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 0.491 | Test PPL:   1.634 |


In [78]:
def expanded_form(tokens, src_field, trg_field, model, device, max_len = 50):
    
    model.eval()
    
    tokens = [src_field.init_token] + tokens + [src_field.eos_token]
    src_indexes = [src_field.vocab.stoi[token] for token in tokens]
    src_tensor = torch.LongTensor(src_indexes).unsqueeze(1).to(device)
    src_len = torch.LongTensor([len(src_indexes)]).to(device)
    
    with torch.no_grad():
         hidden,cell = model.encoder(src_tensor)
            
    trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]
       
    for i in range(max_len):
        
        trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)
        with torch.no_grad():
            output, hidden, cell = model.decoder(trg_tensor, hidden, cell)
                    
        pred_token = output.argmax(1).item()
        
        trg_indexes.append(pred_token)
        
        if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
            break


    trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
    
    return trg_tokens[1:]

In [79]:
i=2
src1 = vars(test_td.examples[i])['x']
trg1 = vars(test_td.examples[i])['y']
print(f'src = {src1}')
print(f'trg = {trg1}')
expansion = expanded_form(src1, src, trg, model, device)
expansion=[x for x in expansion if x!='<eos>']
print(f'predicted trg = {expansion}')

src = ['(', 'k', '-', '12', ')', '*', '(', 'k', '-', '10', ')']
trg = ['k', '^', '2', '-', '22', '*', 'k', '+', '120']
predicted trg = ['i', '^', '2', '-', '22', '*', 's', '+', '120']


In [39]:
correct=0
for i in range(len(test_td)):
    if i%10000==0:
        print(i)
    src1 = vars(test_td.examples[i])['x']
    trg1 = vars(test_td.examples[i])['y']

    expansion = expanded_form(src1, src, trg, model, device)
    expansion=[x for x in expansion if x!='<eos>']

    if expansion==trg1:
        correct+=1
    if correct%10000==0:
        print('correct = '+str(correct)+', num obs.= '+str(i))
accuracy=correct/len(test_td)
print(f'accuracy = {accuracy}')


0
correct = 0, num obs.= 0
correct = 0, num obs.= 1
correct = 0, num obs.= 2
correct = 0, num obs.= 3
correct = 0, num obs.= 4
correct = 0, num obs.= 5
correct = 0, num obs.= 6
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
correct = 10000, num obs.= 156391
correct = 10000, num obs.= 156392
correct = 10000, num obs.= 156393
correct = 10000, num obs.= 156394
correct = 10000, num obs.= 156395
correct = 10000, num obs.= 156396
correct = 10000, num obs.= 156397
160000
170000
180000
190000
200000
accuracy = 0.06396968015159925
