In [33]:
import pandas as pd
import numpy as np
from nltk.tokenize import regexp_tokenize 

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F


from torchtext.data import Field, BucketIterator, TabularDataset

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import dill
import random
import math
import time

In [34]:
def my_tokenizer(text):
    text=text.replace("**","^")
    return regexp_tokenize(text, "[\d]{1,9}|\(|\)|\+|\-|\*|\^|[a-z]{3}|[a-z]{1}")

In [35]:
SEED=1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [36]:
with open("src.Field","rb")as f:
     src=dill.load(f)
with open("trg.Field","rb")as f:
     trg=dill.load(f)


In [37]:
val_td, test_td = TabularDataset.splits(
    path='', validation='val_df.csv', test='test_df.csv',
    format='csv', skip_header=False, fields=[('x', src), ('y', trg)])

In [38]:
[len(src.vocab),len(trg.vocab)]

[71, 626]

In [39]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [40]:
BATCH_SIZE = 128
valid_iterator, test_iterator = BucketIterator.splits(
    (val_td, test_td), 
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    sort_key = lambda x: len(x.x),
    device = device)

In [41]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
                
        embedded = self.dropout(self.embedding(src))
        outputs, hidden = self.rnn(embedded)
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))
        return outputs, hidden

In [42]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()        
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias = False)
        
    def forward(self, hidden, encoder_outputs):
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2))) 
        attention = self.v(energy).squeeze(2)
        
        return F.softmax(attention, dim=1)

In [43]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()

        self.output_dim = output_dim
        self.attention = attention        
        self.embedding = nn.Embedding(output_dim, emb_dim)        
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)        
        self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, encoder_outputs):            
        input = input.unsqueeze(0)        
        embedded = self.dropout(self.embedding(input))        
        a = self.attention(hidden, encoder_outputs)                
        a = a.unsqueeze(1)        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)        
        weighted = torch.bmm(a, encoder_outputs)        
        weighted = weighted.permute(1, 0, 2)        
        rnn_input = torch.cat((embedded, weighted), dim = 2)        
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))  
        assert (output == hidden).all()
        
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim = 1))        
        return prediction, hidden.squeeze(0)

In [58]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        encoder_outputs, hidden = self.encoder(src)
                
        input = trg[0,:]
        
        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden, encoder_outputs)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1) 
            input = trg[t] if teacher_force else top1

        return outputs

In [59]:
INPUT_DIM = len(src.vocab)
OUTPUT_DIM = len(trg.vocab)
ENC_EMB_DIM = 128
DEC_EMB_DIM = 128
ENC_HID_DIM = 256
DEC_HID_DIM = 256
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, device).to(device)

In [60]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,261,746 trainable parameters


In [61]:
optimizer = optim.Adam(model.parameters())


In [62]:
TRG_PAD_IDX = trg.vocab.stoi[trg.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [63]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch.x
            trg = batch.y
            output = model(src, trg, 0) #turn off teacher forcing
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)
            loss = criterion(output, trg)
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [64]:
model.load_state_dict(torch.load('GRUModel.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 0.019 | Test PPL:   1.019 |


In [65]:
def expanded_form(tokens, src_field, trg_field, model, device, max_len = 50):
    
    model.eval()
    
    tokens = [src_field.init_token] + tokens + [src_field.eos_token]
    
    src_indexes = [src_field.vocab.stoi[token] for token in tokens]
    
    src_tensor = torch.LongTensor(src_indexes).unsqueeze(1).to(device)
    
    src_len = torch.LongTensor([len(src_indexes)]).to(device)   
   
    with torch.no_grad():
        encoder_outputs, hidden = model.encoder(src_tensor)
        
    trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]

    attentions = torch.zeros(max_len, 1, len(src_indexes)).to(device)
       
    for i in range(max_len):

        trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)
        with torch.no_grad():
            output, hidden = model.decoder(trg_tensor, hidden, encoder_outputs)
                    
        pred_token = output.argmax(1).item()
        
        trg_indexes.append(pred_token)
        
        if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
            break


    trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
    
    return trg_tokens[1:], attentions[:len(trg_tokens)-1]

In [67]:
i=2
src1 = vars(test_td.examples[i])['x']
trg1 = vars(test_td.examples[i])['y']
print(f'src = {src1}')
print(f'trg = {trg1}')
expansion, attention = expanded_form(src1, src, trg, model, device)
expansion=[x for x in expansion if x!='<eos>']
print(f'predicted trg = {expansion}')

src = ['(', 'a', '-', '30', ')', '*', '(', '8', '*', 'a', '-', '4', ')']
trg = ['8', '*', 'a', '^', '2', '-', '244', '*', 'a', '+', '120']
predicted trg = ['8', '*', 'a', '^', '2', '-', '244', '*', 'a', '+', '120']


In [39]:
correct=0
for i in range(len(test_td)):
    if i%10000==0:
        print(i)
    src1 = vars(test_td.examples[i])['x']
    trg1 = vars(test_td.examples[i])['y']

    expansion, attention = expanded_form(src1, src, trg, model, device)
    expansion=[x for x in expansion if x!='<eos>']

    if expansion==trg1:
        correct+=1
    if correct%10000==0:
        print('correct = '+str(correct)+', num obs.= '+str(i))
accuracy=correct/len(test_td)
print(f'accuracy = {accuracy}')


0
correct = 0, num obs.= 0
10000
correct = 10000, num obs.= 10876
20000
correct = 20000, num obs.= 21757
30000
correct = 30000, num obs.= 32662
40000
correct = 40000, num obs.= 43536
50000
correct = 50000, num obs.= 54394
60000
correct = 60000, num obs.= 65288
70000
correct = 70000, num obs.= 76152
80000
correct = 80000, num obs.= 87080
correct = 80000, num obs.= 87081
90000
correct = 90000, num obs.= 97943
100000
correct = 100000, num obs.= 108861
correct = 100000, num obs.= 108862
110000
correct = 110000, num obs.= 119802
120000
130000
correct = 120000, num obs.= 130719
140000
correct = 130000, num obs.= 141650
150000
correct = 140000, num obs.= 152539
160000
correct = 150000, num obs.= 163372
170000
correct = 160000, num obs.= 174292
180000
correct = 170000, num obs.= 185205
190000
correct = 180000, num obs.= 196165
200000
accuracy = 0.9176454117729411


In [24]:
correct=0
for i in range(len(test_td)-199950):
    if i%10000==0:
        print(i)
    src1 = vars(test_td.examples[i])['x']
    trg1 = vars(test_td.examples[i])['y']
    
    print("src = " + ''.join(src1))
    print("trg = " + ''.join(trg1))

    expansion, attention = expanded_form(src1, src, trg, model, device)
    expansion=[x for x in expansion if x!='<eos>']
    print("predicted trg = " +''.join(expansion))

    if expansion==trg1:
        correct+=1
    
accuracy=correct/(len(test_td)-199950)
print(f'accuracy = {accuracy}')

0
src = x
trg = y
predicted trg = x^2
src = (5-8*i)*(8*i-3)
trg = -64*i^2+64*i-15
predicted trg = -64*i^2+64*i-15
src = 3*z*(z-18)
trg = 3*z^2-54*z
predicted trg = 3*z^2-54*z
src = (x-2)*(7*x+27)
trg = 7*x^2+13*x-54
predicted trg = 7*x^2+13*x-54
src = -6*n*(n+10)
trg = -6*n^2-60*n
predicted trg = -6*n^2-60*n
src = (-2*s-28)*(4*s+7)
trg = -8*s^2-126*s-196
predicted trg = -8*s^2-126*s-196
src = (-6*n-10)*(n-30)
trg = -6*n^2+170*n+300
predicted trg = -6*n^2+170*n+300
src = -7*tan(j)^2
trg = -7*tan(j)^2
predicted trg = -7*tan(j)^2
src = -9*j*(4*j+11)
trg = -36*j^2-99*j
predicted trg = -36*j^2-99*j
src = (10-3*j)*(7*j+21)
trg = -21*j^2+7*j+210
predicted trg = -21*j^2+7*j+210
src = (-6*y-23)*(y-19)
trg = -6*y^2+91*y+437
predicted trg = -6*y^2+95*y+437
src = (-5*j-4)*(j+12)
trg = -5*j^2-64*j-48
predicted trg = -5*j^2-64*j-48
src = (i-31)*(4*i-32)
trg = 4*i^2-156*i+992
predicted trg = 4*i^2-156*i+992
src = -x*(-9*x-9)
trg = 9*x^2+9*x
predicted trg = 9*x^2+9*x
src = -6*h*(h-17)
trg = -6*h^2+102