<a href="https://colab.research.google.com/github/MoonRiyadh/Neural-Symbolic-Computing/blob/master/NSC_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The cell below only needs to be run on google colab


In [None]:
!pip install torchtext==0.7
from google.colab import drive
drive.mount('/content/drive')

Below should be the path to the data

In [None]:
# directory to data
dirname = "/content/drive/My Drive/Colab Notebooks/NSC_Project/Project.zip (Unzipped Files)"

imports

In [None]:
# imports
import torch 
import torchtext
import os
from nltk import regexp_tokenize
from torch import nn, functional



BATCH_SIZE = 128
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(device)

cuda


# Data preparation

In [None]:
SAMPLE_LINES = 4

# The torchtext.data.Dataset class is not needed but is useful
class CustomDataset(torchtext.data.Dataset):
    def __init__(self, path, fields, **kwargs):
        """
        paths:
            path to data
        fields:
            tuple of Field objects (see torchtext.data.Field)
        """

        if not isinstance(fields[0], (tuple, list)):
            fields = [('input', fields[0]), ('output', fields[1])]

        with open(path) as f:
            lines = f.readlines()

        if len(lines) % 4 != 0:
            raise Exception(f":(, incomplete sample in {path}")
        
        
        examples = []
        for i in range(0, len(lines), SAMPLE_LINES):
            y = list(lines[i + SAMPLE_LINES - 1].strip())
            x = "".join(lines[i:i+SAMPLE_LINES - 1])

            examples.append(torchtext.data.Example.fromlist(
                        [x, y], fields))
        
        super().__init__(examples, fields, **kwargs)

    @classmethod
    def create_datasets(cls, train, test, validation, fields, **kwargs):
        """
        train:
            path to train data
        test:
            path to test data
        validation:
            path to validation data
        fields:
            tuple of Field objects (see torchtext.data.Field)
        """
        train_set = cls(path=train, fields=fields, **kwargs)
        test_set = cls(path=test, fields=fields, **kwargs)
        validation_set = cls(path=validation, fields=fields, **kwargs)


        return train_set, test_set, validation_set 

In [None]:
def custom_tokenizer(s):
    ret = regexp_tokenize(s, r'(var|=|[a-z]+\d*|\d|[+-])')
    return ret

INPUT = torchtext.data.Field(tokenize = custom_tokenizer,
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = True)
OUTPUT = torchtext.data.Field(tokenize = custom_tokenizer,
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = True)

# get datasets
train_data, test_data, validation_data = CustomDataset.create_datasets(
    *(os.path.join(dirname, f"{name}_Data.txt") for name in "Training Test Validation".split()),
    fields=(INPUT, OUTPUT)
)



In [None]:
# create vocabs
INPUT.build_vocab(train_data)
OUTPUT.build_vocab(train_data)

## Iterators/Dataloader

In [None]:
# get iters
train_iterator, valid_iterator, test_iterator = torchtext.data.BucketIterator.splits(
    (train_data, validation_data, test_data),
    batch_size = BATCH_SIZE,
    device = device,
    sort_key=lambda x: len(x.input + x.output))



# Model

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class Seq2Seq(nn.Module):
    def __init__(self, embedding_dim, in_vocab_size, out_vocab_size):
        super().__init__()
        self.embedding_in = nn.Embedding(in_vocab_size, embedding_dim).to(device)
        self.embedding_out = nn.Embedding(out_vocab_size, embedding_dim).to(device)
        self.transformer = nn.Transformer(d_model=embedding_dim).to(device)
    
    def forward(self, src, tgt):
        in_embeds = self.embedding_in(src)
        out_embeds = self.embedding_out(tgt)
        return self.transformer(in_embeds, out_embeds)

# Training

In [None]:
model = Seq2Seq(128, in_vocab_size=len(INPUT.vocab.stoi), out_vocab_size=len(OUTPUT.vocab.stoi))

# ignore padding (we removed but just in case)
PAD_IDX = OUTPUT.vocab.stoi['<pad>']

criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [None]:
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=0.0001)

In [None]:
import time
import math


def train(model,
          iterator,
          optimizer,
          criterion,
          clip):

    model.train()

    epoch_loss = 0

    for _, batch in enumerate(iterator):

        src = batch.input
        trg = batch.output

        optimizer.zero_grad()

        output = model(src, trg)

        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)


def evaluate(model,
             iterator,
             criterion):

    model.eval()

    epoch_loss = 0

    with torch.no_grad():

        for _, batch in enumerate(iterator):

            src = batch.input
            trg = batch.output

            output = model(src, trg) 

            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)


def epoch_time(start_time,
               end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


N_EPOCHS = 5
CLIP = 1

best_valid_loss = float('inf')
losses = []

for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)

    end_time = time.time()

    losses.append((train_loss, valid_loss))

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f}')

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f}')



Epoch: 01 | Time: 6m 34s
	Train Loss: 0.079
	 Val. Loss: 0.000
Epoch: 02 | Time: 6m 33s
	Train Loss: 0.001
	 Val. Loss: 0.000
Epoch: 03 | Time: 6m 33s
	Train Loss: 0.000
	 Val. Loss: 0.000
Epoch: 04 | Time: 6m 33s
	Train Loss: 0.000
	 Val. Loss: 0.000
Epoch: 05 | Time: 6m 34s
	Train Loss: 0.000
	 Val. Loss: 0.000
| Test Loss: 0.000


In [None]:
# save
torch.save(model.state_dict(), os.path.join(dirname, "model_params.pyt"))

## Loss progression 

In [None]:
import plotly.graph_objects as go
from plotly.offline import iplot
import pandas as pd

df = pd.DataFrame(losses, columns=['training_loss', 'validation_loss'])

training_trace = go.Scatter(mode="lines+markers",
                  x = df.index,
                  y = df.training_loss,
                  name = "training loss")
validation_trace = go.Scatter(mode="lines+markers",
                        x = df.index,
                        y = df.validation_loss,
                        name = "validation loss")

data = [training_trace, validation_trace]
layout = go.Layout(
    title="Loss progression",
    yaxis=dict(
        title="loss"
    ),
    xaxis=dict(
        title="epoch #"
    )
)

fig = go.Figure(data=data, layout=layout)
fig.update_xaxes(dtick=1)
iplot(fig)

# Test
Use the directly following cell to load the model and to test it

In [None]:
# set to True to load model
LOAD = True
# set to file containing parameters
PATH_TO_PARAMETERS = os.path.join(dirname, "model_params.pyt")

def download_params():
    import requests

    url = "https://dl.dropboxusercontent.com/s/vgdfeopm1ybmwvx/model_params.pyt?dl=0"

    content = requests.get(url).content

    with open(PATH_TO_PARAMETERS, "wb") as file:
        file.write(content)

if LOAD:
    model = Seq2Seq(128, 20, 15)
    if not os.path.exists(PATH_TO_PARAMETERS):
        download_params()
    model.load_state_dict(torch.load(PATH_TO_PARAMETERS))
    model.eval()

In [None]:
def output_to_sentence(output):
    return ''.join((OUTPUT.vocab.itos[torch.argmax(n).item()] for n in output))

def test(x, y, add, complete=False):
    """
    x,y : int
        numbers
    add : boolean
        whether to add or to subtract
    complete : boolean
        give out complete sentence (with sos and eos tokens)
    """
    if add:
        res = x+y
        op = '+'
    else:
        res = x-y
        op = '-'
    
    input_str =  f"var x = {x}\nvar y = {y}\nx{op}y\n"
    output_str = f"{res}\n"

    print(res)

    example = torchtext.data.Example.fromlist([input_str, output_str], [('input', INPUT), ('output', OUTPUT)])

    b = torchtext.data.Batch((example,), test_data, device)

    model.eval()
    output = model(b.input, b.output)

    # reorganize and cutoff sos
    output = output[0 if complete else 1:].view(-1, output.shape[-1])
    y = b.output[0 if complete else 1:].view(-1)

    # cutoff eos (potentially)
    output = output[:-1] if not complete else output
    y = y[:-1] if not complete else y

    exp = ''.join((OUTPUT.vocab.itos[x] for x in y))
    got = output_to_sentence(output)

    # set green if same else red
    OKGREEN = '\033[92m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'

    color = OKGREEN if exp == got else FAIL

    print("Expected:", exp)
    print("Got:    ", color, got, ENDC)

In [None]:
test(95012, 100, add=False)

94912
Expected: 94912
Got:     [92m 94912 [0m


