# Implementation of the ConvSeq2Seq architecture

### This is the notebook used to train and test the SPARQL NMT ConvSeq2Seq model for the Knowledge Base-aware SPARQL Query Translation from Natural Language article

Here are some interesting resources that helped us in our implementation:
- https://github.com/bentrevett/pytorch-seq2seq
- https://huggingface.co/spaces/gradio/HuBERT/blob/main/fairseq/models/transformer.py
- https://github.com/pytorch/fairseq/blob/main/fairseq/models/fconv.py

## Setup

Please note that using [wandb](https://wandb.ai/site) is not required, but suggested as it provides a great way to track model perfomances during training. Install the package and set the const USE_WANDB to true if you wish to use it!

In [None]:
!pip install transformers
!pip install --upgrade spacy
!python -m spacy download en_core_web_sm
!pip install SPARQLWrapper
!pip install torchtext==0.11.0
!pip install wandb

In [None]:
!wandb login

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
from google.colab import files
import wandb

import torchtext
from torchtext.legacy.data import Field, BucketIterator
from torchtext.legacy.data.dataset import TabularDataset
from torchtext.vocab import Vocab

import math
import time

import pandas as pd
from sklearn.model_selection import train_test_split

from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Optional

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import spacy
import unicodedata
import re

from torchtext.data.metrics import bleu_score
from torchtext.data.utils import ngrams_iterator

import json
import random
from google.colab import files, drive
import numpy as np

from torch.autograd import Variable
import torch.nn.functional as F

from SPARQLWrapper import SPARQLWrapper, JSON
from collections import Counter, defaultdict

from typing import List, Dict, Tuple, DefaultDict, Union, Optional
import copy

In [None]:
# connect to google drive
drive.mount('/content/gdrive')

## Consts

In [None]:
EMB_DIM = 768
HID_DIM = 512            # each conv. layer has 2 * hid_dim filters
ENC_DROPOUT = 0.2
DEC_DROPOUT = 0.2
BATCH_SIZE = 128
DECODER_OUT_DIM = 512

DATASET = 'dataset.json' # Name of the dataset to use, found in the Data repo under out_data
USE_WANDBAI = True       # Use WANDBAI as a logging tool
USE_COPY = False          # Train the models using a copy layer

RANDOM = False           # Randomize the order of the entries
LOWERCASE = False        # force lowercase for query and questions - 
                         # not recommended because it makes it very hard to go back to working SPARQL queries

N_EPOCHS = 150          # Train for how many epochs
MAX_LENGTH = 100        # After MAX_LENGTH tokens are predicted by the model without reaching <eos>, it will stop trying

# For Training Gradients
CLIP = 0.1

# Model Train Parameters
LEARNING_RATE = 3.5     # We found that a learning rate up to 3.5 could significantly 
                        # speed up the training without losing performance for the base versions of LC-QuAD

TAGS = ['dbr:', 'dbo:', 'dbp:', 'dbc:', 'dct:', 'geo:', 'georss:']

DatasetIterator = torchtext.legacy.data.Dataset

MODEL_TYPE = "cnns2s"
COPY_FLAG = "copy" if USE_COPY else "no_copy"
DATASET_FAMILY = "LC-QuAD"
DATASET_NAME = "intermediary_question_original_data_2" # DONT FORGET TO SET

OUT_DRIVE_FOLDER_BASE = f"/content/gdrive/MyDrive/PRETRAINED/{MODEL_TYPE}/{COPY_FLAG}/{DATASET_FAMILY}/{DATASET_NAME}/"

## Utils

### Metrics

In [None]:
# Translate a question into a sparql query
def translate_sentence(tokens: Union[str, List[str]], 
                       src_field: Field,
                       trg_field: Field,
                       model: nn.Module,
                       device: torch.device,
                       max_len=MAX_LENGTH, 
                       predict_with_copy=USE_COPY) -> Tuple[List[str], int]:

    model.eval()
    
    # format as a list of strs
    if isinstance(tokens, str):
        tokens = tokenize_en(tokens)
    if LOWERCASE:
      tokens = [token.lower() for token in tokens]

    # extend vocab with KB elems
    if predict_with_copy:
      resources_to_extend = extract_KB_elems(tokens)
      KB_vocab = VocabDup(resources_to_extend, padding=0)

      src_field = extend_vocabulary(src_field, KB_vocab)
      trg_field = extend_vocabulary(trg_field, KB_vocab)

    # add <sos> and <eos> delimiters
    tokens = [src_field.init_token] + tokens + [src_field.eos_token]
    
    # index sentence with extended src vocab
    src_indexes = [src_field.vocab.stoi[token] for token in tokens]
    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)

    with torch.no_grad():
        encoder_conved, encoder_combined, encoder_padding_mask = model.encoder(src_tensor.masked_fill(src_tensor >= model.encoder.tok_embedding.num_embeddings, 0))

    # init empty query sentence with only <sos> token
    trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]

    # generate words
    for i in range(max_len - 2):
        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)

        with torch.no_grad():
            output, attention = model.decoder(trg_tensor.masked_fill(trg_tensor >= model.decoder.tok_embedding.num_embeddings, 0), encoder_conved, encoder_combined, encoder_padding_mask)
            if predict_with_copy:
              output, attention = model.copy_layer(src_tensor, output, attention)
        
        pred_token = output.argmax(2)[:,-1].item()
        trg_indexes.append(pred_token)

        if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
            break

    trg_tokens = [trg_field.vocab.itos[i] if i < len(trg_field.vocab) else '<unk>' for i in trg_indexes]
    return trg_tokens[1:], attention

In [None]:
# Translate a question into a sparql query
def batch_translate(batch: torch.Tensor,
                    trg_field: Field,
                    model: nn.Module,
                    device: torch.device,
                    max_len=MAX_LENGTH, 
                    predict_with_copy=USE_COPY) -> Tuple[List[str], int]:

      model.eval()
    
      src_tensor = batch.English
      max_batch_len = max_len - 2
        
      with torch.no_grad():
              encoder_conved, encoder_combined, encoder_padding_mask = model.encoder(src_tensor.masked_fill(src_tensor >= model.encoder.tok_embedding.num_embeddings, 0))

      # init empty query sentence with only <sos> token
      trg_tensor = torch.full((src_tensor.shape[0], 1), 2).to(device)

      # generate words
      for test in range(max_batch_len):
          with torch.no_grad():
              output, attention = model.decoder(trg_tensor.masked_fill(trg_tensor >= model.decoder.tok_embedding.num_embeddings, 0), encoder_conved, encoder_combined, encoder_padding_mask)
              if predict_with_copy:
                output, attention = model.copy_layer(src_tensor, output, attention)

          output = output[:, -1, :]
          pred_token = output.argmax(1)
          pred_token = torch.unsqueeze(pred_token, dim=1)
          trg_tensor = torch.cat((trg_tensor, pred_token), dim=1)
    
      # remove after eos
      out = []
      for sent in trg_tensor.cpu().numpy():

        if trg_field.vocab.stoi[trg_field.eos_token] in sent:
          try:
            eos_id = np.where(sent == trg_field.vocab.stoi[trg_field.eos_token])[0][0]
            sent = sent[:eos_id]

          except:
            pass

        out.append([trg_field.vocab.itos[i] if i < len(trg_field.vocab) else '<unk>' for i in sent][1:])
    
      return out, attention

In [None]:
# Convert a batch of token ids to a batch of tokens (by Samuel)
def get_batch_tokens(batch: torch.Tensor, field: Field) -> List[str]:
    output_tokens = []
    for pred_trg in batch:
        eos_ids = (pred_trg == field.vocab.stoi[field.eos_token]).nonzero(as_tuple=True)[0]
  
        if eos_ids.nelement():
            non_eos_tokens_ids = pred_trg[:eos_ids[0]]
        else:
            non_eos_tokens_ids = pred_trg
        
        output_tokens.append([field.vocab.itos[tok] if tok <= len(field.vocab) else '<unk>' for tok in non_eos_tokens_ids])
    return output_tokens

In [None]:
# Calculate the BLEU score of our test set
def calculate_bleu(data: DatasetIterator, 
                   src_field: Field, 
                   trg_field: Field, 
                   model: nn.Module, 
                   device: torch.device,
                   predict_with_copy=USE_COPY, 
                   max_len=MAX_LENGTH) -> Tuple[float, float]:
    print("Calculating BLEU score...")
    expected_trgs = []
    pred_trgs = []
    pred_copy_trgs = []

    for datum in data:
        src = vars(datum)['English']
        trg = vars(datum)['SPARQL']

        pred_trg, _ = translate_sentence(src, src_field, trg_field, model, device, max_len, predict_with_copy=predict_with_copy)
        if pred_trg[-1] is trg_field.eos_token:
          pred_trg = pred_trg[:-1]
        pred_trgs.append(pred_trg)
        expected_trgs.append([trg])
        
    return bleu_score(pred_trgs, expected_trgs)

In [None]:
# Calculate the BLEU score of our test set by batch
def batch_bleu(iterator: DatasetIterator, 
               trg_field: Field, # use BASE_TRG for syntax!
               model: nn.Module,
               device: torch.device,
               use_copy=USE_COPY) -> float:

    bleu_preds = []
    bleu_expected = []

    for _, batch in enumerate(iterator):
      preds, _ = batch_translate(batch, trg_field, model, device, predict_with_copy=use_copy)
      bleu_preds.extend(preds)
      expected = get_batch_tokens(batch.SPARQL[:,1:-1], trg_field)
      bleu_expected.extend(expected)

    return bleu_score(bleu_preds, [[sent] for sent in bleu_expected])

In [None]:
# Calculate the BLEU score of our test set - when using copy we want syntax
def calculate_bleu_syntax(data: DatasetIterator, 
                          src_field: Field, 
                          trg_field: Field, 
                          model: nn.Module, 
                          device: torch.device, 
                          max_len=MAX_LENGTH) -> float:
    print("Calculating BLEU score of syntax...")
    expected_trgs = []
    expected_syntax = []
    pred_trgs_syntax = []
    pred_copy_trgs = []

    for datum in data:
        src = vars(datum)['English']
        trg = vars(datum)['SPARQL']
        pred_syntax, _ = translate_sentence(src, src_field, trg_field, model, device, max_len, predict_with_copy=True)
        if pred_syntax[-1] is trg_field.eos_token:
          pred_syntax = pred_syntax[:-1]

        pred_trgs_syntax.append(pred_syntax)

        trg_syntax = ['<unk>' if token.startswith(tuple(TAGS)) else token for token in trg]
        expected_syntax.append([trg_syntax])

    return bleu_score(pred_trgs_syntax, expected_syntax)

In [None]:
# calculates some metrics on the test set
def get_metrics(data: DatasetIterator,
                test_entries: List[Dict], 
                src_field: Field, 
                trg_field: Field, 
                model: nn.Module,
                device: torch.device, 
                max_len=MAX_LENGTH, 
                predict_with_copy=USE_COPY) -> Dict[str, float]:
    print("Computing evaluation metrics...")
    expected_trgs = []
    pred_trgs = []
    error_report = []

    for i, datum in enumerate(data):
        src = vars(datum)['English']
        trg = vars(datum)['SPARQL']

        pred_trg, _ = translate_sentence(src, src_field, trg_field, model, device, max_len, predict_with_copy)

        pred_trg = pred_trg[:-1]

        pred_trgs.append(pred_trg)
        expected_trgs.append([trg])

        error_entry = {
            'id': test_entries[i]['_id'],
            'template_id': test_entries[i]['template_id'],
            'src': ' '.join(src),
            'trg': ' '.join(trg),
            'predicted': ' '.join(pred_trg),
            'correct': trg == pred_trg
        }
        error_report.append(error_entry)

    metrics = {}
    nb_examples = len(expected_trgs)
    metrics['bleu'] = bleu_score(pred_trgs, expected_trgs)
    metrics['accuracy'] = sum([int(pred_trgs[i] == expected_trgs[i][0]) for i in range(nb_examples)])/nb_examples

    pred_ngrams = [list(ngrams_iterator(pred, len(pred))) for pred in pred_trgs]
    exp_ngrams = [list(ngrams_iterator(exp[0], len(exp[0]))) for exp in expected_trgs]

    #https://towardsdatascience.com/the-ultimate-performance-metric-in-nlp-111df6c64460
    n_commons = [len(set(pred_ngrams[i]) & set(exp_ngrams[i])) for i in range(nb_examples)]

    recalls = [n_commons[i] / len(exp_ngrams[i]) for i in range(nb_examples)]
    metrics['macro recall'] = sum(recalls) / len(recalls)

    precisions = [n_commons[i] / len(pred_ngrams[i]) for i in range(nb_examples)]
    metrics['macro precision'] = sum(precisions) / len(precisions)

    metrics['f1 score'] = 2 * (metrics['macro precision'] * metrics['macro recall']) / (metrics['macro precision'] + metrics['macro recall'])

    with open('out/error_report.json', 'w', encoding='utf-8') as f:
      json.dump(error_report, f, indent=4)

    return metrics

### Vocab

In [None]:
# imitation of a torchtext.vocab.Vocab, basic structure needed to extend a torchtext Vocab
class VocabDup:
  def __init__(self, vocab: Union[Dict[int, str], List[str]], padding=0, base_vocab_size=0):
    if type(vocab) is list:
      self.make_vocab_from_list(vocab, padding)

    elif type(vocab) is dict:
      self.make_vocab_from_dict(vocab, base_vocab_size)
    
    else:
      raise ValueError("Could not make a vocab from this structure")


  # Make vocab from a list (usually KB elem list) to use it to extend base vocabs
  def make_vocab_from_list(self, word_list: List[str], padding=0) -> None:
      word_list = list(set(word_list))
      word_counter = Counter(word_list)
      stoi = defaultdict(int)
      itos = [None for _ in range(len(word_list) + padding)]

      curr_idx = 0
      # pad if necessary
      for i in range(padding):
          word = f'not_a_resource_{i}'
        
          stoi[word] = curr_idx
          itos[curr_idx] = word
          curr_idx+=1

      # add KB elems
      for word in word_counter:
          stoi[word] = curr_idx
          itos[curr_idx] = word
          curr_idx+=1

      self.freq = word_counter
      self.itos = itos
      self.stoi = stoi

  # Make vocab from a dict (usually when loading the vocab files)
  def make_vocab_from_dict(self, word_dict: Dict[int, str], base_vocab_size: int=0) -> None:
      stoi = defaultdict(int)
      base_vocab_size = len(word_dict.values()) if base_vocab_size < 1 else base_vocab_size
      itos = [None for _ in range(base_vocab_size)]

      for idx, word in word_dict.items():
          if idx < base_vocab_size:
              stoi[word] = idx
              itos[idx] = word

      word_counter = Counter(itos)

      self.freq = word_counter
      self.itos = itos
      self.stoi = stoi

In [None]:
# by samuel
def hide_KB_elems(tokens: List[str], unk_token = '<unk>') -> List[str]:
  return [unk_token if token.startswith(tuple(TAGS)) else token for token in tokens]

In [None]:
# Extract KB elements from a tokenized sentence
def extract_KB_elems(tokens: List[str]) -> List[str]:
  removed_resources_en = [t for t in tokens if t.startswith(tuple(TAGS))]
  return removed_resources_en

In [None]:
# Split vocbaularies: keep only the base words of question and queries, and save all removed KB elems in another list
def abstract_KB_elems(data) -> Tuple[Dict, Dict]:
  base_vocabs = {'English': [], 'SPARQL': []}
  kb_vocabs = {'English': [], 'SPARQL': []}

  for example in data:
    nl = example.English
    sparql = example.SPARQL

    # for nl
    filtered_nl = [t for t in nl if not t.startswith(tuple(TAGS))]
    removed_resources_nl = [t for t in nl if t.startswith(tuple(TAGS))]

    # for sparql
    filtered_sparql = [t for t in sparql if not t.startswith(tuple(TAGS))]
    removed_resources_sparql = [t for t in sparql if t.startswith(tuple(TAGS))]

    # keep separated by sentences
    base_vocabs['English'].append(filtered_nl)
    base_vocabs['SPARQL'].append(filtered_sparql)

    # a single list of all KB elems
    kb_vocabs['English'].extend(removed_resources_nl)
    kb_vocabs['SPARQL'].extend(removed_resources_sparql)

  return base_vocabs, kb_vocabs

In [None]:
# This function acts exactly like the PyTorch version, but using the PyTorch version Field.vocab.extend_vocabulary cause
# some seriously weird bugs. Our best guess was that it caused collisions in the dict keys, but it is highly unlikely
def extend_vocabulary(field: Field, extension: VocabDup) -> Field:
    words = extension.itos
    for w in words:
        if w not in field.vocab.itos: # stoi does not work
            field.vocab.itos.append(w)
            field.vocab.stoi[w] = len(field.vocab.itos) - 1

    return field

In [None]:
# It is possible that a query contains a KB elem that is in the KB vocab but not in the question (for example, LC-QuAD template ID 7)
# In that case, we should replace KB elems that are not in BOTH the query and the question by unknown tokens (0)
def fix_extended_vocab(src: List[List[int]], trg: List[List[int]], base_voc_limit_trg: int, unk_token = 0) -> List[List[int]]:
  for i_s, sentence in enumerate(trg): # batch size
    for i_t, token_idx in enumerate(trg[i_s]): # batch size
      if token_idx >= base_voc_limit_trg and token_idx not in src[i_s]:
        trg[i_s][i_t] = unk_token

  return trg

In [None]:
# Save vocab to reuse for inference
def save_vocab(vocab: torchtext.vocab.Vocab, path: str) -> None:
    with open(path, 'w', encoding='utf-8') as f:     
        for token, index in vocab.stoi.items():
            f.write(f'{index}\t{token}\n')

In [None]:
# Read vocab files
def read_vocab(path: str) -> Dict[int, str]:
    voc = {}
    i = 0
    with open(path, 'r', encoding='utf-8') as f:
        data = f.read().splitlines()
        for line in data:
            index, token = line.split('\t')
            voc[i] = token
            i += 1
    return voc

### Data

In [None]:
# Tokenize a question by splitting at spaces
def tokenize_en(text: str) -> List[str]:
    text = text.replace('?', ' ? ')
    splitted = text.split()
    return [w for w in splitted if len(w) > 0]

In [None]:
# Tokenize a query by splitting at spaces
def tokenize_sparql(text: str) -> List[str]:
    splitted = text.split()
    return [w for w in splitted if len(w) > 0]

In [None]:
# Generate train, val and test sets
def gen_train_test_val_sets(train_examples: List[str], 
                            valid_examples: List[str], 
                            test_examples: List[str], 
                            data_fields: List[Tuple[str, Field]]) -> Tuple[TabularDataset, TabularDataset, TabularDataset]:
    train_set = pd.DataFrame(train_examples, columns=["English", "SPARQL"])
    valid_set = pd.DataFrame(valid_examples, columns=["English", "SPARQL"])
    test_set = pd.DataFrame(test_examples, columns=["English", "SPARQL"])

    train_set = pd.DataFrame(train_set, columns=["English", "SPARQL"])
    valid_set = pd.DataFrame(valid_set, columns=["English", "SPARQL"])
    test_set = pd.DataFrame(test_set, columns=["English", "SPARQL"])

    train_set.to_csv("train.csv", index=False, header=None)
    valid_set.to_csv("valid.csv", index=False, header=None)
    test_set.to_csv("test.csv", index=False, header=None)

    train_data, valid_data, test_data = torchtext.legacy.data.TabularDataset.splits(
        path='./', train='train.csv', validation='valid.csv', test='test.csv', format='csv', fields=data_fields)

    return train_data, valid_data, test_data

In [None]:
# Generate the data fields used to encode the question-query pairs
def gen_data_field() -> Tuple[Field, Field]:
    SRC = Field(tokenize=tokenize_en,
                init_token='<sos>',
                eos_token='<eos>',
                lower=LOWERCASE,
                batch_first=True)

    TRG = Field(tokenize=tokenize_sparql,
                init_token='<sos>',
                eos_token='<eos>',
                lower=LOWERCASE,
                batch_first=True)

    return SRC, TRG

### Training

In [None]:
# Initialize model weights
def initialize_weights(m) -> None:
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

In [None]:
# Count number of parameters in the model
def count_parameters(model: nn.Module) -> int:
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
# Calculate epoch duration
def epoch_time(start_time: float, end_time: float) -> Tuple[float, float]:
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
# Training function of the model
def train(model: nn.Module, 
          iterator: DatasetIterator, 
          optimizer: torch.optim.Optimizer, 
          criterion: nn.Module, clip: float, 
          use_copy=USE_COPY) -> float:

    model.train()
    epoch_loss = []

    for _, batch in enumerate(iterator):
        
        src = batch.English
        trg = batch.SPARQL
        
        if use_copy: 
          trg = fix_extended_vocab(src, trg, OUT_TRG_DIM)

        optimizer.zero_grad()
        
        output, _ = model(src, trg[:,:-1])
        output_dim = output.shape[-1]
           
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:,1:].contiguous().view(-1) 

        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()

        epoch_loss += [loss.item()]

    return epoch_loss[-1]

In [None]:
# Eval function of the model
def evaluate(model: nn.Module, 
             iterator: DatasetIterator, 
             criterion: nn.Module, 
             use_copy=USE_COPY) -> float:

    model.eval()
    epoch_loss = []

    with torch.no_grad():

        for _, batch in enumerate(iterator):

            src = batch.English
            trg = batch.SPARQL

            if use_copy: 
              trg = fix_extended_vocab(src, trg, OUT_TRG_DIM)

            output, _ = model(src, trg[:,:-1])
            output_dim = output.shape[-1]
            
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:,1:].contiguous().view(-1)

            loss = criterion(output, trg)
            epoch_loss += [loss.item()]
        
    return epoch_loss[-1]

## Model

In [None]:
class Encoder(nn.Module):
    def __init__(self,
                 input_dim: int, # SV (base source vocab length)
                 emb_dim: int,   # E (embed)
                 hid_dim: int,   # C (conv)
                 dropout: float,
                 device: torch.device,
                 padding_idx: int,
                 max_length=MAX_LENGTH):
        super().__init__()

        self.padding_idx = padding_idx

        self.device = device

        self.tok_embedding = nn.Embedding(input_dim, emb_dim, self.padding_idx)
        self.pos_embedding = nn.Embedding(max_length, emb_dim, self.padding_idx)

        self.emb2hid = nn.Linear(emb_dim, hid_dim)
        in_channels = hid_dim

        self.projections = nn.ModuleList()
        self.convolutions = nn.ModuleList()
        self.residuals = []

        # Hardcoded for our purposes, but definitely could be changed/passed as a parameter
        convolutions = [(hid_dim, 3, 1)] * 9 + [(2 * hid_dim, 3, 1)] * 4 + [(4 * hid_dim, 1, 1)] * 2
        layer_in_channels = [in_channels]
        
        for (out_channels, kernel_size, residual) in convolutions:
            
            residual_dim = layer_in_channels[-residual]
            self.projections.append(
                nn.Linear(residual_dim, out_channels)
                if residual_dim != out_channels
                else None
            )

            self.convolutions.append(
                nn.Conv1d(
                    in_channels,
                    out_channels * 2,
                    kernel_size,
                    padding = kernel_size // 2
                )
            )
            self.residuals.append(residual)
            in_channels = out_channels
            layer_in_channels.append(out_channels)
        
        self.hid2emb = nn.Linear(in_channels, emb_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        # src = B x S x SV
        batch_size = src.shape[0] # B
        src_len = src.shape[1] # S (longest sentence in src batch)

        pos = torch.arange(self.padding_idx + 1, src_len + 1 + self.padding_idx).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        # pos = [0, 1, 2, 3, ..., src len - 1] -> B x S
        encoder_padding_mask = src.eq(self.padding_idx)  # B x S

        if not encoder_padding_mask.any():
          encoder_padding_mask = None
        else:
          pos.masked_fill_(encoder_padding_mask, self.padding_idx)

        # embed tokens and positions
        tok_embedded = self.tok_embedding(src) # B x S x E
        pos_embedded = self.pos_embedding(pos) # B x S x E

        # combine embeddings by elementwise summing
        embedded = self.dropout(tok_embedded + pos_embedded) # B x S x E

        # pass embedded through linear layer to convert from emb dim (E) to hid dim (C)
        x = self.emb2hid(embedded) # B x S x C

        # permute for convolutional layer
        x = x.permute(0, 2, 1) # B x C x S

        # convolutions
        residuals = [x]
        for proj, conv, res_layer in zip(
            self.projections, self.convolutions, self.residuals
        ):
            if res_layer > 0:
                residual = residuals[-res_layer]

                residual = residual
                if proj is not None:
                  residual = proj(residual.permute(0,2,1))
                  residual = residual.permute(0,2,1) # B x C x S
            else:
                residual = None

            if encoder_padding_mask is not None:
              x = x.masked_fill(encoder_padding_mask.unsqueeze(1), 0)

            x = conv(self.dropout(x)) # B x C x S
            x = F.glu(x, dim=1) # B x C x S

            if residual is not None:
              x = (x + residual) * math.sqrt(0.5)

            residuals.append(x)

        # permute and convert back to emb dim
        x = self.hid2emb(x.permute(0, 2, 1)) # B x S x E

        if encoder_padding_mask is not None:
            x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)
        
        # Element-wise sum output (conved) and input (embedded) to be used for attention
        combined = (x + embedded) * math.sqrt(0.5)

        # x -> B x S x E
        # combined -> B x S x E
        # encoder_paddding_mask -> # B x S

        return x, combined, encoder_padding_mask

In [None]:
class AttentionLayer(nn.Module):
    def __init__(self, conv_channels: int, embed_dim: int, bmm=None):
        super().__init__()
        self.in_projection = nn.Linear(conv_channels, embed_dim) # C to E
        self.out_projection = nn.Linear(embed_dim, conv_channels) # E to C
        self.bmm = bmm if bmm is not None else torch.bmm

    def forward(self, 
                x: torch.Tensor, 
                target_embedding: torch.Tensor, 
                encoder_out: torch.Tensor, 
                encoder_padding_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        residual = x

        # x -> B x T x C
        # target_embedding -> # B x T x E
        # encoder_out[0] -> B x E x S
        # encoder_out[1] -> B x S x E
        # src_mask -> B x S

        x = (self.in_projection(x) + target_embedding) * math.sqrt(0.5) # B x T x E
        x = self.bmm(x, encoder_out[0]) # B x T x E

        if encoder_padding_mask is not None:
            x = (
                x.float()
                .masked_fill(encoder_padding_mask.unsqueeze(1), float("-inf"))
                .type_as(x)
            ) # B x T x S

        # softmax over last dim
        sz = x.size()
        x = F.softmax(x.view(sz[0] * sz[1], sz[2]), dim=1)
        x = x.view(sz)
        attn_scores = x # B x T x S

        x = self.bmm(x, encoder_out[1]) # B x T x E

        # scale attention output (respecting potentially different lengths)
        s = encoder_out[1].size(1) # S

        if encoder_padding_mask is None:
            x = x * (s * math.sqrt(1.0 / s)) # B x T x E
        else:
            s = s - encoder_padding_mask.type_as(x).sum(
                dim=1, keepdim=True
            )  # exclude padding
            s = s.unsqueeze(-1)
            x = x * (s * s.rsqrt()) # B x T x E

        # project back
        x = (self.out_projection(x) + residual) * math.sqrt(0.5) # B x T x C

        # attn_scores -> T x S
        return x, attn_scores

In [None]:
class Decoder(nn.Module):
    def __init__(self,
                 num_embeddings: int, # TV (base target vocab length)
                 output_dim: int,     # O 
                 emb_dim: int,        # E
                 hid_dim: int,
                 dropout: float,
                 device: torch.device,
                 padding_idx: int,
                 max_length=MAX_LENGTH):
        super().__init__()

        self.padding_idx = padding_idx

        self.device = device

        self.tok_embedding = nn.Embedding(num_embeddings, emb_dim, self.padding_idx)
        self.pos_embedding = nn.Embedding(max_length, emb_dim, self.padding_idx)

        self.projections = nn.ModuleList()
        self.convolutions = nn.ModuleList()
        self.attention = nn.ModuleList()
        self.residuals = []

        self.dropout = nn.Dropout(dropout)

        in_channels = hid_dim
        layer_in_channels = [in_channels]

        # Should be the same as in the encoder
        convolutions = [(hid_dim, 3, 1)] * 9 + [(2*hid_dim, 3, 1)] * 4 + [(4*hid_dim, 1, 1)] * 2

        for i, (out_channels, kernel_size, residual) in enumerate(convolutions):
            residual_dim = layer_in_channels[-residual] # hid dim
            self.projections.append(
                nn.Linear(residual_dim, out_channels) 
                if residual_dim != out_channels
                else None
            )

            self.convolutions.append(
                nn.Conv1d(
                    in_channels,
                    out_channels * 2,
                    kernel_size
                )
            )           

            self.attention.append(
                AttentionLayer(out_channels, emb_dim)
            )

            self.residuals.append(residual)
            in_channels = out_channels
            layer_in_channels.append(out_channels)

        self.emb2hid = nn.Linear(emb_dim, hid_dim)
        self.hid2emb = nn.Linear(in_channels, output_dim)
        self.fc_out = nn.Linear(output_dim, num_embeddings)


    def forward(self, 
                trg: torch.Tensor, 
                encoder_conved: torch.Tensor, 
                encoder_combined: torch.Tensor, 
                src_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:

        # encoder_conved -> B x S x E
        # encoder_combined -> B x S x E
        # src_mask -> # B x S

        batch_size = trg.shape[0] # B
        trg_len = trg.shape[1] # T (longest sentence in targe batch)

        encoder_conved = encoder_conved.permute(0, 2, 1) # B x E x S

        # create position tensor
        pos = torch.arange(self.padding_idx + 1, trg_len + 1 + self.padding_idx).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        # pos = B x T

        decoder_padding_mask = trg.eq(self.padding_idx)  # -> B x T
        if not decoder_padding_mask.any():
          decoder_padding_mask = None
        else:
          pos.masked_fill_(decoder_padding_mask, self.padding_idx)

        # embed tokens and positions
        tok_embedded = self.tok_embedding(trg) # B x T x E
        pos_embedded = self.pos_embedding(pos) # B x T x E

        # combine embeddings by elementwise summing
        embedded = self.dropout(tok_embedded + pos_embedded) # B x T x E

        # pass embedded through linear layer to go through emb dim (E) -> hid dim (C)
        x = self.emb2hid(embedded) # B x T x C

        batch_size = x.shape[0]
        residuals = [x]
        for proj, conv, attention, res_layer in zip(
            self.projections, self.convolutions, self.attention, self.residuals
        ):
            if res_layer > 0:
                residual = residuals[-res_layer] # B x T x C
                residual = residual if proj is None else proj(residual)  # B x T x C
            else:
                residual = None

            hid_dim = x.shape[2] # C
            x = self.dropout(x) # B x T x C
            # we permute it here as opposed to beforer the conv in the encoder because of the attn layer
            x = x.permute(0, 2, 1) # B x C x T

            # K is kernel size
            padding = torch.zeros(batch_size, hid_dim, conv.kernel_size[0] - 1) # B x C x K
            padding = padding.fill_(self.padding_idx).to(self.device) # B x C x K
            padded_x = torch.cat((padding, x), dim = 2) # B x C x [K + T]

            x = conv(padded_x)     # B x C x [K + T]
            x = F.glu(x, dim=1)    # B x C x [K + T]
            x = x.permute(0, 2, 1) # B x [K + T] x C

            if attention is not None:
                attn, attn_scores = attention(
                    x, embedded, (encoder_conved, encoder_combined), src_mask

                )
                
            if residual is not None:
              x = (attn + residual) * math.sqrt(0.5) # B x T x C

            residuals.append(x)

        x = self.hid2emb(x) # B x T x E
        output = self.fc_out(self.dropout(x)) # B x T x O
        # attn_scores -> S x T
        return output, attn_scores


In [None]:
class CopyLayerVocabExtend(nn.Module):
  def __init__(self, decoder: Decoder):
    super().__init__()
    self.switch = nn.Linear(decoder.tok_embedding.num_embeddings, 1)

  def forward(self,
              src: torch.Tensor, 
              output: torch.Tensor, 
              attention: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
    p_pointer = torch.sigmoid(self.switch(output)) # prob copie vs gen
  
    if torch.max(src) + 1 > output.shape[-1]: # estce que source contient des oov? disons oov id 1490 vs taille de base 1000
      extended = Variable(torch.zeros((output.shape[0], output.shape[1], torch.max(src) + 1 - output.shape[-1]))).to(output.device) # taille 490
      output = torch.cat((output, extended), dim = 2) # size output + 490

    output = ((1 - p_pointer) * F.softmax(output, dim = 2)).scatter_add(2, src.unsqueeze(1).repeat(1, output.shape[1], 1), p_pointer * attention) + 1e-10
    return torch.log(output), attention

In [None]:
class CNNSeq2Seq(nn.Module):
    def __init__(self, 
                 encoder: Encoder, 
                 decoder: Decoder, 
                 copy_layer: Optional[CopyLayerVocabExtend]=None):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.switch = nn.Linear(decoder.tok_embedding.num_embeddings, 1)
        self.copy_layer = copy_layer

    def forward(self, src: torch.Tensor, trg: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
      if self.copy_layer is None:
        encoder_conved, encoder_combined, encoder_padding_mask = self.encoder(src)
        output, attention = self.decoder(trg, encoder_conved, encoder_combined, encoder_padding_mask)
        return output, attention

      else:
        encoder_conved, encoder_combined, encoder_padding_mask = self.encoder(src.masked_fill(src >= self.encoder.tok_embedding.num_embeddings, 0))
        output, attention = self.decoder(trg.masked_fill(trg >= self.decoder.tok_embedding.num_embeddings, 0), encoder_conved, encoder_combined, encoder_padding_mask)
        output, attention = self.copy_layer(src, output, attention)
        return output, attention
      

## Main

In [None]:
iteration = 1

In [None]:
# START RERUN HERE ===================================================================================================================================================================================================
OUT_DRIVE_FOLDER = f'{OUT_DRIVE_FOLDER_BASE}/{iteration}'
RUN_NAME = f'{MODEL_TYPE}_{DATASET_NAME}_{iteration}'
GROUP_NAME = f'{MODEL_TYPE}_{DATASET_NAME}'
WANDBAI_TAGS = [MODEL_TYPE, COPY_FLAG, DATASET_FAMILY]

In [None]:
iteration

In [None]:
!mkdir out

In [None]:
# Load data
with open(DATASET, 'r', encoding='utf-8') as f:
  dataset = json.load(f)
  
if RANDOM:
  random.shuffle(dataset)

In [None]:
# Split into sets
test_entries = [entry for entry in dataset if entry['set'] == 'test']

# You will have to manually change the dict keys depending on the version of LC-QuAD you want
train_examples = [(entry['original_data']['lcquad']['intermediary_question'].lower().replace('<','').replace('>',''), entry['query']['interm_sparql']) for entry in dataset if entry['set'] == 'train']
valid_examples = [(entry['original_data']['lcquad']['intermediary_question'].lower().replace('<','').replace('>',''), entry['query']['interm_sparql']) for entry in dataset if entry['set'] == 'valid']
test_examples =  [(entry['original_data']['lcquad']['intermediary_question'].lower().replace('<','').replace('>',''), entry['query']['interm_sparql']) for entry in dataset if entry['set'] == 'test']

In [None]:
# Init Fields
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
SRC, TRG = gen_data_field()
data_fields = [('English', SRC), ('SPARQL', TRG)]
train_data, valid_data, test_data = gen_train_test_val_sets(train_examples, valid_examples, test_examples, data_fields)

In [None]:
# build base vocabs
if USE_COPY:
  vocab_data_train, removed_resources_train = abstract_KB_elems(train_data)
  vocab_data_valid, removed_resources_valid = abstract_KB_elems(valid_data)
  vocab_data_test, removed_resources_test = abstract_KB_elems(test_data)

  vocab_data = {}
  vocab_data['English'] = vocab_data_train['English'] + vocab_data_valid['English'] + vocab_data_test['English']
  vocab_data['SPARQL'] = vocab_data_train['SPARQL'] + vocab_data_valid['SPARQL'] + vocab_data_test['SPARQL']
  
  SRC.build_vocab(vocab_data['English'], min_freq=1, max_size=None)
  TRG.build_vocab(vocab_data['SPARQL'], min_freq=1, max_size=None)

  removed_resources = {}
  removed_resources['English'] = removed_resources_train['English'] + removed_resources_valid['English'] + removed_resources_test['English']
  removed_resources['SPARQL'] = removed_resources_train['SPARQL'] + removed_resources_valid['SPARQL'] + removed_resources_test['SPARQL']

  removed_resources_src = set(removed_resources['English'])
  removed_resources_trg = set(removed_resources['SPARQL'])

  removed_resources = removed_resources_src.union(removed_resources_trg)

else:
  SRC.build_vocab(train_data, min_freq=1)
  TRG.build_vocab(train_data, min_freq=1)

BASE_SRC = copy.deepcopy(SRC)
BASE_TRG = copy.deepcopy(TRG)

In [None]:
IN_SRC_DIM = len(SRC.vocab)
OUT_TRG_DIM = len(TRG.vocab)
print('BASE LENGTH SRC', IN_SRC_DIM)
print('BASE LENGTH TRG', OUT_TRG_DIM)

In [None]:
# extend vocabs if use copy
if USE_COPY:
  KB_vocab_src_extension = VocabDup(list(removed_resources), padding=max(0, OUT_TRG_DIM - IN_SRC_DIM))
  KB_vocab_trg_extension = VocabDup(list(removed_resources), padding=max(0, IN_SRC_DIM - OUT_TRG_DIM))

  SRC.vocab.extend(KB_vocab_src_extension)   
  TRG.vocab.extend(KB_vocab_trg_extension)

  # very important!!!
  for i in range(IN_SRC_DIM, len(SRC.vocab)):
    assert SRC.vocab.itos[i] == TRG.vocab.itos[i]

  assert len(SRC.vocab) == len(TRG.vocab)


  print('BASE SRC VOCAB', len(BASE_SRC.vocab))
  print('BASE TRG VOCAB', len(BASE_TRG.vocab))
  print('EXTENDED SRC VOCAB', len(SRC.vocab))
  print('EXTENDED TRG VOCAB', len(TRG.vocab))
  print('KB VOCAB', len(SRC.vocab) - max(IN_SRC_DIM, OUT_TRG_DIM))
else:
  print("NOT USING COPY")

In [None]:
# Make data into iterators
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    device=device,
    sort_key=lambda x: len(x.SPARQL)
)

In [None]:
if USE_WANDBAI:
  wandb_config = {
    "dataset_family": DATASET_FAMILY,
    "dataset": DATASET_NAME,
    "use_copy": USE_COPY,
    "batch_size": BATCH_SIZE,
    "hidden_dims": HID_DIM,
    "encoder_dropout": ENC_DROPOUT,
    "decoder_dropout": DEC_DROPOUT,
    "learning_rate": LEARNING_RATE,
    "n_epochs": N_EPOCHS,
    "clip": CLIP,
    "groupe": GROUP_NAME
  }

  wandb.init(project="final", entity="rooose", config=wandb_config, name=RUN_NAME, tags=WANDBAI_TAGS)

In [None]:
# generate and save config and vocabs files for inference
INPUT_DIM = IN_SRC_DIM
NUM_EMBEDDINGS = OUT_TRG_DIM

SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

config = {
      'INPUT_DIM': INPUT_DIM,
      'NUM_EMBEDDINGS': NUM_EMBEDDINGS,
      'EMB_DIM': EMB_DIM,
      'HID_DIM': HID_DIM,
      'USE_COPY': USE_COPY,
      'ENCODER': {
          'ENC_DROPOUT': ENC_DROPOUT
      },
      'DECODER': {
          'DEC_DROPOUT': DEC_DROPOUT,
          'OUT_DIM': DECODER_OUT_DIM,
      },
      'SRC_PAD_IDX': SRC.vocab.stoi[SRC.pad_token],
      'TRG_PAD_IDX': TRG.vocab.stoi[TRG.pad_token]
  }

with open('out/config.json', 'w') as f:
  json.dump(config, f)

# saves the extended vocab, could be optimized
save_vocab(SRC.vocab, 'out/src_vocab.field')
save_vocab(TRG.vocab, 'out/trg_vocab.field')

In [None]:
# Define the model
enc = Encoder(INPUT_DIM, EMB_DIM, HID_DIM, ENC_DROPOUT, device, SRC.vocab.stoi[SRC.pad_token])
dec = Decoder(NUM_EMBEDDINGS, DECODER_OUT_DIM, EMB_DIM, HID_DIM, DEC_DROPOUT, device, TRG.vocab.stoi[TRG.pad_token])
copy_layer = CopyLayerVocabExtend(dec) if USE_COPY else None

model = CNNSeq2Seq(enc, dec, copy_layer).to(device)
model.apply(initialize_weights)

optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=0.9) 
criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX, label_smoothing=0.1)

if USE_WANDBAI:
  wandb.watch(model, log_freq = 20)

best_valid_loss = float('inf')
train_loss_list = []
val_loss_list = []

bleu = 0
bleu_syntax = 0
bleu_no_copy_layer = 0

In [None]:
# Train
for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)

    log_dict = {"train_loss": train_loss, "valid_loss": valid_loss}

    if epoch % 10 == 0:# and epoch > 0:
      bleu = batch_bleu(test_iterator, TRG, model, device)
      print('bleu:', bleu)

      if USE_COPY:
        bleu_syntax = batch_bleu(test_iterator, BASE_TRG, model, device) # syntax
        print('bleu syntax:', bleu_syntax)
        bleu_no_copy_layer = batch_bleu(test_iterator, BASE_TRG, model, device, use_copy=False) # not using the copy layer
        print('bleu no copy layer:', bleu_no_copy_layer)
    
    log_dict['bleu'] = bleu

    if USE_COPY:
      log_dict['bleu_syntax'] = bleu_syntax
      log_dict['bleu_no_copy_layer'] = bleu_no_copy_layer

    if USE_WANDBAI:
      wandb.log(log_dict)

    train_loss_list.append(train_loss)
    val_loss_list.append(valid_loss)

    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    # save checkpoints like so:
    # torch.save(model.state_dict(), f'./checkpoint-{epoch+1}.pt')

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'out/best-model-state-dict.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    # if there is a range error with the loss, make sure you are using SGD optimizer!
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')


In [None]:
torch.save(model.state_dict(), 'out/current-state-dict.pt')

In [None]:
# model.load_state_dict(torch.load(f'out/current-state-dict.pt'))
model.load_state_dict(torch.load(f'out/best-model-state-dict.pt'))

In [None]:
# Finish wandbai logging
if USE_WANDBAI:
  wandb.finish(0, True)

In [None]:
# Plot train and validation loss curves
plt.plot(train_loss_list, label="Train loss",  marker='o')
plt.plot(val_loss_list, label = "Validation loss",  marker='o')
plt.xlabel('Epochs')
plt.title('Loss curves')
locs, labels = plt.xticks()
plt.xticks(np.arange(0, len(train_loss_list)))
plt.legend()
plt.show()

In [None]:
# Calculate metrics from the best model (lowest loss, not always highest BLEU)

model.load_state_dict(torch.load(f'out/best-model-state-dict.pt'))
metrics = get_metrics(test_data, test_entries, SRC, TRG, model, device)

for m in metrics:
    print(f'{m} = {metrics[m]*100:.2f} %')

# INFERENCE

In [None]:
CONFIG_PATH = 'out/config.json'
SRC_VOCAB_PATH = 'out/src_vocab.field'
TRG_VOCAB_PATH = 'out/trg_vocab.field'
MODEL_PATH = 'out/best-model-state-dict.pt'
OOV_DATASET = 'oov_dataset.json'

In [None]:
# Translator class to facilitate inference, easier portability
# If you use it as a standalone script, make sure to also import the following utils elements:
# VocabDub, gen_data_field(), read_vocab(), extend_vocabulary(), translate_sentence()
# As well as the model architecture:
# Encoder, Decoder, CopyLayerVocabExtend and CNNSeq2Seq

class Translator:
    def __init__(self):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        
        with open(CONFIG_PATH, 'r') as f:
            config = json.load(f)

        self.src_pad_idx = config['SRC_PAD_IDX']
        base_vocab_size = max(config['INPUT_DIM'], config['NUM_EMBEDDINGS'])
        
        # load vocab
        self.SRC, self.TRG = gen_data_field()
        self.SRC.build_vocab([], min_freq=1, max_size=None)
        self.TRG.build_vocab([], min_freq=1, max_size=None)

        src_vocab = VocabDup(read_vocab(SRC_VOCAB_PATH), base_vocab_size)
        trg_vocab = VocabDup(read_vocab(TRG_VOCAB_PATH), base_vocab_size)

        self.SRC = extend_vocabulary(self.SRC, src_vocab)
        self.TRG = extend_vocabulary(self.TRG, trg_vocab)
        
        # safeguard bcs of backwards compat issues
        dec_out_dim = config['DECODER']['OUT_DIM'] if 'OUT_DIM' in config['DECODER'] else DECODER_OUT_DIM

        # define model
        self.enc = Encoder(config['INPUT_DIM'], config['EMB_DIM'], config['HID_DIM'], config['ENCODER']['ENC_DROPOUT'], self.device, config['SRC_PAD_IDX'])
        self.dec = Decoder(config['NUM_EMBEDDINGS'], dec_out_dim, config['EMB_DIM'], config['HID_DIM'], config['DECODER']['DEC_DROPOUT'], self.device, config['TRG_PAD_IDX'])
        self.copy_layer = CopyLayerVocabExtend(self.dec) if config['USE_COPY'] else None
        self.model = CNNSeq2Seq(self.enc, self.dec, self.copy_layer).to(self.device)

        # load pretrained model
        loaded = torch.load(MODEL_PATH)
        self.model.load_state_dict(loaded)
        self.model.eval()


    def translate(self, sentence: str) -> List[str]:
      translation, _ = translate_sentence(
          sentence.split(), self.SRC, self.TRG, self.model, self.device, predict_with_copy=self.model.copy_layer is not None)
      
      return translation

    def calculate_bleu(self, test_data: List[Dict]) -> float:
      print("Calculating BLEU score...")

      expected_trgs = []
      pred_trgs = []
      pred_copy_trgs = []
      error_report = []

      for entry in test_data:
          src_sentence = entry['question']['interm_question'].lower()
          trg_sentence = entry['query']['interm_sparql']

          pred_trg = self.translate(src_sentence)
          pred_trg = pred_trg[:-1]
          pred_trgs.append(pred_trg)
          expected_trgs.append([trg_sentence.split()])

          error_entry = {
            'id': entry['_id'],
            'template_id': entry['template_id'],
            'src': src_sentence,
            'trg': trg_sentence,
            'predicted': ' '.join(pred_trg),
            'correct': trg_sentence == pred_trg
          }
          error_report.append(error_entry)

      bleu = bleu_score(pred_trgs, expected_trgs)

      with open('out/error_report_oov.json', 'w', encoding='utf-8') as f:
        json.dump(error_report, f, indent=4)

      return bleu

In [None]:
translator = Translator()

with open(OOV_DATASET, 'r', encoding='utf-8') as f:
  dataset = json.load(f)
  
translator.calculate_bleu(dataset)

# SAVE ALL

In [None]:
import shutil
shutil.move("out", OUT_DRIVE_FOLDER)
iteration += 1

In [None]:
# DONE :)