In [2]:
import json
import re

def tokenize(s):
    # Extract tokens: parentheses or sequences of non-whitespace, non-parenthesis characters.
    tokens = re.findall(r'\(|\)|[^\s()]+', s)
    return tokens

def parse_tokens(tokens):
    # Parse tokens into a nested list structure
    stack = []
    current_list = []
    for token in tokens:
        if token == '(':
            stack.append(current_list)
            current_list = []
        elif token == ')':
            finished = current_list
            current_list = stack.pop()
            current_list.append(finished)
        else:
            current_list.append(token)
    return current_list

def normalize_structure(tree):
    if not isinstance(tree, list):
        return None

    def is_key(token):
        return token in [
            "ORDER", "PIZZAORDER", "DRINKORDER", "NUMBER", "SIZE", "STYLE", "TOPPING",
            "COMPLEX_TOPPING", "QUANTITY", "VOLUME", "DRINKTYPE", "CONTAINERTYPE", "NOT"
        ]

    # Clean the list by keeping sublists and tokens as-is for further analysis
    cleaned = []
    for el in tree:
        cleaned.append(el)

    if len(cleaned) > 0 and isinstance(cleaned[0], str) and is_key(cleaned[0]):
        key = cleaned[0]
        if key == "ORDER":
            pizzaorders = []
            drinkorders = []
            for sub in cleaned[1:]:
                node = normalize_structure(sub)
                if isinstance(node, dict):
                    if "PIZZAORDER" in node:
                        if isinstance(node["PIZZAORDER"], list):
                            pizzaorders.extend(node["PIZZAORDER"])
                        else:
                            pizzaorders.append(node["PIZZAORDER"])
                    if "DRINKORDER" in node:
                        if isinstance(node["DRINKORDER"], list):
                            drinkorders.extend(node["DRINKORDER"])
                        else:
                            drinkorders.append(node["DRINKORDER"])
                    if node.get("TYPE") == "PIZZAORDER":
                        pizzaorders.append(node)
                    if node.get("TYPE") == "DRINKORDER":
                        drinkorders.append(node)
            result = {}
            if pizzaorders:
                result["PIZZAORDER"] = pizzaorders
            if drinkorders:
                result["DRINKORDER"] = drinkorders
            if result:
                return {"ORDER": result}
            else:
                return {}

        elif key == "PIZZAORDER":
            number = None
            size = None
            style = None
            toppings = []
            for sub in cleaned[1:]:
                node = normalize_structure(sub)
                if isinstance(node, dict):
                    t = node.get("TYPE")
                    if t == "NUMBER":
                        number = node["VALUE"]
                    elif t == "SIZE":
                        size = node["VALUE"]
                    elif t == "STYLE":
                        style = node["VALUE"]
                    elif t == "TOPPING":
                        toppings.append(node)
            result = {}
            if number is not None:
                result["NUMBER"] = number
            if size is not None:
                result["SIZE"] = size
            if style is not None:
                result["STYLE"] = style
            if toppings:
                result["AllTopping"] = toppings
            # Mark type internally, will remove later
            result["TYPE"] = "PIZZAORDER"
            return result

        elif key == "DRINKORDER":
            number = None
            volume = None
            drinktype = None
            containertype = None
            for sub in cleaned[1:]:
                node = normalize_structure(sub)
                if isinstance(node, dict):
                    t = node.get("TYPE")
                    if t == "NUMBER":
                        number = node["VALUE"]
                    elif t == "VOLUME":
                        volume = node["VALUE"]
                    elif t == "DRINKTYPE":
                        drinktype = node["VALUE"]
                    elif t == "CONTAINERTYPE":
                        containertype = node["VALUE"]
            result = {}
            if number is not None:
                result["NUMBER"] = number
            if volume is not None:
                result["VOLUME"] = volume
            if drinktype is not None:
                result["DRINKTYPE"] = drinktype
            if containertype is not None:
                result["CONTAINERTYPE"] = containertype
            result["TYPE"] = "DRINKORDER"
            return result

        elif key in ["NUMBER","SIZE","STYLE","VOLUME","DRINKTYPE","CONTAINERTYPE","QUANTITY"]:
            values = []
            for el in cleaned[1:]:
                if isinstance(el, str):
                    values.append(el)
            value_str = " ".join(values).strip()
            return {
                "TYPE": key,
                "VALUE": value_str
            }

        elif key == "TOPPING":
            values = []
            for el in cleaned[1:]:
                if isinstance(el, str):
                    values.append(el)
            topping_str = " ".join(values).strip()
            return {
                "TYPE": "TOPPING",
                "NOT": False,
                "Quantity": None,
                "Topping": topping_str
            }

        elif key == "COMPLEX_TOPPING":
            quantity = None
            topping = None
            for sub in cleaned[1:]:
                node = normalize_structure(sub)
                if isinstance(node, dict):
                    t = node.get("TYPE")
                    if t == "QUANTITY":
                        quantity = node["VALUE"]
                    elif t == "TOPPING":
                        topping = node["Topping"]
            return {
                "TYPE": "TOPPING",
                "NOT": False,
                "Quantity": quantity,
                "Topping": topping
            }

        elif key == "NOT":
            for sub in cleaned[1:]:
                node = normalize_structure(sub)
                if isinstance(node, dict) and node.get("TYPE") == "TOPPING":
                    node["NOT"] = True
                    if "Quantity" not in node:
                        node["Quantity"] = None
                    return node
            return None

    else:
        # Try to parse sublists and combine orders found
        combined_order = {"PIZZAORDER": [], "DRINKORDER": []}
        found_order = False

        for el in cleaned:
            node = normalize_structure(el)
            if isinstance(node, dict):
                if "ORDER" in node:
                    found_order = True
                    order_node = node["ORDER"]
                    if "PIZZAORDER" in order_node:
                        combined_order["PIZZAORDER"].extend(order_node["PIZZAORDER"])
                    if "DRINKORDER" in order_node:
                        combined_order["DRINKORDER"].extend(order_node["DRINKORDER"])
                elif node.get("TYPE") == "PIZZAORDER":
                    found_order = True
                    combined_order["PIZZAORDER"].append(node)
                elif node.get("TYPE") == "DRINKORDER":
                    found_order = True
                    combined_order["DRINKORDER"].append(node)

        if found_order:
            final = {}
            if combined_order["PIZZAORDER"]:
                final["PIZZAORDER"] = combined_order["PIZZAORDER"]
            if combined_order["DRINKORDER"]:
                final["DRINKORDER"] = combined_order["DRINKORDER"]
            return {"ORDER": final} if final else {}

        return None

def remove_type_keys(obj):
    # Recursively remove "TYPE" keys from all dictionaries
    if isinstance(obj, dict):
        obj.pop("TYPE", None)
        for k, v in obj.items():
            remove_type_keys(v)
    elif isinstance(obj, list):
        for item in obj:
            remove_type_keys(item)


def preprocess(text):
    tokens = tokenize(text)
    parsed = parse_tokens(tokens)
    result = normalize_structure(parsed)
    remove_type_keys(result)
    return result


In [3]:
import json
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers=1, dropout=0.5):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, num_layers=n_layers, dropout=(dropout if n_layers>1 else 0), batch_first=True)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, src):
        # src: [batch, src_len]
        embedded = self.dropout(self.embedding(src))  # [batch, src_len, emb_dim]
        outputs, (hidden, cell) = self.rnn(embedded) 
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers=1, dropout=0.5):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, num_layers=n_layers, dropout=(dropout if n_layers>1 else 0), batch_first=True)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        # input: [batch] single token
        input = input.unsqueeze(1)  # [batch, 1]
        embedded = self.dropout(self.embedding(input)) # [batch, 1, emb_dim]
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell)) # output: [batch, 1, hid_dim]
        prediction = self.fc_out(output.squeeze(1)) # [batch, output_dim]
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
    
    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        # src: [batch, src_len]
        # tgt: [batch, tgt_len]
        batch_size = src.shape[0]
        tgt_len = tgt.shape[1]
        tgt_vocab_size = self.decoder.fc_out.out_features

        outputs = torch.zeros(batch_size, tgt_len, tgt_vocab_size).to(self.device)
        
        hidden, cell = self.encoder(src)
        input = tgt[:,0]  # first token <sos>
        
        for t in range(1, tgt_len):
            # Teacher forcing
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[:, t, :] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = tgt[:, t] if teacher_force else top1
        
        return outputs
    

In [4]:
def build_vocab(texts, min_freq=2, special_tokens=["<pad>", "<sos>", "<eos>", "<unk>"]):
    word_freq = {}
    for text in texts:
        for tok in text.split():
            word_freq[tok] = word_freq.get(tok, 0) + 1
    # Build vocab
    vocab = special_tokens[:]
    for w, freq in sorted(word_freq.items(), key=lambda x: x[1], reverse=True):
        if freq >= min_freq and w not in special_tokens:
            vocab.append(w)
    word2idx = {w: i for i, w in enumerate(vocab)}
    idx2word = {i: w for w, i in word2idx.items()}
    return word2idx, idx2word

def numericalize(text, word2idx, max_len, sos_token="<sos>", eos_token="<eos>", pad_token="<pad>"):
    tokens = text.split()
    tokens = [sos_token] + tokens + [eos_token]
    # truncate or pad
    if len(tokens) > max_len:
        tokens = tokens[:max_len]
        tokens[-1] = eos_token  # ensure ends with EOS
    else:
        tokens = tokens + [pad_token] * (max_len - len(tokens))
    indices = [word2idx.get(t, word2idx["<unk>"]) for t in tokens]
    return indices

def collate_fn(batch, src_word2idx, tgt_word2idx, max_src_len, max_tgt_len):
    srcs, tgts = zip(*batch)
    src_indices = [numericalize(s, src_word2idx, max_src_len) for s in srcs]
    tgt_indices = [numericalize(t, tgt_word2idx, max_tgt_len) for t in tgts]
    
    src_tensor = torch.tensor(src_indices, dtype=torch.long)
    tgt_tensor = torch.tensor(tgt_indices, dtype=torch.long)
    
    return src_tensor, tgt_tensor

In [5]:
def train_model(model, dataloader, optimizer, criterion, clip=1):
    model.train()
    epoch_loss = 0
    progress_bar = tqdm(dataloader, desc="Training", leave=False)
    for src, tgt in progress_bar:
        src = src.to(model.device)
        tgt = tgt.to(model.device)
        
        optimizer.zero_grad()
        output = model(src, tgt)
        
        output_dim = output.shape[-1]
        output = output[:,1:].contiguous().view(-1, output_dim)
        tgt = tgt[:,1:].contiguous().view(-1)
        
        loss = criterion(output, tgt)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        epoch_loss += loss.item()
        progress_bar.set_postfix({"loss": f"{loss.item():.3f}"})
        
    return epoch_loss / len(dataloader)


def evaluate_loss(model, dataloader, criterion):
    model.eval()
    epoch_loss = 0
    progress_bar = tqdm(dataloader, desc="Evaluating", leave=False)
    with torch.no_grad():
        for src, tgt in progress_bar:
            src = src.to(model.device)
            tgt = tgt.to(model.device)
            output = model(src, tgt, teacher_forcing_ratio=0)
            output_dim = output.shape[-1]
            output = output[:,1:].contiguous().view(-1, output_dim)
            tgt = tgt[:,1:].contiguous().view(-1)
            
            loss = criterion(output, tgt)
            epoch_loss += loss.item()
            progress_bar.set_postfix({"val_loss": f"{loss.item():.3f}"})
    return epoch_loss / len(dataloader)


def generate_output(model, src_tokens, tgt_word2idx, idx2tgt):
    model.eval()
    with torch.no_grad():
        hidden, cell = model.encoder(src_tokens)
        sos_idx = tgt_word2idx["<sos>"]
        eos_idx = tgt_word2idx["<eos>"]
        input_token = torch.tensor([sos_idx], device=model.device)
        decoded_tokens = []
        max_tgt_len = src_tokens.shape[1]*2  # heuristic limit

        for _ in range(max_tgt_len):
            output, hidden, cell = model.decoder(input_token, hidden, cell)
            top1 = output.argmax(1)
            if top1.item() == eos_idx:
                break
            decoded_tokens.append(top1.item())
            input_token = top1
        
        decoded_text = " ".join([idx2tgt[i] for i in decoded_tokens])
        return decoded_text

def exact_match_accuracy(model, dev_dataset, src_word2idx, tgt_word2idx, idx2tgt, max_src_len, batch_size=32):
    model.eval()
    correct = 0
    total = 0
    dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False)
    for srcs, tgts in dev_loader:
        # Convert srcs to tensors
        src_indices = [numericalize(s, src_word2idx, max_src_len) for s in srcs]
        src_tensor = torch.tensor(src_indices, dtype=torch.long, device=model.device)

        # Decode each sample in the batch
        for i in range(len(srcs)):
            single_src = src_tensor[i].unsqueeze(0)  # [1, max_src_len]
            predicted_text = generate_output(model, single_src, tgt_word2idx, idx2tgt)
            # Convert both predicted and reference TOP into JSON using preprocess
            pred_json = preprocess(predicted_text)
            ref_json = preprocess(tgts[i])
            if pred_json == ref_json:
                correct += 1
            total += 1
    
    return correct / total if total > 0 else 0.0

In [6]:
class PizzaDataset(Dataset):
    def __init__(self, jsonl_path,SRC_NAME, TOP_NAME, max_samples=None, max_src_len=128, max_tgt_len=256):
        self.src_texts = []
        self.tgt_texts = []
        self.max_src_len = max_src_len
        self.max_tgt_len = max_tgt_len
        
        with open(jsonl_path, 'r', encoding='utf-8') as f:
            for i, line in enumerate(f):
                if max_samples is not None and i >= max_samples:
                    break
                entry = json.loads(line.strip())
                src = entry.get(SRC_NAME, "").strip()
                tgt = entry.get(TOP_NAME, "").strip()
                if src and tgt:
                    self.src_texts.append(src)
                    self.tgt_texts.append(tgt)
                    
        # Shuffle is not applied here for dev datasets.
    
    def __len__(self):
        return len(self.src_texts)
    
    def __getitem__(self, idx):
        return self.src_texts[idx], self.tgt_texts[idx]

In [7]:
train_jsonl_path = "../dataset/PIZZA_train.json"  
dev_jsonl_path = "../dataset/PIZZA_dev.json"      

train_dataset = PizzaDataset(train_jsonl_path,"train.SRC", "train.TOP-DECOUPLED", max_samples=2500000) 
dev_dataset = PizzaDataset(dev_jsonl_path,"dev.SRC", "dev.TOP", max_samples=10000)

In [8]:
train_dataset.__getitem__(1)

('large pie with green pepper and with extra peperonni',
 '(ORDER (PIZZAORDER (SIZE large ) (TOPPING green pepper ) (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING peperonni ) ) ) )')

In [9]:
len(train_dataset), len(dev_dataset)

(2456446, 348)

In [10]:
# Build vocabularies from train data
src_word2idx, src_idx2word = build_vocab(train_dataset.src_texts, min_freq=1)
tgt_word2idx, tgt_idx2word = build_vocab(train_dataset.tgt_texts, min_freq=1)

pad_idx = tgt_word2idx["<pad>"]

In [11]:
src_word2idx, src_idx2word

({'<pad>': 0,
  '<sos>': 1,
  '<eos>': 2,
  '<unk>': 3,
  'and': 4,
  'with': 5,
  'a': 6,
  'three': 7,
  'pizzas': 8,
  'pizza': 9,
  "i'd": 10,
  'like': 11,
  'cheese': 12,
  'four': 13,
  'pies': 14,
  'party': 15,
  'five': 16,
  'american': 17,
  'sized': 18,
  'one': 19,
  'no': 20,
  'of': 21,
  'two': 22,
  'i': 23,
  'size': 24,
  'sprite': 25,
  'pepper': 26,
  'glaze': 27,
  'without': 28,
  'ice': 29,
  '-': 30,
  'large': 31,
  'balsamic': 32,
  'peppers': 33,
  'ounce': 34,
  'pie': 35,
  'crust': 36,
  'tea': 37,
  'thin': 38,
  'sauce': 39,
  'ups': 40,
  'extra': 41,
  'diet': 42,
  'green': 43,
  'seven': 44,
  'medium': 45,
  'also': 46,
  'personal': 47,
  'roasted': 48,
  'red': 49,
  'teas': 50,
  'ginger': 51,
  'pecorino': 52,
  'peperonni': 53,
  'cans': 54,
  'chicken': 55,
  'banana': 56,
  'need': 57,
  'fantas': 58,
  'little': 59,
  'ale': 60,
  'lunch': 61,
  'bottle': 62,
  'any': 63,
  '500': 64,
  'sprites': 65,
  '20': 66,
  'coke': 67,
  'can': 68,

In [12]:
tgt_word2idx, tgt_idx2word

({'<pad>': 0,
  '<sos>': 1,
  '<eos>': 2,
  '<unk>': 3,
  ')': 4,
  '(NUMBER': 5,
  '(TOPPING': 6,
  '(PIZZAORDER': 7,
  '(ORDER': 8,
  '(DRINKORDER': 9,
  '(DRINKTYPE': 10,
  'a': 11,
  '(SIZE': 12,
  'three': 13,
  '(NOT': 14,
  'cheese': 15,
  '(VOLUME': 16,
  'four': 17,
  'party': 18,
  'five': 19,
  'american': 20,
  'sized': 21,
  '(COMPLEX_TOPPING': 22,
  '(QUANTITY': 23,
  'one': 24,
  '(STYLE': 25,
  'two': 26,
  'size': 27,
  'sprite': 28,
  'pepper': 29,
  'glaze': 30,
  'ice': 31,
  '-': 32,
  'large': 33,
  'balsamic': 34,
  'peppers': 35,
  '(CONTAINERTYPE': 36,
  'ounce': 37,
  'crust': 38,
  'tea': 39,
  'thin': 40,
  'sauce': 41,
  'ups': 42,
  'extra': 43,
  'diet': 44,
  'green': 45,
  'seven': 46,
  'medium': 47,
  'personal': 48,
  'roasted': 49,
  'red': 50,
  'teas': 51,
  'of': 52,
  'ginger': 53,
  'pecorino': 54,
  'peperonni': 55,
  'cans': 56,
  'chicken': 57,
  'banana': 58,
  'fantas': 59,
  'little': 60,
  'ale': 61,
  'lunch': 62,
  'bottle': 63,
  '500

In [None]:
## max lenght without preprocessing (133, 335)

In [14]:
batch_size = 64
max_src_len = 133
max_tgt_len = 335

# Split train into train/val
train_size = int(0.9 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_data, val_data = torch.utils.data.random_split(train_dataset, [train_size, val_size])

In [15]:

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, 
                            collate_fn=lambda b: collate_fn(b, src_word2idx, tgt_word2idx, max_src_len, max_tgt_len))
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False, 
                        collate_fn=lambda b: collate_fn(b, src_word2idx, tgt_word2idx, max_src_len, max_tgt_len))


In [16]:
# Model hyperparameters
input_dim = len(src_word2idx)
output_dim = len(tgt_word2idx)
emb_dim = 256
hid_dim = 512
n_layers = 2
dropout = 0.5

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
encoder = Encoder(input_dim, emb_dim, hid_dim, n_layers, dropout)
decoder = Decoder(output_dim, emb_dim, hid_dim, n_layers, dropout)
model = Seq2Seq(encoder, decoder, device).to(device)

optimizer = optim.Adam(model.parameters(), lr=0.0005)
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [17]:

n_epochs = 5
for epoch in range(n_epochs):
    train_loss = train_model(model, train_loader, optimizer, criterion)
    val_loss = evaluate_loss(model, val_loader, criterion)

    # Compute exact match accuracy on dev set
    exact_match = exact_match_accuracy(model, dev_dataset, src_word2idx, tgt_word2idx, tgt_idx2word, max_src_len)
    
    print(f"Epoch {epoch+1}/{n_epochs} | Train Loss: {train_loss:.3f} | Val Loss: {val_loss:.3f} | Dev Exact Match: {exact_match:.2%}")



                                                                           

KeyboardInterrupt: 

In [1]:
# Final evaluation on dev
exact_match_final = exact_match_accuracy(model, dev_dataset, src_word2idx, tgt_word2idx, tgt_idx2word, max_src_len)
print("Final Dev Exact Match Accuracy:", exact_match_final)

NameError: name 'exact_match_accuracy' is not defined