In [1]:
import json
import re
from collections import defaultdict

ENTITY_KEYS = {
    "NUMBER", "SIZE", "STYLE", "TOPPING", "COMPLEX_TOPPING", "QUANTITY",
    "VOLUME", "DRINKTYPE", "CONTAINERTYPE"
}
ORDER_KEYS = {"PIZZAORDER", "DRINKORDER"}

def tokenize(s):
    tokens = re.findall(r'\(|\)|[^\s()]+', s)
    return tokens

def parse_tokens(tokens):
    stack = []
    current_list = []
    for token in tokens:
        if token == '(':
            stack.append(current_list)
            current_list = []
        elif token == ')':
            finished = current_list
            current_list = stack.pop()
            current_list.append(finished)
        else:
            current_list.append(token)
    return current_list
def extract_orders(structure, order_index=1):

    results = []

    if not isinstance(structure, list) or len(structure) == 0:
        return results, order_index

    first = structure[0]
    if isinstance(first, list):
        for elem in structure:
            sub_results, order_index = extract_orders(elem, order_index)
            results.extend(sub_results)
        return results, order_index

    if isinstance(first, str) and first in ORDER_KEYS:
        order_type = "PIZZAORDER" if first == "PIZZAORDER" else "DRINKORDER"
        current_order_sequence = order_index
        order_index += 1
        content_tokens = []
        for elem in structure[1:]:
            content_tokens.extend(collect_tokens(elem))
        for tok in content_tokens:
            results.append((tok, order_type, current_order_sequence))

        return results, order_index
    else:
        for elem in structure:
            sub_results, order_index = extract_orders(elem, order_index)
            results.extend(sub_results)
        return results, order_index
def collect_tokens(node):
    collected = []
    if isinstance(node, list):
        for sub in node:
            sub_tokens = collect_tokens(sub)
            collected.extend(sub_tokens)
    else:
        if node not in ["(", ")"] and not is_structural_key(node):
            collected.append(node)
    return collected

def is_structural_key(token):
    return token in [
        "ORDER","PIZZAORDER","DRINKORDER","NUMBER","SIZE","STYLE","TOPPING",
        "COMPLEX_TOPPING","QUANTITY","VOLUME","DRINKTYPE","CONTAINERTYPE","NOT"
    ]


[('a', 'PIZZAORDER', 1), ('medium', 'PIZZAORDER', 1), ('ham', 'PIZZAORDER', 1), ('and', 'PIZZAORDER', 1), ('pineapple', 'PIZZAORDER', 1), ('pizza', 'PIZZAORDER', 1), ('a', 'DRINKORDER', 2), ('small', 'DRINKORDER', 2), ('iced', 'DRINKORDER', 2), ('tea', 'DRINKORDER', 2)]


In [4]:

def label_input(input_text, top):
    tokens = tokenize(top)
    parsed = parse_tokens(tokens)

    order_info, _ = extract_orders(parsed)
    label_dict = defaultdict(list)
    for tok, lbl, num in order_info:
        label_dict[tok.lower()].append((lbl, num))

    input_tokens = input_text.split()
    labeled_input = []

    used_labels = defaultdict(int)

    for token in input_tokens:
        token_lower = token.lower()
        if token_lower in label_dict:

            label_index = used_labels[token_lower]
            if label_index < len(label_dict[token_lower]):
                token_label, sequence_number = label_dict[token_lower][label_index]
                used_labels[token_lower] += 1 
            else:
                token_label, sequence_number = 'O', None  
        else:
            token_label, sequence_number = 'O', None  

        labeled_input.append((token, token_label, sequence_number))
    return labeled_input

top = "(ORDER i need (PIZZAORDER (NUMBER a ) (SIZE medium ) (TOPPING ham ) and (TOPPING pineapple ) pizza ) and (DRINKORDER (NUMBER a ) (VOLUME small ) (DRINKTYPE iced tea ) ) )"

input_text = "i need a medium ham and pineapple pizza and a small iced tea"
input_label_sequence = label_input(input_text, top)
print(input_label_sequence)

[('i', 'O', None), ('need', 'O', None), ('a', 'PIZZAORDER', 1), ('medium', 'PIZZAORDER', 1), ('ham', 'PIZZAORDER', 1), ('and', 'PIZZAORDER', 1), ('pineapple', 'PIZZAORDER', 1), ('pizza', 'PIZZAORDER', 1), ('and', 'O', None), ('a', 'DRINKORDER', 2), ('small', 'DRINKORDER', 2), ('iced', 'DRINKORDER', 2), ('tea', 'DRINKORDER', 2)]


In [7]:

def transform_to_labels(input_array):
    labeled_numbers = []

    for _, label, sequence in input_array:
        # Compute the numerical label
        if label == 'O' and sequence is None:
            numerical_label = 0  # Neutral/irrelevant
        elif label == 'PIZZAORDER':
            numerical_label = 10 + sequence  # Unique range for pizza orders
        elif label == 'DRINKORDER':
            numerical_label = 20 + sequence  # Unique range for drink orders
        else:
            numerical_label = 0  # Default fallback

        labeled_numbers.append(numerical_label)

    return labeled_numbers

transform_to_labels(input_label_sequence)

[0, 0, 11, 11, 11, 11, 11, 11, 0, 22, 22, 22, 22]

In [6]:
class PizzaDataset(Dataset):
    def __init__(self, jsonl_path,SRC_NAME, TOP_NAME, max_samples=None, max_src_len=128, max_tgt_len=256):
        self.src_texts = []
        self.tgt_texts = []
        self.max_src_len = max_src_len
        self.max_tgt_len = max_tgt_len
        
        with open(jsonl_path, 'r', encoding='utf-8') as f:
            for i, line in enumerate(f):
                if max_samples is not None and i >= max_samples:
                    break
                entry = json.loads(line.strip())
                src = entry.get(SRC_NAME, "").strip()
                tgt = entry.get(TOP_NAME, "").strip()
                if src and tgt:
                    self.src_texts.append(src)
                    self.tgt_texts.append(tgt)
                    
        # Shuffle is not applied here for dev datasets.
    
    def __len__(self):
        return len(self.src_texts)
    
    def __getitem__(self, idx):
        return self.src_texts[idx], self.tgt_texts[idx]

In [7]:
train_jsonl_path = "../dataset/PIZZA_train.json"  
dev_jsonl_path = "../dataset/PIZZA_dev.json"      

train_dataset = PizzaDataset(train_jsonl_path,"train.SRC", "train.TOP-DECOUPLED", max_samples=2500000) 
dev_dataset = PizzaDataset(dev_jsonl_path,"dev.SRC", "dev.TOP", max_samples=10000)

In [8]:
train_dataset.__getitem__(1)

('large pie with green pepper and with extra peperonni',
 '(ORDER (PIZZAORDER (SIZE large ) (TOPPING green pepper ) (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING peperonni ) ) ) )')

In [9]:
len(train_dataset), len(dev_dataset)

(2456446, 348)

In [10]:
# Build vocabularies from train data
src_word2idx, src_idx2word = build_vocab(train_dataset.src_texts, min_freq=1)
tgt_word2idx, tgt_idx2word = build_vocab(train_dataset.tgt_texts, min_freq=1)

pad_idx = tgt_word2idx["<pad>"]

In [11]:
src_word2idx, src_idx2word

({'<pad>': 0,
  '<sos>': 1,
  '<eos>': 2,
  '<unk>': 3,
  'and': 4,
  'with': 5,
  'a': 6,
  'three': 7,
  'pizzas': 8,
  'pizza': 9,
  "i'd": 10,
  'like': 11,
  'cheese': 12,
  'four': 13,
  'pies': 14,
  'party': 15,
  'five': 16,
  'american': 17,
  'sized': 18,
  'one': 19,
  'no': 20,
  'of': 21,
  'two': 22,
  'i': 23,
  'size': 24,
  'sprite': 25,
  'pepper': 26,
  'glaze': 27,
  'without': 28,
  'ice': 29,
  '-': 30,
  'large': 31,
  'balsamic': 32,
  'peppers': 33,
  'ounce': 34,
  'pie': 35,
  'crust': 36,
  'tea': 37,
  'thin': 38,
  'sauce': 39,
  'ups': 40,
  'extra': 41,
  'diet': 42,
  'green': 43,
  'seven': 44,
  'medium': 45,
  'also': 46,
  'personal': 47,
  'roasted': 48,
  'red': 49,
  'teas': 50,
  'ginger': 51,
  'pecorino': 52,
  'peperonni': 53,
  'cans': 54,
  'chicken': 55,
  'banana': 56,
  'need': 57,
  'fantas': 58,
  'little': 59,
  'ale': 60,
  'lunch': 61,
  'bottle': 62,
  'any': 63,
  '500': 64,
  'sprites': 65,
  '20': 66,
  'coke': 67,
  'can': 68,

In [12]:
tgt_word2idx, tgt_idx2word

({'<pad>': 0,
  '<sos>': 1,
  '<eos>': 2,
  '<unk>': 3,
  ')': 4,
  '(NUMBER': 5,
  '(TOPPING': 6,
  '(PIZZAORDER': 7,
  '(ORDER': 8,
  '(DRINKORDER': 9,
  '(DRINKTYPE': 10,
  'a': 11,
  '(SIZE': 12,
  'three': 13,
  '(NOT': 14,
  'cheese': 15,
  '(VOLUME': 16,
  'four': 17,
  'party': 18,
  'five': 19,
  'american': 20,
  'sized': 21,
  '(COMPLEX_TOPPING': 22,
  '(QUANTITY': 23,
  'one': 24,
  '(STYLE': 25,
  'two': 26,
  'size': 27,
  'sprite': 28,
  'pepper': 29,
  'glaze': 30,
  'ice': 31,
  '-': 32,
  'large': 33,
  'balsamic': 34,
  'peppers': 35,
  '(CONTAINERTYPE': 36,
  'ounce': 37,
  'crust': 38,
  'tea': 39,
  'thin': 40,
  'sauce': 41,
  'ups': 42,
  'extra': 43,
  'diet': 44,
  'green': 45,
  'seven': 46,
  'medium': 47,
  'personal': 48,
  'roasted': 49,
  'red': 50,
  'teas': 51,
  'of': 52,
  'ginger': 53,
  'pecorino': 54,
  'peperonni': 55,
  'cans': 56,
  'chicken': 57,
  'banana': 58,
  'fantas': 59,
  'little': 60,
  'ale': 61,
  'lunch': 62,
  'bottle': 63,
  '500

In [None]:
## max lenght without preprocessing (133, 335)

In [14]:
batch_size = 64
max_src_len = 133
max_tgt_len = 335

# Split train into train/val
train_size = int(0.9 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_data, val_data = torch.utils.data.random_split(train_dataset, [train_size, val_size])

In [15]:

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, 
                            collate_fn=lambda b: collate_fn(b, src_word2idx, tgt_word2idx, max_src_len, max_tgt_len))
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False, 
                        collate_fn=lambda b: collate_fn(b, src_word2idx, tgt_word2idx, max_src_len, max_tgt_len))


In [16]:
# Model hyperparameters
input_dim = len(src_word2idx)
output_dim = len(tgt_word2idx)
emb_dim = 256
hid_dim = 512
n_layers = 2
dropout = 0.5

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
encoder = Encoder(input_dim, emb_dim, hid_dim, n_layers, dropout)
decoder = Decoder(output_dim, emb_dim, hid_dim, n_layers, dropout)
model = Seq2Seq(encoder, decoder, device).to(device)

optimizer = optim.Adam(model.parameters(), lr=0.0005)
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [17]:

n_epochs = 5
for epoch in range(n_epochs):
    train_loss = train_model(model, train_loader, optimizer, criterion)
    val_loss = evaluate_loss(model, val_loader, criterion)

    # Compute exact match accuracy on dev set
    exact_match = exact_match_accuracy(model, dev_dataset, src_word2idx, tgt_word2idx, tgt_idx2word, max_src_len)
    
    print(f"Epoch {epoch+1}/{n_epochs} | Train Loss: {train_loss:.3f} | Val Loss: {val_loss:.3f} | Dev Exact Match: {exact_match:.2%}")



                                                                           

KeyboardInterrupt: 

In [1]:
# Final evaluation on dev
exact_match_final = exact_match_accuracy(model, dev_dataset, src_word2idx, tgt_word2idx, tgt_idx2word, max_src_len)
print("Final Dev Exact Match Accuracy:", exact_match_final)

NameError: name 'exact_match_accuracy' is not defined