In [4]:
import json
import re
from collections import defaultdict

ENTITY_KEYS = {
    "NUMBER", "SIZE", "STYLE", "TOPPING", "COMPLEX_TOPPING", "QUANTITY",
    "VOLUME", "DRINKTYPE", "CONTAINERTYPE"
}
ORDER_KEYS = {"PIZZAORDER", "DRINKORDER"}

def tokenize(s):
    tokens = re.findall(r'\(|\)|[^\s()]+', s)
    return tokens

def parse_tokens(tokens):
    stack = []
    current_list = []
    for token in tokens:
        if token == '(':
            stack.append(current_list)
            current_list = []
        elif token == ')':
            finished = current_list
            current_list = stack.pop()
            current_list.append(finished)
        else:
            current_list.append(token)
    return current_list
def extract_orders(structure, order_index=1):

    results = []

    if not isinstance(structure, list) or len(structure) == 0:
        return results, order_index

    first = structure[0]
    if isinstance(first, list):
        for elem in structure:
            sub_results, order_index = extract_orders(elem, order_index)
            results.extend(sub_results)
        return results, order_index

    if isinstance(first, str) and first in ORDER_KEYS:
        order_type = "PIZZAORDER" if first == "PIZZAORDER" else "DRINKORDER"
        current_order_sequence = order_index
        order_index += 1
        content_tokens = []
        for elem in structure[1:]:
            content_tokens.extend(collect_tokens(elem))
        for tok in content_tokens:
            results.append((tok, order_type, current_order_sequence))

        return results, order_index
    else:
        for elem in structure:
            sub_results, order_index = extract_orders(elem, order_index)
            results.extend(sub_results)
        return results, order_index
def collect_tokens(node):
    collected = []
    if isinstance(node, list):
        for sub in node:
            sub_tokens = collect_tokens(sub)
            collected.extend(sub_tokens)
    else:
        if node not in ["(", ")"] and not is_structural_key(node):
            collected.append(node)
    return collected

def is_structural_key(token):
    return token in [
        "ORDER","PIZZAORDER","DRINKORDER","NUMBER","SIZE","STYLE","TOPPING",
        "COMPLEX_TOPPING","QUANTITY","VOLUME","DRINKTYPE","CONTAINERTYPE","NOT"
    ]


In [5]:
def map_tokens_to_labels(input_text, reference_list):

    input_tokens = input_text.split()

    ref_index = 0
    ref_len = len(reference_list)

    output = []
    for token in input_tokens:
        if ref_index < ref_len:
            ref_token, ref_label, ref_seq = reference_list[ref_index]
            if token.lower() == ref_token.lower():
                output.append((token, ref_label, ref_seq))
                ref_index += 1
            else:
                # If not matched, assign O and None
                output.append((token, 'O', None))
        else:
            # No more reference tokens to match, so assign O and None
            output.append((token, 'O', None))

    return output

def label_input(input_text, top):
    tokens = tokenize(top)
    parsed = parse_tokens(tokens)

    order_info, _ = extract_orders(parsed)
    return map_tokens_to_labels(input_text, order_info)

top = "(ORDER i need (PIZZAORDER (NUMBER a ) (SIZE medium ) (TOPPING ham ) and (TOPPING pineapple ) pizza ) and (DRINKORDER (NUMBER a ) (VOLUME small ) (DRINKTYPE iced tea ) ) )"

input_text = "i need a medium ham and pineapple pizza and a small iced tea"
input_label_sequence = label_input(input_text, top)
print(input_label_sequence)

[('i', 'O', None), ('need', 'O', None), ('a', 'PIZZAORDER', 1), ('medium', 'PIZZAORDER', 1), ('ham', 'PIZZAORDER', 1), ('and', 'PIZZAORDER', 1), ('pineapple', 'PIZZAORDER', 1), ('pizza', 'PIZZAORDER', 1), ('and', 'O', None), ('a', 'DRINKORDER', 2), ('small', 'DRINKORDER', 2), ('iced', 'DRINKORDER', 2), ('tea', 'DRINKORDER', 2)]


In [6]:

def transform_to_labels(input_array):
    labeled_numbers = []

    for _, label, sequence in input_array:
        # Compute the numerical label
        if label == 'O' and sequence is None:
            numerical_label = 0  # Neutral/irrelevant
        elif label == 'PIZZAORDER':
            numerical_label = 10 + sequence  # Unique range for pizza orders
        elif label == 'DRINKORDER':
            numerical_label = 20 + sequence  # Unique range for drink orders
        else:
            numerical_label = 0  # Default fallback

        labeled_numbers.append(numerical_label)

    return labeled_numbers

transform_to_labels(input_label_sequence)

[0, 0, 11, 11, 11, 11, 11, 11, 0, 22, 22, 22, 22]

In [10]:
def create_training_data(input_file: str, output_file: str):

    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            record = json.loads(line)
            src = record["train.SRC"]
            top = record["train.TOP"]

            labeled_input = label_input(src, top)
            numerical_labels = transform_to_labels(labeled_input)

            training_instance = {
                "text": src,
                "labels": numerical_labels
            }

            outfile.write(json.dumps(training_instance) + "\n")

# File paths
input_file = "../dataset/PIZZA_train.json"
output_file = "../dataset/train_data_model1.json"

# Generate the training data
create_training_data(input_file,output_file)

In [1]:
import pandas as pd
train_path = '../dataset/training_data_model1.json'
dev_path = "../dataset/dev_data_model1.json"
df = pd.read_json(train_path, lines=True)
dev = pd.read_json(dev_path, lines=True)
df.describe()

Unnamed: 0,text,labels
count,2456446,2456446
unique,2456446,1929
top,can i have a large bbq pulled pork,"[0, 0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,..."
freq,1,110156


In [2]:
dev.describe()

Unnamed: 0,text,labels
count,348,348
unique,348,176
top,i want to order two medium pizzas with sausage...,"[0, 0, 11, 11, 11, 11, 11, 11, 11, 11, 11]"
freq,1,24


In [3]:
X_train = df['text']
y_train = df['labels']
X_test = dev['text']
y_test = dev['labels']

In [4]:
X_train[0], y_train[0]

('can i have a large bbq pulled pork', [0, 0, 0, 11, 11, 11, 11, 11])

In [5]:
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences
vocab = {"<PAD>": 30, "<SOS>": 31, "<EOS>": 32, "<UNK>": 33}  # Special tokens

def tokenize_output(output):
    """
    Tokenizes the structured output into meaningful tokens.
    Example:
        Input: "(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (TOPPING bbq pulled pork ) ) )"
        Output: ["(ORDER", "(PIZZAORDER", "(NUMBER", "a", "(SIZE", "large", "(TOPPING", "bbq", "pulled", "pork", ")", ")", ")", ")"]
    """
    tokens = re.findall(r"\(|\)|\w+|[^\s()]+", output)
    return tokens

def build_vocab(outputs, index):
    """
    Builds a vocabulary from tokenized outputs.
    """
    i = index
    for output in outputs:
        tokens = tokenize_output(output)
        for token in tokens:
            if token not in vocab:
                vocab[token] = i
                i += 1
    return vocab, i
def encode_outputs(outputs, vocab):
    encoded = []
    for output in outputs:
        tokens = tokenize_output(output)  # Tokenize the output
        sequence = [vocab.get(token, vocab.get("<UNK>", 0)) for token in tokens] # [vocab.get("<SOS>", 0)] + \+ \[vocab.get("<EOS>", 0)]
        encoded.append(sequence)
    return encoded

def pad_sequences_to_fixed_length(sequences, max_len):
    """
    Pads sequences to a fixed length.
    """
    return pad_sequences(sequences, maxlen=max_len, padding="post", value=vocab["<PAD>"])

def decode_sequence(sequence, vocab):
    """
    Decodes a sequence of integers back into the structured output string.
    """
    inv_vocab = {v: k for k, v in vocab.items()}  # Reverse the vocabulary
    tokens = [inv_vocab[idx] for idx in sequence if idx in inv_vocab and idx not in { vocab["<SOS>"], vocab["<EOS>"],vocab["<PAD>"]} ] 
    output = " ".join(tokens)
    output = output.replace(" ( ", " (").replace("( ", "(") #.replace(" )", ")")
    return output

def decode_sequence_2(sequence, vocab):
    """
    Decodes a sequence of integers back into the structured output string.
    """
    inv_vocab = {v: k for k, v in vocab.items()}  # Reverse the vocabulary
    # sequence = sequence.cpu().tolist()  # Convert tensor to a list of integers
    tokens = [inv_vocab.get(idx, "") for idx in sequence if idx > 0]  # Ignore unknown and put empty char
    return "".join(tokens)



In [6]:


def prepare_data(
    X_train, y_train, X_test, y_test, max_len_1=20, max_len_2 = 20
):

    index = 34
    X_vocab, index = build_vocab(X_train,index)  # Build vocabulary from training outputs
    X_train_encoded = encode_outputs(X_train, X_vocab)  # Encode training outputs
    X_test_encoded = encode_outputs(X_test, X_vocab)  # Encode testing outputs
    X_train_processed = pad_sequences_to_fixed_length(X_train_encoded, max_len_1)
    X_test_processed = pad_sequences_to_fixed_length(X_test_encoded, max_len_1)

    y_train_processed = pad_sequences_to_fixed_length(y_train, max_len_2)
    y_test_processed = pad_sequences_to_fixed_length(y_test, max_len_2)


    return (
        X_train_processed,
        X_test_processed,
        y_train_processed,
        y_test_processed,
        X_vocab,
    )


In [7]:
X_train_processed, X_test_processed, y_train_processed, y_test_processed, vocab  = prepare_data( X_train, y_train, X_test, y_test, max_len_1=40, max_len_2=40)

In [8]:
X_train_processed

array([[34, 35, 36, ..., 30, 30, 30],
       [38, 42, 43, ..., 30, 30, 30],
       [35, 49, 50, ..., 30, 30, 30],
       ...,
       [35, 49, 50, ..., 30, 30, 30],
       [35, 49, 50, ..., 30, 30, 30],
       [35, 49, 50, ..., 30, 30, 30]])

In [9]:
y_train_processed[1]

array([11, 11, 11, 11, 11, 11, 11, 11, 11, 30, 30, 30, 30, 30, 30, 30, 30,
       30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
       30, 30, 30, 30, 30, 30])

In [10]:
vocab

{'<PAD>': 30,
 '<SOS>': 31,
 '<EOS>': 32,
 '<UNK>': 33,
 'can': 34,
 'i': 35,
 'have': 36,
 'a': 37,
 'large': 38,
 'bbq': 39,
 'pulled': 40,
 'pork': 41,
 'pie': 42,
 'with': 43,
 'green': 44,
 'pepper': 45,
 'and': 46,
 'extra': 47,
 'peperonni': 48,
 "'d": 49,
 'like': 50,
 'vegetarian': 51,
 'pizza': 52,
 'party': 53,
 'size': 54,
 'stuffed': 55,
 'crust': 56,
 'american': 57,
 'cheese': 58,
 'mushroom': 59,
 'one': 60,
 'personal': 61,
 'sized': 62,
 'artichoke': 63,
 'banana': 64,
 'peppperonis': 65,
 'low': 66,
 'fat': 67,
 'want': 68,
 'regular': 69,
 'without': 70,
 'any': 71,
 'fried': 72,
 'onions': 73,
 'little': 74,
 'bit': 75,
 'of': 76,
 'high': 77,
 'rise': 78,
 'dough': 79,
 'lot': 80,
 'olive': 81,
 'pesto': 82,
 'sauce': 83,
 'peperonis': 84,
 'yellow': 85,
 'meatball': 86,
 '-': 87,
 'bean': 88,
 'big': 89,
 'meat': 90,
 'mushrooms': 91,
 'pecorino': 92,
 'balsamic': 93,
 'glaze': 94,
 'black': 95,
 'chicken': 96,
 'mozzarella': 97,
 'italian': 98,
 'sausage': 99,
 

In [11]:
X_test_processed

array([[ 35,  68,  33, ...,  30,  30,  30],
       [244, 124, 237, ...,  30,  30,  30],
       [ 35, 104,  33, ...,  30,  30,  30],
       ...,
       [ 33,  37,  52, ...,  30,  30,  30],
       [ 33,  37,  33, ...,  30,  30,  30],
       [ 33,  35, 104, ...,  30,  30,  30]])

In [12]:
y_test_processed

array([[ 0,  0,  0, ..., 30, 30, 30],
       [11, 11, 11, ..., 30, 30, 30],
       [ 0,  0,  0, ..., 30, 30, 30],
       ...,
       [ 0, 11, 11, ..., 30, 30, 30],
       [ 0,  0,  0, ..., 30, 30, 30],
       [ 0, 11,  0, ..., 30, 30, 30]])

In [13]:
X_test_processed.shape, y_test_processed.shape


((348, 40), (348, 40))

In [14]:
X_train_processed.shape, y_train_processed.shape

((2456446, 40), (2456446, 40))

In [15]:
import torch
from torch.utils.data import Dataset, DataLoader

class SequenceDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = torch.tensor(inputs, dtype=torch.long) 
        self.targets = torch.tensor(targets, dtype=torch.long) 

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {
            "src_input_ids": self.inputs[idx],
            "tgt_input_ids": self.targets[idx],
        }



train_dataset = SequenceDataset(X_train_processed, y_train_processed)
test_dataset = SequenceDataset(X_test_processed, y_test_processed)

batch_size = 128  # Adjust based GPU ;-;  memory
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import BatchNorm1d
class BiLSTMModel(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, num_layers=3, dropout=0.5):
        super(BiLSTMModel, self).__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=vocab["<PAD>"])
        self.bilstm_1 = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            bidirectional=True,
            batch_first=True,
            dropout=dropout,
        )

        self.fc2 = nn.Linear(hidden_dim * 2, output_dim)


    def forward(self, x):
        embedded = self.embedding(x)

        lstm_out, _ = self.bilstm_1(embedded)

        output = self.fc2(lstm_out)
        return output


In [17]:
len(vocab)

307

In [18]:
max(vocab.values())

336

In [31]:
input_dim = 337 # len(vocab)
embedding_dim = 128
hidden_dim = 128  
output_dim = y_train_processed.shape[1] # num of classes 
num_layers = 2  
dropout = 0 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #"cuda" if torch.cuda.is_available() else
model = BiLSTMModel(input_dim, embedding_dim, hidden_dim, output_dim, num_layers, dropout).to(device)

In [32]:
def evaluate_model_with_accuracy(model, dataloader, criterion, device):
    model.eval()
    epoch_loss = 0
    total_tokens = 0
    correct_tokens = 0

    with torch.no_grad():
        for batch in dataloader:
            src = batch["src_input_ids"].to(device)
            tgt = batch["tgt_input_ids"].to(device)

            output = model(src)
            output_dim = output.shape[-1]

            # Flatten outputs and targets
            output = output.view(-1, output_dim)  # Shape: (batch_size * seq_len, output_dim)
            tgt = tgt.view(-1)  # Shape: (batch_size * seq_len)

            # # Apply mask to remove padding tokens
            # mask = tgt != 0  # Mask to ignore padding indices
            # output = output[mask]  # Filter model outputs
            # tgt = tgt[mask]  # Filter targets

            # Compute loss
            loss = criterion(output, tgt)
            epoch_loss += loss.item()

            # Calculate accuracy
            predictions = output.argmax(dim=1)  # Get the index of the max log-probability
            correct_tokens += (predictions == tgt).sum().item()
            total_tokens += tgt.size(0)

    accuracy = correct_tokens / total_tokens if total_tokens > 0 else 0
    return epoch_loss / len(dataloader), accuracy


In [33]:
import torch.optim as optim
from tqdm import tqdm

criterion = nn.CrossEntropyLoss()  # Use for multi-class classification ignore_index=0 for padding
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(20):  # Number of epochs
    model.train()
    epoch_loss = 0
    total_batches = len(train_dataloader)
    progress_bar = tqdm(train_dataloader, desc="Training Progress", unit="batch", leave=True)

    for batch_idx, batch in enumerate(progress_bar):  # Assuming a DataLoader is used
        src = batch["src_input_ids"].to(device)  # Input tokens
        tgt = batch["tgt_input_ids"].to(device)  # Target tokens
        # print(src)
        # print(tgt)

        optimizer.zero_grad()
        output = model(src)  # Forward pass
        output_dim = output.shape[-1]

        # Flatten outputs and targets for loss computation
        output = output.view(-1, output_dim)  # Shape: (batch_size * seq_len, output_dim)
        tgt = tgt.view(-1)  # Shape: (batch_size * seq_len)

        # Apply mask to remove padding tokens
        # mask = tgt != 0  # Mask to ignore padding indices
        # output = output[mask]  # Filter model outputs
        # tgt = tgt[mask]  # Filter targets
        # print(output.shape)
        # print(tgt.shape)

        # Compute loss
        loss = criterion(output, tgt)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        avg_loss = epoch_loss / (batch_idx + 1)
        progress_bar.set_description(f"Training Progress: Batch {batch_idx + 1}/{total_batches}, Avg Loss: {avg_loss:.8f}")

    val_loss, accuracy = evaluate_model_with_accuracy(model, test_dataloader, criterion, device)
    print(f"Epoch {epoch + 1}, Loss: {epoch_loss / len(train_dataloader):.8f}, Val Loss: {val_loss:.8f}, Accuracy: {accuracy * 100:.4f}%")


Training Progress: Batch 19191/19191, Avg Loss: 0.00476997: 100%|██████████| 19191/19191 [04:30<00:00, 70.93batch/s]


Epoch 1, Loss: 0.00476997, Val Loss: 4.53896872, Accuracy: 25.9339%


Training Progress: Batch 19191/19191, Avg Loss: 0.00001605: 100%|██████████| 19191/19191 [04:30<00:00, 70.99batch/s]


Epoch 2, Loss: 0.00001605, Val Loss: 3.95911344, Accuracy: 27.1480%


Training Progress: Batch 862/19191, Avg Loss: 0.00000054:   4%|▍         | 862/19191 [00:12<04:26, 68.73batch/s]


KeyboardInterrupt: 

In [38]:
def save_model(model, path):
    torch.save(model.state_dict(), path)
    print(f"Model saved to {path}")

def load_model(model, path):
    model.load_state_dict(torch.load(path))
    print(f"Model loaded from {path}")
    return model

In [39]:
save_model(model, "../weights/Bilstm_order_sequence.pt")

Model saved to ../weights/Bilstm_order_sequence.pt


In [37]:
model = BiLSTMModel(input_dim, embedding_dim, hidden_dim, output_dim, num_layers, dropout).to(device)
model = load_model(model,"../weights/Bilstm_order_sequence.pt")

Model loaded from ../weights/Bilstm_order_sequence.pt


In [46]:
sequence = test_dataset.__getitem__(0)["src_input_ids"].cpu().tolist()
pred_sequence= decode_sequence( sequence,vocab)
pred_sequence

'i want <UNK> <UNK> two medium pizzas with sausage and black olives and two medium pizzas with pepperoni and extra cheese and three large pizzas with pepperoni and sausage'

In [47]:

with torch.no_grad():
    output = model( test_dataset.__getitem__(0)["src_input_ids"].to(device))
    predictions = output.argmax(dim=1)
predictions

tensor([ 0,  0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12,
        12, 12, 12, 12, 12, 12, 12, 12, 12,  0, 12,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0], device='cuda:0')

In [48]:
tgt= test_dataset.__getitem__(0)["tgt_input_ids"].to(device)
tgt

tensor([ 0,  0,  0,  0, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12,
        13, 12, 12,  0, 13, 13, 13, 13, 13,  0, 13,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0], device='cuda:0')

In [43]:
predictions == tgt

tensor([True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True], device='cuda:0')

In [44]:
def evaluate_model_with_sequence_accuracy(model, dataloader, device):
    model.eval()
    total_sequences = 0
    correct_sequences = 0

    with torch.no_grad():
        for batch in dataloader:
            src = batch["src_input_ids"].to(device)
            tgt = batch["tgt_input_ids"].to(device)

            output = model(src)
            output = output.argmax(dim=-1)


            for pred, tgt_seq in zip(output, tgt):
                if torch.equal(pred, tgt_seq): 
                    correct_sequences += 1
                else:
                    print("Predicted:", pred)
                    print("Target:", tgt_seq)
                total_sequences += 1

    print(f"Correct {correct_sequences}, Total {total_sequences}")
    sequence_accuracy = correct_sequences / total_sequences if total_sequences > 0 else 0
    return sequence_accuracy * 100

In [45]:
evaluate_model_with_sequence_accuracy(model, test_dataloader, device)

Predicted: tensor([ 0,  0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12,
        12, 12, 12, 12, 12, 12, 12, 12, 12,  0, 12,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0], device='cuda:0')
Target: tensor([ 0,  0,  0,  0, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12,
        13, 12, 12,  0, 13, 13, 13, 13, 13,  0, 13,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0], device='cuda:0')
Predicted: tensor([ 0,  0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0], device='cuda:0')
Target: tensor([ 0,  0,  0,  0, 11, 11, 11, 11, 11, 11, 11, 11,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0], device='cuda:0')
Predicted: tensor([ 0,  0, 11, 11, 11, 11, 11, 11, 11, 11,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,

21.839080459770116