In [69]:
import re

ORDER_KEYS = {"PIZZAORDER", "DRINKORDER"}

def tokenize(s):
    tokens = re.findall(r'\(|\)|[^\s()]+', s)
    return tokens

def tokenize2(s):
    # just split by space
    return s.split()


In [70]:
import pandas as pd
train_path = '../dataset3/PIZZA_train_model2.json'
dev_path = "../dataset2/PIZZA_dev_model2.json"
df = pd.read_json(train_path, lines=True)
dev = pd.read_json(dev_path, lines=True)
df.describe()

Unnamed: 0,text,labels
count,2404558,2404558
unique,2330939,32433
top,two party sized pies with not many green peppers,"[21, 21, 5, 21, 21, 11, 21, 5, 21, 21, 11]"
freq,6,17495


In [71]:
dev.describe()

Unnamed: 0,text,labels
count,1357,1357
unique,1357,1000
top,i want a pizza with pesto and mushrooms but no...,"[21, 21, 5, 3, 11, 21, 11, 21, 21, 21, 13]"
freq,1,33


In [72]:
X_train = df['text']
y_train = df['labels']
X_test = dev['text']
y_test = dev['labels']

In [73]:
X_train[0], y_train[0]

('large pie with salami and with extra chorizo',
 [3, 21, 21, 11, 21, 21, 19, 11])

In [74]:
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define a mapping for entity keys to numerical labels
LABEL_MAP = {
    'B-DRINKTYPE': 1, 'I-DRINKTYPE': 2,
    'B-SIZE': 3, 'I-SIZE': 4,  # Treats SIZE and VOLUME as the same
    'B-NUMBER': 5, 'I-NUMBER': 6,
    'B-CONTAINERTYPE': 7, 'I-CONTAINERTYPE': 8,
    'B-COMPLEX_TOPPING': 9, 'I-COMPLEX_TOPPING': 10,
    'B-TOPPING': 11, 'I-TOPPING': 12,
    'B-NEG_TOPPING': 13, 'I-NEG_TOPPING': 14,
    'B-NEG_STYLE': 15, 'I-NEG_STYLE': 16,
    'B-STYLE': 17, 'I-STYLE': 18,
    'B-QUANTITY': 19, 'I-QUANTITY': 20,
    'O': 21
}
vocab = {"<PAD>": 0, "<UNK>": 1}

def tokenize_output(output):
    """
    Tokenizes the structured output into meaningful tokens.
    Example:
        Input: "(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (TOPPING bbq pulled pork ) ) )"
        Output: ["(ORDER", "(PIZZAORDER", "(NUMBER", "a", "(SIZE", "large", "(TOPPING", "bbq", "pulled", "pork", ")", ")", ")", ")"]
    """
    tokens = re.findall(r"\(|\)|\w+|[^\s()]+", output)
    return tokens

def build_vocab(input_strings, index):
    """
    Builds a vocabulary from tokenized strings.
    """
    i = index
    for input in input_strings:
        tokens = tokenize2(input)
        for token in tokens:
            if token not in vocab:
                vocab[token] = i
                i += 1
    return vocab, i
def encode_outputs(outputs, vocab):
    encoded = []
    for output in outputs:
        tokens = tokenize_output(output)  # Tokenize the output
        sequence = [vocab.get(token, vocab.get("<UNK>", 0)) for token in tokens] 
        encoded.append(sequence)
    return encoded

def pad_sequences_to_fixed_length(sequences, max_len):
    """
    Pads sequences to a fixed length.
    """
    return pad_sequences(sequences, maxlen=max_len, padding="post", value=0)

def vocab_decode_sequence(sequence, vocab):
    """
    Decodes a sequence of integers back into the structured output string.
    """
    inv_vocab = {v: k for k, v in vocab.items()}  # Reverse the vocabulary
    tokens = [inv_vocab[idx] for idx in sequence if idx in inv_vocab and idx not in {vocab["<PAD>"]} ] 
    output = " ".join(tokens)
    output = output.replace(" ( ", " (").replace("( ", "(") #.replace(" )", ")")
    print(output)
    return output

def decode_sequence(sequence):
    """
    Decodes a sequence of integers back into their ENTITY_KEYS.
    """
    output = []
    # remove all padding tokens 
    sequence = [i for i in sequence if i != 0]
    for i in sequence:
        # use the LABEL_MAP to decode the integer
        output.append(list(LABEL_MAP.keys())[list(LABEL_MAP.values()).index(i)])
    return output

decode_sequence([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])



['B-DRINKTYPE',
 'I-DRINKTYPE',
 'B-SIZE',
 'I-SIZE',
 'B-NUMBER',
 'I-NUMBER',
 'B-CONTAINERTYPE',
 'I-CONTAINERTYPE',
 'B-COMPLEX_TOPPING',
 'I-COMPLEX_TOPPING',
 'B-TOPPING',
 'I-TOPPING']

In [75]:
def load_vocab():
    # loads the vocab from the text file "vocab.txt" and then swaps the key value pairs
    with open("../dataset2/vocab.txt", "r") as f:
        vocab = f.readlines()
    
    # Process the vocab lines
    processed_vocab = []
    for v in vocab:
        # Remove single quotes if there are no double quotes
        if '"' not in v:
            v = v.replace("'", "")
        # Remove double quotes and commas
        v = v.replace('"', "").replace(",", "")
        processed_vocab.append(v)
    
    # Convert to dictionary
    vocab_dict = {v.split(":")[0].strip(): int(v.split(":")[1].strip()) for v in processed_vocab}
    return vocab_dict

def prepare_data(
    X_train, y_train, X_test, y_test, max_len_1=20, max_len_2 = 20, rebuild_vocab=False
):

    index = 2
    if rebuild_vocab:
        X_vocab, index = build_vocab(X_train,index)  # Build vocabulary from training outputs
    else:
        # load the vocab from the file
        X_vocab = load_vocab()
    X_train_encoded = encode_outputs(X_train, X_vocab)  # Encode training outputs
    X_test_encoded = encode_outputs(X_test, X_vocab)  # Encode testing outputs
    X_train_processed = pad_sequences_to_fixed_length(X_train_encoded, max_len_1)
    X_test_processed = pad_sequences_to_fixed_length(X_test_encoded, max_len_1)

    y_train_processed = pad_sequences_to_fixed_length(y_train, max_len_2)
    y_test_processed = pad_sequences_to_fixed_length(y_test, max_len_2)


    return (
        X_train_processed,
        X_test_processed,
        y_train_processed,
        y_test_processed,
        X_vocab,
    )


In [76]:
X_train_processed, X_test_processed, y_train_processed, y_test_processed, vocab  = prepare_data( X_train, y_train, X_test, y_test, max_len_1=40, max_len_2=40)

In [77]:
print(vocab_decode_sequence(X_train_processed[3], vocab))
print(decode_sequence(y_train_processed[3]))

i want one regular pizza without any salami
i want one regular pizza without any salami
['O', 'O', 'B-NUMBER', 'B-SIZE', 'O', 'O', 'O', 'B-NEG_TOPPING']


In [78]:
y_train_processed[1]

array([ 3,  4, 17, 18, 21, 21, 11, 21, 21, 11,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0])

In [79]:
vocab

{'<PAD>': 0,
 '<UNK>': 1,
 'large': 2,
 'pie': 3,
 'with': 4,
 'green': 5,
 'pepper': 6,
 'and': 7,
 'extra': 8,
 'ham': 9,
 'party': 10,
 'size': 11,
 'stuffed': 12,
 'crust': 13,
 'artichokes': 14,
 'mushroom': 15,
 'i': 16,
 'want': 17,
 'one': 18,
 'regular': 19,
 'pizza': 20,
 'without': 21,
 'any': 22,
 'basil': 23,
 'a': 24,
 'american': 25,
 'cheese': 26,
 'little': 27,
 'bit': 28,
 'of': 29,
 'sausage': 30,
 "i'd": 31,
 'like': 32,
 'sized': 33,
 'high': 34,
 'rise': 35,
 'dough': 36,
 'lot': 37,
 'banana': 38,
 'chicken': 39,
 'black': 40,
 'olives': 41,
 'sauce': 42,
 'broccoli': 43,
 'peperonni': 44,
 'italian': 45,
 'can': 46,
 'have': 47,
 'flatbread': 48,
 'style': 49,
 'lunch': 50,
 '-': 51,
 'blue': 52,
 'need': 53,
 'caramelized': 54,
 'onions': 55,
 'combination': 56,
 'eggplant': 57,
 'pecorino': 58,
 'New': 59,
 'York': 60,
 'artichoke': 61,
 'spinach': 62,
 'Neapolitan': 63,
 'bacon': 64,
 'tofu': 65,
 'grilled': 66,
 'mozzarella': 67,
 'vegan': 68,
 'pepperoni': 

In [80]:
X_test_processed

array([[ 16,  17,  24, ...,   0,   0,   0],
       [ 16, 512,  32, ...,   0,   0,   0],
       [ 46,  16, 511, ...,   0,   0,   0],
       ...,
       [ 16,   1, 518, ...,   0,   0,   0],
       [ 16, 596,  18, ...,   0,   0,   0],
       [ 16,   1,  32, ...,   0,   0,   0]])

In [81]:
y_test_processed

array([[21, 21,  5, ...,  0,  0,  0],
       [21, 21, 21, ...,  0,  0,  0],
       [21, 21, 21, ...,  0,  0,  0],
       ...,
       [21, 21, 21, ...,  0,  0,  0],
       [21, 21,  5, ...,  0,  0,  0],
       [21, 21, 21, ...,  0,  0,  0]])

In [82]:
X_test_processed.shape, y_test_processed.shape


((1357, 40), (1357, 40))

In [83]:
X_train_processed.shape, y_train_processed.shape

((2404558, 40), (2404558, 40))

In [84]:
import torch
from torch.utils.data import Dataset, DataLoader

class SequenceDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = torch.tensor(inputs, dtype=torch.long) 
        self.targets = torch.tensor(targets, dtype=torch.long) 

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {
            "src_input_ids": self.inputs[idx],
            "tgt_input_ids": self.targets[idx],
        }



train_dataset = SequenceDataset(X_train_processed, y_train_processed)
test_dataset = SequenceDataset(X_test_processed, y_test_processed)

batch_size = 512  # Adjust based GPU ;-;  memory
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [85]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import BatchNorm1d
class BiLSTMModel(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, num_layers=3, dropout=0.3):
        super(BiLSTMModel, self).__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=0)
        self.lstm_dropout = nn.Dropout(p=0.25)
        # Bidirectional LSTM
        self.bilstm_1 = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            bidirectional=True,
            batch_first=True,
        )

        # Batch normalization
        #self.batchnorm_1 = BatchNorm1d(hidden_dim * 2)

        # Fully connected layers
        self.fc2 = nn.Linear(hidden_dim * 2, output_dim)


    def forward(self, x):
        # Embedding layer
        embedded = self.embedding(x)

        # BiLSTM layer
        lstm_out, _ = self.bilstm_1(embedded)

        # # # Batch normalization
        # lstm_out = lstm_out.permute(0, 2, 1)
        # lstm_out = self.batchnorm_1(lstm_out)
        # lstm_out = lstm_out.permute(0, 2, 1)
        lstm_out = self.lstm_dropout(lstm_out)
        output = self.fc2(lstm_out)
        return output


In [86]:
len(vocab)

607

In [103]:

input_dim = len(vocab)
embedding_dim = 128
hidden_dim = 128  
output_dim = 22
num_layers = 2  
dropout = 0.3

device = torch.device("cuda") #"cuda" if torch.cuda.is_available() else
model = BiLSTMModel(input_dim, embedding_dim, hidden_dim, output_dim, num_layers, dropout).to(device)

In [104]:
def evaluate_model_with_accuracy(model, dataloader, criterion, device):
    model.eval()
    epoch_loss = 0
    total_tokens = 0
    correct_tokens = 0

    with torch.no_grad():
        for batch in dataloader:
            src = batch["src_input_ids"].to(device)
            tgt = batch["tgt_input_ids"].to(device)

            output = model(src)
            output_dim = output.shape[-1]

            # Flatten outputs and targets
            output = output.view(-1, output_dim)  # Shape: (batch_size * seq_len, output_dim)
            tgt = tgt.view(-1)  # Shape: (batch_size * seq_len)

            # Compute loss
            loss = criterion(output, tgt)
            epoch_loss += loss.item()

            # Calculate accuracy
            predictions = output.argmax(dim=1)  # Get the index of the max log-probability
            valid_indices = tgt != 0
            correct_tokens += (predictions[valid_indices] == tgt[valid_indices]).sum().item()
            total_tokens += valid_indices.sum().item()

    accuracy = correct_tokens / total_tokens if total_tokens > 0 else 0
    return epoch_loss / len(dataloader), accuracy


In [105]:
import torch.optim as optim
from tqdm import tqdm

criterion = nn.CrossEntropyLoss(ignore_index=0)  # Use for multi-class classification ignore_index=0 for padding
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)

for epoch in range(20):  # Number of epochs
    model.train()
    epoch_loss = 0
    total_batches = len(train_dataloader)
    progress_bar = tqdm(train_dataloader, desc="Training Progress", unit="batch", leave=True)

    for batch_idx, batch in enumerate(progress_bar):  # Assuming a DataLoader is used
        src = batch["src_input_ids"].to(device)  # Input tokens
        tgt = batch["tgt_input_ids"].to(device)  # Target tokens

        optimizer.zero_grad()
        output = model(src)  # Forward pass
        output_dim = output.shape[-1]

        # Flatten outputs and targets for loss computation
        output = output.view(-1, output_dim)  # Shape: (batch_size * seq_len, output_dim)
        tgt = tgt.view(-1)  # Shape: (batch_size * seq_len)

        # Compute loss
        loss = criterion(output, tgt)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        avg_loss = epoch_loss / (batch_idx + 1)
        progress_bar.set_description(f"Training Progress: Batch {batch_idx + 1}/{total_batches}, Avg Loss: {avg_loss:.8f}")

    val_loss, accuracy = evaluate_model_with_accuracy(model, test_dataloader, criterion, device)
    print(f"Epoch {epoch + 1}, Loss: {epoch_loss / len(train_dataloader):.8f}, Val Loss: {val_loss:.8f}, Accuracy: {accuracy * 100:.4f}%")


Training Progress: Batch 4697/4697, Avg Loss: 0.02659801: 100%|██████████| 4697/4697 [11:39<00:00,  6.71batch/s]  


Epoch 1, Loss: 0.02659801, Val Loss: 0.74913645, Accuracy: 82.2337%


Training Progress: Batch 4697/4697, Avg Loss: 0.01092344: 100%|██████████| 4697/4697 [12:37<00:00,  6.20batch/s]


Epoch 2, Loss: 0.01092344, Val Loss: 0.63611684, Accuracy: 85.0417%


Training Progress: Batch 4697/4697, Avg Loss: 0.01036332: 100%|██████████| 4697/4697 [13:11<00:00,  5.93batch/s]


Epoch 3, Loss: 0.01036332, Val Loss: 0.63127462, Accuracy: 85.1842%


Training Progress: Batch 4697/4697, Avg Loss: 0.01009512: 100%|██████████| 4697/4697 [12:31<00:00,  6.25batch/s]


Epoch 4, Loss: 0.01009512, Val Loss: 0.69960964, Accuracy: 84.6247%


Training Progress: Batch 821/4697, Avg Loss: 0.00980073:  17%|█▋        | 821/4697 [02:01<09:35,  6.74batch/s]


KeyboardInterrupt: 

In [106]:
def save_model(model, path):
    torch.save(model.state_dict(), path)
    print(f"Model saved to {path}")

def load_model(model, path):
    model.load_state_dict(torch.load(path))
    print(f"Model loaded from {path}")
    return model

In [107]:
save_model(model, "../weights/Bilstm_model2.pt")

Model saved to ../weights/Bilstm_model2.pt


### Testing Real Output sequence

In [108]:
test_dataset.__getitem__(1)["src_input_ids"]

tensor([ 16, 512,  32, 506, 513, 253, 123, 164, 247,   4,   8,  26,   7,  70,
        135,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0])

In [109]:

with torch.no_grad():

    output = model( test_dataset.__getitem__(1)["src_input_ids"].to(device))
    predictions = output.argmax(dim=1)
    # print the sentence from the test set
    print(X_test[1])
predictions

i would like to try two medium tuna pizzas with extra cheese and no pesto


tensor([21, 21, 21, 21, 21,  5,  3, 17, 21, 21, 11, 12, 21, 21, 13, 14, 14, 14,
        14, 14, 14, 14, 14, 14, 14, 14, 14, 21, 14, 14, 14, 14, 14, 14, 21, 14,
        14, 14, 14, 21], device='cuda:0')

In [110]:
tgt= test_dataset.__getitem__(1)["tgt_input_ids"].to(device)
tgt

tensor([21, 21, 21, 21, 21,  5,  3, 11, 21, 21, 19, 11, 21, 21, 13,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0], device='cuda:0')

In [111]:

sequence = predictions.cpu().tolist()
pred_sequence= decode_sequence(sequence)
pred_sequence

['O',
 'O',
 'O',
 'O',
 'O',
 'B-NUMBER',
 'B-SIZE',
 'B-STYLE',
 'O',
 'O',
 'B-TOPPING',
 'I-TOPPING',
 'O',
 'O',
 'B-NEG_TOPPING',
 'I-NEG_TOPPING',
 'I-NEG_TOPPING',
 'I-NEG_TOPPING',
 'I-NEG_TOPPING',
 'I-NEG_TOPPING',
 'I-NEG_TOPPING',
 'I-NEG_TOPPING',
 'I-NEG_TOPPING',
 'I-NEG_TOPPING',
 'I-NEG_TOPPING',
 'I-NEG_TOPPING',
 'I-NEG_TOPPING',
 'O',
 'I-NEG_TOPPING',
 'I-NEG_TOPPING',
 'I-NEG_TOPPING',
 'I-NEG_TOPPING',
 'I-NEG_TOPPING',
 'I-NEG_TOPPING',
 'O',
 'I-NEG_TOPPING',
 'I-NEG_TOPPING',
 'I-NEG_TOPPING',
 'I-NEG_TOPPING',
 'O']

In [112]:
sequence = tgt.cpu().tolist()
tgt_sequence = decode_sequence(sequence)
tgt_sequence

['O',
 'O',
 'O',
 'O',
 'O',
 'B-NUMBER',
 'B-SIZE',
 'B-TOPPING',
 'O',
 'O',
 'B-QUANTITY',
 'B-TOPPING',
 'O',
 'O',
 'B-NEG_TOPPING']

In [113]:
pred_sequence == tgt_sequence

False

In [114]:
def evaluate_model_with_sequence_accuracy(model, dataloader, device):
    model.eval()
    total_sequences = 0
    correct_sequences = 0

    with torch.no_grad():
        for batch in dataloader:
            src = batch["src_input_ids"].to(device)
            tgt = batch["tgt_input_ids"].to(device)

            # Get model predictions
            output = model(src)
            predictions = output.argmax(dim=-1)  # Shape: (batch_size, seq_len)

            # Compare sequences, token-by-token
            for pred, tgt_seq in zip(predictions, tgt):
                valid_mask = tgt_seq != 0  # Ignore padding in comparison
                pred = pred[valid_mask]
                tgt_seq = tgt_seq[valid_mask]

                if torch.equal(pred, tgt_seq):  # Compare sequences
                    correct_sequences += 1
                else:
                    print(X_test[total_sequences])
                    print(f"Predicted: {pred}")
                    print(f"Target: {tgt_seq}")

                total_sequences += 1

    print(f"Correct {correct_sequences}, Total {total_sequences}")
    sequence_accuracy = correct_sequences / total_sequences if total_sequences > 0 else 0
    return sequence_accuracy * 100


In [115]:
evaluate_model_with_sequence_accuracy(model, test_dataloader, device)

i would like to try two medium tuna pizzas with extra cheese and no pesto
Predicted: tensor([21, 21, 21, 21, 21,  5,  3, 17, 21, 21, 11, 12, 21, 21, 13],
       device='cuda:0')
Target: tensor([21, 21, 21, 21, 21,  5,  3, 11, 21, 21, 19, 11, 21, 21, 13],
       device='cuda:0')
get me two pepsis a coke and five large fantas
Predicted: tensor([21, 21,  5,  1, 21,  1, 21,  5,  3,  1], device='cuda:0')
Target: tensor([21, 21,  5,  1,  5,  1, 21,  5,  3,  1], device='cuda:0')
i want one medium pizza along with sausage mushrooms but hold ham please
Predicted: tensor([21, 21,  5,  3, 21, 21, 21, 11, 12, 12, 21, 21, 21], device='cuda:0')
Target: tensor([21, 21,  5,  3, 21, 21, 21, 11, 11, 21, 21, 13, 21], device='cuda:0')
i'll go for one pepsi six large diet cokes and a medium fanta
Predicted: tensor([21, 21, 21, 21,  5, 21, 21,  1,  2, 21,  5,  3,  1], device='cuda:0')
Target: tensor([21, 21, 21,  5,  1,  5,  3,  1,  2, 21,  5,  3,  1], device='cuda:0')
i would like one pie with sausage oliv

26.160648489314664