In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import BatchNorm1d
class BiLSTMModel(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, num_layers=3, dropout=0.5):
        super(BiLSTMModel, self).__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=0)
        
        # Bidirectional LSTM
        self.bilstm_1 = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            bidirectional=True,
            batch_first=True,
            dropout=dropout,
        )

        # Batch normalization
        #self.batchnorm_1 = BatchNorm1d(hidden_dim * 2)

        # Fully connected layers
        self.fc2 = nn.Linear(hidden_dim * 2, output_dim)


    def forward(self, x):
        # Embedding layer
        embedded = self.embedding(x)

        # BiLSTM layer
        lstm_out, _ = self.bilstm_1(embedded)

        # # # Batch normalization
        # lstm_out = lstm_out.permute(0, 2, 1)
        # lstm_out = self.batchnorm_1(lstm_out)
        # lstm_out = lstm_out.permute(0, 2, 1)

        output = self.fc2(lstm_out)
        return F.log_softmax(output, dim=-1)



In [2]:
import pandas as pd
train_path = '../dataset/PIZZA_train.json'
test_path = '../dataset/PIZZA_dev.json'
df = pd.read_json(train_path, lines=True)
dev = pd.read_json(test_path, lines=True)

In [3]:
df.describe()

Unnamed: 0,train.SRC,train.EXR,train.TOP,train.TOP-DECOUPLED
count,2456446,2456446,2456446,2456446
unique,2456446,694346,2456446,1425035
top,can i have a large bbq pulled pork,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PARTY_SIZ...,(ORDER can i have (PIZZAORDER (NUMBER a ) (SIZ...,(ORDER (PIZZAORDER (NUMBER three ) (NOT (TOPPI...
freq,1,1999,1,167


In [4]:
unique_exr = df['train.TOP-DECOUPLED'].value_counts()[df['train.TOP-DECOUPLED'].value_counts() == 1].index

# Filter rows where 'train.EXR' is unique
unique_related_dataset = df[df['train.TOP-DECOUPLED'].isin(unique_exr)].reset_index(drop=True)
unique_related_dataset

Unnamed: 0,train.SRC,train.EXR,train.TOP,train.TOP-DECOUPLED
0,can i have one pie with banana pepper and peco...,(ORDER (PIZZAORDER (NUMBER 1 ) (TOPPING BANANA...,(ORDER can i have (PIZZAORDER (NUMBER one ) pi...,(ORDER (PIZZAORDER (NUMBER one ) (TOPPING bana...
1,party sized high rise dough pie with artichoke...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PARTY_SIZ...,(ORDER (PIZZAORDER (SIZE party sized ) (STYLE ...,(ORDER (PIZZAORDER (SIZE party sized ) (STYLE ...
2,high rise dough pie with american cheese and w...,(ORDER (PIZZAORDER (NUMBER 1 ) (STYLE THICK_CR...,(ORDER (PIZZAORDER (STYLE high rise dough ) pi...,(ORDER (PIZZAORDER (STYLE high rise dough ) (T...
3,a party sized meatlover pie with extra banana ...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PARTY_SIZ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE party siz...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE party siz...
4,a meatlover pie with american cheese and with ...,(ORDER (PIZZAORDER (NUMBER 1 ) (STYLE MEAT_LOV...,(ORDER (PIZZAORDER (NUMBER a ) (STYLE meatlove...,(ORDER (PIZZAORDER (NUMBER a ) (STYLE meatlove...
...,...,...,...,...
1087402,i'd like a pizza with arugula ricotta cheese a...,(ORDER (PIZZAORDER (NUMBER 1 ) (TOPPING ARUGUL...,(ORDER i'd like (PIZZAORDER (NUMBER a ) pizza ...,(ORDER (PIZZAORDER (NUMBER a ) (TOPPING arugul...
1087403,i'd like a pizza with yellow peppers fried oni...,(ORDER (PIZZAORDER (NUMBER 1 ) (TOPPING YELLOW...,(ORDER i'd like (PIZZAORDER (NUMBER a ) pizza ...,(ORDER (PIZZAORDER (NUMBER a ) (TOPPING yellow...
1087404,i'd like a pizza with olives roasted tomatoes ...,(ORDER (PIZZAORDER (NUMBER 1 ) (TOPPING OLIVES...,(ORDER i'd like (PIZZAORDER (NUMBER a ) pizza ...,(ORDER (PIZZAORDER (NUMBER a ) (TOPPING olives...
1087405,i'd like a pizza with mozzarella jalapeno and ...,(ORDER (PIZZAORDER (NUMBER 1 ) (TOPPING MOZZAR...,(ORDER i'd like (PIZZAORDER (NUMBER a ) pizza ...,(ORDER (PIZZAORDER (NUMBER a ) (TOPPING mozzar...


In [5]:
#unique_patterns = df['train.EXR'].value_counts()
# # Subset the DataFrame for rows with unique values
# unique_related_dataset = df[df['train.EXR'].isin(unique_patterns.index[:5000])] # reduce to 5000 patterns

# # Describe the resulting dataset
# unique_related_dataset.describe()

In [6]:
# unique_related_dataset =unique_related_dataset.reset_index(drop=True)
# unique_related_dataset

In [7]:
X_train = unique_related_dataset['train.SRC']
y_train = unique_related_dataset['train.TOP-DECOUPLED']
X_test = dev['dev.SRC']
y_test = dev['dev.TOP']
print(len(unique_related_dataset))
print(X_train[476368])
print(y_train[476368])
print(dev['dev.SRC'][0])

1087407
two 20 fl ounce diet sprites in cans and five 500 milliliter ice teas and three 500-ml san pellegrinos
(ORDER (DRINKORDER (NUMBER two ) (VOLUME 20 fl ounce ) (DRINKTYPE diet sprites ) (CONTAINERTYPE in cans ) ) (DRINKORDER (NUMBER five ) (VOLUME 500 milliliter ) (DRINKTYPE ice teas ) ) (DRINKORDER (NUMBER three ) (VOLUME 500-ml ) (DRINKTYPE san pellegrinos ) ) )
i want to order two medium pizzas with sausage and black olives and two medium pizzas with pepperoni and extra cheese and three large pizzas with pepperoni and sausage


In [8]:
import sys
sys.path.append("..")
from utils.data_preprocessing import preprocess_text
from utils.feature_extraction import bag_of_words, tfidf_features, extract_embeddings
X_train = [" ".join(preprocess_text(text)) for text in X_train]
X_test = [" ".join(preprocess_text(text)) for text in X_test]
y_train = [" ".join(preprocess_text(text)) for text in y_train]
# y_test = [" ".join(preprocess_text(text)) for text in y_test]

In [9]:
del df
del dev

In [10]:
X_train

['one pie banana pepper pecorino cheese spiced sausage',
 'party sized high rise dough pie artichoke pecorino cheese',
 'high rise dough pie american cheese NOT much olive',
 'a party sized meatlover pie banana pepper pesto',
 'a meatlover pie american cheese a little bit peperoni',
 'personal sized stuffed crust pie american cheese pesto',
 'personal sized every meat pizza american cheese',
 'party sized pie american cheese peperronni a lot vegan pepperoni',
 'a party sized pie banana pepper mozzarella tomato',
 'stuffed crust pie balzamic glaze pecorino cheese',
 'one party sized pie banana pepper peperonni red pepper flake',
 'one high rise dough pie green pepper pickle',
 'one pie only a little american cheese',
 'pie balsamic glaze meatball a lot roasted red pepper',
 'one party sized pie a lot green olive pecorino cheese',
 'a party size pie green olive peperroni yellow pepper',
 'a party size pie a little american cheese pecorino cheese',
 'meatlover pie american cheese NOT much

In [11]:
y_train[0]

'ORDER PIZZAORDER NUMBER one TOPPING banana pepper TOPPING pecorino cheese TOPPING spiced sausage'

SRC

one pie banana pepper pecorino cheese spiced sausage

TOP decoupled

ORDER PIZZAORDER NUMBER one TOPPING banana pepper TOPPING pecorino cheese TOPPING spiced sausage


In [12]:
max_str_1 = len(max(X_train, key=len))
max_str_2 = len(max(y_train, key=len))
max_str_1, max_str_2

(111, 251)

In [13]:
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences
vocab = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2}  # Special tokens

def tokenize_output(output):
    """
    Tokenizes the structured output into meaningful tokens.
    Example:
        Input: "(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (TOPPING bbq pulled pork ) ) )"
        Output: ["(ORDER", "(PIZZAORDER", "(NUMBER", "a", "(SIZE", "large", "(TOPPING", "bbq", "pulled", "pork", ")", ")", ")", ")"]
    """
    tokens = re.findall(r"\(|\)|\w+|[^\s()]+", output)
    return tokens

def build_vocab(outputs, index):
    """
    Builds a vocabulary from tokenized outputs.
    """
    i = index
    for output in outputs:
        tokens = tokenize_output(output)
        for token in tokens:
            if token not in vocab:
                vocab[token] = i
                i += 1
    return vocab, i
def encode_outputs(outputs, vocab):
    """
    Encodes tokenized outputs into sequences of integers.
    """
    encoded = []
    for output in outputs:
        tokens = tokenize_output(output)
        sequence = [vocab["<SOS>"]] + [vocab[token] for token in tokens if token in vocab] + [vocab["<EOS>"]]
        encoded.append(sequence)
    return encoded

def pad_sequences_to_fixed_length(sequences, max_len):
    """
    Pads sequences to a fixed length.
    """
    return pad_sequences(sequences, maxlen=max_len, padding="post", value=0)

def decode_sequence(sequence, vocab):
    """
    Decodes a sequence of integers back into the structured output string.
    """
    inv_vocab = {v: k for k, v in vocab.items()}  # Reverse the vocabulary
    tokens = [inv_vocab[idx] for idx in sequence if idx in inv_vocab and idx not in { vocab["<SOS>"], vocab["<EOS>"],vocab["<PAD>"]} ] 
    output = " ".join(tokens)
    output = output.replace(" ( ", " (").replace("( ", "(") #.replace(" )", ")")
    return output

def decode_sequence_2(sequence, vocab):
    """
    Decodes a sequence of integers back into the structured output string.
    """
    inv_vocab = {v: k for k, v in vocab.items()}  # Reverse the vocabulary
    # sequence = sequence.cpu().tolist()  # Convert tensor to a list of integers
    tokens = [inv_vocab.get(idx, "") for idx in sequence if idx > 0]  # Ignore unknown and put empty char
    return "".join(tokens)



In [14]:
# import gensim.downloader as api
# glove_vectors = api.load("glove-wiki-gigaword-100")  # 100-dimension GloVe
# glove_vectors

In [15]:


def prepare_data(
    X_train, y_train, X_test, y_test, feature_type="bow", glove_vectors=None, max_len_1=20, max_len_2 = 20
):

    vectorizer = None

    # # Feature Extraction for X_train and X_test
    # if feature_type == "bow":
    #     X_train_processed, vectorizer = bag_of_words(X_train)
    #     X_train_processed = X_train_processed.toarray()
    #     X_test_processed = vectorizer.transform(X_test).toarray()
    # elif feature_type == "tfidf":
    #     X_train_processed, vectorizer = tfidf_features(X_train)
    #     X_train_processed = X_train_processed.toarray()
    #     X_test_processed = vectorizer.transform(X_test).toarray()
    # elif feature_type == "embeddings":
    #     if not glove_vectors:
    #         raise ValueError("GloVe vectors must be provided for embeddings.")
    #     X_train_tokenized = [sentence.split() for sentence in X_train]
    #     X_test_tokenized = [sentence.split() for sentence in X_test]
    #     X_train_processed = extract_embeddings(X_train_tokenized)
    #     X_test_processed = extract_embeddings(X_test_tokenized)
    # else:
    #     raise ValueError("Invalid feature type. Choose 'bow', 'tfidf', or 'embeddings'.")
    index = 3
    X_vocab, index = build_vocab(X_train,index)  # Build vocabulary from training outputs
    X_train_encoded = encode_outputs(X_train, X_vocab)  # Encode training outputs
    X_test_encoded = encode_outputs(X_test, X_vocab)  # Encode testing outputs
    X_train_processed = pad_sequences_to_fixed_length(X_train_encoded, max_len_1)
    X_test_processed = pad_sequences_to_fixed_length(X_test_encoded, max_len_1)

    y_vocab, index = build_vocab(y_train,index)  # Build vocabulary from training outputs
    y_train_encoded = encode_outputs(y_train, vocab)  # Encode training outputs
    y_test_encoded = encode_outputs(y_test, vocab)  # Encode testing outputs
    y_train_processed = pad_sequences_to_fixed_length(y_train_encoded, max_len_2)
    y_test_processed = pad_sequences_to_fixed_length(y_test_encoded, max_len_2)


    return (
        X_train_processed,
        X_test_processed,
        y_train_processed,
        y_test_processed,
        X_vocab,
        y_vocab,  # Return vocabulary for decoding
    )


In [16]:
X_train_processed, X_test_processed, y_train_processed, y_test_processed, X_vocab, y_vocab = prepare_data( X_train, y_train, X_test, y_test, feature_type="embeddings", max_len_1=253, max_len_2=253)

In [45]:
del X_train
del X_test
del y_train
del y_test

In [17]:
X_vocab

{'<PAD>': 0,
 '<SOS>': 1,
 '<EOS>': 2,
 'one': 3,
 'pie': 4,
 'banana': 5,
 'pepper': 6,
 'pecorino': 7,
 'cheese': 8,
 'spiced': 9,
 'sausage': 10,
 'party': 11,
 'sized': 12,
 'high': 13,
 'rise': 14,
 'dough': 15,
 'artichoke': 16,
 'american': 17,
 'NOT': 18,
 'much': 19,
 'olive': 20,
 'a': 21,
 'meatlover': 22,
 'pesto': 23,
 'little': 24,
 'bit': 25,
 'peperoni': 26,
 'personal': 27,
 'stuffed': 28,
 'crust': 29,
 'every': 30,
 'meat': 31,
 'pizza': 32,
 'peperronni': 33,
 'lot': 34,
 'vegan': 35,
 'pepperoni': 36,
 'mozzarella': 37,
 'tomato': 38,
 'balzamic': 39,
 'glaze': 40,
 'peperonni': 41,
 'red': 42,
 'flake': 43,
 'green': 44,
 'pickle': 45,
 'only': 46,
 'balsamic': 47,
 'meatball': 48,
 'roasted': 49,
 'size': 50,
 'peperroni': 51,
 'yellow': 52,
 'large': 53,
 'neapolitan': 54,
 'black': 55,
 'combination': 56,
 'everything': 57,
 'regular': 58,
 'cherry': 59,
 'medium': 60,
 'big': 61,
 'jalapeno': 62,
 'peppperoni': 63,
 'low': 64,
 'fat': 65,
 'chicken': 66,
 'sau

In [18]:
vocab

{'<PAD>': 0,
 '<SOS>': 1,
 '<EOS>': 2,
 'one': 3,
 'pie': 4,
 'banana': 5,
 'pepper': 6,
 'pecorino': 7,
 'cheese': 8,
 'spiced': 9,
 'sausage': 10,
 'party': 11,
 'sized': 12,
 'high': 13,
 'rise': 14,
 'dough': 15,
 'artichoke': 16,
 'american': 17,
 'NOT': 18,
 'much': 19,
 'olive': 20,
 'a': 21,
 'meatlover': 22,
 'pesto': 23,
 'little': 24,
 'bit': 25,
 'peperoni': 26,
 'personal': 27,
 'stuffed': 28,
 'crust': 29,
 'every': 30,
 'meat': 31,
 'pizza': 32,
 'peperronni': 33,
 'lot': 34,
 'vegan': 35,
 'pepperoni': 36,
 'mozzarella': 37,
 'tomato': 38,
 'balzamic': 39,
 'glaze': 40,
 'peperonni': 41,
 'red': 42,
 'flake': 43,
 'green': 44,
 'pickle': 45,
 'only': 46,
 'balsamic': 47,
 'meatball': 48,
 'roasted': 49,
 'size': 50,
 'peperroni': 51,
 'yellow': 52,
 'large': 53,
 'neapolitan': 54,
 'black': 55,
 'combination': 56,
 'everything': 57,
 'regular': 58,
 'cherry': 59,
 'medium': 60,
 'big': 61,
 'jalapeno': 62,
 'peppperoni': 63,
 'low': 64,
 'fat': 65,
 'chicken': 66,
 'sau

In [19]:
X_train_processed.shape, y_train_processed.shape

((1087407, 253), (1087407, 253))

In [20]:
X_train_processed[1]

array([ 1, 11, 12, 13, 14, 15,  4, 16,  7,  8,  2,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0

In [21]:
y_train_processed[1]

array([  1, 241, 242, 245,  11,  12, 246,  13,  14,  15, 244,  16, 244,
         7,   8,   2,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   

In [22]:

ou= decode_sequence(y_train_processed[1],vocab)
ou

'ORDER PIZZAORDER SIZE party sized STYLE high rise dough TOPPING artichoke TOPPING pecorino cheese'

In [23]:
#five medium pizzas with tomatoes and ham
ou= decode_sequence(y_test_processed[1],vocab)
ou

'ORDER PIZZAORDER NUMBER five SIZE medium TOPPING TOPPING ham'

### Ensure Decoding works right

In [24]:
total_sequences = 0
correct_sequences = 0

for src,tgt in zip(y_test_processed, y_test):
    #print(src)

    predicted_sequences = decode_sequence(src, vocab) 
    #print(predicted_sequences)
    # print(predicted_sequences)
    # print(target_sequences)
    # Calculate sequence accuracy
    if predicted_sequences == tgt:
        # print(pred)
        # print(tgt)
        correct_sequences += 1
    total_sequences += 1
print(f"Correct {correct_sequences}, Total {total_sequences}")
sequence_accuracy = correct_sequences / total_sequences if total_sequences > 0 else 0
sequence_accuracy * 100

Correct 0, Total 348


0.0

In [56]:
len(vocab)

253

In [57]:
input_dim = X_train_processed.shape[1]
input_dim

253

In [58]:
output_dim = y_train_processed.shape[1] 
output_dim

253

In [59]:
len(vocab)

253

In [60]:
input_dim =  X_train_processed.shape[1]  # Vocabulary size
embedding_dim = len(vocab)  # Dimension of embedding vectors    vocabs  260,158       max seq lens  (99, 265)
hidden_dim = 256  # Hidden state size for LSTM          first was 256 for batch norm 150
output_dim = y_train_processed.shape[1]  # Number of output classes
num_layers = 2  # Number of BiLSTM layers
dropout = 0.2  # Dropout probability

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BiLSTMModel(input_dim, embedding_dim, hidden_dim, output_dim, num_layers, dropout).to(device)


In [64]:
import torch
from torch.utils.data import Dataset, DataLoader

class SequenceDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = torch.tensor(inputs, dtype=torch.long) 
        self.targets = torch.tensor(targets, dtype=torch.long) 

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {
            "src_input_ids": self.inputs[idx],
            "tgt_input_ids": self.targets[idx],
        }

train_dataset = SequenceDataset(X_train_processed, y_train_processed)
test_dataset = SequenceDataset(X_test_processed, y_test_processed)

batch_size = 128  # Adjust based GPU ;-;  memory
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [62]:
train_dataset.__getitem__(3)

{'src_input_ids': tensor([ 1, 21, 11, 12, 22,  4,  5,  6, 23,  2,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0

In [None]:
import json
import re

def tokenize(s):
    # Extract tokens: parentheses or sequences of non-whitespace, non-parenthesis characters.
    tokens = re.findall(r'\(|\)|[^\s()]+', s)
    return tokens

def parse_tokens(tokens):
    # Parse tokens into a nested list structure
    stack = []
    current_list = []
    for token in tokens:
        if token == '(':
            stack.append(current_list)
            current_list = []
        elif token == ')':
            finished = current_list
            current_list = stack.pop()
            current_list.append(finished)
        else:
            current_list.append(token)
    return current_list

def normalize_structure(tree):
    if not isinstance(tree, list):
        return None

    def is_key(token):
        return token in [
            "ORDER", "PIZZAORDER", "DRINKORDER", "NUMBER", "SIZE", "STYLE", "TOPPING",
            "COMPLEX_TOPPING", "QUANTITY", "VOLUME", "DRINKTYPE", "CONTAINERTYPE", "NOT"
        ]

    # Clean the list by keeping sublists and tokens as-is for further analysis
    cleaned = []
    for el in tree:
        cleaned.append(el)

    if len(cleaned) > 0 and isinstance(cleaned[0], str) and is_key(cleaned[0]):
        key = cleaned[0]
        if key == "ORDER":
            pizzaorders = []
            drinkorders = []
            for sub in cleaned[1:]:
                node = normalize_structure(sub)
                if isinstance(node, dict):
                    if "PIZZAORDER" in node:
                        if isinstance(node["PIZZAORDER"], list):
                            pizzaorders.extend(node["PIZZAORDER"])
                        else:
                            pizzaorders.append(node["PIZZAORDER"])
                    if "DRINKORDER" in node:
                        if isinstance(node["DRINKORDER"], list):
                            drinkorders.extend(node["DRINKORDER"])
                        else:
                            drinkorders.append(node["DRINKORDER"])
                    if node.get("TYPE") == "PIZZAORDER":
                        pizzaorders.append(node)
                    if node.get("TYPE") == "DRINKORDER":
                        drinkorders.append(node)
            result = {}
            if pizzaorders:
                result["PIZZAORDER"] = pizzaorders
            if drinkorders:
                result["DRINKORDER"] = drinkorders
            if result:
                return {"ORDER": result}
            else:
                return {}

        elif key == "PIZZAORDER":
            number = None
            size = None
            style = None
            toppings = []
            for sub in cleaned[1:]:
                node = normalize_structure(sub)
                if isinstance(node, dict):
                    t = node.get("TYPE")
                    if t == "NUMBER":
                        number = node["VALUE"]
                    elif t == "SIZE":
                        size = node["VALUE"]
                    elif t == "STYLE":
                        style = node["VALUE"]
                    elif t == "TOPPING":
                        toppings.append(node)
            result = {}
            if number is not None:
                result["NUMBER"] = number
            if size is not None:
                result["SIZE"] = size
            if style is not None:
                result["STYLE"] = style
            if toppings:
                result["AllTopping"] = toppings
            # Mark type internally, will remove later
            result["TYPE"] = "PIZZAORDER"
            return result

        elif key == "DRINKORDER":
            number = None
            volume = None
            drinktype = None
            containertype = None
            for sub in cleaned[1:]:
                node = normalize_structure(sub)
                if isinstance(node, dict):
                    t = node.get("TYPE")
                    if t == "NUMBER":
                        number = node["VALUE"]
                    elif t == "VOLUME":
                        volume = node["VALUE"]
                    elif t == "DRINKTYPE":
                        drinktype = node["VALUE"]
                    elif t == "CONTAINERTYPE":
                        containertype = node["VALUE"]
            result = {}
            if number is not None:
                result["NUMBER"] = number
            if volume is not None:
                result["VOLUME"] = volume
            if drinktype is not None:
                result["DRINKTYPE"] = drinktype
            if containertype is not None:
                result["CONTAINERTYPE"] = containertype
            result["TYPE"] = "DRINKORDER"
            return result

        elif key in ["NUMBER","SIZE","STYLE","VOLUME","DRINKTYPE","CONTAINERTYPE","QUANTITY"]:
            values = []
            for el in cleaned[1:]:
                if isinstance(el, str):
                    values.append(el)
            value_str = " ".join(values).strip()
            return {
                "TYPE": key,
                "VALUE": value_str
            }

        elif key == "TOPPING":
            values = []
            for el in cleaned[1:]:
                if isinstance(el, str):
                    values.append(el)
            topping_str = " ".join(values).strip()
            return {
                "TYPE": "TOPPING",
                "NOT": False,
                "Quantity": None,
                "Topping": topping_str
            }

        elif key == "COMPLEX_TOPPING":
            quantity = None
            topping = None
            for sub in cleaned[1:]:
                node = normalize_structure(sub)
                if isinstance(node, dict):
                    t = node.get("TYPE")
                    if t == "QUANTITY":
                        quantity = node["VALUE"]
                    elif t == "TOPPING":
                        topping = node["Topping"]
            return {
                "TYPE": "TOPPING",
                "NOT": False,
                "Quantity": quantity,
                "Topping": topping
            }

        elif key == "NOT":
            for sub in cleaned[1:]:
                node = normalize_structure(sub)
                if isinstance(node, dict) and node.get("TYPE") == "TOPPING":
                    node["NOT"] = True
                    if "Quantity" not in node:
                        node["Quantity"] = None
                    return node
            return None

    else:
        # Try to parse sublists and combine orders found
        combined_order = {"PIZZAORDER": [], "DRINKORDER": []}
        found_order = False

        for el in cleaned:
            node = normalize_structure(el)
            if isinstance(node, dict):
                if "ORDER" in node:
                    found_order = True
                    order_node = node["ORDER"]
                    if "PIZZAORDER" in order_node:
                        combined_order["PIZZAORDER"].extend(order_node["PIZZAORDER"])
                    if "DRINKORDER" in order_node:
                        combined_order["DRINKORDER"].extend(order_node["DRINKORDER"])
                elif node.get("TYPE") == "PIZZAORDER":
                    found_order = True
                    combined_order["PIZZAORDER"].append(node)
                elif node.get("TYPE") == "DRINKORDER":
                    found_order = True
                    combined_order["DRINKORDER"].append(node)

        if found_order:
            final = {}
            if combined_order["PIZZAORDER"]:
                final["PIZZAORDER"] = combined_order["PIZZAORDER"]
            if combined_order["DRINKORDER"]:
                final["DRINKORDER"] = combined_order["DRINKORDER"]
            return {"ORDER": final} if final else {}

        return None

def remove_type_keys(obj):
    # Recursively remove "TYPE" keys from all dictionaries
    if isinstance(obj, dict):
        obj.pop("TYPE", None)
        for k, v in obj.items():
            remove_type_keys(v)
    elif isinstance(obj, list):
        for item in obj:
            remove_type_keys(item)


def preprocess(text):
    tokens = tokenize(text)
    parsed = parse_tokens(tokens)
    result = normalize_structure(parsed)
    remove_type_keys(result)
    return result

input_str = "(ORDER potato potato junior (PIZZAORDER (NUMBER one) (SIZE large) (STYLE thin crust) (TOPPING cheese) (TOPPING pepperoni) ) (PIZZAORDER (NUMBER two) (SIZE medium) (STYLE deep dish) (NOT (TOPPING mushrooms) ) (NOT (COMPLEX_TOPPING (QUANTITY extra) (TOPPING olives) ) ) ) (DRINKORDER (NUMBER five) (VOLUME one liter) (DRINKTYPE lemon ice tea) (CONTAINERTYPE bottles)) (DRINKORDER (NUMBER three) (VOLUME two liters) (DRINKTYPE cola) (CONTAINERTYPE cans)) (DRINKORDER (NUMBER three) (VOLUME two liters) (DRINKTYPE cola) (CONTAINERTYPE cans) ) )"

tokens = tokenize(input_str)
parsed = parse_tokens(tokens)
result = normalize_structure(parsed)
remove_type_keys(result)

print(json.dumps(result, indent=2))


In [63]:
def evaluate_model_with_accuracy(model, dataloader, criterion, device):
    model.eval()
    epoch_loss = 0
    total_tokens = 0
    correct_tokens = 0

    with torch.no_grad():
        for batch in dataloader:
            src = batch["src_input_ids"].to(device)
            tgt = batch["tgt_input_ids"].to(device)

            output = model(src)
            output_dim = output.shape[-1]

            # Flatten outputs and targets
            output = output.view(-1, output_dim)  # Shape: (batch_size * seq_len, output_dim)
            tgt = tgt.view(-1)  # Shape: (batch_size * seq_len)

            # # Apply mask to remove padding tokens
            # mask = tgt != 0  # Mask to ignore padding indices
            # output = output[mask]  # Filter model outputs
            # tgt = tgt[mask]  # Filter targets

            # Compute loss
            loss = criterion(output, tgt)
            epoch_loss += loss.item()

            # Calculate accuracy
            predictions = output.argmax(dim=1)  # Get the index of the max log-probability
            correct_tokens += (predictions == tgt).sum().item()
            total_tokens += tgt.size(0)

    accuracy = correct_tokens / total_tokens if total_tokens > 0 else 0
    return epoch_loss / len(dataloader), accuracy


In [77]:
import torch.optim as optim
from tqdm import tqdm

criterion = nn.CrossEntropyLoss()  # Use for multi-class classification ignore_index=0 for padding
optimizer = optim.SGD(model.parameters(), lr=0.0071, weight_decay=1e-5)

for epoch in range(20):  # Number of epochs
    model.train()
    epoch_loss = 0
    total_batches = len(train_dataloader)
    progress_bar = tqdm(train_dataloader, desc="Training Progress", unit="batch", leave=True)

    for batch_idx, batch in enumerate(progress_bar):  # Assuming a DataLoader is used
        src = batch["src_input_ids"].to(device)  # Input tokens
        tgt = batch["tgt_input_ids"].to(device)  # Target tokens

        optimizer.zero_grad()
        output = model(src)  # Forward pass
        output_dim = output.shape[-1]

        # Flatten outputs and targets for loss computation
        output = output.view(-1, output_dim)  # Shape: (batch_size * seq_len, output_dim)
        tgt = tgt.view(-1)  # Shape: (batch_size * seq_len)

        # Apply mask to remove padding tokens
        # mask = tgt != 0  # Mask to ignore padding indices
        # output = output[mask]  # Filter model outputs
        # tgt = tgt[mask]  # Filter targets
        # print(output.shape)
        # print(tgt.shape)

        # Compute loss
        loss = criterion(output, tgt)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        avg_loss = epoch_loss / (batch_idx + 1)
        progress_bar.set_description(f"Training Progress: Batch {batch_idx + 1}/{total_batches}, Avg Loss: {avg_loss:.8f}")

    val_loss, accuracy = evaluate_model_with_accuracy(model, test_dataloader, criterion, device)
    print(f"Epoch {epoch + 1}, Loss: {epoch_loss / len(train_dataloader):.8f}, Val Loss: {val_loss:.8f}, Accuracy: {accuracy * 100:.4f}%")


Training Progress: Batch 8496/8496, Avg Loss: 0.34789363: 100%|██████████| 8496/8496 [19:22<00:00,  7.31batch/s]


Epoch 1, Loss: 0.34789363, Val Loss: 0.28977745, Accuracy: 94.1302%


Training Progress: Batch 8496/8496, Avg Loss: 0.31821771: 100%|██████████| 8496/8496 [19:30<00:00,  7.26batch/s]


Epoch 2, Loss: 0.31821771, Val Loss: 0.28012431, Accuracy: 94.9196%


Training Progress: Batch 8496/8496, Avg Loss: 0.30149070: 100%|██████████| 8496/8496 [19:18<00:00,  7.34batch/s]


Epoch 3, Loss: 0.30149070, Val Loss: 0.27114896, Accuracy: 95.1047%


Training Progress: Batch 1709/8496, Avg Loss: 0.29252738:  20%|██        | 1709/8496 [03:54<15:30,  7.30batch/s]


KeyboardInterrupt: 

### Saving the model

In [67]:
def save_model(model, path):
    torch.save(model.state_dict(), path)
    print(f"Model saved to {path}")

def load_model(model, path):
    model.load_state_dict(torch.load(path))
    print(f"Model loaded from {path}")
    return model

In [68]:
save_model(model, "../weights/Bilstm_topdecoupled.pt")

Model saved to ../weights/Bilstm_topdecoupled.pt


In [None]:
model = BiLSTMModel(input_dim, embedding_dim, hidden_dim, output_dim, num_layers, dropout).to(device)
model = load_model(model,"../weights/Bilstm.pt")

Model loaded from ../weights/Bilstm.pt


### Testing Real Output sequence

In [69]:
test_dataset.__getitem__(1)["src_input_ids"]

tensor([  1, 163,  60,  32,  38,  81,   2,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  

In [70]:

with torch.no_grad():

    output = model( test_dataset.__getitem__(1)["src_input_ids"].to(device))
    predictions = output.argmax(dim=1)
predictions

tensor([  0,   0, 243, 243, 243, 243, 243, 243, 243,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  

In [71]:
tgt= test_dataset.__getitem__(1)["tgt_input_ids"].to(device)
tgt

tensor([  1, 241, 242, 243, 163, 245,  60, 244, 244,  81,   2,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  

In [72]:

sequence = predictions.cpu().tolist()
pred_sequence= decode_sequence(sequence,vocab)
pred_sequence

'NUMBER NUMBER NUMBER NUMBER NUMBER NUMBER NUMBER'

In [73]:
sequence = tgt.cpu().tolist()
tgt_sequence= decode_sequence(sequence,vocab)
tgt_sequence

'ORDER PIZZAORDER NUMBER five SIZE medium TOPPING TOPPING ham'

In [74]:
pred_sequence == tgt_sequence

False

In [75]:
def evaluate_model_with_sequence_accuracy(model, dataloader, device):
    model.eval()
    total_sequences = 0
    correct_sequences = 0

    with torch.no_grad():
        for batch in dataloader:
            src = batch["src_input_ids"].to(device)
            tgt = batch["tgt_input_ids"].to(device)

            output = model(src)
            output = output.argmax(dim=-1)

            # Decode sequences for comparison

            predicted_sequences = [decode_sequence(seq.cpu().tolist(), vocab) for seq in output]
            target_sequences = [decode_sequence(seq.cpu().tolist(), vocab) for seq in tgt]
            # print(predicted_sequences)
            # print(target_sequences)
            # Calculate sequence accuracy
            for pred, tgt in zip(predicted_sequences, target_sequences):
                if pred == tgt:
                    # print(pred)
                    # print(tgt)
                    correct_sequences += 1
                else:
                    print(pred)
                    print(tgt)
                total_sequences += 1

    print(f"Correct {correct_sequences}, Total {total_sequences}")
    sequence_accuracy = correct_sequences / total_sequences if total_sequences > 0 else 0
    return sequence_accuracy * 100

In [76]:
evaluate_model_with_sequence_accuracy(model, test_dataloader, device)

NUMBER NUMBER NUMBER NUMBER NUMBER
ORDER PIZZAORDER NUMBER two SIZE medium TOPPING sausage TOPPING black PIZZAORDER NUMBER two SIZE medium TOPPING pepperoni COMPLEX_TOPPING QUANTITY TOPPING cheese PIZZAORDER NUMBER three SIZE large TOPPING pepperoni TOPPING sausage
NUMBER NUMBER NUMBER NUMBER NUMBER NUMBER NUMBER
ORDER PIZZAORDER NUMBER five SIZE medium TOPPING TOPPING ham
NUMBER NUMBER
ORDER need PIZZAORDER NUMBER one SIZE large STYLE vegetarian pizza COMPLEX_TOPPING QUANTITY TOPPING banana
NUMBER NUMBER NUMBER NUMBER NUMBER NUMBER NUMBER NUMBER NUMBER
ORDER PIZZAORDER NUMBER a SIZE large TOPPING onion TOPPING pepper pizza

ORDER PIZZAORDER NUMBER one pie TOPPING pesto TOPPING ham NOT TOPPING

ORDER need PIZZAORDER NUMBER one SIZE large pizza TOPPING ham TOPPING bacon TOPPING TOPPING black PIZZAORDER NUMBER one SIZE medium pizza TOPPING sausage TOPPING DRINKORDER NUMBER six SIZE large DRINKTYPE
NUMBER NUMBER NUMBER NUMBER NUMBER NUMBER NUMBER NUMBER NUMBER NUMBER NUMBER NUMBER
ORDER P

0.0