In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import BatchNorm1d
from tqdm import tqdm



class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim,padding_idx=0)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=0)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)  # Make input (1, batch_size)
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell



class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.fc_out.out_features

        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

        hidden, cell = self.encoder(src)

        input = trg[0, :]  # Start token

        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            top1 = output.argmax(1)
            input = trg[t] if torch.rand(1).item() < teacher_forcing_ratio else top1

        return outputs




In [2]:
import pandas as pd
train_path = '../dataset/PIZZA_train.json'
test_path = '../dataset/PIZZA_dev.json'
df = pd.read_json(train_path, lines=True)
dev = pd.read_json(test_path, lines=True)

In [3]:
df.describe()

Unnamed: 0,train.SRC,train.EXR,train.TOP,train.TOP-DECOUPLED
count,2456446,2456446,2456446,2456446
unique,2456446,694346,2456446,1425035
top,can i have a large bbq pulled pork,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PARTY_SIZ...,(ORDER can i have (PIZZAORDER (NUMBER a ) (SIZ...,(ORDER (PIZZAORDER (NUMBER three ) (NOT (TOPPI...
freq,1,1999,1,167


In [4]:
unique_exr = df['train.EXR'].value_counts()[df['train.EXR'].value_counts() == 1].index

# Filter rows where 'train.EXR' is unique
unique_related_dataset = df[df['train.EXR'].isin(unique_exr)].reset_index(drop=True)
unique_related_dataset

Unnamed: 0,train.SRC,train.EXR,train.TOP,train.TOP-DECOUPLED
0,party sized high rise dough pie with american ...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PARTY_SIZ...,(ORDER (PIZZAORDER (SIZE party sized ) (STYLE ...,(ORDER (PIZZAORDER (SIZE party sized ) (STYLE ...
1,meatlover pie with extra chicken,(ORDER (PIZZAORDER (NUMBER 1 ) (STYLE MEAT_LOV...,(ORDER (PIZZAORDER (STYLE meatlover ) pie with...,(ORDER (PIZZAORDER (STYLE meatlover ) (COMPLEX...
2,medium high rise dough pie with artichoke,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE MEDIUM ) ...,(ORDER (PIZZAORDER (SIZE medium ) (STYLE high ...,(ORDER (PIZZAORDER (SIZE medium ) (STYLE high ...
3,large pie with green pepper and peperonni and ...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER (PIZZAORDER (SIZE large ) pie with (TOP...,(ORDER (PIZZAORDER (SIZE large ) (TOPPING gree...
4,large pie with chicken and mozzarella and ranc...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER (PIZZAORDER (SIZE large ) pie with (TOP...,(ORDER (PIZZAORDER (SIZE large ) (TOPPING chic...
...,...,...,...,...
476365,i'd like a pizza with pesto mushrooms and gree...,(ORDER (PIZZAORDER (NUMBER 1 ) (TOPPING PESTO ...,(ORDER i'd like (PIZZAORDER (NUMBER a ) pizza ...,(ORDER (PIZZAORDER (NUMBER a ) (TOPPING pesto ...
476366,i'd like a pizza with arugula ricotta cheese a...,(ORDER (PIZZAORDER (NUMBER 1 ) (TOPPING ARUGUL...,(ORDER i'd like (PIZZAORDER (NUMBER a ) pizza ...,(ORDER (PIZZAORDER (NUMBER a ) (TOPPING arugul...
476367,i'd like a pizza with yellow peppers fried oni...,(ORDER (PIZZAORDER (NUMBER 1 ) (TOPPING YELLOW...,(ORDER i'd like (PIZZAORDER (NUMBER a ) pizza ...,(ORDER (PIZZAORDER (NUMBER a ) (TOPPING yellow...
476368,i'd like a pizza with olives roasted tomatoes ...,(ORDER (PIZZAORDER (NUMBER 1 ) (TOPPING OLIVES...,(ORDER i'd like (PIZZAORDER (NUMBER a ) pizza ...,(ORDER (PIZZAORDER (NUMBER a ) (TOPPING olives...


In [5]:
#unique_patterns = df['train.EXR'].value_counts()
# # Subset the DataFrame for rows with unique values
# unique_related_dataset = df[df['train.EXR'].isin(unique_patterns.index[:5000])] # reduce to 5000 patterns

# # Describe the resulting dataset
# unique_related_dataset.describe()

In [6]:
# unique_related_dataset =unique_related_dataset.reset_index(drop=True)
# unique_related_dataset

In [7]:
X_train = unique_related_dataset['train.SRC']
y_train = unique_related_dataset['train.EXR']
X_test = dev['dev.SRC']
y_test = dev['dev.EXR']
print(len(unique_related_dataset))
print(X_train[476368])
print(y_train[476368])
print(dev['dev.SRC'][0])

476370
i'd like a pizza with olives roasted tomatoes and broccoli without thin crust
(ORDER (PIZZAORDER (NUMBER 1 ) (TOPPING OLIVES ) (TOPPING ROASTED_TOMATOES ) (TOPPING BROCCOLI ) (NOT (STYLE THIN_CRUST ) ) ) )
i want to order two medium pizzas with sausage and black olives and two medium pizzas with pepperoni and extra cheese and three large pizzas with pepperoni and sausage


In [8]:
import sys
sys.path.append("..")
from utils.data_preprocessing import preprocess_text
from utils.feature_extraction import bag_of_words, tfidf_features, extract_embeddings
X_train = [" ".join(preprocess_text(text)) for text in X_train]
X_test = [" ".join(preprocess_text(text)) for text in X_test]

In [9]:
X_train

['party sized high rise dough pie with american cheese with oregano',
 'meatlover pie with chicken',
 'medium high rise dough pie with artichoke',
 'large pie with green pepper peperonni with roasted pepper',
 'large pie with chicken mozzarella ranch sauce',
 'large pie with banana pepper meatball with italian sausage',
 'high rise dough pie with cheese',
 'regular big meat pie with green pepper',
 'party sized pie with american cheese mozzarella with tomato sauce',
 'party sized pie with little bit american cheese with pickle',
 'party sized stuffed crust pie with banana pepper with meatball',
 'lunch sized pie with little bit american cheese',
 'regular big meat pie with banana pepper pecorino cheese',
 'big new yorker pie with american cheese with olive oil',
 'medium pie with banana pepper peperonni with little bit roasted green pepper',
 'lunch sized stuffed crust pie with banana pepper pecorino cheese',
 'personal pie with bbq sauce mozzarella',
 'large high rise dough pie with b

In [10]:
max_str_1 = len(max(X_train, key=len))
max_str_2 = len(y_train[y_train.str.len().idxmax()])
max_str_1, max_str_2

(114, 300)

In [11]:
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences

def tokenize_output(output):
    """
    Tokenizes the structured output into meaningful tokens.
    Example:
        Input: "(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (TOPPING bbq pulled pork ) ) )"
        Output: ["(ORDER", "(PIZZAORDER", "(NUMBER", "a", "(SIZE", "large", "(TOPPING", "bbq", "pulled", "pork", ")", ")", ")", ")"]
    """
    tokens = re.findall(r"\(|\)|\w+|[^\s()]+", output)
    return tokens

def build_vocab(outputs):
    """
    Builds a vocabulary from tokenized outputs.
    """
    vocab = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2}  # Special tokens
    i = 3
    for output in outputs:
        tokens = tokenize_output(output)
        for token in tokens:
            if token not in vocab:
                vocab[token] = i
                i += 1
    return vocab
def encode_outputs(outputs, vocab):
    """
    Encodes tokenized outputs into sequences of integers.
    """
    encoded = []
    for output in outputs:
        tokens = tokenize_output(output)
        sequence = [vocab["<SOS>"]] + [vocab[token] for token in tokens if token in vocab] + [vocab["<EOS>"]]
        encoded.append(sequence)
    return encoded

def pad_sequences_to_fixed_length(sequences, max_len):
    """
    Pads sequences to a fixed length.
    """
    return pad_sequences(sequences, maxlen=max_len, padding="post", value=0)

def decode_sequence(sequence, vocab):
    """
    Decodes a sequence of integers back into the structured output string.
    """
    inv_vocab = {v: k for k, v in vocab.items()}  # Reverse the vocabulary
    tokens = [inv_vocab[idx] for idx in sequence if idx in inv_vocab and idx not in {vocab["<SOS>"], vocab["<EOS>"], vocab["<PAD>"]}]

    output = " ".join(tokens)
    output = output.replace(" ( ", " (").replace("( ", "(") #.replace(" )", ")")
    return output

def decode_sequence_2(sequence, vocab):
    """
    Decodes a sequence of integers back into the structured output string.
    """
    inv_vocab = {v: k for k, v in vocab.items()}  # Reverse the vocabulary
    # sequence = sequence.cpu().tolist()  # Convert tensor to a list of integers
    tokens = [inv_vocab.get(idx, "") for idx in sequence if idx > 0]  # Ignore unknown and put empty char
    return "".join(tokens)



In [12]:
# import gensim.downloader as api
# glove_vectors = api.load("glove-wiki-gigaword-100")  # 100-dimension GloVe
# glove_vectors

In [13]:


def prepare_data(
    X_train, y_train, X_test, y_test, feature_type="bow", glove_vectors=None, max_len_1=20, max_len_2 = 20
):

    vectorizer = None

    # # Feature Extraction for X_train and X_test
    # if feature_type == "bow":
    #     X_train_processed, vectorizer = bag_of_words(X_train)
    #     X_train_processed = X_train_processed.toarray()
    #     X_test_processed = vectorizer.transform(X_test).toarray()
    # elif feature_type == "tfidf":
    #     X_train_processed, vectorizer = tfidf_features(X_train)
    #     X_train_processed = X_train_processed.toarray()
    #     X_test_processed = vectorizer.transform(X_test).toarray()
    # elif feature_type == "embeddings":
    #     if not glove_vectors:
    #         raise ValueError("GloVe vectors must be provided for embeddings.")
    #     X_train_tokenized = [sentence.split() for sentence in X_train]
    #     X_test_tokenized = [sentence.split() for sentence in X_test]
    #     X_train_processed = extract_embeddings(X_train_tokenized)
    #     X_test_processed = extract_embeddings(X_test_tokenized)
    # else:
    #     raise ValueError("Invalid feature type. Choose 'bow', 'tfidf', or 'embeddings'.")
    
    X_vocab = build_vocab(X_train)  # Build vocabulary from training outputs
    X_train_encoded = encode_outputs(X_train, X_vocab)  # Encode training outputs
    X_test_encoded = encode_outputs(X_test, X_vocab)  # Encode testing outputs
    X_train_processed = pad_sequences_to_fixed_length(X_train_encoded, max_len_1)
    X_test_processed = pad_sequences_to_fixed_length(X_test_encoded, max_len_1)

    vocab = build_vocab(y_train)  # Build vocabulary from training outputs
    y_train_encoded = encode_outputs(y_train, vocab)  # Encode training outputs
    y_test_encoded = encode_outputs(y_test, vocab)  # Encode testing outputs
    y_train_processed = pad_sequences_to_fixed_length(y_train_encoded, max_len_2)
    y_test_processed = pad_sequences_to_fixed_length(y_test_encoded, max_len_2)


    return (
        X_train_processed,
        X_test_processed,
        y_train_processed,
        y_test_processed,
        X_vocab,
        vocab,  # Return vocabulary for decoding
    )


In [14]:
X_train_processed, X_test_processed, y_train_processed, y_test_processed, X_vocab, vocab = prepare_data( X_train, y_train, X_test, y_test, feature_type="embeddings", max_len_1=250, max_len_2=250)

In [15]:
X_vocab

{'<PAD>': 0,
 '<SOS>': 1,
 '<EOS>': 2,
 'party': 3,
 'sized': 4,
 'high': 5,
 'rise': 6,
 'dough': 7,
 'pie': 8,
 'with': 9,
 'american': 10,
 'cheese': 11,
 'oregano': 12,
 'meatlover': 13,
 'chicken': 14,
 'medium': 15,
 'artichoke': 16,
 'large': 17,
 'green': 18,
 'pepper': 19,
 'peperonni': 20,
 'roasted': 21,
 'mozzarella': 22,
 'ranch': 23,
 'sauce': 24,
 'banana': 25,
 'meatball': 26,
 'italian': 27,
 'sausage': 28,
 'regular': 29,
 'big': 30,
 'meat': 31,
 'tomato': 32,
 'little': 33,
 'bit': 34,
 'pickle': 35,
 'stuffed': 36,
 'crust': 37,
 'lunch': 38,
 'pecorino': 39,
 'new': 40,
 'yorker': 41,
 'olive': 42,
 'oil': 43,
 'personal': 44,
 'bbq': 45,
 'basil': 46,
 'everything': 47,
 'mushroom': 48,
 'low': 49,
 'fat': 50,
 'pineaples': 51,
 'arugula': 52,
 'carrot': 53,
 'garlic': 54,
 'alfredo': 55,
 'napolitana': 56,
 'spinach': 57,
 'ham': 58,
 'spiced': 59,
 'bay': 60,
 'leaf': 61,
 'balsamic': 62,
 'glaze': 63,
 'yellow': 64,
 'salami': 65,
 'kalamata': 66,
 'combinatio

In [16]:
vocab

{'<PAD>': 0,
 '<SOS>': 1,
 '<EOS>': 2,
 '(': 3,
 'ORDER': 4,
 'PIZZAORDER': 5,
 'NUMBER': 6,
 '1': 7,
 ')': 8,
 'SIZE': 9,
 'PARTY_SIZE': 10,
 'STYLE': 11,
 'THICK_CRUST': 12,
 'TOPPING': 13,
 'AMERICAN_CHEESE': 14,
 'COMPLEX_TOPPING': 15,
 'QUANTITY': 16,
 'EXTRA': 17,
 'OREGANO': 18,
 'MEAT_LOVER': 19,
 'CHICKEN': 20,
 'MEDIUM': 21,
 'ARTICHOKES': 22,
 'LARGE': 23,
 'GREEN_PEPPERS': 24,
 'PEPPERONI': 25,
 'ROASTED_PEPPERS': 26,
 'MOZZARELLA_CHEESE': 27,
 'RANCH_SAUCE': 28,
 'BANANA_PEPPERS': 29,
 'MEATBALLS': 30,
 'ITALIAN_SAUSAGE': 31,
 'CHEESE': 32,
 'REGULARSIZE': 33,
 'TOMATO_SAUCE': 34,
 'LIGHT': 35,
 'PICKLES': 36,
 'STUFFED_CRUST': 37,
 'LUNCH_SIZE': 38,
 'PECORINO_CHEESE': 39,
 'NEW_YORK_STYLE': 40,
 'OLIVE_OIL': 41,
 'ROASTED_GREEN_PEPPERS': 42,
 'PERSONAL_SIZE': 43,
 'BBQ_SAUCE': 44,
 'BASIL': 45,
 'ALL_TOPPINGS': 46,
 'GREEN_OLIVES': 47,
 'MUSHROOMS': 48,
 'LOW_FAT_CHEESE': 49,
 'ROASTED_TOMATOES': 50,
 'PINEAPPLE': 51,
 'ARUGULA': 52,
 'CARROTS': 53,
 'ROASTED_CHICKEN': 5

In [17]:
X_train_processed.shape, y_train_processed.shape

((476370, 250), (476370, 250))

In [18]:
X_train_processed[1]

array([ 1, 13,  8,  9, 14,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0

In [19]:
y_train_processed[1]

array([ 1,  3,  4,  3,  5,  3,  6,  7,  8,  3, 11, 19,  8,  3, 15,  3, 16,
       17,  8,  3, 13, 20,  8,  8,  8,  8,  2,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0

In [20]:

ou= decode_sequence(y_train_processed[1],vocab)
ou

'(ORDER (PIZZAORDER (NUMBER 1 ) (STYLE MEAT_LOVER ) (COMPLEX_TOPPING (QUANTITY EXTRA ) (TOPPING CHICKEN ) ) ) )'

In [21]:
y_train[1]

'(ORDER (PIZZAORDER (NUMBER 1 ) (STYLE MEAT_LOVER ) (COMPLEX_TOPPING (QUANTITY EXTRA ) (TOPPING CHICKEN ) ) ) )'

In [22]:
ou == y_train[1]

True

### Ensure Decoding works right

In [23]:
total_sequences = 0
correct_sequences = 0

for src,tgt in zip(y_test_processed, y_test):
    #print(src)

    predicted_sequences = decode_sequence(src, vocab) 
    #print(predicted_sequences)
    # print(predicted_sequences)
    # print(target_sequences)
    # Calculate sequence accuracy
    if predicted_sequences == tgt:
        # print(pred)
        # print(tgt)
        correct_sequences += 1
    total_sequences += 1
print(f"Correct {correct_sequences}, Total {total_sequences}")
sequence_accuracy = correct_sequences / total_sequences if total_sequences > 0 else 0
sequence_accuracy * 100

Correct 348, Total 348


100.0

In [24]:
len(vocab)

183

In [25]:
input_dim = X_train_processed.shape[1]
input_dim

250

In [26]:
output_dim = y_train_processed.shape[1] 
output_dim

250

In [27]:
import torch.optim as optim

INPUT_DIM = input_dim
OUTPUT_DIM = output_dim
ENC_EMB_DIM = 64
DEC_EMB_DIM = 64
HID_DIM = 128
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)
model = Seq2Seq(encoder, decoder, device).to(device)

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()


In [28]:
import torch
from torch.utils.data import Dataset, DataLoader

class SequenceDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = torch.tensor(inputs, dtype=torch.long) 
        self.targets = torch.tensor(targets, dtype=torch.long) 

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {
            "src_input_ids": self.inputs[idx],
            "tgt_input_ids": self.targets[idx],
        }

train_dataset = SequenceDataset(X_train_processed, y_train_processed)
test_dataset = SequenceDataset(X_test_processed, y_test_processed)

batch_size = 128  # Adjust based GPU ;-;  memory
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [29]:
def evaluate_model_with_accuracy(model, dataloader, criterion, device):
    model.eval()
    epoch_loss = 0
    total_tokens = 0
    correct_tokens = 0

    with torch.no_grad():
        for batch in dataloader:
            src = batch["src_input_ids"].to(device)
            tgt = batch["tgt_input_ids"].to(device)

            output = model(src,tgt, teacher_forcing_ratio=0)
            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)  # Reshape for token comparison
            tgt = tgt[:, 1:].reshape(-1)
            # Apply mask to remove padding tokens
            mask = tgt != 0  # Mask to ignore padding indices
            output = output[mask]  # Filter model outputs
            tgt = tgt[mask]  # Filter targets
            # Calculate loss
            loss = criterion(output, tgt)
            epoch_loss += loss.item()

            # Calculate accuracy
            predictions = output.argmax(dim=1)  # Get the index of the max log-probability
            correct_tokens += (predictions == tgt).sum().item()
            total_tokens += tgt.size(0)

    accuracy = correct_tokens / total_tokens if total_tokens > 0 else 0
    return epoch_loss / len(dataloader), accuracy

In [30]:
import torch.optim as optim
from tqdm import tqdm

criterion = nn.CrossEntropyLoss(ignore_index=0)  # Use for multi-class classification
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

# Training loop
for epoch in range(2):  # Number of epochs
    model.train()
    epoch_loss = 0
    total_batches = len(train_dataloader)
    progress_bar = tqdm(train_dataloader,desc="Training Progress", unit="batch", leave=True)
    for  batch_idx, batch in enumerate(progress_bar): # Assuming a DataLoader is used for batches
        src = batch["src_input_ids"].to(device)  # Input tokens
        tgt = batch["tgt_input_ids"].to(device)  # Target tokens

        optimizer.zero_grad()
        output = model(src,tgt)  # Forward pass


        # Flatten the model output
        output = output.view(-1, output_dim) 
        tgt = tgt.view(-1)
        
        # Apply mask to remove padding tokens
        mask = tgt != 0  # Mask to ignore padding indices
        output = output[mask]  # Filter model outputs
        tgt = tgt[mask]  # Filter targets
        
        loss = criterion(output, tgt)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()

        epoch_loss += loss.item()
        avg_loss = epoch_loss / (batch_idx + 1)
        progress_bar.set_description(f"Training Progress: Batch {batch_idx + 1}/{total_batches}, Avg Loss: {avg_loss:.4f}")
    val_loss, accuracy = evaluate_model_with_accuracy(model, test_dataloader, criterion, device)
    print(f"Epoch {epoch + 1}, Loss: {epoch_loss / len(train_dataloader):.4f}, Val Loss: {val_loss / len(test_dataloader):.4f}, Accuracy: {accuracy * 100:.4f}%")



Training Progress: Batch 504/3722, Avg Loss: 1.7514:  14%|█▎        | 504/3722 [03:09<20:09,  2.66batch/s]


KeyboardInterrupt: 

In [None]:
evaluate_model_with_accuracy(model, test_dataloader, criterion, device)

(0.6574204663435618, 0.9081516933840772)

### Saving the model

In [None]:
def save_model(model, path):
    torch.save(model.state_dict(), path)
    print(f"Model saved to {path}")

def load_model(model, path):
    model.load_state_dict(torch.load(path))
    print(f"Model loaded from {path}")
    return model

In [None]:
save_model(model, "../weights/transformer_lstm.pt")

Model saved to ../weights/transformer_lstm.pt


### Testing Real Output sequence

In [None]:
# model = BiLSTMModel(input_dim, embedding_dim, hidden_dim, output_dim, num_layers, dropout).to(device)
# model = load_model(model,"../weights/Bilstm.pt")
model.eval()
epoch_loss = 0
total_tokens = 0
correct_tokens = 0

with torch.no_grad():
    for batch in test_dataloader:
        src = batch["src_input_ids"].to(device)
        tgt = batch["tgt_input_ids"].to(device)

        output = model(src,tgt, teacher_forcing_ratio=0)
        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)  # Reshape for token comparison
        tgt = tgt[:, 1:].reshape(-1)

        # Calculate loss
        loss = criterion(output, tgt)
        epoch_loss += loss.item()

        # Calculate accuracy
        predictions = output.argmax(dim=1)  # Get the index of the max log-probability
        break
len(predictions)

19136

In [None]:
tgt= test_dataset.__getitem__(0)["tgt_input_ids"].to(device)
tgt

tensor([  1,   3,   4,   3,   5,   3,   6, 105,   8,   3,   9,  21,   8,   3,
         15,   3,  16,  17,   8,   3,  13,  32,   8,   8,   3,  13,  25,   8,
          8,   3,   5,   3,   6, 105,   8,   3,   9,  21,   8,   3,  13,  57,
          8,   3,  13,  81,   8,   8,   3,   5,   3,   6, 104,   8,   3,   9,
         23,   8,   3,  13,  25,   8,   3,  13,  81,   8,   8,   8,   2,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  

In [None]:

sequence = predictions[:300].cpu().tolist()
pred_sequence= decode_sequence(sequence,vocab)
pred_sequence

'('

In [None]:
sequence = tgt.cpu().tolist()
tgt_sequence= decode_sequence(sequence,vocab)
tgt_sequence

'(ORDER (PIZZAORDER (NUMBER 2 ) (SIZE MEDIUM ) (COMPLEX_TOPPING (QUANTITY EXTRA ) (TOPPING CHEESE ) ) (TOPPING PEPPERONI ) ) (PIZZAORDER (NUMBER 2 ) (SIZE MEDIUM ) (TOPPING OLIVES ) (TOPPING SAUSAGE ) ) (PIZZAORDER (NUMBER 3 ) (SIZE LARGE ) (TOPPING PEPPERONI ) (TOPPING SAUSAGE ) ) ) (ORDER (PIZZAORDER (NUMBER 5 ) (SIZE MEDIUM ) (TOPPING HAM ) (TOPPING TOMATOES ) ) ) (ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (STYLE VEGETARIAN ) (COMPLEX_TOPPING (QUANTITY EXTRA ) (TOPPING BANANA_PEPPERS ) ) ) ) (ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (TOPPING ONIONS ) (TOPPING PEPPERS ) ) ) (ORDER (PIZZAORDER (NOT (TOPPING OLIVES ) ) (NUMBER 1 ) (TOPPING HAM ) (TOPPING PESTO ) ) ) (ORDER (DRINKORDER (DRINKTYPE COKE ) (NUMBER 6 ) (SIZE LARGE ) ) (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (TOPPING BACON ) (TOPPING HAM ) (TOPPING OLIVES ) (TOPPING ONIONS ) ) (PIZZAORDER (NUMBER 1 ) (SIZE MEDIUM ) (TOPPING ONIONS ) (TOPPING SAUSAGE ) ) ) (ORDER (PIZZAORDER (NOT (TOPPING PINEAPPLE ) ) (NUMBER 1 ) (SIZE ME

In [None]:
pred_sequence == tgt_sequence

False

In [None]:
def evaluate_model_with_sequence_accuracy(model, dataloader, device):
    model.eval()
    epoch_loss = 0
    total_sequences = 0
    correct_sequences = 0

    with torch.no_grad():
        for batch in dataloader:
            src = batch["src_input_ids"].to(device)
            src_lengths = (batch["src_input_ids"] != 0).sum(dim=1).to(device)
            tgt = batch["tgt_input_ids"].to(device)

            output = model(src,tgt, teacher_forcing_ratio=0)  # No teacher forcing during evaluation
            output_dim = output.shape[-1]
            output = output.argmax(dim=-1)  # Get the predicted tokens

            # Decode sequences for comparison
            predicted_sequences = [decode_sequence(seq.cpu().tolist(), vocab) for seq in output]
            target_sequences = [decode_sequence(seq.cpu().tolist(), vocab) for seq in tgt]
            print(predicted_sequences)
            print(target_sequences)
            # Calculate sequence accuracy
            for pred, tgt in zip(predicted_sequences, target_sequences):
                if pred == tgt:
                    correct_sequences += 1
                total_sequences += 1

    sequence_accuracy = correct_sequences / total_sequences if total_sequences > 0 else 0
    return epoch_loss / len(dataloader), sequence_accuracy


In [None]:
evaluate_model_with_sequence_accuracy(model, test_dataloader, device)

['', '(ORDER (PIZZAORDER (NUMBER 1 ) ((PIZZAORDER ) (TOPPING () TOPPING ) (TOPPING ) ) ) (TOPPING ) ) ) (((TOPPING ) () (TOPPING ) ) (TOPPING ) ) ) (((TOPPING ) () (TOPPING ) ) (TOPPING ) ) ) )', '(ORDER (PIZZAORDER (NUMBER 1 ) ((PIZZAORDER ) (TOPPING () TOPPING ) (TOPPING ) ) ) (TOPPING ) ) ) (((TOPPING ) () (TOPPING ) ) (TOPPING ) ) ) (((TOPPING ) () (TOPPING ) ) (TOPPING ) ) ) )', '(ORDER (PIZZAORDER (NUMBER 1 ) ((PIZZAORDER ) (TOPPING () TOPPING ) (TOPPING ) ) ) (TOPPING ) ) ) (((TOPPING ) () (TOPPING ) ) (TOPPING ) ) ) (((TOPPING ) () (TOPPING ) ) (TOPPING ) ) ) )', '(ORDER (PIZZAORDER (NUMBER 1 ) ((PIZZAORDER ) (TOPPING () TOPPING ) (TOPPING ) ) ) (TOPPING ) ) ) (((TOPPING ) () (TOPPING ) ) (TOPPING ) ) ) (((TOPPING ) () (TOPPING ) ) (TOPPING ) ) ) )', '(ORDER (PIZZAORDER (NUMBER 1 ) ((PIZZAORDER ) (TOPPING () TOPPING ) (TOPPING ) ) ) (TOPPING ) ) ) (((TOPPING ) () (TOPPING ) ) (TOPPING ) ) ) (((TOPPING ) () (TOPPING ) ) (TOPPING ) ) ) )', '(ORDER (PIZZAORDER (NUMBER 1 ) (TOPPING

(0.0, 0.0)