In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class BiLSTMModel(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, num_layers=3, dropout=0.5):
        super(BiLSTMModel, self).__init__()
        self.embedding = nn.Linear(input_dim, embedding_dim) # nn.Embedding(input_dim, embedding_dim)#
        
        # Bidirectional LSTM layers
        self.bilstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            bidirectional=True,
            batch_first=True,
            dropout=dropout,
        )
        
        # Fully connected layers
        self.fc1 = nn.Linear(hidden_dim * 2, 512)
        self.fc2 = nn.Linear(512, 512)
        self.fc_out = nn.Linear(512, output_dim)

        # Dropout layer
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Embedding layer
        embedded = self.embedding(x)  # (batch_size, seq_len, embedding_dim)

        # BiLSTM layers
        lstm_out, _ = self.bilstm(embedded)  # (batch_size, seq_len, hidden_dim * 2)

        # TimeDistributed fully connected layers
        output = self.fc1(lstm_out)  # (batch_size, seq_len, 512)
        output = F.relu(output)
        output = self.dropout(output)

        output = self.fc2(output)  # (batch_size, seq_len, 512)
        output = F.relu(output)
        output = self.dropout(output)

        output = self.fc_out(output)  # (batch_size, seq_len, output_dim)
        return F.log_softmax(output, dim=-1)



In [2]:
import pandas as pd
train_path = '../dataset/PIZZA_train.json'
test_path = '../dataset/PIZZA_dev.json'
df = pd.read_json(train_path, lines=True)
dev = pd.read_json(test_path, lines=True)

In [3]:
# Get counts of each unique value in 'train.EXR'
value_counts = df['train.EXR'].value_counts()

# Filter for unique values (occurrence count == 1)
unique_values = value_counts[value_counts == 1].index

# Subset the DataFrame for rows with unique values
unique_related_dataset = df[df['train.EXR'].isin(unique_values)].reset_index(drop=True)

# Describe the resulting dataset
unique_related_dataset.describe()

Unnamed: 0,train.SRC,train.EXR,train.TOP,train.TOP-DECOUPLED
count,476370,476370,476370,476370
unique,476370,476370,476370,476370
top,party sized high rise dough pie with american ...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PARTY_SIZ...,(ORDER (PIZZAORDER (SIZE party sized ) (STYLE ...,(ORDER (PIZZAORDER (SIZE party sized ) (STYLE ...
freq,1,1,1,1


In [4]:
unique_related_dataset

Unnamed: 0,train.SRC,train.EXR,train.TOP,train.TOP-DECOUPLED
0,party sized high rise dough pie with american ...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PARTY_SIZ...,(ORDER (PIZZAORDER (SIZE party sized ) (STYLE ...,(ORDER (PIZZAORDER (SIZE party sized ) (STYLE ...
1,meatlover pie with extra chicken,(ORDER (PIZZAORDER (NUMBER 1 ) (STYLE MEAT_LOV...,(ORDER (PIZZAORDER (STYLE meatlover ) pie with...,(ORDER (PIZZAORDER (STYLE meatlover ) (COMPLEX...
2,medium high rise dough pie with artichoke,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE MEDIUM ) ...,(ORDER (PIZZAORDER (SIZE medium ) (STYLE high ...,(ORDER (PIZZAORDER (SIZE medium ) (STYLE high ...
3,large pie with green pepper and peperonni and ...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER (PIZZAORDER (SIZE large ) pie with (TOP...,(ORDER (PIZZAORDER (SIZE large ) (TOPPING gree...
4,large pie with chicken and mozzarella and ranc...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER (PIZZAORDER (SIZE large ) pie with (TOP...,(ORDER (PIZZAORDER (SIZE large ) (TOPPING chic...
...,...,...,...,...
476365,i'd like a pizza with pesto mushrooms and gree...,(ORDER (PIZZAORDER (NUMBER 1 ) (TOPPING PESTO ...,(ORDER i'd like (PIZZAORDER (NUMBER a ) pizza ...,(ORDER (PIZZAORDER (NUMBER a ) (TOPPING pesto ...
476366,i'd like a pizza with arugula ricotta cheese a...,(ORDER (PIZZAORDER (NUMBER 1 ) (TOPPING ARUGUL...,(ORDER i'd like (PIZZAORDER (NUMBER a ) pizza ...,(ORDER (PIZZAORDER (NUMBER a ) (TOPPING arugul...
476367,i'd like a pizza with yellow peppers fried oni...,(ORDER (PIZZAORDER (NUMBER 1 ) (TOPPING YELLOW...,(ORDER i'd like (PIZZAORDER (NUMBER a ) pizza ...,(ORDER (PIZZAORDER (NUMBER a ) (TOPPING yellow...
476368,i'd like a pizza with olives roasted tomatoes ...,(ORDER (PIZZAORDER (NUMBER 1 ) (TOPPING OLIVES...,(ORDER i'd like (PIZZAORDER (NUMBER a ) pizza ...,(ORDER (PIZZAORDER (NUMBER a ) (TOPPING olives...


In [5]:
X_train = unique_related_dataset['train.SRC']
y_train = unique_related_dataset['train.EXR']
X_test = dev['dev.SRC']
y_test = dev['dev.EXR']
print(len(unique_related_dataset))
print(X_train[476368])
print(y_train[476368])
print(dev['dev.SRC'][0])

476370
i'd like a pizza with olives roasted tomatoes and broccoli without thin crust
(ORDER (PIZZAORDER (NUMBER 1 ) (TOPPING OLIVES ) (TOPPING ROASTED_TOMATOES ) (TOPPING BROCCOLI ) (NOT (STYLE THIN_CRUST ) ) ) )
i want to order two medium pizzas with sausage and black olives and two medium pizzas with pepperoni and extra cheese and three large pizzas with pepperoni and sausage


In [6]:
import sys
sys.path.append("..")
from utils.data_preprocessing import preprocess_text
from utils.feature_extraction import bag_of_words, tfidf_features, extract_embeddings
X_train = [" ".join(preprocess_text(text)) for text in X_train]
X_test = [" ".join(preprocess_text(text)) for text in X_test]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hima\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
X_train

['party sized high rise dough pie american cheese oregano',
 'meatlover pie chicken',
 'medium high rise dough pie artichoke',
 'large pie green pepper peperonni roasted pepper',
 'large pie chicken mozzarella ranch sauce',
 'large pie banana pepper meatball italian sausage',
 'high rise dough pie cheese',
 'regular big meat pie green pepper',
 'party sized pie american cheese mozzarella tomato sauce',
 'party sized pie little bit american cheese pickle',
 'party sized stuffed crust pie banana pepper meatball',
 'lunch sized pie little bit american cheese',
 'regular big meat pie banana pepper pecoricheese',
 'big new yorker pie american cheese olive oil',
 'medium pie banana pepper peperonni little bit roasted green pepper',
 'lunch sized stuffed crust pie banana pepper pecoricheese',
 'personal pie bbq sauce mozzarella',
 'large high rise dough pie basil',
 'large everything pie little bit american cheese',
 'party sized high rise dough pie green olive mushroom',
 'large pie american

In [8]:
max_str_1 = len(max(X_train, key=len))
max_str_2 = len(y_train[y_train.str.len().idxmax()])
max_str_1, max_str_2

(109, 300)

In [9]:
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences

def tokenize_output(output):
    """
    Tokenizes the structured output into meaningful tokens.
    Example:
        Input: "(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (TOPPING bbq pulled pork ) ) )"
        Output: ["(ORDER", "(PIZZAORDER", "(NUMBER", "a", "(SIZE", "large", "(TOPPING", "bbq", "pulled", "pork", ")", ")", ")", ")"]
    """
    tokens = re.findall(r"\(|\)|\w+|[^\s()]+", output)
    return tokens

def build_vocab(outputs):
    """
    Builds a vocabulary from tokenized outputs.
    """
    vocab = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2}  # Special tokens
    i = 2
    for output in outputs:
        tokens = tokenize_output(output)
        for token in tokens:
            if token not in vocab:
                vocab[token] = i
                i += 1
    return vocab
def encode_outputs(outputs, vocab):
    """
    Encodes tokenized outputs into sequences of integers.
    """
    encoded = []
    for output in outputs:
        tokens = tokenize_output(output)
        sequence = [vocab["<SOS>"]] + [vocab[token] for token in tokens if token in vocab] + [vocab["<EOS>"]]
        encoded.append(sequence)
    return encoded

def pad_sequences_to_fixed_length(sequences, max_len):
    """
    Pads sequences to a fixed length.
    """
    return pad_sequences(sequences, maxlen=max_len, padding="post", value=0)

def decode_sequence(sequence, vocab):
    """
    Decodes a sequence of integers back into the structured output string.
    """
    inv_vocab = {v: k for k, v in vocab.items()}  # Reverse the vocabulary
    tokens = [inv_vocab[idx] for idx in sequence if idx > 0]  # Ignore <PAD> tokens
    return " ".join(tokens)


In [10]:
import gensim.downloader as api
glove_vectors = api.load("glove-wiki-gigaword-100")  # 100-dimension GloVe
glove_vectors

<gensim.models.keyedvectors.KeyedVectors at 0x2662d245ad0>

In [11]:


def prepare_data(
    X_train, y_train, X_test, y_test, feature_type="bow", glove_vectors=None, max_len=20
):

    vectorizer = None

    # # Feature Extraction for X_train and X_test
    # if feature_type == "bow":
    #     X_train_processed, vectorizer = bag_of_words(X_train)
    #     X_train_processed = X_train_processed.toarray()
    #     X_test_processed = vectorizer.transform(X_test).toarray()
    # elif feature_type == "tfidf":
    #     X_train_processed, vectorizer = tfidf_features(X_train)
    #     X_train_processed = X_train_processed.toarray()
    #     X_test_processed = vectorizer.transform(X_test).toarray()
    # elif feature_type == "embeddings":
    #     if not glove_vectors:
    #         raise ValueError("GloVe vectors must be provided for embeddings.")
    #     X_train_tokenized = [sentence.split() for sentence in X_train]
    #     X_test_tokenized = [sentence.split() for sentence in X_test]
    #     X_train_processed = extract_embeddings(X_train_tokenized)
    #     X_test_processed = extract_embeddings(X_test_tokenized)
    # else:
    #     raise ValueError("Invalid feature type. Choose 'bow', 'tfidf', or 'embeddings'.")
    
    X_vocab = build_vocab(X_train)  # Build vocabulary from training outputs
    X_train_encoded = encode_outputs(X_train, X_vocab)  # Encode training outputs
    X_test_encoded = encode_outputs(X_test, X_vocab)  # Encode testing outputs
    X_train_processed = pad_sequences_to_fixed_length(X_train_encoded, max_len)
    X_test_processed = pad_sequences_to_fixed_length(X_test_encoded, max_len)

    vocab = build_vocab(y_train)  # Build vocabulary from training outputs
    y_train_encoded = encode_outputs(y_train, vocab)  # Encode training outputs
    y_test_encoded = encode_outputs(y_test, vocab)  # Encode testing outputs
    y_train_processed = pad_sequences_to_fixed_length(y_train_encoded, max_len)
    y_test_processed = pad_sequences_to_fixed_length(y_test_encoded, max_len)


    return (
        X_train_processed,
        X_test_processed,
        y_train_processed,
        y_test_processed,
        X_vocab,
        vocab,  # Return vocabulary for decoding
    )


In [12]:
X_train_processed, X_test_processed, y_train_processed, y_test_processed, X_vocab, vocab = prepare_data( X_train, y_train, X_test, y_test, feature_type="embeddings",glove_vectors=glove_vectors, max_len=300)

In [13]:
X_vocab

{'<PAD>': 0,
 '<SOS>': 1,
 '<EOS>': 2,
 'party': 2,
 'sized': 3,
 'high': 4,
 'rise': 5,
 'dough': 6,
 'pie': 7,
 'american': 8,
 'cheese': 9,
 'oregano': 10,
 'meatlover': 11,
 'chicken': 12,
 'medium': 13,
 'artichoke': 14,
 'large': 15,
 'green': 16,
 'pepper': 17,
 'peperonni': 18,
 'roasted': 19,
 'mozzarella': 20,
 'ranch': 21,
 'sauce': 22,
 'banana': 23,
 'meatball': 24,
 'italian': 25,
 'sausage': 26,
 'regular': 27,
 'big': 28,
 'meat': 29,
 'tomato': 30,
 'little': 31,
 'bit': 32,
 'pickle': 33,
 'stuffed': 34,
 'crust': 35,
 'lunch': 36,
 'pecoricheese': 37,
 'new': 38,
 'yorker': 39,
 'olive': 40,
 'oil': 41,
 'personal': 42,
 'bbq': 43,
 'basil': 44,
 'everything': 45,
 'mushroom': 46,
 'low': 47,
 'fat': 48,
 'pineaples': 49,
 'arugula': 50,
 'carrot': 51,
 'garlic': 52,
 'alfredo': 53,
 'napolitana': 54,
 'spinach': 55,
 'ham': 56,
 'spiced': 57,
 'bay': 58,
 'leaf': 59,
 'balsamic': 60,
 'glaze': 61,
 'oregayellow': 62,
 'salami': 63,
 'kalamata': 64,
 'combination': 6

In [14]:
X_train_processed.shape, y_train_processed.shape

((476370, 300), (476370, 300))

In [15]:
X_train_processed[1]

array([ 1, 11,  7, 12,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0

In [16]:
y_train_processed[1]

array([ 1,  2,  3,  2,  4,  2,  5,  6,  7,  2, 10, 18,  7,  2, 14,  2, 15,
       16,  7,  2, 12, 19,  7,  7,  7,  7,  2,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0

In [17]:
len(vocab)

183

In [18]:
input_dim = X_train_processed.shape[1]
input_dim

300

In [19]:
output_dim = y_train_processed.shape[1] 
output_dim

300

In [20]:
input_dim = X_train_processed.shape[1]  # Vocabulary size
embedding_dim = 25  # Dimension of embedding vectors
hidden_dim = 256  # Hidden state size for LSTM
output_dim = y_train_processed.shape[1]  # Number of output classes
num_layers = 3  # Number of BiLSTM layers
dropout = 0.5  # Dropout probability

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BiLSTMModel(input_dim, embedding_dim, hidden_dim, output_dim, num_layers, dropout).to(device)


In [21]:
import torch
from torch.utils.data import Dataset, DataLoader

class SequenceDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = torch.tensor(inputs, dtype=torch.int32) 
        self.targets = torch.tensor(targets, dtype=torch.int32) 

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {
            "src_input_ids": self.inputs[idx],
            "tgt_input_ids": self.targets[idx],
        }

train_dataset = SequenceDataset(X_train_processed, y_train_processed)
test_dataset = SequenceDataset(X_test_processed, y_test_processed)

batch_size = 32  # Adjust based GPU ;-;  memory
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [22]:
train_dataset.__getitem__(3)

{'src_input_ids': tensor([ 1, 15,  7, 16, 17, 18, 19, 17,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0

In [23]:
import torch.optim as optim
from tqdm import tqdm

criterion = nn.CrossEntropyLoss()  # Use for multi-class classification
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(10):  # Number of epochs
    model.train()
    epoch_loss = 0
    total_batches = len(train_dataloader)
    progress_bar = tqdm(train_dataloader,desc="Training Progress", unit="batch", leave=True)
    for  batch_idx, batch in enumerate(progress_bar): # Assuming a DataLoader is used for batches
        src = batch["src_input_ids"].to(device)  # Input tokens
        tgt = batch["tgt_input_ids"].to(device)  # Target tokens

        optimizer.zero_grad()
        output = model(src)  # Forward pass


        # Flatten the model output
        output = output.view(-1, output_dim) 

        print(output.shape)
        print(tgt.shape)

        loss = criterion(output, tgt)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        avg_loss = epoch_loss / (batch_idx + 1)
        progress_bar.set_description(f"Training Progress: Batch {batch_idx + 1}/{total_batches}, Avg Loss: {avg_loss:.4f}")
    print(f"Epoch {epoch + 1}, Loss: {epoch_loss / len(train_dataloader):.4f}")




Training Progress:   0%|          | 0/14887 [00:00<?, ?batch/s]


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
model.eval()
with torch.no_grad():
    sample_input = torch.tensor([[1, 2, 3, 4, 5]], dtype=torch.long).to(device)  # Example input
    output = model(sample_input)
    predicted_classes = torch.argmax(output, dim=-1)  # Get class with highest probability
    print(predicted_classes)
