In [1]:
import sys
import pandas as pd
import json
sys.path.append("..")
from utils.data_preprocessing import preprocess_text
from utils.feature_extraction import bag_of_words, tfidf_features, extract_embeddings

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hima\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:

train_path = '../dataset/PIZZA_train.json'
test_path = '../dataset/PIZZA_dev.json'
def read_data(file_path):
    data = []
    with open(file_path, 'r') as f:
        for i, line in enumerate(f):
            try:
                record = json.loads(line.strip())
                data.append(record)
                
                # Process in chunks of 10,000 records
                if i > 0 and i % 10000 == 0:
                    print(f"Processed {i} records so far...")
            except json.JSONDecodeError:
                continue
    return data

# Convert remaining data to DataFrame
data = read_data(train_path)
if data:
    df = pd.DataFrame(data)
    
data = read_data(test_path)
if data:
    dev = pd.DataFrame(data)


Processed 10000 records so far...
Processed 20000 records so far...
Processed 30000 records so far...
Processed 40000 records so far...
Processed 50000 records so far...
Processed 60000 records so far...
Processed 70000 records so far...
Processed 80000 records so far...
Processed 90000 records so far...
Processed 100000 records so far...
Processed 110000 records so far...
Processed 120000 records so far...
Processed 130000 records so far...
Processed 140000 records so far...
Processed 150000 records so far...
Processed 160000 records so far...
Processed 170000 records so far...
Processed 180000 records so far...
Processed 190000 records so far...
Processed 200000 records so far...
Processed 210000 records so far...
Processed 220000 records so far...
Processed 230000 records so far...
Processed 240000 records so far...
Processed 250000 records so far...
Processed 260000 records so far...
Processed 270000 records so far...
Processed 280000 records so far...
Processed 290000 records so f

### Data

In [3]:
X_train = df['train.SRC']
y_train = df['train.EXR']
X_test = dev['dev.SRC']
y_test = dev['dev.EXR']
print(len(df))
print(X_train[2456445])
print(y_train[2456445])
print(dev['dev.SRC'][0])

2456446
i'd like a pizza with hot pepper pecorino cheese and parmesan without thin crust
(ORDER (PIZZAORDER (NUMBER 1 ) (TOPPING HOT_PEPPERS ) (TOPPING PECORINO_CHEESE ) (TOPPING PARMESAN_CHEESE ) (NOT (STYLE THIN_CRUST ) ) ) )
i want to order two medium pizzas with sausage and black olives and two medium pizzas with pepperoni and extra cheese and three large pizzas with pepperoni and sausage


### Preprocessing

In [4]:
X_train = [" ".join(preprocess_text(text)) for text in X_train]
X_test = [" ".join(preprocess_text(text)) for text in X_test]
print(X_train[2456445])

pizza hot pepper pecoricheese parmesan without thin crust


### Define model constants

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np


DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Encoder

In [6]:
### not prepared encoder
# class Encoder(nn.Module):
#     def __init__(self, input_dim, embedding_dim, hidden_dim):
#         super(Encoder, self).__init__()
#         self.embedding = nn.Embedding(input_dim, embedding_dim)
#         self.rnn = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
    
#     def forward(self, src):
#         embedded = self.embedding(src)  # Shape: [batch_size, seq_len, embedding_dim]
#         outputs, hidden = self.rnn(embedded)  # Shape: [batch_size, seq_len, hidden_dim]
#         return hidden  # Hidden state passed to the decoder
    


class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(Encoder, self).__init__()
        self.fc = nn.Linear(input_dim, hidden_dim)  # Fully connected to map features to hidden space
        self.rnn = nn.GRU(hidden_dim, hidden_dim, batch_first=True)

    def forward(self, src):
        src = src.unsqueeze(1)  # Add sequence dimension (batch_size, seq_len=1, input_dim)
        embedded = self.fc(src)  # (batch_size, seq_len=1, hidden_dim)
        outputs, hidden = self.rnn(embedded)
        return hidden

### Decoder

In [7]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, input, hidden):
        input = input.unsqueeze(1)  # Add sequence dimension (for single token)
        embedded = self.embedding(input)  # Shape: [batch_size, 1, embedding_dim]
        output, hidden = self.rnn(embedded, hidden)  # Shape: [batch_size, 1, hidden_dim]
        prediction = self.fc(output.squeeze(1))  # Shape: [batch_size, output_dim]
        return prediction, hidden

### Seq2Seq model

In [8]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.fc.out_features

        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
        hidden = self.encoder(src)

        # First input to the decoder is the <sos> token
        input = trg[:, 0]

        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden)
            outputs[:, t] = output
            # Decide whether to use teacher forcing
            teacher_force = np.random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[:, t] if teacher_force else top1

        return outputs

### Train loop in pytorch as usual

In [9]:
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for batch in iterator:
        src = batch["src"].to(DEVICE)
        trg = batch["trg"].to(DEVICE)

        optimizer.zero_grad()
        output = model(src, trg)

        # trg: [batch_size, trg_len]
        # output: [batch_size, trg_len, output_dim]
        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)
        trg = trg[:, 1:].reshape(-1)

        loss = criterion(output, trg)
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

### Eavluation

In [10]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for batch in iterator:
            src = batch["src"].to(DEVICE)
            trg = batch["trg"].to(DEVICE)

            output = model(src, trg, 0)  # Turn off teacher forcing
            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            trg = trg[:, 1:].reshape(-1)

            loss = criterion(output, trg)
            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

### Process Y

In [11]:
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences

def tokenize_output(output):
    """
    Tokenizes the structured output into meaningful tokens.
    Example:
        Input: "(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (TOPPING bbq pulled pork ) ) )"
        Output: ["(ORDER", "(PIZZAORDER", "(NUMBER", "a", "(SIZE", "large", "(TOPPING", "bbq", "pulled", "pork", ")", ")", ")", ")"]
    """
    tokens = re.findall(r"\(|\)|\w+|[^\s()]+", output)
    return tokens

def build_vocab(outputs):
    """
    Builds a vocabulary from tokenized outputs.
    """
    vocab = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2}  # Special tokens
    i = 2
    for output in outputs:
        tokens = tokenize_output(output)
        for token in tokens:
            if token not in vocab:
                vocab[token] = i
                i += 1
    return vocab
def encode_outputs(outputs, vocab):
    """
    Encodes tokenized outputs into sequences of integers.
    """
    encoded = []
    for output in outputs:
        tokens = tokenize_output(output)
        sequence = [vocab["<SOS>"]] + [vocab[token] for token in tokens if token in vocab] + [vocab["<EOS>"]]
        encoded.append(sequence)
    return encoded

def pad_sequences_to_fixed_length(sequences, max_len):
    """
    Pads sequences to a fixed length.
    """
    return pad_sequences(sequences, maxlen=max_len, padding="post", value=0)

def decode_sequence(sequence, vocab):
    """
    Decodes a sequence of integers back into the structured output string.
    """
    inv_vocab = {v: k for k, v in vocab.items()}  # Reverse the vocabulary
    tokens = [inv_vocab[idx] for idx in sequence if idx > 0]  # Ignore <PAD> tokens
    return " ".join(tokens)


### Prep Data

In [12]:

def prepare_data(
    X_train, y_train, X_test, y_test, feature_type="bow", glove_vectors=None, max_len=20
):

    vectorizer = None

    # Feature Extraction for X_train and X_test
    if feature_type == "bow":
        X_train_processed, vectorizer = bag_of_words(X_train)
        X_test_processed = vectorizer.transform(X_test).toarray()
    elif feature_type == "tfidf":
        X_train_processed, vectorizer = tfidf_features(X_train)
        X_test_processed = vectorizer.transform(X_test).toarray()
    elif feature_type == "embeddings":
        if not glove_vectors:
            raise ValueError("GloVe vectors must be provided for embeddings.")
        X_train_tokenized = [sentence.split() for sentence in X_train]
        X_test_tokenized = [sentence.split() for sentence in X_test]
        X_train_processed = extract_embeddings(X_train_tokenized)
        X_test_processed = extract_embeddings(X_test_tokenized)
    else:
        raise ValueError("Invalid feature type. Choose 'bow', 'tfidf', or 'embeddings'.")

    vocab = build_vocab(y_train)  # Build vocabulary from training outputs
    y_train_encoded = encode_outputs(y_train, vocab)  # Encode training outputs
    y_test_encoded = encode_outputs(y_test, vocab)  # Encode testing outputs
    y_train_processed = pad_sequences_to_fixed_length(y_train_encoded, max_len)
    y_test_processed = pad_sequences_to_fixed_length(y_test_encoded, max_len)


    return (
        X_train_processed,
        X_test_processed,
        y_train_processed,
        y_test_processed,
        vectorizer,
        vocab,  # Return vocabulary for decoding
    )


### Save and load

In [13]:
def save_model(model, path):
    torch.save(model.state_dict(), path)
    print(f"Model saved to {path}")

# Load the model
def load_model(model, path):
    model.load_state_dict(torch.load(path))
    print(f"Model loaded from {path}")
    return model

### Create dataset and dataloader

In [14]:
# Create dataset and dataloader
class Seq2SeqDataset(torch.utils.data.Dataset):
    def __init__(self, src_data, trg_data):
        self.src_data = src_data
        self.trg_data = trg_data

    def __len__(self):
        return len(self.src_data)

    def __getitem__(self, index):
        return {
            "src": torch.tensor(self.src_data[index], dtype=torch.float32),
            "trg": torch.tensor(self.trg_data[index], dtype=torch.long),
        }

### Preprocess

In [15]:
X_train_processed, X_test_processed, y_train_processed, y_test_processed, vectorizer, vocab = prepare_data( X_train, y_train, X_test, y_test, feature_type="bow", max_len=50)

In [16]:
vocab

{'<PAD>': 0,
 '<SOS>': 1,
 '<EOS>': 2,
 '(': 2,
 'ORDER': 3,
 'PIZZAORDER': 4,
 'NUMBER': 5,
 '1': 6,
 ')': 7,
 'SIZE': 8,
 'LARGE': 9,
 'TOPPING': 10,
 'BBQ_PULLED_PORK': 11,
 'GREEN_PEPPERS': 12,
 'COMPLEX_TOPPING': 13,
 'QUANTITY': 14,
 'EXTRA': 15,
 'PEPPERONI': 16,
 'STYLE': 17,
 'VEGETARIAN': 18,
 'PARTY_SIZE': 19,
 'STUFFED_CRUST': 20,
 'AMERICAN_CHEESE': 21,
 'MUSHROOMS': 22,
 'PERSONAL_SIZE': 23,
 'ARTICHOKES': 24,
 'BANANA_PEPPERS': 25,
 'LOW_FAT_CHEESE': 26,
 'REGULARSIZE': 27,
 'NOT': 28,
 'FRIED_ONIONS': 29,
 'LIGHT': 30,
 'THICK_CRUST': 31,
 'GREEN_OLIVES': 32,
 'PESTO': 33,
 'YELLOW_PEPPERS': 34,
 'MEATBALLS': 35,
 'BEANS': 36,
 'MEAT_LOVER': 37,
 'PECORINO_CHEESE': 38,
 'BALSAMIC_GLAZE': 39,
 'OLIVES': 40,
 'CHICKEN': 41,
 'MOZZARELLA_CHEESE': 42,
 'SAUCE': 43,
 'ITALIAN_SAUSAGE': 44,
 'LUNCH_SIZE': 45,
 'ALFREDO_CHICKEN': 46,
 'CHEESEBURGER': 47,
 'COMBINATION': 48,
 'SPICED_SAUSAGE': 49,
 'MEDITERRANEAN': 50,
 'CARAMELIZED_ONIONS': 51,
 'BACON': 52,
 'CHORIZO': 53,
 '

In [17]:
y_train_processed

array([[1, 2, 3, ..., 0, 0, 0],
       [1, 2, 3, ..., 0, 0, 0],
       [1, 2, 3, ..., 0, 0, 0],
       ...,
       [1, 2, 3, ..., 0, 0, 0],
       [1, 2, 3, ..., 0, 0, 0],
       [1, 2, 3, ..., 0, 0, 0]])

### Main

In [29]:
print(y_train_processed.shape)
print(X_train_processed.shape)

(2456446, 50)
(2456446, 793)


In [18]:
import scipy.sparse
import torch

# Ensure X_train_processed and X_test_processed are dense
if scipy.sparse.issparse(X_train_processed):
    X_train_processed = X_train_processed.toarray()
if scipy.sparse.issparse(X_test_processed):
    X_test_processed = X_test_processed.toarray()

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train_processed, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_processed, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_processed, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_processed, dtype=torch.long)

In [20]:
X_train_processed.shape[1] ,y_train_processed.shape[1]

(793, 50)

In [19]:
INPUT_DIM = X_train_processed.shape[1]  # Vocabulary size for input
OUTPUT_DIM = y_train_processed.shape[1]  # Vocabulary size for output
EMBEDDING_DIM = 512  # Dimension of word embeddings
HIDDEN_DIM = 1024  # Hidden state size

# Create dataset and dataloader
dataset = Seq2SeqDataset(X_train_processed, y_train_processed)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)

# Model, optimizer, and loss
encoder = Encoder(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM).to(DEVICE)
decoder = Decoder(OUTPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM).to(DEVICE)
model = Seq2Seq(encoder, decoder, DEVICE).to(DEVICE)

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

# Training
N_EPOCHS = 10
CLIP = 1
for epoch in range(N_EPOCHS):
    train_loss = train(model, dataloader, optimizer, criterion, CLIP)
    print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}")

TypeError: Encoder.__init__() takes 3 positional arguments but 4 were given

### Save the model after training

In [None]:
# Save the model
save_model(model, "../weights/transformer.pt")

### Load the model again

In [None]:
# Load the model for inference
loaded_model = Seq2Seq(encoder, decoder, DEVICE).to(DEVICE)
loaded_model = load_model(loaded_model, "../weights/transformer.pt")