In [1]:
import sys
import pandas as pd
import json
sys.path.append("..")
from utils.data_preprocessing import preprocess_text
from utils.feature_extraction import bag_of_words, tfidf_features, extract_embeddings

train_path = '../dataset/PIZZA_train.json'
test_path = '../dataset/PIZZA_dev.json'
def read_data(file_path):
    data = []
    with open(file_path, 'r') as f:
        for i, line in enumerate(f):
            try:
                record = json.loads(line.strip())
                data.append(record)
                
                # Process in chunks of 10,000 records
                if i > 0 and i % 10000 == 0:
                    print(f"Processed {i} records so far...")
            except json.JSONDecodeError:
                continue
    return data

# Convert remaining data to DataFrame
data = read_data(train_path)
if data:
    df = pd.DataFrame(data)
    
data = read_data(test_path)
if data:
    dev = pd.DataFrame(data)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hima\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Processed 10000 records so far...
Processed 20000 records so far...
Processed 30000 records so far...
Processed 40000 records so far...
Processed 50000 records so far...
Processed 60000 records so far...
Processed 70000 records so far...
Processed 80000 records so far...
Processed 90000 records so far...
Processed 100000 records so far...
Processed 110000 records so far...
Processed 120000 records so far...
Processed 130000 records so far...
Processed 140000 records so far...
Processed 150000 records so far...
Processed 160000 records so far...
Processed 170000 records so far...
Processed 180000 records so far...
Processed 190000 records so far...
Processed 200000 records so far...
Processed 210000 records so far...
Processed 220000 records so far...
Processed 230000 records so far...
Processed 240000 records so far...
Processed 250000 records so far...
Processed 260000 records so far...
Processed 270000 records so far...
Processed 280000 records so far...
Processed 290000 records so f

In [4]:
input_texts = df['train.SRC']
output_texts = df['train.TOP']

In [5]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input, Bidirectional
from tensorflow.keras.models import Model
import numpy as np


input_tokenizer = tf.keras.preprocessing.text.Tokenizer()
output_tokenizer = tf.keras.preprocessing.text.Tokenizer()

input_tokenizer.fit_on_texts(input_texts)
output_tokenizer.fit_on_texts(output_texts)

In [6]:
# Convert Text to Sequences
input_sequences = input_tokenizer.texts_to_sequences(input_texts)
output_sequences = output_tokenizer.texts_to_sequences(output_texts)

# Add Padding
max_input_len = max(len(seq) for seq in input_sequences)
max_output_len = max(len(seq) for seq in output_sequences)

input_sequences = tf.keras.preprocessing.sequence.pad_sequences(input_sequences, maxlen=max_input_len, padding="post")
output_sequences = tf.keras.preprocessing.sequence.pad_sequences(output_sequences, maxlen=max_output_len, padding="post")


In [11]:
input_vocab_size = len(input_tokenizer.word_index) + 1
output_vocab_size = len(output_tokenizer.word_index) + 1

In [104]:
batch_size = 64
START_TOKEN = output_tokenizer.word_index['<start>'] if '<start>' in output_tokenizer.word_index else 0
def data_generator(input_sequences, output_sequences, batch_size):
    num_samples = len(input_sequences)
    for i in range(0, num_samples, batch_size):
        batch_input = input_sequences[i:i + batch_size]

        # Prepare decoder inputs
        batch_decoder_input = np.zeros((len(batch_input), max_output_len), dtype=np.int32)
        for j, seq in enumerate(output_sequences[i:i + batch_size]):
            shifted_seq = [START_TOKEN] + seq[:-1].tolist()
            batch_decoder_input[j, :len(shifted_seq)] = shifted_seq

        # Prepare target outputs (padded to match max_output_len)
        batch_output = np.zeros((len(batch_input), max_output_len), dtype=np.int32)
        for j, seq in enumerate(output_sequences[i:i + batch_size]):
            batch_output[j, :len(seq)] = seq

        yield ([batch_input, batch_decoder_input], batch_output)



In [105]:
input_sequences = tf.convert_to_tensor(input_sequences, dtype=tf.int32)
output_sequences = tf.convert_to_tensor(output_sequences, dtype=tf.int32)

In [106]:
dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(input_sequences, output_sequences, batch_size),
    output_signature=(
        (  # Encoder and decoder inputs
            tf.TensorSpec(shape=(None, max_input_len), dtype=tf.int32),
            tf.TensorSpec(shape=(None, max_output_len), dtype=tf.int32),
        ),
        tf.TensorSpec(shape=(None, max_output_len - 1), dtype=tf.int32),  # Target outputs
    )
).shuffle(buffer_size=1024).prefetch(tf.data.AUTOTUNE)


In [107]:
embedding_dim = 128
lstm_units = 256

In [108]:
encoder_inputs = tf.keras.Input(shape=(max_input_len,))
encoder_embedding = tf.keras.layers.Embedding(input_vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm = tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(lstm_units, return_sequences=False, return_state=True)
)
_, forward_h, forward_c, backward_h, backward_c = encoder_lstm(encoder_embedding)
encoder_state_h = tf.keras.layers.Concatenate()([forward_h, backward_h])
encoder_state_c = tf.keras.layers.Concatenate()([forward_c, backward_c])

In [109]:
decoder_inputs = tf.keras.Input(shape=(max_output_len,))
decoder_embedding = tf.keras.layers.Embedding(output_vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM(2 * lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[encoder_state_h, encoder_state_c])
decoder_dense = tf.keras.layers.Dense(output_vocab_size, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

In [110]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [111]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [112]:
model.summary()

In [113]:
for batch in dataset.take(1):
    encoder_input, decoder_input = batch[0]
    target_output = batch[1]
    print("Encoder Input Shape:", encoder_input.shape)
    print("Decoder Input Shape:", decoder_input.shape)
    print("Target Output Shape:", target_output.shape)

    # Forward pass test
    model([encoder_input, decoder_input])


UnknownError: {{function_node __wrapped__IteratorGetNext_output_types_3_device_/job:localhost/replica:0/task:0/device:CPU:0}} AttributeError: EagerTensor object has no attribute 'tolist'. 
        If you are looking for numpy-related methods, please run the following:
        tf.experimental.numpy.experimental_enable_numpy_behavior()
      
Traceback (most recent call last):

  File "C:\Users\Hima\AppData\Roaming\Python\Python311\site-packages\tensorflow\python\ops\script_ops.py", line 270, in __call__
    ret = func(*args)
          ^^^^^^^^^^^

  File "C:\Users\Hima\AppData\Roaming\Python\Python311\site-packages\tensorflow\python\autograph\impl\api.py", line 643, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^

  File "C:\Users\Hima\AppData\Roaming\Python\Python311\site-packages\tensorflow\python\data\ops\from_generator_op.py", line 198, in generator_py_func
    values = next(generator_state.get_iterator(iterator_id))
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

  File "C:\Users\Hima\AppData\Local\Temp\ipykernel_4180\1276729944.py", line 11, in data_generator
    shifted_seq = [START_TOKEN] + seq[:-1].tolist()
                                  ^^^^^^^^^^^^^^^

  File "C:\Users\Hima\AppData\Roaming\Python\Python311\site-packages\tensorflow\python\framework\tensor.py", line 255, in __getattr__
    raise AttributeError(

AttributeError: EagerTensor object has no attribute 'tolist'. 
        If you are looking for numpy-related methods, please run the following:
        tf.experimental.numpy.experimental_enable_numpy_behavior()
      


	 [[{{node PyFunc}}]] [Op:IteratorGetNext] name: 

In [None]:

history = model.fit(
    dataset, 
    epochs=10
)

Epoch 1/10


ValueError: Arguments `target` and `output` must have the same shape up until the last dimension: target.shape=(None, 41), output.shape=(None, 42, 308)

In [None]:
model.save("../weights/seq2seq_model.h5")