In [12]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Attention
from tensorflow.keras.models import Model
from tqdm import tqdm
class SentenceGenerator:
    def __init__(self, max_vocab_size=10000, max_input_length=10, max_output_length=30):
        """
        Initialize the Sentence Generator Model

        Parameters:
        - max_vocab_size: Maximum number of words to keep in the vocabulary
        - max_input_length: Maximum length of input word sequence
        - max_output_length: Maximum length of generated sentence
        """
        self.max_vocab_size = max_vocab_size
        self.max_input_length = max_input_length
        self.max_output_length = max_output_length

        # Special tokens
        self.start_token = '<start>'
        self.end_token = '<end>'
        self.pad_token = '<pad>'

        # Tokenizers for input and output
        self.input_tokenizer = Tokenizer(num_words=max_vocab_size, oov_token='<OOV>')
        self.output_tokenizer = Tokenizer(
            num_words=max_vocab_size,
            oov_token='<OOV>',
            filters='',  # Keep all characters
            lower=False  # Preserve case
        )

        # Model components
        self.encoder_model = None
        self.decoder_model = None
        self.model = None

    def prepare_data(self, input_sequences, output_sentences):
        """
        Prepare training data by tokenizing and padding sequences

        Parameters:
        - input_sequences: List of input word sequences
        - output_sentences: Corresponding full sentences

        Returns:
        - Prepared input and output sequences
        """
        # Add special tokens to output sentences if not already present
        processed_sentences = []
        for sent in output_sentences:
            if not sent.startswith(self.start_token):
                sent = f"{self.start_token} {sent}"
            if not sent.endswith(self.end_token):
                sent = f"{sent} {self.end_token}"
            processed_sentences.append(sent)

        # Fit tokenizers
        self.input_tokenizer.fit_on_texts(input_sequences)
        self.output_tokenizer.fit_on_texts(processed_sentences)

        # Ensure special tokens are in the word index
        if self.start_token not in self.output_tokenizer.word_index:
            self.output_tokenizer.word_index[self.start_token] = len(self.output_tokenizer.word_index) + 1
        if self.end_token not in self.output_tokenizer.word_index:
            self.output_tokenizer.word_index[self.end_token] = len(self.output_tokenizer.word_index) + 1

        # Convert to sequences
        input_seq = self.input_tokenizer.texts_to_sequences(input_sequences)
        output_seq = self.output_tokenizer.texts_to_sequences(processed_sentences)

        # Pad sequences
        input_pad = pad_sequences(input_seq, maxlen=self.max_input_length, padding='post')
        output_pad = pad_sequences(output_seq, maxlen=self.max_output_length, padding='post')

        return input_pad, output_pad

    def build_model(self, input_vocab_size, output_vocab_size, embedding_dim=256, units=512):
        """
        Build the sequence-to-sequence model with attention

        Parameters:
        - input_vocab_size: Size of input vocabulary
        - output_vocab_size: Size of output vocabulary
        - embedding_dim: Dimension of embedding layer
        - units: Number of LSTM units
        """
        # Encoder
        encoder_inputs = Input(shape=(self.max_input_length,))
        encoder_embedding = Embedding(input_vocab_size, embedding_dim)(encoder_inputs)
        encoder = LSTM(units, return_sequences=True, return_state=True)
        encoder_outputs, state_h, state_c = encoder(encoder_embedding)
        encoder_states = [state_h, state_c]

        # Decoder
        decoder_inputs = Input(shape=(self.max_output_length,))
        decoder_embedding = Embedding(output_vocab_size, embedding_dim)(decoder_inputs)
        decoder_lstm = LSTM(units, return_sequences=True, return_state=True)

        # Attention Layer
        attention_layer = Attention()

        # Decoder outputs
        decoder_lstm_output, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

        # Apply attention
        context_vector = attention_layer([decoder_lstm_output, encoder_outputs])

        # Final dense layer
        decoder_dense = Dense(output_vocab_size, activation='softmax')
        decoder_outputs = decoder_dense(context_vector)

        # Compile model
        model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

        self.model = model
        return model

    def train(self, input_sequences, output_sentences, epochs=3, batch_size=32):
        """
        Train the sequence-to-sentence model

        Parameters:
        - input_sequences: Training input word sequences
        - output_sentences: Corresponding training sentences
        - epochs: Number of training epochs
        - batch_size: Batch size for training
        """
        # Prepare data
        X_input, y_output = self.prepare_data(input_sequences, output_sentences)

        # Get vocabulary sizes
        input_vocab_size = len(self.input_tokenizer.word_index) + 1
        output_vocab_size = len(self.output_tokenizer.word_index) + 1

        # Build model
        self.build_model(input_vocab_size, output_vocab_size)

        # Prepare decoder target data (shifted by one timestep)
        decoder_target_data = np.zeros_like(y_output)
        decoder_target_data[:, :-1] = y_output[:, 1:]

        # Train model
        history = self.model.fit(
            [X_input, y_output],
            decoder_target_data,
            epochs=epochs,
            batch_size=batch_size,
            validation_split=0.2
        )
        return history

    def generate_sentence(self, input_words):
        """
        Generate a sentence from input words

        Parameters:
        - input_words: List of input words

        Returns:
        - Generated sentence
        """
        # Convert input words to sequence
        input_seq = self.input_tokenizer.texts_to_sequences([input_words])
        input_seq = pad_sequences(input_seq, maxlen=self.max_input_length, padding='post')

        # Ensure start token is in the word index
        if self.start_token not in self.output_tokenizer.word_index:
            self.output_tokenizer.word_index[self.start_token] = len(self.output_tokenizer.word_index) + 1

        # Predict sentence
        predicted_sequence = np.zeros((1, self.max_output_length))
        predicted_sequence[0, 0] = self.output_tokenizer.word_index[self.start_token]

        for i in range(1, self.max_output_length):
            decoder_input = predicted_sequence[:, :i]
            outputs = self.model.predict([input_seq, decoder_input], verbose=0)
            predicted_word_index = np.argmax(outputs[0, -1, :])

            predicted_sequence[0, i] = predicted_word_index

            # Stop if end token or max length reached
            if (predicted_word_index == self.output_tokenizer.word_index.get(self.end_token, -1)) or (i == self.max_output_length - 1):
                break

        # Convert back to words
        generated_words = []
        for idx in predicted_sequence[0]:
            if idx > 0:
                word = self.output_tokenizer.index_word.get(idx, '')
                if word and word not in [self.start_token, self.end_token]:
                    generated_words.append(word)

        return ' '.join(generated_words)

In [8]:
import pandas as pd

path_to_data = '/kaggle/input/nlp-dataset/seq2seq_data.csv'
data = pd.read_csv(path_to_data)
input_sentences = data['base_word_text'].tolist()
output_sentences = data['original_text'].tolist()
input_sentences = [str(sentence) for sentence in input_sentences]
input_sentences = [sentence.split() for sentence in input_sentences]
output_sentences = [str(sentence) for sentence in output_sentences]

In [9]:
input_sequences = input_sentences
output_sentence = output_sentences

# Create and train model
model = SentenceGenerator()
history = model.train(input_sequences, output_sentence)

Epoch 1/3
[1m4368/4368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2352s[0m 538ms/step - accuracy: 0.6364 - loss: 2.8989 - val_accuracy: 0.8344 - val_loss: 1.1182
Epoch 2/3
[1m4368/4368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2351s[0m 538ms/step - accuracy: 0.7316 - loss: 1.7451 - val_accuracy: 0.8593 - val_loss: 0.8460
Epoch 3/3
[1m4368/4368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2352s[0m 538ms/step - accuracy: 0.7576 - loss: 1.4216 - val_accuracy: 0.8783 - val_loss: 0.6482


In [25]:
df = pd.read_csv(path_to_data)
sample_data = df.sample(n=300)

In [26]:
def compute_f1(predicted, ground_truth):
    predicted_tokens = set(predicted.lower().split())
    ground_truth_tokens = set(ground_truth.lower().split())

    common = predicted_tokens.intersection(ground_truth_tokens)
    if len(common) == 0:
        return 0, 0, 0

    precision = len(common) / len(predicted_tokens)
    recall = len(common) / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return precision, recall, f1
def exact_match(predicted, ground_truth):
    return 1 if predicted.lower() == ground_truth.lower() else 0
    
# define evaluation function
def evaluate_deep_learning(dataset, model):
  input_base_word = dataset['base_word_text']
  ground_truth = dataset['original_text']
  ground_truth = [str(gt) for gt in ground_truth]
  input_base_word = [str(sentence) for sentence in input_base_word]
  input_base_word = [sentence.split() for sentence in input_base_word]
  predicted = []

  for base_word in tqdm(input_base_word):
    predicted.append(str(model.generate_sentence(base_word)))

  exact_match_scores = [exact_match(pred, gt) for pred, gt in zip(predicted, ground_truth)]
  exact_match_result = sum(exact_match_scores) / len(exact_match_scores)

  f1_scores = [compute_f1(pred, gt) for pred, gt in zip(predicted, ground_truth)]
  precision_scores = [score[0] for score in f1_scores]
  recall_scores = [score[1] for score in f1_scores]
  f1_scores = [score[2] for score in f1_scores]

  precision_result = sum(precision_scores) / len(precision_scores)
  recall_result = sum(recall_scores) / len(recall_scores)
  f1_result = sum(f1_scores) / len(f1_scores)
  print(f"Accuracy exact match: {exact_match_result:.2%}")
  print(f"F1-Score: {f1_result:.2%}")
  print(f"Precision: {precision_result:.2%}")
  print(f"Recall: {recall_result:.2%}")    
    
evaluate_deep_learning(sample_data, model)

100%|██████████| 300/300 [03:44<00:00,  1.34it/s]

Accuracy exact match: 0.00%
F1-Score: 45.33%
Precision: 59.84%
Recall: 38.56%





In [28]:
# define evaluation function
def evaluate_deep_learning_manual(dataset, model):
  input_base_word = dataset['base_word_text']
  ground_truth = dataset['original_text']
  ground_truth = [str(gt) for gt in ground_truth]
  input_base_word = [str(sentence) for sentence in input_base_word]
  input_base_word = [sentence.split() for sentence in input_base_word]
  predicted = []

  for base_word, gt in tqdm(zip(input_base_word, ground_truth)):
    result = str(model.generate_sentence(base_word))
    predicted.append(result)
    print("input:", base_word)
    print("output:", result)
    print("ground truth:", gt)

  exact_match_scores = [exact_match(pred, gt) for pred, gt in zip(predicted, ground_truth)]
  exact_match_result = sum(exact_match_scores) / len(exact_match_scores)

  f1_scores = [compute_f1(pred, gt) for pred, gt in zip(predicted, ground_truth)]
  precision_scores = [score[0] for score in f1_scores]
  recall_scores = [score[1] for score in f1_scores]
  f1_scores = [score[2] for score in f1_scores]

  precision_result = sum(precision_scores) / len(precision_scores)
  recall_result = sum(recall_scores) / len(recall_scores)
  f1_result = sum(f1_scores) / len(f1_scores)
  print(f"Accuracy exact match: {exact_match_result:.2%}")
  print(f"F1-Score: {f1_result:.2%}")
  print(f"Precision: {precision_result:.2%}")
  print(f"Recall: {recall_result:.2%}")    
    
sample_data = df.sample(n=30, random_state=42)
evaluate_deep_learning_manual(sample_data, model)

1it [00:00,  1.31it/s]

input: ['how', 'to', 'lose', 'fat', 'fast', '1', 'aerobics', '2', 'weight', 'training', '3', 'both']
output: how can i lose a <OOV> weight weight weight weight weight <OOV>
ground truth: how to lose fat fast? 1) aerobics 2) weight training 3) both?


2it [00:01,  1.26it/s]

input: ['you', 'think', 'that', 'wojtek', 'wolski', 'on', 'colorado', 'avalanche', 'this', 'year']
output: do you think that that <OOV> <OOV> <OOV> this <OOV> this this year?
ground truth: do you think that wojtek wolski should have been on the colorado avalanche this year?


3it [00:03,  1.12s/it]

input: ['shah', 'rukh', 'khan', 'born', 'to', 'muslim', 'family', 'and', 'married', 'to', 'muslim', 'gauri', 'chibber', 'who', 'later', 'converted', 'to', 'islam']
output: declarative: to <OOV> <OOV> <OOV> <OOV> to <OOV> <OOV> to <OOV> <OOV> to <OOV> <OOV> to <OOV> <OOV> to <OOV> <OOV> to <OOV> <OOV>
ground truth:  declarative: shah rukh khan was born to a muslim family and married to a muslim, gauri chibber, who later converted to islam.


4it [00:03,  1.03s/it]

input: ['child', 'in', 'washington', 'dc', 'must', '5', 'year', 'old', 'and', 'meet', 'cutoff', 'date', 'of', 'february', '6th', 'to', 'start', 'kindergarten']
output: declarative: you can get a <OOV> <OOV> and <OOV> <OOV> to start <OOV> <OOV>
ground truth:  children in washington d.c. must be 5 years old and meet the cutoff date of february 6th to start kindergarten.


5it [00:04,  1.13it/s]

input: ['jesus', 'giving', 'you', 'free', 'gift', 'why', 'not', 'take', 'it']
output: declarative: you is you <OOV> <OOV> <OOV> not it <OOV>
ground truth: jesus is giving you a free gift, why not take it?


6it [00:05,  1.19it/s]

input: ['share', 'information', 'about', 'lowest', 'paid', 'ceo', 'in', 'world']
output: imperative: find the about information about the <OOV> <OOV> in the world!
ground truth: imperative: share information about the lowest paid ceo in the world! 


7it [00:07,  1.14s/it]

input: ['improving', 'geograph', 'and', 'history', 'requires', 'consistent', 'reading', 'and', 'exploring', 'various', 'historical', 'event']
output: declarative: the <OOV> and <OOV> requires a <OOV> <OOV> and <OOV> and many historical historical historical historical historical historical historical historical historical <OOV> historical historical historical historical historical historical
ground truth:  declarative: improving geograph and history requires consistent reading and exploring various historical events.


8it [00:07,  1.08it/s]

input: ['solve', 'trig', 'question', 'for', 'me']
output: imperative: check the <OOV> <OOV> for me!
ground truth: imperative: solve trig question for me! 


9it [00:07,  1.35it/s]

input: ['tell', 'me', 'about', 'possibility']
output: tell me about a <OOV>
ground truth: tell me about the possibility! 


10it [00:08,  1.30it/s]

input: ['this', 'energy', 'absorbed', 'in', 'each', 'cycle', 'and', 'time', 'a', 'magnetic', 'hysteresis', 'loss']
output: declarative: <OOV> <OOV> in the <OOV> <OOV> in the <OOV> <OOV> <OOV> <OOV>
ground truth:  declarative: this energy is absorbed in each cycle and time as magnetic hysteresis loss.


11it [00:09,  1.52it/s]

input: ['why', 'illegal', 'hispanic', 'hate', 'american']
output: why why why americans do <OOV>
ground truth: why do illegal hispanics hate americans?


12it [00:10,  1.30it/s]

input: ['end', 'of', 'world', 'not', 'predicted', 'to', 'occur', 'on', 'december', '21', '2012']
output: declarative: the <OOV> of the <OOV> is not not not not not due of <OOV> <OOV>
ground truth:  declarative: the end of the world was not predicted to occur on december 21, 2012


13it [00:11,  1.25it/s]

input: ['why', 'yahoo', 'nt', 'crawl', 'my', 'site', 'i', 'submited', 'it', 'one', 'week', 'now', 'url', 'http', 'wwwcamcorderbatterybankcom']
output: i have one <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV>
ground truth: why yahoo! haven't crawl my site? i have submited it one week now! url:http:www.camcorder-battery-bank.com


14it [00:11,  1.31it/s]

input: ['you', 'consult', 'doctor', 'immediately', 'a', 'you', 're', 'experiencing', 'excruciating', 'pain', 'in', 'your', 'jaw', 'and', 'neck']
output: declarative: you're experiencing your <OOV> <OOV> in your <OOV> and <OOV>
ground truth:  declarative: you should consult a doctor immediately as you're experiencing excruciating pain in your jaw and neck.


15it [00:12,  1.13it/s]

input: ['he', 's', 'playing', 'her', 'and', 'you', 'might', 'need', 'to', 'conversation', 'with', 'him', 'about', 'it']
output: declarative: you need to need to a <OOV> <OOV> with a <OOV> with a <OOV> about <OOV>
ground truth:  declarative: he's been playing her, and you might need to have a conversation with him about it.


16it [00:13,  1.17it/s]

input: ['who', 'can', 'come', 'up', 'with', 'most', 'word', 'fr', 'househome']
output: who can be the <OOV> with the first <OOV> <OOV> <OOV>
ground truth: who can come up with the most words fr house/home?


17it [00:14,  1.35it/s]

input: ['tell', 'me', 'history', 'of', 'english', 'language']
output: tell me the history of english language!
ground truth: tell me the history of the english language. 


18it [00:15,  1.23it/s]

input: ['horse', 'and', 'donkey', 'long', 'penis', 'due', 'to', 'their', 'evolutionary', 'history', 'and', 'mating', 'habit']
output: declarative: a <OOV> <OOV> is a <OOV> to be their <OOV> and <OOV> <OOV>
ground truth:   horses and donkeys have long penises due to their evolutionary history and mating habits.


19it [00:15,  1.30it/s]

input: ['explain', 'war', 'on', 'terror', 'to', 'me']
output: imperative: explain the war on the war to me!
ground truth: imperative: explain the war on terror to me! 


20it [00:16,  1.42it/s]

input: ['gloria', 'allred', 'contactable', 'through', 'her', 'law', 'firm']
output: declarative: <OOV> <OOV> <OOV> through <OOV> through <OOV>
ground truth:  gloria allred is contactable through her law firm.


21it [00:17,  1.22it/s]

input: ['meeting', 'planner', 'app', 'genius', 'meeting', 'gal', 'site', 'at', 'meatpacking', 'district']
output: declarative: you can purchase a <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> at <OOV> <OOV>
ground truth:  declarative: meeting planner app genius is the meeting gal site at meatpacking district.


22it [00:18,  1.01s/it]

input: ['pentagram', 'on', 'tiara', 'not', 'banish', 'evil', 'christian', 'a', 'it', 'commonly', 'symbol', 'of', 'protection', 'in', 'various', 'culture']
output: declarative: it is a <OOV> is it as it is a <OOV> of the <OOV> of various <OOV> in various cultures.
ground truth:   declarative: the pentagram on a tiara does not banish evil christians as it is commonly a symbol of protection in various cultures.


23it [00:19,  1.04it/s]

input: ['you', 'talk', 'openly', 'about', 'it', 'and', 'listen', 'with', 'open', 'mind']
output: declarative: you should talk about your <OOV> and <OOV> with your <OOV>
ground truth:  declarative: you should talk openly about it and listen with an open mind.


24it [00:20,  1.12it/s]

input: ['how', 'animal', 'kestrel', 'deal', 'with', 'problem', 'found', 'in', 'city']
output: how <OOV> deal with a problem problems with the <OOV>
ground truth: how does the animal kestrel deal with problems found in the city?


25it [00:21,  1.09it/s]

input: ['excel', 'worksheet', 'function', 'for', 'proface', 'prostudio', 'ver', '4', 'used', 'for', 'data', 'analysis', 'and', 'logging', 'operation']
output: declarative: <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> and <OOV>
ground truth:  the excel worksheet function for proface pro-studio ver 4 is used for data analysis and logging operations.


26it [00:22,  1.13it/s]

input: ['what', 'three', 'concept', 'that', 'make', 'up', 'cell', 'theory']
output: what that is the three that make a cell cell <OOV>
ground truth: what are the three concepts that make up the cell theory?


27it [00:22,  1.22it/s]

input: ['why', 'people', 'in', 'this', 'world', 'never', 'satisfied']
output: why do people in this world is this <OOV>
ground truth: why people in this world never be satisfied?


28it [00:23,  1.47it/s]

input: ['when', 'first', 'time', 'you']
output: when when when you <OOV>
ground truth: when was the first time you...?


29it [00:24,  1.26it/s]

input: ['jesus', 'taught', 'importance', 'of', 'offering', 'sacrifice', 'to', 'god', 'a', 'seen', 'in', 'book', 'of', 'malachi', 'and', 'firstcentury', 'jewish', 'practice']
output: declarative: the <OOV> <OOV> <OOV> is seen as the <OOV> of <OOV> and <OOV> <OOV>
ground truth:  declarative: jesus taught the importance of offering sacrifices to god, as seen in the book of malachi and first-century jewish practice.


30it [00:24,  1.21it/s]

input: ['there', '2', 'different', 'nlt', 'bible']
output: declarative: there is 2 <OOV> <OOV>
ground truth: are there 2 different nlt bibles?
Accuracy exact match: 0.00%
F1-Score: 44.28%
Precision: 60.11%
Recall: 37.26%



