In [47]:
# Load CSV data into a pandas DataFrame
import pandas as pd
data = pd.read_csv("news_headline_generator.csv")

# Display the first few rows to inspect the structure of the data
data.head()

Unnamed: 0,content_text,generated_headline
0,Science: Score each cause. Quality throughout ...,NASA Discovers New Exoplanet
1,Environment: Behavior benefit suggest page. Ro...,Climate Change Effects Escalate
2,Sports: Director allow firm environment. Tree ...,Local Team Wins Championship
3,Sports: Seven medical blood personal success m...,Local Team Wins Championship
4,Environment: Yet practice just military buildi...,Climate Change Effects Escalate


### Code Explanation :

-> import pandas : importing the library pandas.
-> data = pd.read_csv("news_headline_generator.csv") : Load CSV data into a pandas DataFrame.
-> data.head() : Display the first few rows to inspect the structure of the data

In [48]:
# Check for missing values in the dataset
data.isnull().sum()

content_text          0
generated_headline    0
dtype: int64

### Code Explanation :

-> data.isnull().sum() : Check for missing values in the dataset.
  -> if data.isnull().sum() = 0 then no NAN values are present in the dataset.

In [49]:
# Extract 'content_text' as the input features
x = data['content_text']

# Extract 'generated_headline' as the target labels
y = data['generated_headline']


### Code Explanation :

-> x = data['content_text'] : Extract 'content_text' as the input features

-> y = data['generated_headline'] : Extract 'generated_headline' as the target labels

In [50]:
print(x)

0      Science: Score each cause. Quality throughout ...
1      Environment: Behavior benefit suggest page. Ro...
2      Sports: Director allow firm environment. Tree ...
3      Sports: Seven medical blood personal success m...
4      Environment: Yet practice just military buildi...
                             ...                        
995    Health: Particularly state visit mention heart...
996    Science: Floor feeling her play new win. Prove...
997    Technology: Window foreign forward society enj...
998    Environment: Position fact democratic vote rat...
999    Environment: Board skin expect door magazine l...
Name: content_text, Length: 1000, dtype: object


In [51]:
print(y)


0             NASA Discovers New Exoplanet
1          Climate Change Effects Escalate
2             Local Team Wins Championship
3             Local Team Wins Championship
4          Climate Change Effects Escalate
                      ...                 
995    New Breakthrough in Cancer Research
996           NASA Discovers New Exoplanet
997           AI Revolutionizes Daily Life
998        Climate Change Effects Escalate
999        Climate Change Effects Escalate
Name: generated_headline, Length: 1000, dtype: object


In [52]:

import re

# Clean text function to remove unwanted characters and normalize
def clean_text(text):
    if isinstance(text, str):

        # Remove all characters except alphabets and space
        text = re.sub('[^a-zA-Z ]', ' ', text)

        # Remove extra spaces
        text = ' '.join(text.split())

        # Convert to lowercase
        return text.lower()
    return ""

# Apply cleaning to both input and output texts
data['content_text'] = data['content_text'].apply(clean_text)
data['generated_headline'] = data['generated_headline'].apply(clean_text)

# Add <start> and <end> tokens to target sequences
# Add start and end tokens to help decoder learn boundaries
# This is essential for sequence-to-sequence modeling
data['generated_headline'] = data['generated_headline'].apply(lambda x: 'start ' + x + ' end')

# Separate cleaned inputs and outputs
x = data['content_text']
y = data['generated_headline']

### Code Explanation :

                                   TEXT CLEANING + FORMATTING FOR SEQ2SEQ
-> import re
   -> Imports Python’s built-in Regular Expression (regex) module.
   -> Reason : Required for pattern-based text substitution and cleaning (e.g., removing non-alphabet characters).
   -> Purpose: Helps in cleaning the raw text by removing special characters, numbers, and extra whitespace

-> def clean_text(text):
   -> Defines a custom function named clean_text that takes a single argument text.
   -> Reason : Modularizes the text cleaning process, so it can be reused on multiple text fields.
   -> Purpose: To ensure all input and output text is cleaned in a consistent way before feeding it to the model.

-> if isinstance(text, str):
   -> Checks if text is a string.
   -> Reason : Prevents errors if text is NaN or another non-string data type.
   -> Purpose: Defensive programming — ensures cleaning is applied only on valid strings.

-> text = re.sub('[^a-zA-Z ]', ' ', text)
   -> Replaces everything except alphabets and spaces with a space.
   -> Reason : Removes numbers, punctuation, special characters (e.g., .,?!@).
   -> Purpose: Keeps the text simple and clean — only words. Models like LSTM/GRU perform better with cleaner data.

-> text = ' '.join(text.split())
   ->  Breaks the text into words (.split()), removes extra whitespace, and joins it back with single spaces.
   -> Reason : Handles multiple spaces or irregular spacing.
   -> Purpose: Ensures consistent word separation and formatting.

-> return text.lower()
   -> Converts all characters in the text to lowercase.
   -> Reason : To reduce vocabulary size. E.g., India and india should be treated the same.
   -> Purpose: Simplifies training and improves model generalization.

-> return ""
  ->  If the input text is not a string, return an empty string.
  -> Prevents the function from failing on None or non-text inputs.
  -> Purpose: Robustness.

-> data['content_text'] = data['content_text'].apply(clean_text) / data['generated_headline'] = data['generated_headline'].apply(clean_text)
   -> Applies clean_text() to every row in the generated_headline column.
   -> Reason : Prepares target output (headline) for model training.
   -> Purpose: Ensures the decoder learns from clean data.

-> data['generated_headline'] = data['generated_headline'].apply(lambda x: 'start ' + x + ' end')
   -> Adds 'start ' at the beginning and ' end' at the end of every headline.
   -> These special tokens help the model:Know when to start generating.Know when to stop predicting further words.
   -> Purpose: This is essential for sequence-to-sequence models like LSTM/GRU with attention or greedy decoding.

In [53]:
# Tokenize input and output sequences
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer

### Code Explanation :

-> import tensorflow
   -> Imports the TensorFlow library.
   -> Reason : TensorFlow is used to build and train deep learning models.
   -> Purpose: Required for using Keras layers, preprocessing tools, and models.

-> from tensorflow.keras.preprocessing.text import Tokenizer:Imports the Tokenizer class from Keras.
   -> Reason : It tokenizes (converts) text into sequences of integers.
   -> Purpose: To convert words to indices so that neural networks can process them.

In [55]:
import pickle
tokenizer = Tokenizer()

# Fit tokenizer on both inputs and outputs to build vocabulary
tokenizer.fit_on_texts(x.tolist()+y.tolist())

# Store word-to-index mapping
index_word = tokenizer.word_index

with open("tokenizer.pkl",'wb') as file:
    pickle.dump(tokenizer,file)


### Code Explanation :

-> tokenizer = Tokenizer() : Creates an instance of the Tokenizer.
   -> Reason : Initializes an empty tokenizer object that will be fitted on your dataset.
   -> Purpose: To build a vocabulary and prepare text sequences.

-> tokenizer.fit_on_texts(x.tolist()+y.tolist()) : Fits the tokenizer on both input (x) and output (y) text.
   -> Reason : Ensures that the vocabulary covers all words in the full dataset (articles + headlines).
   -> Purpose: Builds a word_index mapping (e.g., "hello" → 5).

-> index_word = tokenizer.word_index : Stores the word-to-index mapping dictionary from the tokenizer.
   -> Reason : You may need it later for decoding predictions (converting back from tokens to words).
   -> Purpose: To allow reverse lookup (e.g., for displaying generated text).

In [56]:
# Convert input and output texts to sequences of integers
x_seq = tokenizer.texts_to_sequences(x)

y_seq = tokenizer.texts_to_sequences(y)

### Code Explanation : 

-> x_seq = tokenizer.texts_to_sequences(x) : Converts all input texts (x) into sequences of integers.
   -> Reason : LSTMs and GRUs don’t process raw text — they need integer token inputs.
   -> Purpose: To numerically represent input text for training.
-> y_seq = tokenizer.texts_to_sequences(y) : Converts output texts (y, the headlines) into integer sequences.
   -> Reason : Decoder input/output also needs to be integer-encoded.
   -> Purpose: For the model to learn the mapping from input sequence to output sequence.

In [57]:
# Determine max sequence lengths for padding
x_maxlen = max(len(seq) for seq in x_seq)
y_maxlen = max(len(seq) for seq in y_seq)

# Calculate total vocabulary size
vocab_size = len(tokenizer.word_index)+1


### Code Explanation :
 
-> y_maxlen = max(len(seq) for seq in y_seq) : Finds the maximum length of output (headline) sequences.
   -> Reason : Same reason as above — needed for padding.
   -> Purpose: Ensures decoder inputs and outputs are the same length across batches.
-> vocab_size = len(tokenizer.word_index)+1 : Calculates the total number of unique tokens + 1.
   -> Reason : Tokenizer indices start from 1. So we add 1 to include a 0-padding token.
   -> Purpose: Needed to set dimensions for the Embedding layer.

In [58]:
# Pad input and output sequences to uniform length
from tensorflow.keras.preprocessing.sequence import pad_sequences
x_padded = pad_sequences(x_seq, maxlen=x_maxlen, padding='pre')
y_padded = pad_sequences(y_seq, maxlen=y_maxlen, padding='post')

### Code Explanation :

-> from tensorflow.keras.preprocessing.sequence import pad_sequences : Imports the function for padding sequences.
   -> Reason : Required to make all sequences have the same length.
   -> Purpose: Essential for batching inputs to the model.

-> x_padded = pad_sequences(x_seq, maxlen=x_maxlen, padding='pre') : Pads input sequences on the left side ('pre') with zeros to make them equal 
              in length.
   -> Reason : LSTMs work better when shorter sequences are padded from the beginning.
   -> Purpose: Converts list of sequences into a matrix that can be fed to the encoder.

-> y_padded = pad_sequences(y_seq, maxlen=y_maxlen, padding='post') : Pads target sequences (headlines) on the right side ('post').
   -> Reason : Output sequences are often padded at the end so the model learns to generate tokens until it hits <end>.
   -> Purpose: Prepares decoder input/output data.

In [387]:
#print("X shape:", x_padded.shape)
#print("Y shape:", y_padded.shape)

In [59]:
# Split data into training and testing sets
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_padded,y_padded,test_size=0.2,random_state=42)

### Code Explanation :

-> from sklearn.model_selection import train_test_split : Imports a utility to split the dataset into training and testing sets.
   -> Reason : Essential for validating model performance on unseen data.
   -> Purpose: Prevents overfitting by evaluating on test data.

->  x_train, x_test, y_train, y_test = train_test_split(x_padded, y_padded, test_size=0.2, random_state=42)
    -> Splits the data into 80% training and 20% testing.
    -> Reason : test_size=0.2 ensures a fair split; random_state=42 ensures reproducibility.
    -> Purpose: Prepares inputs/targets for training and evaluation.

In [60]:
# Import models, layers and callbacks
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input,Embedding, LSTM,GRU,Dense,TimeDistributed
from tensorflow.keras.callbacks import EarlyStopping

In [61]:
# Prepare decoder inputs and targets
# Decoder target is the output shifted by 1 word

# For training
decoder_input_train = y_train[:, :-1]
decoder_output_train = y_train[:, 1:]

# For testing
decoder_input_test = y_test[:, :-1]
decoder_output_test = y_test[:, 1:]

### Code Explanation : 

-> decoder_input_train = y_train[:, :-1] / decoder_input_test = y_test[:, :-1]
   -> Takes all training target sequences (y_train) and removes the last token of each sequence.
   -> Reason : This becomes the input to the decoder during training.The input to the decoder typically starts with the <start> token.
-> Example:
   -> If your y_train sequence is:
      [start, the, president, speaks, end]
      Then:
   -> decoder_input_train = [start, the, president, speaks]
   -> These are the tokens you feed into the decoder at each timestep to predict the next word.

-> decoder_output_train = y_train[:, 1:] / decoder_output_test = y_test[:, 1:]
   -> Takes all training target sequences (y_train) and removes the first token of each sequence.
   -> Reason : This becomes the expected output that the decoder should predict at each timestep.
               The decoder is trained to predict the next word based on previous ones.

In [62]:
# Reshape output to fit sparse_categorical_crossentropy
decoder_output_train = decoder_output_train.reshape(
    decoder_output_train.shape[0], decoder_output_train.shape[1], 1
)
decoder_output_test = decoder_output_test.reshape(
    decoder_output_test.shape[0], decoder_output_test.shape[1], 1
)

### Code Explanation : 

-> decoder_output_train = decoder_output_train.reshape(
   decoder_output_train.shape[0], decoder_output_train.shape[1], 1)
-> Reshapes the decoder_output_train array from a 2D shape to a 3D shape.
 Before:
-> decoder_output_train has shape:(num_samples, sequence_length)
-> For example: (800, 15) → 800 sequences, each of length 15.
After:
-> It becomes:(num_samples, sequence_length, 1)
-> For example: (800, 15, 1) → adds an extra dimension for the output.
-> Reason : To make the label format compatible with the expected output shape of the model when using a sparse classification loss.

-> ecoder_output_test = decoder_output_test.reshape(
    decoder_output_test.shape[0], decoder_output_test.shape[1], 1)
-> It reshapes the test target sequences in the same way.
-> So during validation (with model.fit(..., validation_data=...)) you don’t get shape mismatch.

In [63]:
# Define embedding size and LSTM unit count
embedding_dim = 100
lstm_units = 128

In [64]:
# Encoder: input -> embedding -> LSTM -> states
encoder_inputs = Input(shape=(x_maxlen,))
enc_emb = Embedding(vocab_size, embedding_dim)(encoder_inputs)
encoder_outputs, state_h, state_c = LSTM(lstm_units, return_state=True)(enc_emb)

### Code Explanation :
 
-> encoder_inputs = Input(shape=(x_maxlen,))
   -> This creates a Keras Input layer to receive the encoder input sequence (i.e., the news article text).
   -> shape=(x_maxlen,): The input is a sequence of integers (tokenized word IDs) of fixed length x_maxlen.
   -> Reason : Every model in Keras starts with an input layer.
               This is the placeholder for the source sequence (the news content you're summarizing into a headline).

-> enc_emb = Embedding(vocab_size, embedding_dim)(encoder_inputs)
   -> Applies an Embedding layer to the input sequence.
   -> Converts each word index in encoder_inputs into a dense vector of dimension embedding_dim.
   -> Parameters:
      -> vocab_size: The total number of words in your vocabulary.
      -> embedding_dim: The number of dimensions for each word embedding (e.g., 100).
   -> Reason : Neural networks don't understand raw word indices.
               Embedding translates sparse word IDs into dense, trainable vectors that capture semantic meaning.

-> encoder_outputs, state_h, state_c = LSTM(lstm_units, return_state=True)(enc_emb)
   -> Feeds the embedded sequence into an LSTM layer.
   -> Returns:
             -> encoder_outputs: The full sequence of hidden states from the LSTM (not used in this basic model).
             -> state_h: Final hidden state (important).
             -> state_c: Final cell state (important).
   -> Parameters:
             -> lstm_units: Number of units (neurons) in the LSTM cell (e.g., 128).
             -> return_state=True: This tells Keras to return the internal states (state_h and state_c), which are essential for the decoder.
   -> Reason : 
             -> The LSTM encodes the entire input sequence into two vectors: state_h and state_c.
             -> These vectors carry the context of the entire input and are passed to the decoder to help it generate a relevant output (headline).

In [65]:
# Decoder: input -> embedding -> LSTM (uses encoder states) -> Dense
decoder_inputs = Input(shape=(y_maxlen - 1,))
dec_emb = Embedding(vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True,return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=[state_h, state_c])
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

### Code Explanation:

-> decoder_inputs = Input(shape=(y_maxlen - 1,))
   -> Creates a Keras Input layer for the target sequence input.
   -> y_maxlen - 1: Because during training, we shift the headline input and output by 1 time step.
   -> The decoder input will be the headline with the <end> token removed.
   -> The decoder output will be the headline with the <start> token removed.
   -> Reason : The decoder must learn to predict the next word in the headline, given the previous words.
               This is the input to the decoder during training, i.e., decoder_input_train.

-> dec_emb = Embedding(vocab_size, embedding_dim)(decoder_inputs)
   -> Applies an Embedding layer to the decoder input tokens.
   -> Converts each token (word index) into a dense embedding vector.
   -> Reason : Just like the encoder, the decoder also needs semantic-rich vector inputs to feed into the LSTM.
               To convert word IDs into embeddings so the LSTM can learn meaningful patterns and relationships between words.

-> decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
   -> Initializes the LSTM layer for the decoder.
   -> return_sequences=True: Returns the full sequence of hidden states (one per word).
   -> return_state=True: Also returns the final hidden and cell states (optional here but useful for inference).
   -> Reason : The decoder needs to process the entire input sequence and output a sequence of predictions (one per word).
               This LSTM learns to generate a sequence of output words based on the encoder’s context and previous decoder outputs.

-> decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=[state_h, state_c])
   -> Runs the embedded decoder inputs (dec_emb) through the LSTM decoder.
   -> initial_state=[state_h, state_c]: Initializes the LSTM with the final hidden (state_h) and cell (state_c) states from the encoder.
   -> _: We discard the final states for now (they’re not needed in training, but used during inference).
   -> Reason : Passing encoder states here allows the decoder to "know" the context of the input sequence.
               Without this, the decoder would not know what it's trying to generate a headline for.
               To generate contextual hidden states in the decoder that are influenced by the original input content.
            
-> decoder_dense = Dense(vocab_size, activation='softmax')
   -> Creates a Dense output layer to convert the decoder’s LSTM output at each timestep into a probability distribution over the vocabulary.
   -> activation='softmax': Converts raw scores into probabilities for each word in the vocabulary.
   -> Reason : You need to predict the next word in the headline from all possible words in the vocabulary.
               This layer allows the model to select the most likely next word during training or inference.

-> decoder_outputs = decoder_dense(decoder_outputs)
   -> Applies the Dense layer to the sequence output from the LSTM.
   -> Converts the decoder’s LSTM output into a sequence of predicted word probabilities.
   -> Reason : The model needs to output a word at each time step, and this gives the probability distribution over all possible words.
               Final step in the decoder: turn the hidden states into actual word predictions.



In [66]:
# LSTM model
headline_lstm_model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
headline_lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


### Code Explanation :

-> headline_lstm_model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
   -> Model(...): This is a function from Keras (tensorflow.keras.models.Model) used to define a complete model.
   -> We have to give it :A list of input layers,An output layer.It then builds a model that maps the inputs to the outputs.
   -> [encoder_inputs, decoder_inputs]:This is the input to the model and contains two parts:
      -> encoder_inputs:This input layer receives the input text sequence (e.g., a news article or document).
                        It’s passed through an Embedding layer and then into an LSTM encoder to generate context/state.
      -> decoder_inputs:This input layer receives the target sequence during training (e.g., headline with <start> token).
                        It also goes through an Embedding → LSTM decoder that uses the encoder’s states as initial state.
      -> decoder_outputs:This is the final output of the decoder part of the network:
                         It is a sequence of probability distributions (via softmax) over the vocabulary at each timestep.
                         It predicts the next word in the output sequence.
    -> Shape: (batch_size, y_maxlen - 1, vocab_size)
-> headline_lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
   -> Compiles model using:
      -> adam: efficient optimizer.
      -> sparse_categorical_crossentropy: suitable loss for multi-class classification when labels are integers (not one-hot).
   -> Enables the model to learn by minimizing prediction error.


In [67]:
# Callback to stop training early if no improvement
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)


### Code Explanation :

 -> callbacks=[early_stopping]: stops training if no improvement in validation loss.

In [68]:
# Train the model
headline_lstm_model.fit(
    [x_train, decoder_input_train],
    decoder_output_train,
    validation_data=([x_test, decoder_input_test], decoder_output_test),
    batch_size=32,epochs=40,
    callbacks=[early_stopping]
)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40


<keras.src.callbacks.History at 0x1be47c51db0>

### Code Explanation :

-> headline_lstm_model.fit(
    [x_train, decoder_input_train],decoder_output_train,validation_data=([x_test, decoder_input_test], decoder_output_test),batch_size=32, epochs=40,callbacks=[early_stopping])
   -> Key components:
      -> x_train: input articles (tokenized, padded).
      -> decoder_input_train: shifted target sequences (with <start> token).
      -> decoder_output_train: expected output sequences (with <end> token), reshaped for training.
      -> validation_data: evaluates the model on unseen test data during training.
      -> batch_size=32: processes 32 samples at a time.
      -> epochs=40: trains for up to 40 epochs.
      -> callbacks=[early_stopping]: stops training if no improvement in validation loss.
   -> Reason : Trains the full sequence-to-sequence model so that, given input text, it can learn to generate the correct output sequence 
                 (e.g.headline).


In [69]:
from tensorflow.keras.models import load_model

# Save the model
headline_lstm_model.save("headline_lstm_model.h5")

  saving_api.save_model(


In [70]:
# GRU Encoder
encoder_inputs_gru = Input(shape=(x_maxlen,))
enc_emb_gru = Embedding(vocab_size, embedding_dim)(encoder_inputs_gru)
encoder_outputs_gru, state_gru = GRU(lstm_units, return_state=True)(enc_emb_gru)

# GRU Decoder
decoder_inputs_gru = Input(shape=(y_maxlen - 1,))
dec_emb_gru = Embedding(vocab_size, embedding_dim)(decoder_inputs_gru)
decoder_gru = GRU(lstm_units, return_sequences=True, return_state=True)
decoder_outputs_gru, _ = decoder_gru(dec_emb_gru, initial_state=state_gru)
decoder_dense_gru = Dense(vocab_size, activation='softmax')
decoder_outputs_gru = decoder_dense_gru(decoder_outputs_gru)

# GRU Model
gru_model = Model([encoder_inputs_gru, decoder_inputs_gru], decoder_outputs_gru)
gru_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train GRU model
gru_model.fit(
    [x_train, decoder_input_train],
    decoder_output_train,
    validation_data=([x_test, decoder_input_test], decoder_output_test),
    batch_size=32,epochs=40,
    callbacks=[early_stopping]
)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40


<keras.src.callbacks.History at 0x1be4249ee30>

### Code Explanation :

-> encoder_inputs_gru = Input(shape=(x_maxlen,))
   -> Creates an input layer for the encoder with shape (x_maxlen,), meaning a sequence of word indices of max length x_maxlen.
   -> Reason : The encoder takes an input sequence (e.g. cleaned article text) as a series of tokens (integers).

-> enc_emb_gru = Embedding(vocab_size, embedding_dim)(encoder_inputs_gru)
   -> Embedding layer turns the integer tokens into dense word vectors of size embedding_dim.
   -> Reason : Embeddings give semantic meaning to tokens for better GRU learning.

-> encoder_outputs_gru, state_gru = GRU(lstm_units, return_state=True)(enc_emb_gru)
   -> Passes embeddings to a GRU layer.
   -> return_state=True: We return the final hidden state state_gru, which summarizes the input sequence.
   -> Reason : This state is passed to the decoder as initial context.

-> decoder_inputs_gru = Input(shape=(y_maxlen - 1,))
   -> Input layer for the decoder, which takes target sequences shifted right (i.e., without the <end> token).
   -> Reason : Used during training; model learns to predict the next word from previous ones.

-> dec_emb_gru = Embedding(vocab_size, embedding_dim)(decoder_inputs_gru)
   -> Embedding layer for decoder inputs.
   -> Reason : Converts output tokens into embeddings for the decoder GRU to understand.

-> decoder_gru = GRU(lstm_units, return_sequences=True, return_state=True)
   -> Initializes a GRU layer for the decoder.
   -> return_sequences=True: Needed because the decoder must output a sequence (not a single output).
   -> Reason : To generate a word at each step of the target sequence.

-> decoder_outputs_gru, _ = decoder_gru(dec_emb_gru, initial_state=state_gru)
   -> Feeds embedded target sequence to the decoder GRU.
   -> initial_state=state_gru: Uses the final state from the encoder GRU to guide decoding.
   -> Reason : This provides the decoder with context about the input sequence.

-> decoder_dense_gru = Dense(vocab_size, activation='softmax')
   -> Dense layer that predicts the next word (probability distribution over vocabulary).
   -> Reason : Converts GRU output at each time step into a probability over words.

-> decoder_outputs_gru = decoder_dense_gru(decoder_outputs_gru)
   -> Applies the dense layer to GRU outputs.
   ->  Produces final predicted word sequence.

-> GRU Model Compilation
  -> gru_model = Model([encoder_inputs_gru, decoder_inputs_gru], decoder_outputs_gru)
  -> Defines the final Keras model, connecting encoder and decoder inputs to decoder outputs.
  -> Reason : This is the full model to be trained.

-> gru_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
   -> Compiles model using:
      -> adam: efficient optimizer.
      -> sparse_categorical_crossentropy: suitable loss for multi-class classification when labels are integers (not one-hot).
   -> Enables the model to learn by minimizing prediction error.

-> gru_model.fit(
    [x_train, decoder_input_train],decoder_output_train,validation_data=([x_test, decoder_input_test], decoder_output_test),batch_size=32, epochs=40,callbacks=[early_stopping])
   -> Key components:
      -> x_train: input articles (tokenized, padded).
      -> decoder_input_train: shifted target sequences (with <start> token).
      -> decoder_output_train: expected output sequences (with <end> token), reshaped for training.
      -> validation_data: evaluates the model on unseen test data during training.
      -> batch_size=32: processes 32 samples at a time.
      -> epochs=40: trains for up to 40 epochs.
      -> callbacks=[early_stopping]: stops training if no improvement in validation loss.
   -> Reason : Trains the full sequence-to-sequence model so that, given input text, it can learn to generate the correct output sequence 
                 (e.g.headline).

-> Summary Diagram:
Input (x_train) ───> Encoder GRU ───┐
                                    │
Target (decoder_input_train) ──> Decoder GRU ──> Dense (softmax) ──> Output headline
                                    │
                          (Initial state from Encoder)

In [71]:
# Save GRU Model
gru_model.save("headline_gru_model.h5")

In [72]:
# importing the numpy library
import numpy as np

In [73]:
# ------------------- Inference Models -------------------
# LSTM inference
encoder_model_lstm = Model(encoder_inputs, [state_h, state_c])

decoder_state_input_h = Input(shape=(lstm_units,))
decoder_state_input_c = Input(shape=(lstm_units,))
decoder_hidden_inputs = Input(shape=(1,))
decoder_emb_infer = Embedding(vocab_size, embedding_dim)(decoder_hidden_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(decoder_emb_infer,
                                                    initial_state=[decoder_state_input_h, decoder_state_input_c])
decoder_outputs2 = decoder_dense(decoder_outputs2)
decoder_model_lstm = Model(
    [decoder_hidden_inputs, decoder_state_input_h, decoder_state_input_c],
    [decoder_outputs2, state_h2, state_c2]
)

# GRU inference
encoder_model_gru = Model(encoder_inputs_gru, state_gru)

decoder_state_input_gru = Input(shape=(lstm_units,))
decoder_hidden_inputs_gru = Input(shape=(1,))
decoder_emb_infer_gru = Embedding(vocab_size, embedding_dim)(decoder_hidden_inputs_gru)
decoder_outputs_gru_inf, state_gru_inf = decoder_gru(decoder_emb_infer_gru, initial_state=decoder_state_input_gru)
decoder_outputs_gru_inf = decoder_dense_gru(decoder_outputs_gru_inf)
decoder_model_gru = Model(
    [decoder_hidden_inputs_gru, decoder_state_input_gru],
    [decoder_outputs_gru_inf, state_gru_inf]
)

### Code Explanation:

 #### Why do we need inference models separately?
-> During training, the encoder and decoder run together using full sequences.
-> During inference (prediction), we:
    -> Encode the input once,
    -> Then decode one word at a time, feeding each predicted word back to the decoder to get the next word.
    -> So, we rebuild smaller versions of the models for this step. That’s what you're doing here.

-> encoder_model_lstm = Model(encoder_inputs, [state_h, state_c])
    -> Creates an encoder model for inference using LSTM.
    -> encoder_inputs: The input layer (tokenized padded input text).
    -> [state_h, state_c]: Outputs only the hidden and cell states (not the whole sequence).
    -> Reason : During inference, we only need the final states from the encoder to pass into the decoder.

-> decoder_state_input_h = Input(shape=(lstm_units,))
   decoder_state_input_c = Input(shape=(lstm_units,))
   -> Defines input layers for the decoder's previous timestep hidden state (h) and cell state (c).
   -> Reason : In inference, we decode one word at a time — we must feed the previous LSTM states back into the decoder.

-> decoder_hidden_inputs = Input(shape=(1,))
   -> Defines an input layer for the decoder that will take one token at a time (e.g., 'start', 'the', etc.).
   -> Reason : Unlike training (where we pass full sequences), inference works step-by-step. So, we feed just one word each time.

-> decoder_emb_infer = Embedding(vocab_size, embedding_dim)(decoder_hidden_inputs)
   -> Embeds the one-word input into a dense vector using the same Embedding layer setup.
   -> Reason : The decoder expects embedded vectors, not plain integers. This gives word representations learned during training.

-> decoder_outputs2, state_h2, state_c2 = decoder_lstm(
   decoder_emb_infer, initial_state=[decoder_state_input_h, decoder_state_input_c])
   -> Runs the decoder LSTM for 1 time step, using:
   -> The embedded input word,
   -> The previous hidden and cell states.
   -> Returns:
      -> Output for the current step,
      -> Updated hidden and cell states (state_h2, state_c2).
   -> Reason : We use these updated states for the next decoding step.

-> decoder_outputs2 = decoder_dense(decoder_outputs2)
   -> Passes the decoder's output through the final Dense layer with softmax.
   -> Reason : To get the probability distribution over all vocabulary words — from which we select the most probable one.

-> decoder_model_lstm = Model(
    [decoder_hidden_inputs, decoder_state_input_h, decoder_state_input_c],
    [decoder_outputs2, state_h2, state_c2])
   -> Creates the final LSTM decoder inference model.
      -> Inputs : Current word,Previous state_h,Previous state_c
      -> Outputs : Word prediction,Updated state_h,Updated state_c
   -> Reason : This model allows step-by-step decoding, reusing the updated states every time.

-> GRU Inference
   -> Now, I replicate the same logic using GRU, which is simpler (only one hidden state).

-> encoder_model_gru = Model(encoder_inputs_gru, state_gru)
   -> Creates the encoder inference model for GRU.
   -> Reason : The GRU encoder only outputs one state — state_gru.

-> decoder_state_input_gru = Input(shape=(lstm_units,))
   -> Defines the input for the GRU's previous hidden state.

-> decoder_hidden_inputs_gru = Input(shape=(1,))
   -> Defines an input for the one-token input word for the decoder

-> decoder_emb_infer_gru = Embedding(vocab_size, embedding_dim)(decoder_hidden_inputs_gru)
   -> Embeds the decoder input word into dense vectors.

-> decoder_outputs_gru_inf, state_gru_inf = decoder_gru(
    decoder_emb_infer_gru, initial_state=decoder_state_input_gru)
   -> Feeds the input into the GRU decoder with previous state → gets:
      -> Output word prediction
      -> Updated GRU state

-> decoder_outputs_gru_inf = decoder_dense_gru(decoder_outputs_gru_inf)
   -> Passes decoder output through the final Dense layer to get vocabulary probabilities.

-> decoder_model_gru = Model(
    [decoder_hidden_inputs_gru, decoder_state_input_gru],
    [decoder_outputs_gru_inf, state_gru_inf])
   -> Creates the final GRU decoder model for inference.

In [74]:
# Function to generate headline given input text
def generate_headline(text, encoder_model, decoder_model, tokenizer, max_len, y_maxlen, is_lstm=True):
    # Clean and tokenize the input
    input_seq = tokenizer.texts_to_sequences([clean_text(text)])
    input_seq = pad_sequences(input_seq, maxlen=max_len, padding='pre')

    # Encode input sequence
    if is_lstm:
        state_h, state_c = encoder_model.predict(input_seq, verbose=0)
        states = [state_h, state_c]
    else:
        state_gru = encoder_model.predict(input_seq, verbose=0)
        states = [state_gru]

    # Start decoding with <start> token
    start_token_id = tokenizer.word_index.get('start')
    if not start_token_id:
        raise ValueError("The tokenizer does not contain a 'start' token.")

    target_seq = np.array([[start_token_id]])
    result = []

    for _ in range(y_maxlen):
        if is_lstm:
            output_tokens, h, c = decoder_model.predict([target_seq] + states, verbose=0)
            states = [h, c]
        else:
            output_tokens, new_state = decoder_model.predict([target_seq] + states, verbose=0)
            states = [new_state]

        predicted_id = np.argmax(output_tokens[0, -1, :])
        word = tokenizer.index_word.get(predicted_id, '')

        if word == 'end' or word == '':
            break

        result.append(word)
        target_seq = np.array([[predicted_id]])

    return ' '.join(result).strip()


### Code Explanation:

-> def generate_headline(text, encoder_model, decoder_model, tokenizer, max_len, y_maxlen, is_lstm=True):Defines a user defined  function to generate a headline for a given news paragraph using a trained encoder-decoder model (LSTM or GRU).

-> Parameters:
             -> text: The raw news paragraph to summarize.

             -> encoder_model: The inference model for encoding the input sequence.

             -> decoder_model: The inference model for decoding/generating the headline.

             -> tokenizer: Maps between words and integer tokens.

             -> max_len: Maximum length for input padding.

             --> y_maxlen: Maximum length of output (i.e., how long the headline should be).

             -> is_lstm: Boolean flag to switch between LSTM and GRU inference models.

-> input_seq = tokenizer.texts_to_sequences([clean_text(text)]):
    -> Cleans the text using the same cleaning function used during training.
    -> Converts the cleaned text into a list of integers (tokens) using the tokenizer.
    -> Reason : Models only understand numbers. You must transform words into token IDs the model saw during training.

-> input_seq = pad_sequences(input_seq, maxlen=max_len, padding='pre'):
    -> Pads the tokenized input so it's exactly max_len long by adding zeros at the beginning.
    -> Reason : The encoder expects input of a fixed length. Padding ensures consistency. 

-> if is_lstm:
        state_h, state_c = encoder_model.predict(input_seq, verbose=0)
        states = [state_h, state_c]
    -> If using LSTM:
    -> Pass the input to the encoder model.
    -> Receive two states: hidden (state_h) and cell (state_c) state.
    -> Store them in states.
    -> Reason : LSTM maintains both hidden and cell states for better long-term memory tracking during decoding.

-> else:
        state_gru = encoder_model.predict(input_seq, verbose=0)
        states = [state_gru]
    -> If using GRU:
    -> Pass the input to the encoder model.
    -> GRU returns only one hidden state.
    -> Reason : GRUs are simpler than LSTMs; they only return a single state.

-> start_token_id = tokenizer.word_index.get('start'):
    -> Gets the integer token that corresponds to the word 'start'.
    -> Reason : This token is used to kick off the decoding process, telling the model: "Start generating the headline now."

-> if not start_token_id:
        raise ValueError("The tokenizer does not contain a 'start' token.")
    -> Safety check to make sure 'start' token exists in the tokenizer.
    -> Reason : Without the 'start' token, the model won't know where to begin generating output.

-> target_seq = np.array([[start_token_id]])
    -> Initializes the target input for the decoder with the <start> token.
    -> Reason : This is the very first input to the decoder so it starts generating the first word.

-> result = []
    -> An empty list to store predicted words as they are generated.
    -> Reason : This will ultimately contain the generated headline, word by word.

-> for _ in range(y_maxlen):
    -> Loop up to y_maxlen times to generate each word of the headline.
    -> Reason : Headlines have a maximum length. Looping ensures we don't generate endlessly.

->  if is_lstm:
            output_tokens, h, c = decoder_model.predict([target_seq] + states, verbose=0)
            states = [h, c]
    -> Pass the current input word (target_seq) and states to the decoder.
    -> Get output token probabilities and updated states.
    -> Update the states to use in the next step.
    -> Reason : Each decoding step needs the previous state to generate the next word in sequence.

-> else:
            output_tokens, new_state = decoder_model.predict([target_seq] + states, verbose=0)
            states = [new_state]
    -> Same as above, but only one state is returned and updated.

-> predicted_id = np.argmax(output_tokens[0, -1, :])
    -> Finds the index (word ID) of the most probable word in the decoder output.
    -> Reason : The decoder outputs a probability distribution over all vocabulary. argmax selects the most likely next word.

-> word = tokenizer.index_word.get(predicted_id, '')
    -> Converts the predicted token ID back to its corresponding word.
    -> Reason : We need the actual word, not just the index, to append to the final result.

-> if word == 'end' or word == '':
            break
    -> Stops generation if the model predicts the 'end' token or can't find a valid word.
    -> Reason : 'end' indicates the end of the sequence. Empty predictions are treated as errors or unknowns.

-> result.append(word)
    -> Adds the predicted word to the output list.
    -> To build the full sentence word by word.

-> target_seq = np.array([[predicted_id]])
     -> Prepares the predicted word as the next input to the decoder.
     -> Reason : Decoding is sequential: the output at time t becomes input at time t+1.

-> return ' '.join(result).strip()
     -> Combines all predicted words into a single string and removes leading/trailing spaces.
     -> Reason : Returns the final readable headline as a properly formatted string.

In [75]:
# Test Generation
test_paragraphs = [
    "The president addressed the media regarding the economic reforms.",
    "India launched a new satellite into orbit for communication services.",
    "The finance minister presented the annual budget in parliament.",
    "Heavy rains caused flooding in several districts across the state.",
    "The cricket team celebrated after winning the international tournament."
]

print("\nHeadline Predictions:\n")
for i, para in enumerate(test_paragraphs, 1):
    lstm_headline = generate_headline(para, encoder_model_lstm, decoder_model_lstm,
                                      tokenizer, x_maxlen, y_maxlen, is_lstm=True)
    gru_headline = generate_headline(para, encoder_model_gru, decoder_model_gru,
                                     tokenizer, x_maxlen, y_maxlen, is_lstm=False)

    print(f"{i}. Input Text    : {para}")
    print(f"   LSTM Headline: {lstm_headline}")
    print(f"   GRU Headline : {gru_headline}")
    print("-" * 80)


Headline Predictions:

1. Input Text    : The president addressed the media regarding the economic reforms.
   LSTM Headline: nasa change wins
   GRU Headline : ai discovers effects
--------------------------------------------------------------------------------
2. Input Text    : India launched a new satellite into orbit for communication services.
   LSTM Headline: climate discovers wins championship
   GRU Headline : ai discovers effects
--------------------------------------------------------------------------------
3. Input Text    : The finance minister presented the annual budget in parliament.
   LSTM Headline: climate discovers wins championship
   GRU Headline : ai discovers effects
--------------------------------------------------------------------------------
4. Input Text    : Heavy rains caused flooding in several districts across the state.
   LSTM Headline: new team wins
   GRU Headline : ai discovers effects
-----------------------------------------------------------

### Code Explanation :

-> for i, para in enumerate(test_paragraphs, 1) : Loops over each paragraph (para) and assigns an index (i), starting from 1.
   -> Reason : So you can number the output predictions (1., 2., etc.).
   -> Purpose : Allows systematic evaluation and display of results for each test case.

-> lstm_headline = generate_headline(para,encoder_model_lstm,decoder_model_lstm,tokenizer,x_maxlen,y_maxlen,is_lstm=True)
   -> Generates a headline using the LSTM-based model for the current input paragraph.
   -> Reason : Tests the LSTM model’s ability to summarize the input.
   -> Purpose:Compares the LSTM model’s predictions with the GRU model for the same paragraph.

-> gru_headline = generate_headline(para,encoder_model_gru,decoder_model_gru,tokenizer,x_maxlen,y_maxlen,is_lstm=False)
   -> Generates a headline using the GRU-based model for the same paragraph.
   -> Reason : To compare how differently the GRU model performs on the same input.
   -> Purpose : Allows side-by-side evaluation of two architectures: LSTM vs GRU.

-> print(f"{i}. Input Text : {para}") : Prints the original input paragraph.
   -> Reason : Shows what the model is trying to summarize.
   -> Purpose : Useful for human evaluation — we want to see if the generated headline makes sense.

-> print(f" LSTM Headline: {lstm_headline}") : Prints the headline generated by the LSTM model.
   -> Reason : To visually display and evaluate LSTM model predictions.
   -> Purpose : Compare its summary quality with the GRU’s output.

-> print(f" GRU Headline : {gru_headline}") : Prints the headline generated by the GRU model.
   -> Reason : Completes the comparison between both models.
   -> Purpose : Helps you decide which architecture is performing better in headline generation.

-> print("-" * 80) : Prints a horizontal line separator.
   -> Reason : Visually separates results for each input paragraph.
   -> Purpose : Improves readability of the console output.

In [45]:
def generate_headline(text, encoder_model, decoder_model, tokenizer, max_len, y_maxlen, is_lstm=True):
    # Clean and tokenize the input
    input_seq = tokenizer.texts_to_sequences([clean_text(text)])
    input_seq = pad_sequences(input_seq, maxlen=max_len, padding='pre')

    # Encode the input sequence
    if is_lstm:
        state_h, state_c = encoder_model.predict(input_seq)
        states = [state_h, state_c]
    else:
        state_gru = encoder_model.predict(input_seq)
        states = [state_gru]

    # Start decoding with the <start> token
    target_seq = np.array([[tokenizer.word_index['start']]])
    result = []

    for _ in range(y_maxlen):
        if is_lstm:
            output_tokens, h, c = decoder_model.predict([target_seq] + states)
            states = [h, c]
        else:
            output_tokens, new_state = decoder_model.predict([target_seq] + states)
            states = [new_state]

        predicted_id = np.argmax(output_tokens[0, -1, :])
        word = tokenizer.index_word.get(predicted_id, '')

        # Stop if 'end' or empty string is predicted
        if word == 'end' or word == '':
            break

        result.append(word)
        target_seq = np.array([[predicted_id]])

    return ' '.join(result)

In [46]:
test_paragraphs = [
    "The president addressed the media regarding the economic reforms.",
    "India launched a new satellite into orbit for communication services.",
    "The finance minister presented the annual budget in parliament.",
    "Heavy rains caused flooding in several districts across the state.",
    "The cricket team celebrated after winning the international tournament."
]

print("\nHeadline Predictions:\n")
for para in test_paragraphs:
    lstm_headline = generate_headline(
        para, encoder_model_lstm, decoder_model_lstm, tokenizer, x_maxlen, y_maxlen, is_lstm=True)
    gru_headline = generate_headline(
        para, encoder_model_gru, decoder_model_gru, tokenizer, x_maxlen, y_maxlen, is_lstm=False)
    
    print(f"Input Text      : {para}")
    print(f"LSTM Headline   : {lstm_headline}")
    print(f"GRU Headline    : {gru_headline}")
    print("-" * 60)


Headline Predictions:

Input Text      : The president addressed the media regarding the economic reforms.
LSTM Headline   : schools change effects
GRU Headline    : ai discovers hits learning
------------------------------------------------------------
Input Text      : India launched a new satellite into orbit for communication services.
LSTM Headline   : stock change effects
GRU Headline    : climate discovers hits
------------------------------------------------------------
Input Text      : The finance minister presented the annual budget in parliament.
LSTM Headline   : new breakthrough wins
GRU Headline    : climate discovers hits
------------------------------------------------------------
Input Text      : Heavy rains caused flooding in several districts across the state.
LSTM Headline   : local discovers wins
GRU Headline    : climate discovers hits learning
------------------------------------------------------------
Input Text      : The cricket team celebrated after winni

In [35]:
# ------------------- Generate Headline Function -------------------
def generate_headline(text, encoder_model, decoder_model, tokenizer, max_len, is_lstm=True):
    input_seq = tokenizer.texts_to_sequences([clean_text(text)])
    input_seq = pad_sequences(input_seq, maxlen=max_len, padding='pre')

    if is_lstm:
        state_h, state_c = encoder_model.predict(input_seq)
    else:
        state_gru = encoder_model.predict(input_seq)

    target_seq = np.array([[tokenizer.word_index['start']]])
    result = []

    for _ in range(y_maxlen):
        if is_lstm:
            output_tokens, h, c = decoder_model.predict([target_seq, state_h, state_c])
            state_h, state_c = h, c
        else:
            output_tokens, state_gru = decoder_model.predict([target_seq, state_gru])

        predicted_id = np.argmax(output_tokens[0, -1, :])
        word = tokenizer.index_word.get(predicted_id, '')

        #if word == 'end' or word == '':
         #   break

        result.append(word)
        target_seq = np.array([[predicted_id]])

    return ' '.join(result)


In [39]:

test_paragraphs = [
    "The president addressed the media regarding the economic reforms.",
    "India launched a new satellite into orbit for communication services.",
    "The finance minister presented the annual budget in parliament.",
    "Heavy rains caused flooding in several districts across the state.",
    "The cricket team celebrated after winning the international tournament."
]

print("\nHeadline Predictions:\n")
for para in test_paragraphs:
    lstm_headline = generate_headline(para, encoder_model_lstm, decoder_model_lstm, tokenizer, x_maxlen, is_lstm=True)
    gru_headline = generate_headline(para, encoder_model_gru, decoder_model_gru, tokenizer, x_maxlen, is_lstm=False)
    print(f"Input Text      : {para}")
    print(f"LSTM Headline   : {lstm_headline}")
    print(f"GRU Headline    : {gru_headline}")
    print("-" * 60)


Headline Predictions:



TypeError: generate_headline() missing 1 required positional argument: 'y_maxlen'