<a href="https://colab.research.google.com/github/Karthikeyan37/Deep-Reinforcement-Learning/blob/main/notebooks/LSTM_text_summarisation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q kaggle

In [None]:
!kaggle datasets download -d gowrishankarp/newspaper-text-summarization-cnn-dailymail
!unzip newspaper-text-summarization-cnn-dailymail.zip


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, AdditiveAttention
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np

In [None]:
dataset_train = pd.read_csv('cnn_dailymail/train.csv')
dataset_train=dataset_train.drop(['id'], axis=1)
dataset_train.head()

In [None]:
dataset_validation=pd.read_csv('cnn_dailymail/validation.csv')
dataset_validation=dataset_validation.drop(['id'], axis=1)
dataset_validation.head()

In [None]:
dataset_test=pd.read_csv('cnn_dailymail/test.csv')
dataset_test=dataset_test.drop(['id'], axis=1)
dataset_test.head()

In [None]:
#taking 10% sample of the train, validation, and test sets
train_sampled = dataset_train.sample(frac=0.1, random_state=42) # Use sample instead of shuffle and select
val_sampled = dataset_validation.sample(frac=0.1, random_state=42) # Use sample instead of shuffle and select
test_sampled = dataset_test.sample(frac=0.1, random_state=42) # Use sample instead of shuffle and select


print(f"Train size: {len(train_sampled)}")
print(f"Validation size: {len(val_sampled)}")
print(f"Test size: {len(test_sampled)}")

In [None]:
#cleaning text function
def clean_text(text):
    text = text.lower()  #lowercasing
    text = re.sub(r'[^a-z0-9\s]', '', text)  #removing special characters
    return text

In [None]:
cleaned_articles = [clean_text(i) for i in train_sampled['article']]


In [None]:
max_len_index = max(range(len(cleaned_articles)), key=lambda i: len(cleaned_articles[i].split()))
print(f"Index of the article with the max length: {max_len_index}")


In [None]:
# Extract articles and highlights for each split
train_articles = train_sampled['article'].values
train_highlights = train_sampled['highlights'].values

val_articles = val_sampled['article'].values
val_highlights = val_sampled['highlights'].values

test_articles = test_sampled['article'].values
test_highlights = test_sampled['highlights'].values








In [None]:
# Set tokenizer parameters
vocab_size = 50000
max_article_len = 500
max_summary_len = 50

# Initialize and fit tokenizers on training data only
article_tokenizer = Tokenizer(num_words=vocab_size, oov_token="<UNK>")
summary_tokenizer = Tokenizer(num_words=vocab_size, oov_token="<UNK>")

article_tokenizer.fit_on_texts(train_articles)
summary_tokenizer.fit_on_texts(train_highlights)

In [None]:
# Define a function to tokenize and pad sequences
def tokenize_and_pad(texts, tokenizer, max_len):
    sequences = tokenizer.texts_to_sequences(texts)
    padded = pad_sequences(sequences, maxlen=max_len, padding='post')
    return padded

In [None]:
# Tokenize and pad each dataset split
train_article_padded = tokenize_and_pad(train_articles, article_tokenizer, max_article_len)
train_summary_padded = tokenize_and_pad(train_highlights, summary_tokenizer, max_summary_len)

val_article_padded = tokenize_and_pad(val_articles, article_tokenizer, max_article_len)
val_summary_padded = tokenize_and_pad(val_highlights, summary_tokenizer, max_summary_len)

test_article_padded = tokenize_and_pad(test_articles, article_tokenizer, max_article_len)
test_summary_padded = tokenize_and_pad(test_highlights, summary_tokenizer, max_summary_len)

In [None]:
# Prepare decoder input and output for training
train_decoder_input = train_summary_padded[:, :-1]
train_decoder_output = train_summary_padded[:, 1:]

val_decoder_input = val_summary_padded[:, :-1]
val_decoder_output = val_summary_padded[:, 1:]

In [None]:
# Build the encoder-decoder model with attention
embedding_dim = 256
lstm_units = 512

# Encoder
encoder_inputs = Input(shape=(max_article_len,))
enc_emb = Embedding(vocab_size, embedding_dim, trainable=True)(encoder_inputs)
encoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

In [None]:
# Attention Layer
attention = AdditiveAttention()

In [None]:
# Decoder
decoder_inputs = Input(shape=(max_summary_len-1,))
dec_emb_layer = Embedding(vocab_size, embedding_dim, trainable=True)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)

In [None]:
# Apply attention between encoder and decoder
context_vector = attention([encoder_outputs, decoder_outputs])
decoder_combined_context = tf.concat([context_vector, decoder_outputs], axis=-1)

# Dense layer for predictions
dense = Dense(vocab_size, activation='softmax')
decoder_outputs = dense(decoder_combined_context)

# Define the final model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Train the model using training and validation sets
batch_size = 64
epochs = 20

history = model.fit(
    [train_article_padded, train_decoder_input],
    train_decoder_output,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=([val_article_padded, val_decoder_input], val_decoder_output)
)


In [None]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate([test_article_padded, test_summary_padded[:, :-1]], test_summary_padded[:, 1:])
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

In [None]:
# Define inference models
# Encoder model
encoder_model = Model(encoder_inputs, [encoder_outputs, state_h, state_c])

# Decoder model for inference
decoder_state_input_h = Input(shape=(lstm_units,))
decoder_state_input_c = Input(shape=(lstm_units,))
decoder_hidden_state_input = Input(shape=(max_article_len, lstm_units))

dec_emb_inf = dec_emb_layer(decoder_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(
    dec_emb_inf, initial_state=[decoder_state_input_h, decoder_state_input_c]
)

In [None]:
# Apply attention for inference
context_vector_inf = attention([decoder_hidden_state_input, decoder_outputs2])
decoder_combined_context2 = tf.concat([context_vector_inf, decoder_outputs2], axis=-1)
decoder_outputs2 = dense(decoder_combined_context2)

decoder_model = Model(
    [decoder_inputs] + [decoder_hidden_state_input, decoder_state_input_h, decoder_state_input_c],
    [decoder_outputs2] + [state_h2, state_c2]
)

In [None]:
# Generate summary using the trained model
def decode_sequence(input_seq):
    enc_outs, h, c = encoder_model.predict(input_seq)
    target_seq = np.array([summary_tokenizer.word_index['<start>']])

    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + [enc_outs, h, c])
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = summary_tokenizer.index_word[sampled_token_index]

        if sampled_word == '<end>' or len(decoded_sentence.split()) > max_summary_len:
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_word

        target_seq = np.array([sampled_token_index])

    return decoded_sentence

In [None]:
# Example usage to generate summaries
for i in range(5):
    input_seq = test_article_padded[i:i+1]
    summary = decode_sequence(input_seq)
    print("Original:", test_articles[i])
    print("Generated Summary:", summary)
    print("Actual Summary:", test_highlights[i])
    print("-----")
