## Importing the relevant libraries

In [None]:
!pip install textstat contractions

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk, warnings, re, contractions
warnings.filterwarnings('ignore')
%matplotlib inline
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, Embedding, Input, Concatenate, LayerNormalization, Attention, LSTM, TimeDistributed
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler, ModelCheckpoint, TensorBoard
from tensorflow.keras.optimizers import Adam
from wordcloud import WordCloud
from tensorflow.keras.utils import plot_model
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import corpus_bleu
from sklearn.feature_extraction.text import TfidfVectorizer
from textstat import flesch_reading_ease, flesch_kincaid_grade
from tqdm.notebook import tqdm
from collections import Counter
tqdm.pandas()
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy('mixed_float16')
from tensorflow.keras.models import Model
from numpy import gradient

In [None]:
# Set scientific notation of display output to 2 decimal places
pd.set_option('display.float_format', '{:.2f}'.format) 

## Loading the dataset

In [None]:
df = pd.read_csv('/kaggle/input/news-summarization/data.csv')
df.drop('Unnamed: 0',axis=1,inplace=True)
df = df.sample(frac=0.15).reset_index(drop=True)
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df = df[['Content','Summary']]
df = df.dropna()
df.shape

In [None]:
df.rename({'Content': 'news', 'Summary': 'summary'},axis=1,inplace=True)

In [None]:
df.head()

## Text Preprocessing

In [None]:
def convert_text_to_lowercase(text):
    return text.lower()

In [None]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'',text)

In [None]:
def remove_html_tags(text):
    pattern = r'[^a-zA-Z0-9\s]'
    text = re.sub(pattern,'',text)
    return text

In [None]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [None]:
def expand_contractions(text):
    return contractions.fix(text)

In [None]:
df.news = df.news.progress_apply(convert_text_to_lowercase)
df.news = df.news.progress_apply(remove_url)
df.news = df.news.progress_apply(remove_emoji)
df.news = df.news.progress_apply(remove_html_tags)
df.news = df.news.progress_apply(expand_contractions)

In [None]:
df.summary = df.summary.progress_apply(convert_text_to_lowercase)
df.summary = df.summary.progress_apply(remove_url)
df.summary = df.summary.progress_apply(remove_emoji)
df.summary = df.summary.progress_apply(remove_html_tags)
df.summary = df.summary.progress_apply(expand_contractions)

## Text Tokenization and Padding

In [None]:
df.summary = df.summary.apply(lambda x: '<start> ' + x.strip() + ' <end>')

In [None]:
MAX_VOCAB_SIZE = 30000

In [None]:
def create_tokenizer(texts):
    tok = Tokenizer(num_words=MAX_VOCAB_SIZE)
    tok.fit_on_texts(texts)
    return tok

In [None]:
def get_max_sequence_length(sentences):
    return max([len(line.split()) for line in sentences])

In [None]:
news_tokenizer = create_tokenizer(df.news)
max_news_len = get_max_sequence_length(df.news)
news_vocab_size = len(news_tokenizer.word_index) + 1
print("Maximum length of news:", max_news_len)
print("Number of unique words in news:", news_vocab_size)

In [None]:
summary_tokenizer = create_tokenizer(df.summary)
max_summary_len = get_max_sequence_length(df.summary)
summary_vocab_size = len(summary_tokenizer.word_index) + 1
print("Maximum length of summary:", max_summary_len)
print("Number of unique words in summary:", summary_vocab_size)

In [None]:
tokenizer = create_tokenizer(df.news.tolist() + df.summary.tolist())
len(tokenizer.word_index) + 1

In [None]:
df.news = tokenizer.texts_to_sequences(df.news)
df.summary = tokenizer.texts_to_sequences(df.summary)

In [None]:
for x in df.news.iloc[np.random.randint(0,df.news.shape[0])]:
    word = tokenizer.index_word.get(x,'UNKNOWN')
    print(word,end=' ')

In [None]:
for x in df.summary.iloc[np.random.randint(0,df.summary.shape[0])]:
    word = tokenizer.index_word.get(x,'UNKNOWN')
    print(word,end=' ')

In [None]:
max_news_len = 1000
max_summary_len = 100

In [None]:
X = pad_sequences(df.news,maxlen=max_news_len,padding='post',truncating='post')
y = pad_sequences(df.summary,maxlen=max_summary_len,padding='post',truncating='post')

In [None]:
X.shape, y.shape

## Performing train-test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=42,shuffle=True)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## Defining the model architecture

In [None]:
def attention_model(src_vocab_size,dest_vocab_size,num_lstm_units,num_dense_units,emb_dim,src_timesteps,dest_timesteps):
    """
        Builds an encoder-decoder based Seq2Seq model with attention for text summarization.
    """
    # Encoder
    encoder_inputs = Input(shape=(src_timesteps,),name='encoder_inputs')
    encoder_embeddings = Embedding(input_dim=src_vocab_size,output_dim=emb_dim,name='encoder_embeddings')(encoder_inputs)
    
    encoder_lstm1 = LSTM(units=num_lstm_units,return_sequences=True,return_state=True,dropout=0.2,recurrent_dropout=0.1,name='encoder_lstm1')
    encoder_outputs1, _, _ = encoder_lstm1(encoder_embeddings)

    encoder_lstm2 = LSTM(units=num_lstm_units,return_sequences=True,return_state=True,dropout=0.2,recurrent_dropout=0.1,name='encoder_lstm2')
    encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm2(encoder_outputs1)

    encoder_states = [encoder_state_h, encoder_state_c]

    # Decoder
    decoder_inputs = Input(shape=(dest_timesteps,),name='decoder_inputs')
    decoder_embeddings = Embedding(input_dim=dest_vocab_size,output_dim=emb_dim,name='decoder_embeddings')(decoder_inputs)

    decoder_lstm1 = LSTM(units=num_lstm_units,return_sequences=True,return_state=True,dropout=0.2,recurrent_dropout=0.1,name='decoder_lstm1')
    decoder_outputs1, _, _ = decoder_lstm1(decoder_embeddings,initial_state=encoder_states)

    decoder_lstm2 = LSTM(units=num_lstm_units,return_sequences=True,return_state=True,dropout=0.2,recurrent_dropout=0.1,name='decoder_lstm2')
    decoder_outputs, _, _ = decoder_lstm2(decoder_outputs1)

    # Attention
    context_vector = Attention(name='attention_layer')([decoder_outputs,encoder_outputs])

    # Concatenate context vector with decoder outputs
    decoder_concat_output = Concatenate(axis=-1,name='decoder_concat_output')([decoder_outputs,context_vector])

    # Apply layer normalization for smooth training with stable gradient flow
    decoder_concat_output = LayerNormalization(name='layer_normalization')(decoder_concat_output)

    # Full connected layer
    decoder_dense = TimeDistributed(Dense(units=num_dense_units, activation='relu', name='fc_dense_layer1'))(decoder_concat_output)

    # Output layer
    decoder_final_outputs = TimeDistributed(Dense(units=dest_vocab_size, activation='softmax', name='output_layer'))(decoder_dense)

    # Build the final model
    model = Model(inputs=[encoder_inputs,decoder_inputs],outputs=decoder_final_outputs)
    return model

In [None]:
model = attention_model(
    src_vocab_size=MAX_VOCAB_SIZE,
    dest_vocab_size=MAX_VOCAB_SIZE,
    num_lstm_units=128,
    num_dense_units=64,
    emb_dim=300,
    src_timesteps=max_news_len,
    dest_timesteps=max_summary_len
)
model.summary()

In [None]:
plot_model(model,to_file='model.png',show_shapes=True,dpi=200,show_layer_names=True)

## Model Training & Evaluation

In [None]:
adam = Adam(learning_rate=1e-3,clipnorm=1.0)

model.compile(loss='sparse_categorical_crossentropy',optimizer=adam,metrics=['accuracy'])

In [None]:
decay_rate = 0.05

def time_based_decay(epoch, lr):
    return lr / (1 + decay_rate * epoch)

In [None]:
callbacks = [
    EarlyStopping(monitor='val_accuracy',patience=5,mode='max',restore_best_weights=True,verbose=1,start_from_epoch=5),
    LearningRateScheduler(schedule=time_based_decay,verbose=1),
    TensorBoard(log_dir='./',histogram_freq=1,write_graph=True,update_freq='epoch',embeddings_freq=1),
    ModelCheckpoint(filepath='news_summarizer.keras',monitor='val_accuracy',mode='max',save_best_only=True,verbose=1,save_freq='epoch')
]

r = model.fit(
    [X_train,y_train],
    y_train,
    epochs=20,
    batch_size=256,
    validation_data=([X_test,y_test],y_test),
    callbacks=callbacks,
    verbose=True
)

## Model Performance Visualization

In [None]:
plt.figure(figsize=(9,6))
plt.plot(r.history['loss'],'r',label='train loss',marker='o')
plt.plot(r.history['val_loss'],'b',label='validation loss',marker='o')
plt.xlabel('Epoch')
plt.ylabel('Sparse Categorical Crossentropy Loss')
plt.title('Loss Graph')
plt.legend()
plt.tight_layout()
plt.show();

In [None]:
plt.figure(figsize=(9,6))
plt.plot(r.history['accuracy'],'r',label='train accuracy',marker='o')
plt.plot(r.history['val_accuracy'],'b',label='validation accuracy',marker='o')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Accuracy Graph')
plt.legend()
plt.tight_layout()
plt.show();

## Model Inference

In [None]:
def summarize_news(model, tokenizer, X_test, y_test, num_samples=10):
    """
        Displays predicted vs actual summaries for a few test samples.

        Parameters:
        - model: Trained seq2seq model
        - tokenizer: Tokenizer used for both input and output
        - X_test: Padded input dialogues
        - y_test: Padded output summaries
        - num_samples: Number of examples to display
    """
    index_to_word = {idx: word for word, idx in tokenizer.word_index.items()}
    start_token_idx = tokenizer.word_index.get('start')
    end_token_idx = tokenizer.word_index.get('end')

    X_test_subset = X_test[:num_samples]
    y_test_subset = y_test[:num_samples]

    # Predict on test samples
    predictions = model.predict([X_test_subset,y_test_subset],verbose=0,batch_size=256)
    predicted_token_ids = np.argmax(predictions,axis=-1)

    predicted_summaries = []
    actual_summaries = []
    news = []

    for i in range(num_samples):
        # Convert predicted token IDs to tokens
        predicted_summary = ' '.join([index_to_word.get(idx,'UNKNOWN') for idx in predicted_token_ids[i] if idx not in [0,start_token_idx,end_token_idx]])
        predicted_summaries.append(predicted_summary)

        # Convert actual summary token IDs to tokens
        actual_summary = ' '.join([index_to_word.get(idx,'UNKNOWN') for idx in y_test_subset[i] if idx not in [0,start_token_idx,end_token_idx]])
        actual_summaries.append(actual_summary)

        # Convert input news token IDs to tokens
        input_news = ' '.join([index_to_word.get(idx,'UNKNOWN') for idx in X_test_subset[i] if idx not in [0,start_token_idx,end_token_idx]])
        news.append(input_news)

    # Display results
    for i in range(num_samples):
        print(f"\n News {i+1}:\n{news[i]}")
        print(f"\n Actual Summary:\n{actual_summaries[i]}")
        print(f"\n Predicted Summary:\n{predicted_summaries[i]}")
        print("-" * 90)

In [None]:
summarize_news(model,tokenizer,X_test,y_test,num_samples=10)

In [None]:
reverse_tokenizer_news = {idx: word for word, idx in tokenizer.word_index.items()}
reverse_tokenizer_summary = {idx: word for word, idx in tokenizer.word_index.items()}

batch_size = 256

predicted_summaries = []
reference_summaries = []

# Make predictions on test data in batches
for start in tqdm(range(0, len(X_test), batch_size)):
    end = min(start + batch_size,len(X_test))
    X_batch = X_test[start:end]
    y_batch = y_test[start:end]

    # Predict on the batch
    predictions = model.predict([X_batch,y_batch],verbose=0,batch_size=batch_size)

    # Get the predicted tokens with the highest probabilities
    predicted_tokens = np.argmax(predictions,axis=-1)

    for token_ids in predicted_tokens:
        # Convert the predicted token ids into corresponding words for summary generation
        predicted_summary = " ".join([reverse_tokenizer_summary.get(idx,'UNKNOWN') for idx in token_ids if idx not in [0,tokenizer.word_index.get('start'),tokenizer.word_index.get('end')]])
        predicted_summaries.append(predicted_summary)

    for i in range(len(y_batch)):
        # Convert actual summary token ids into corresponding words
        actual_summary = " ".join([reverse_tokenizer_summary.get(idx,'UNKNOWN') for idx in y_batch[i] if idx not in [0,tokenizer.word_index.get('start'),tokenizer.word_index.get('end')]])
        reference_summaries.append([actual_summary.split()])

In [None]:
# Evaluate BLEU score for the predicted summaries
bleu_score = corpus_bleu(reference_summaries,[summary.split() for summary in predicted_summaries])
print(f"BLEU Score: {bleu_score:.4f}")