In [None]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv("/content/drive/MyDrive/archive (5)/news_summary.csv", encoding='latin-1')
df = df[['ctext', 'headlines']].dropna().rename(columns={'ctext': 'text', 'headlines': 'summary'})

In [None]:
df = df.sample(n=200, random_state=42).reset_index(drop=True)

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\([^)]*\)', '', text)  # remove content in parentheses
    text = re.sub('"','', text)
    text = re.sub(r"'s\b", "", text)
    text = re.sub("[^a-zA-Z]", " ", text)
    text = re.sub("[\s]+", " ", text)
    return text.strip()

In [None]:
# Clean the data
df['clean_text'] = df['text'].apply(clean_text)
df['clean_summary'] = df['summary'].apply(clean_text)

# Add special tokens *after cleaning* and before tokenization
df['clean_summary'] = df['clean_summary'].apply(lambda x: '_start_ ' + x + ' _end_')

In [None]:
max_text_len = 100
max_summary_len = 15

x_tokenizer = Tokenizer()
x_tokenizer.fit_on_texts(df['clean_text'])
x_vocab_size = len(x_tokenizer.word_index) + 1

In [None]:
x_seq = x_tokenizer.texts_to_sequences(df['clean_text'])
x_seq = pad_sequences(x_seq, maxlen=max_text_len, padding='post')

In [None]:
y_tokenizer = Tokenizer(filters='')
y_tokenizer.fit_on_texts(df['clean_summary'])
y_vocab_size = len(y_tokenizer.word_index) + 1

y_seq = y_tokenizer.texts_to_sequences(df['clean_summary'])
y_seq = pad_sequences(y_seq, maxlen=max_summary_len, padding='post')


In [None]:
x_train, x_val, y_train, y_val = train_test_split(x_seq, y_seq, test_size=0.1, random_state=0)

In [None]:
# latent_dim = 300

In [None]:
encoder_inputs = Input(shape=(max_text_len,))
enc_emb = Embedding(x_vocab_size, latent_dim, trainable=True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
_, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

In [None]:
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(y_vocab_size, latent_dim, trainable=True)
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(y_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [None]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')
model.summary()

In [None]:
y_train_target = np.expand_dims(y_train[:, 1:], -1)
y_train_input = y_train[:, :-1]

y_val_target = np.expand_dims(y_val[:, 1:], -1)
y_val_input = y_val[:, :-1]

In [None]:
model.fit([x_train, y_train_input], y_train_target,
          epochs=20,
          batch_size=32,
          validation_data=([x_val, y_val_input], y_val_target))


Epoch 1/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - loss: 3.4860 - val_loss: 5.8168
Epoch 2/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - loss: 3.4825 - val_loss: 5.8076
Epoch 3/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - loss: 3.4585 - val_loss: 5.8156
Epoch 4/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - loss: 3.4721 - val_loss: 5.8211
Epoch 5/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - loss: 3.4770 - val_loss: 5.8307
Epoch 6/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - loss: 3.4580 - val_loss: 5.8152
Epoch 7/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - loss: 3.4616 - val_loss: 5.8147
Epoch 8/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - loss: 3.4366 - val_loss: 5.8129
Epoch 9/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

<keras.src.callbacks.history.History at 0x7834a04ff510>

In [None]:
encoder_model_inf = Model(encoder_inputs, encoder_states)

In [None]:
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]


In [None]:
dec_emb2 = dec_emb_layer(decoder_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

In [None]:
decoder_model_inf = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)

In [None]:
reverse_target_word_index = y_tokenizer.index_word
reverse_source_word_index = x_tokenizer.index_word
target_word_index = y_tokenizer.word_index

In [None]:
print('_start_' in target_word_index)  # Should print: True
print(target_word_index['_start_'])   # Should print an integer > 0


True
1


In [None]:
def decode_sequence(input_seq):
    states_value = encoder_model_inf.predict(input_seq)

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = target_word_index['_start_']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model_inf.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = reverse_target_word_index.get(sampled_token_index, '')

        if sampled_token == '_end_' or len(decoded_sentence.split()) >= max_summary_len - 1:
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_token

        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence.strip()

In [None]:
for i in range(5):
    input_seq = x_val[i].reshape(1, max_text_len)
    print("Original:", df['clean_text'].iloc[i])
    print("Actual Summary:", df['clean_summary'].iloc[i])
    print("Predicted Summary:", decode_sequence(input_seq))
    print("-" * 100)

Original: washington mar a year old woman in the us who was apprehended twice for allegedly trying to jump the white house fence last week has been arrested for scaling a fence at the treasury building marci anderson wahl of everett washington was arrested after an alarm sounded at about am yesterday when she scaled a fence at the treasury building next to the white house police said wahl has told them she was there to speak to us president donald trump the cnn reported she was charged with unlawful entry and contempt of court wahl was first arrested on march last week for trying to jump the white house fence once in custody it was determined that wahl had been issued a stay away order for the white house complex after the incident on march officers saw wahl walking and staring at the white house complex before discovering she had jumped a fence on the south side but got stuck officers found her hanging from the inside of the fence by her shoelaces which were caught on top of the fence

In [None]:
!pip install rouge-score nltk




In [None]:
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
smoothie = SmoothingFunction().method4

rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
bleu_scores = []

print("\nEvaluation on Validation Set:\n")

for i in range(len(x_val)):
    input_seq = x_val[i].reshape(1, max_text_len)
    predicted_summary = decode_sequence(input_seq)

    ref_summary = df['clean_summary'].iloc[i].replace('_start_', '').replace('_end_', '').strip()

    # ROUGE
    scores = rouge.score(predicted_summary, ref_summary)
    for k in rouge_scores:
        rouge_scores[k].append(scores[k].fmeasure)

    # BLEU
    ref_tokens = [ref_summary.split()]
    pred_tokens = predicted_summary.split()
    bleu = sentence_bleu(ref_tokens, pred_tokens, smoothing_function=smoothie)
    bleu_scores.append(bleu)

# Average Scores
print("ROUGE-1 F1:", np.mean(rouge_scores['rouge1']))
print("ROUGE-2 F1:", np.mean(rouge_scores['rouge2']))
print("ROUGE-L F1:", np.mean(rouge_scores['rougeL']))
print("BLEU Score:", np.mean(bleu_scores))



Evaluation on Validation Set:

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 116ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 122ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━