<a href="https://colab.research.google.com/github/GirishNaik711/Abstractive_text_summarizer/blob/main/Abstractive_Text_Summarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install torch sacrebleu transformers[sentencepiece] rouge_score py7zr datasets -q

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import keras
import tensorflow_datasets as tfds
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, LSTM, Dense, Embedding
from keras.preprocessing import sequence
from datasets import load_dataset, load_metric

In [None]:
import string

In [None]:
#dataset = tfds.load('cnn_dailymail', version = "3.0.0", split='train', shuffle_files=True)
#data = tfds.as_dataframe(dataset)
dataset = load_dataset("cnn_dailymail", version = "3.0.0")
data = dataset

In [None]:
data_train = data['train']
data_train = data_train[:100000]

In [None]:
data_train['article']

In [None]:
input_texts = data_train['article']
input_texts = [' '.join(i.split()) for i in input_texts]
input_texts = [i.translate(str.maketrans('','',string.punctuation)).lower() for i in input_texts]

target_texts = data_train['highlights']
target_texts = [' '.join(i.split()) for i in target_texts]
target_texts = [i.translate(str.maketrans('','',string.punctuation)).lower() for i in target_texts]

In [None]:
input_tokenizer = Tokenizer()
target_tokenizer = Tokenizer()
input_tokenizer.fit_on_texts(input_texts)
target_tokenizer.fit_on_texts(target_texts)

input_vocab_size = len(input_tokenizer.word_index) + 1
output_vocab_size = len(target_tokenizer.word_index) + 1

input_sequences = input_tokenizer.texts_to_sequences(input_texts)
target_sequences = target_tokenizer.texts_to_sequences(target_texts)

max_input_len = 500
max_output_len = 100

encoder_input_data = pad_sequences(input_sequences, maxlen=max_input_len, padding='post')
decoder_input_data = pad_sequences(target_sequences, maxlen=max_output_len, padding='post')

In [None]:
decoder_target_data = np.zeros_like(decoder_input_data)
decoder_target_data[:, 1:] = decoder_input_data[:, :-1]


In [None]:
embedding_dim = 100
hidden_units = 256

encoder_input = Input(shape=(max_input_len,))
encoder_embedding = Embedding(input_vocab_size, embedding_dim, mask_zero=True)(encoder_input)
encoder_lstm = LSTM(hidden_units, return_state=True)
_, state_h, state_c = encoder_lstm(encoder_embedding)

decoder_input = Input(shape=(max_output_len,))
decoder_embedding = Embedding(output_vocab_size, embedding_dim, mask_zero=True)(decoder_input)
decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True)
decoder_output, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])
decoder_dense = Dense(output_vocab_size, activation='softmax')
output = decoder_dense(decoder_output)

In [None]:
from keras.models import Model

model = Model([encoder_input, decoder_input], output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', learning_rate = 0.0001, metrics = ['acc'])

batch_size = 32
epochs = 10

model.fit([encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1),
          batch_size=batch_size, epochs=epochs)


In [None]:
def summarize(input_text):
    input_sequence = input_tokenizer.texts_to_sequences([input_text])
    encoder_input = pad_sequences(input_sequence, maxlen=max_input_len, padding='post')
    decoder_input = np.zeros((1, max_output_len))
    #decoder_input[0, 0] = target_tokenizer.word_index['start']

    for i in range(max_output_len - 1):
        predictions = model.predict([encoder_input, decoder_input])
        predicted_id = np.argmax(predictions[0, i, :])

        print("Predicted ID:", predicted_id)
        print("Decoder Input (Before):", decoder_input)

        if predicted_id == target_tokenizer.word_index['end']:
            break
        decoder_input[0, i+1] = predicted_id
        print("Decoder Input (After):", decoder_input)

    summary = target_tokenizer.sequences_to_texts(decoder_input)[0]
    return summary


In [None]:
input_text = input_texts[84]
summary = summarize(input_text)
print('Input:', input_text)
print('Summary:', summary)

#BART SUMMARIZER

In [None]:
! pip install torch sacrebleu transformers[sentencepiece] rouge_score py7zr datasets -q

In [None]:
import pandas as pd
import numpy as np
from transformers import pipeline, set_seed
from datasets import load_dataset, load_metric
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import nltk
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

In [None]:
dataset = load_dataset("cnn_dailymail", version = "3.0.0")
print(f"Feature of the CNN_DailyMail: {dataset['train'].column_names}")

In [None]:
sample = dataset["train"][1]
print(f"Article total length: {len(sample['article'])} \n")
print(f"""Article"s excerpt of 500 characters: {sample['article'][:500]} \n""")

print(f"Summary Length: {len(sample['highlights'])} \n")
print(f"SUMMARY: {sample['highlights']}")


Article total length: 4051 

Article"s excerpt of 500 characters: Editor's note: In our Behind the Scenes series, CNN correspondents share their experiences in covering news and analyze the stories behind the events. Here, Soledad O'Brien takes users inside a jail where many of the inmates are mentally ill. An inmate housed on the "forgotten floor," where many mentally ill inmates are housed in Miami before trial. MIAMI, Florida (CNN) -- The ninth floor of the Miami-Dade pretrial detention facility is dubbed the "forgotten floor." Here, inmates with the most s 

Summary Length: 281 

SUMMARY: Mentally ill inmates in Miami are housed on the "forgotten floor"
Judge Steven Leifman says most are there as a result of "avoidable felonies"
While CNN tours facility, patient shouts: "I am the son of the president"
Leifman says the system is unjust and he's fighting for change .


In [None]:
sample_text = dataset["train"][1]["article"][:1000]
summaries = {}

#BaseLine Model

In [None]:
def baseline_summary_three_sent(text):
  return "\n".join(sent_tokenize(text)[:3])

summaries["Baseline"] = baseline_summary_three_sent(sample_text)
summaries["Baseline"]

'Editor\'s note: In our Behind the Scenes series, CNN correspondents share their experiences in covering news and analyze the stories behind the events.\nHere, Soledad O\'Brien takes users inside a jail where many of the inmates are mentally ill. An inmate housed on the "forgotten floor," where many mentally ill inmates are housed in Miami before trial.\nMIAMI, Florida (CNN) -- The ninth floor of the Miami-Dade pretrial detention facility is dubbed the "forgotten floor."'

#BART

In [None]:
pipe = pipeline("summarization", model = "facebook/bart-large-cnn")
pipe_out = pipe(sample_text)
summaries["Bart"] = "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
summaries['Bart']

'Miami-Dade pretrial detention facility is dubbed the "forgotten floor" Here, inmates with the most severe mental illnesses are incarcerated.\nMost often, they face drug charges or charges of assaulting an officer.\nJudge Steven Leifman says the arrests often result from confrontations with police.'

#ROUGE

In [None]:
rouge_metric = load_metric("rouge")
rouge_names = ("rouge1", "rouge2", "rougeL","rougeLsum")

reference = dataset["train"][1]["highlights"]

records = []

for model_name in summaries:
  rouge_metric.add(prediction = summaries[model_name], reference = reference)
  score = rouge_metric.compute()
  rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
  print("rouge_dict ", rouge_dict )
  records.append(rouge_dict)

pd.DataFrame.from_records(records, index = summaries.keys())

  rouge_metric = load_metric("rouge")


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

rouge_dict  {'rouge1': 0.365079365079365, 'rouge2': 0.14516129032258066, 'rougeL': 0.20634920634920634, 'rougeLsum': 0.2857142857142857}
rouge_dict  {'rouge1': 0.3655913978494624, 'rouge2': 0.13186813186813184, 'rougeL': 0.2150537634408602, 'rougeLsum': 0.3225806451612903}


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
Baseline,0.365079,0.145161,0.206349,0.285714
Bart,0.365591,0.131868,0.215054,0.322581


#Test Set Evaluation

Base Model

In [None]:
def calculate_metric_on_test_ds(dataset, metric, column_text = "article", column_summary = "highlights"):
  summaries = [baseline_summary_three_sent(text) for text in dataset[column_text]]
  metric.add_batch(predictions = summaries, references = dataset[column_summary])

  score = metric.compute()

  return score


In [None]:
test_sample = dataset["train"].shuffle(seed=42).select(range(1000))

score = calculate_metric_on_test_ds(test_sample, rouge_metric)

rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)

pd.DataFrame.from_dict(rouge_dict, orient = "index", columns = ["Baseline"]).T


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
Baseline,0.253772,0.100706,0.165549,0.232214


BART

In [None]:
from tqdm import tqdm
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

def generate_batch_sized_chunks(list_of_elements, batch_size):
  for i in range(0,len(list_of_elements), batch_size):
    yield list_of_elements[i : i + batch_size]

def calculate_metric_on_test_ds(dataset, metric, model, tokenizer, batch_size = 8, device = device, column_text="article", column_summary="highlights"):

  article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
  target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

  for article_batch, target_batch in tqdm(zip(article_batches, target_batches,), total = len(article_batches)):

      inputs = tokenizer(article_batch, max_length=512, truncation = True, padding = "max_length", return_tensors = "pt" )

      summaries = model.generate(input_ids = inputs["input_ids"].to(device),
                                 attention_mask = inputs["attention_mask"].to(device),
                                 length_penalty = 0.8, num_beams = 8, max_length = 128)

      decoded_summaries = [tokenizer.decode(s, skip_special_tokens = True, clean_up_tokenization_spaces = True) for s in summaries]
      decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries]

      metric.add_batch(predictions = decoded_summaries, reference = target_batch)

  score = metric.compute()

  return score

In [None]:
test_sample = dataset["train"].shuffle(seed=42).select(range(1000))

model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

score = calculate_metric_on_test_ds(test_sample, rouge_metric, model, tokenizer)

rouge_dict = {rn: score[rn].mid.fmeasure for rn in rouge_names}

pd.DataFrame.from_dict(rouge_dict, orient="index", columns=["Bart"]).T
