In [None]:
# Importing the required libraries

%%capture
!pip install transformers
!pip install datasets
!pip install sentencepiece

In [None]:
!pip install transformers[torch]
!pip install accelerate -U

Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.24.1


In [None]:
!python -m spacy download en_core_web_lg    # download en_core_web_lg model

2023-12-01 06:41:53.836937: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-01 06:41:53.836995: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-01 06:41:53.837044: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-01 06:41:53.845145: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-01 06:41:58.422150: I tensorflow/c

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Same as before

import torch
import numpy as np
import datasets

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)

from tabulate import tabulate
import nltk
from datetime import datetime

import re
import spacy

nlp = spacy.load("en_core_web_lg")

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Same as before

def entity_based_filtered_sentences(text,summary):
    doc = nlp(summary)
    sentences_select = {}
    for sent in doc.sents:
        sentences_select[sent.text] = True
    for e in doc.ents:
        if e[0].ent_type_ in ['PERSON', 'FAC', 'GPE', 'ORG', 'NORP', 'LOC', 'EVENT']:
            if e.text.lower() not in text.lower():
                sentences_select[e.sent.text] = False
    result = []
    for sent in doc.sents:
        if sentences_select[sent.text]:
            result.append(sent.text)
    filter_summary =  " ".join(result)
    return filter_summary

def create_ent_augmented_target(text,summary):
    entity_summary = []
    doc = nlp(summary)
    entities = []
    for e in doc.ents:
        if e[0].ent_type_ in ['PERSON', 'FAC', 'GPE', 'ORG', 'NORP', 'LOC', 'EVENT']:
            if e.text.lower() in text.lower():
                entities.append(e.text)

    entity_summary =  " ".join(entities)
    entity_summary = entity_summary + " " + summary
    return entity_summary

In [None]:
# Using the saved model from the filtered data

!unzip -q '/content/drive/MyDrive/AML Project/finetune_cnn_dm_filtered.zip'

In [None]:
path = '/content/content/finetune_cnn_dm_filtered'
model_name_summ = path

model_summ = AutoModelForSeq2SeqLM.from_pretrained(model_name_summ)
tokenizer_summ = AutoTokenizer.from_pretrained(model_name_summ)

In [None]:
# Same as before

def generate_summary(test_samples, model):
    inputs = tokenizer_summ(
        test_samples,
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt",
    )
    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)
    outputs = model_summ.generate(input_ids, attention_mask=attention_mask)
    output_str = tokenizer_summ.batch_decode(outputs, skip_special_tokens=True)
    return output_str

In [None]:
%%capture
import numpy as np

from transformers import T5ForConditionalGeneration, T5TokenizerFast, AutoModelForSeq2SeqLM
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

model_name_qa = "deepset/roberta-base-squad2"                                             # Pre defined model for question and answer
model_qa = AutoModelForQuestionAnswering.from_pretrained(model_name_qa)
tokenizer_qa = AutoTokenizer.from_pretrained(model_name_qa)
nlp_qa = pipeline('question-answering', model=model_name_qa, tokenizer_qa=model_name_qa)

hfmodel = T5ForConditionalGeneration.from_pretrained("ThomasSimonini/t5-end2end-question-generation")
hftokenizer = T5TokenizerFast.from_pretrained('t5-base')

In [None]:
# Function to generate question

def gen_question(input_string, **generator_args):
  generator_args = {                                                            # Defining the parameters of question generation
  "max_length": 256,
  "num_beams": 1,
  "length_penalty": 1.5,
  "no_repeat_ngram_size": 3,
  "early_stopping": True,
  }
  input_string = "generate questions: " + input_string + " </s>"                  # Making input string
  input_ids = hftokenizer.encode(input_string, truncation=True, return_tensors="pt")  # Input encoding
  res = hfmodel.generate(input_ids, **generator_args)                               # Generate result
  output = hftokenizer.batch_decode(res, skip_special_tokens=True)                  # Get the final output
  output = [item.split("<sep>") for item in output]                                 # Stripping the output
  return output

In [None]:
# Function to generate answers

def gen_answers(doc, summ):
  ques = gen_question(doc)                                                      # First generate question from doc
  ans = []
  for _ in ques[0][0].split('? '):
    doc_ans = nlp_qa({'question': _+'?', 'context': doc})['answer']             # Answer from doc
    summ_ans = nlp_qa({'question': _+'?', 'context': summ})['answer']           # Answer from summary
    ans.append((doc_ans, summ_ans))
  return ques, ans

In [None]:
# Demo for question answering

def Demo(text):
  summ = generate_summary(text, model_summ)[0]                                  # First generate summary
  print("Summary:", summ)
  q, a = gen_answers(text, summ)                                                # Generate question and answers from text and summary
  print("Questions:", q)
  print("Answers:", a)
  return

In [None]:
# Demo for summarization

def Demo_FJ(text, summary):
  filtered = entity_based_filtered_sentences(text,summary)                      # Generate filtered data
  JAENS = create_ent_augmented_target(text,summary)                             # Generates JAENS data
  jf = create_ent_augmented_target(text, filtered)                              # Generates JAENS data from filtered data
  print("Filtered summary:\n", filtered)
  print("JAENS summary:\n", JAENS)
  print("Filter+JAENS:\n", jf)
  return

In [None]:
# Results from a sample text for q and a

text = "The telescreen received and transmitted simultaneously. Any sound that Winston made, above the level of a very low whisper, would be picked up by it, moreover, so long as he remained within the field of vision which the metal plaque commanded, he could be seen as well as heard. There was of course no way of knowing whether you were being watched at any given moment. How often, or on what system, the Thought Police plugged in on any individual wire was guesswork. It was even conceivable that they watched everybody all the time. But at any rate, they could plug in your wire whenever they wanted to. You had to live—did live, from habit that became instinct—in the assumption that every sound you made was overheard, and, except in darkness, every movement scrutinized."
print("Article:", text)
summary = "In Winston's world, the telescreen functioned as both a receiver and transmitter, capturing any sound above a whisper and allowing him to be seen within its field of vision. The pervasive uncertainty of being watched by the Thought Police meant living with the constant assumption that all sounds were overheard and every movement, except in darkness, was scrutinized."
print("Highlights:", summary)
Demo(text)

Article: The telescreen received and transmitted simultaneously. Any sound that Winston made, above the level of a very low whisper, would be picked up by it, moreover, so long as he remained within the field of vision which the metal plaque commanded, he could be seen as well as heard. There was of course no way of knowing whether you were being watched at any given moment. How often, or on what system, the Thought Police plugged in on any individual wire was guesswork. It was even conceivable that they watched everybody all the time. But at any rate, they could plug in your wire whenever they wanted to. You had to live—did live, from habit that became instinct—in the assumption that every sound you made was overheard, and, except in darkness, every movement scrutinized.
Highlights: In Winston's world, the telescreen functioned as both a receiver and transmitter, capturing any sound above a whisper and allowing him to be seen within its field of vision. The pervasive uncertainty of be

In [None]:
# Results from a sample text for different summaries by different filtering

print("True summary:\n", summary)
Demo_FJ(text, summary)

True summary:
 In Winston's world, the telescreen functioned as both a receiver and transmitter, capturing any sound above a whisper and allowing him to be seen within its field of vision. The pervasive uncertainty of being watched by the Thought Police meant living with the constant assumption that all sounds were overheard and every movement, except in darkness, was scrutinized.
Filtered summary:
 In Winston's world, the telescreen functioned as both a receiver and transmitter, capturing any sound above a whisper and allowing him to be seen within its field of vision. The pervasive uncertainty of being watched by the Thought Police meant living with the constant assumption that all sounds were overheard and every movement, except in darkness, was scrutinized.
JAENS summary:
 Winston the Thought Police In Winston's world, the telescreen functioned as both a receiver and transmitter, capturing any sound above a whisper and allowing him to be seen within its field of vision. The pervasi