In [1]:
import os
from sys import getsizeof
from time import sleep
import json
import pickle
import re
import inspect
from warnings import filterwarnings

import numpy as np
import nltk
import matplotlib.pyplot as plt
import torch
from transformers import (
	BartTokenizer, BartForConditionalGeneration,
	T5Tokenizer, T5ForConditionalGeneration,
	PegasusForConditionalGeneration, PegasusTokenizerFast,
	GPT2TokenizerFast
)
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv

from utils.helpers import *
from utils.encoders import *
from utils.pipelines import *
from utils.trainer_utils import *
from utils.evaluator_utils import *

In [2]:
def plot_histogram(data):
	bins = int(len(data) ** .5)
	plt.hist(data, bins=bins)
	plt.show()

inf = float("inf")
filterwarnings("ignore")
device = get_device(500)
# device = "cpu"
load_dotenv()

True

In [3]:
data_dir = "/Users/naman/Workspace/Data/Long-Document-Summarization"
data_dir = "/home/nchibbar/Data"

bart_dir = f"{data_dir}/Models/BART"
t5_dir = f"{data_dir}/Models/T5"
pegasus_dir = f"{data_dir}/Models/PEGASUS"
gpt_dir = f"{data_dir}/Models/GPT-3.5-turbo-tokenizer"

govreport_dir = f"{data_dir}/GovReport/processed"
bigpatent_dir = f"{data_dir}/BigPatent/processed"
govreport_files = os.listdir(govreport_dir)
bigpatent_files = os.listdir(bigpatent_dir)

len(govreport_files), len(bigpatent_files)

(7238, 2856)

In [4]:
# Sentence transformer
# Automatically loads into gpu if available
sent_dir = f"{data_dir}/Models/Sent-Transformer"
sent_encoder = SentenceTransformer(sent_dir)

name = "pegasus"

match name:

	case "bart":
		tokenizer = BartTokenizer.from_pretrained(bart_dir)
		model = BartForConditionalGeneration.from_pretrained(bart_dir)
		context_size = model.config.max_position_embeddings

	case "t5":
		tokenizer = T5Tokenizer.from_pretrained(t5_dir)
		model = T5ForConditionalGeneration.from_pretrained(t5_dir)
		context_size = model.config.n_positions

	case "pegasus":
		tokenizer = PegasusTokenizerFast.from_pretrained(pegasus_dir)
		model = PegasusForConditionalGeneration.from_pretrained(pegasus_dir)
		context_size = model.config.max_position_embeddings

# GPT 3.5 turbo tokenizer
gpt_tokenizer = GPT2TokenizerFast.from_pretrained(gpt_dir)
gpt_model = "gpt-3.5-turbo"
gpt_context_size = 4096

context_size

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


512

In [5]:
preprocessor = TextProcessor(preprocessing=True)
postprocessor = None

## BigPatent

In [None]:
word_counts = []
for file in bigpatent_files:
	file_path = f"{bigpatent_dir}/{file}"
	with open(file_path) as fp:
		data = json.load(fp)
	for text in data["texts"]:
		word_counts.append(count_words(text))

bins = int(len(word_counts) ** .5)
plt.hist(word_counts, bins=bins)
plt.show()

In [None]:
max(word_counts), min(word_counts), np.mean(word_counts)

In [None]:
sum([
	1
	for count in word_counts
	if count > 40_000
])

## Rough

In [6]:
min_words = 20_000
max_words = inf
max_texts = inf
texts, summaries = [], []
num_texts = 0
for file in govreport_files:
	file_path = f"{govreport_dir}/{file}"
	with open(file_path) as fp:
		data = json.load(fp)
	if min_words < count_words(data["text"]) < max_words:
		texts.append(data["text"])
		summaries.append(data["summary"])
		num_texts += 1
	if num_texts == max_texts:
		break

num_texts

317

In [7]:
segment_min_words = 20
text_segmenter = TextSegmenter(nltk.sent_tokenize, segment_min_words)

In [8]:
keywords_preprocessor = TextProcessor(
	only_words_nums = True,
	remove_nums = True
)
stop_words = get_stop_words(extra_stop_words=STOP_WORDS)
len(stop_words)

392

In [82]:
class SummarizationPipeline(Pipeline):
	"""
	Pipeline for generating summaries using an encoder.

	## Parameters
	`model`: The model model.
	`encoder`: The encoder model.
	`summary_min_tokens`: The minimum number of tokens in the summary.
	`summary_max_tokens`: The maximum number of tokens in the summary.
	`postprocessor`: The postprocessor for the generated summaries.
	`device`: The device to use for computation.

	## Returns
	list[str]: The generated summaries.
	"""
	def __init__(
		self,
		model,
		encoder: Encoder,
		postprocessor: Callable[[list[str]], list[str]] | None = None,
		summary_min_tokens: int | None = None,
		summary_max_tokens: int | None = None,
		device: str | torch.device = "cpu"
	) -> None:
		super().__init__(model.to("cpu"), encoder, postprocessor)
		self.summary_min_tokens = summary_min_tokens or model.config.min_length
		self.summary_max_tokens = summary_max_tokens or encoder.max_tokens
		self.device = device

	def __call__(
		self,
		texts: str | list[str],
		batch_size: int | None = None,
		temperature: float = 1.,
		repetition_penalty: float = 1.,
		top_p: float = .95
	) -> list[str]:
		if isinstance(texts, str):
			texts = [texts]
		
		device = self.device
		model = self.model.to(device)
		encoder = self.encoder
		summary_min_tokens = self.summary_min_tokens
		summary_max_tokens = self.summary_max_tokens
		postprocessor = self.postprocessor
		batch_size = batch_size or len(texts)

		# Generate encodings in batches
		batches = SummarizationDataset(texts, encoder, batch_size)

		# Generate summaries
		all_summaries = []
		for encodings in batches:

			# Send encodings to device
			encodings = encodings.to(device)

			# Generate summaries' encodings
			output = self.model.generate(
				**encodings,
				min_length = summary_min_tokens,
				max_length = summary_max_tokens,
				temperature = temperature,
				repetition_penalty = repetition_penalty,
				top_p = top_p,
				early_stopping = True
			)

			# Decode summaries' encodings
			summaries = [
				encoder.tokenizer.decode(out, skip_special_tokens=True)
				for out in output
			]

			# Append summaries
			all_summaries.extend(summaries)

		# Remove model from device
		model.to("cpu")

		# Postprocess summaries
		if postprocessor is not None:
			all_summaries = postprocessor(all_summaries)

		return all_summaries


min_tokens_frac = .5
min_summary_tokens = 300
head_size = .5
threshold = .7
boost = .03
num_keywords = 20
seed = 69
system_prompt = "You will be given some segments of a very long document. Your task is to summarize the entire document as a whole by extracting key information and ideas from the segments. Generate a detailed, concise, and coherent summary in 500 words. Do not refer to the document in the summary in any way."

encoders = [
	TruncateMiddle(
		tokenizer, context_size, 1, preprocessor
	),
	TruncateMiddle(
		tokenizer, context_size, head_size, preprocessor, True
	),
	UniformSampler(
		tokenizer, min_tokens_frac * context_size, context_size,
		text_segmenter, preprocessor, True, seed
	),
	SegmentSampler(
		tokenizer, min_tokens_frac * context_size, context_size,
		text_segmenter, sent_encoder, preprocessor, threshold, boost, seed
	),
	RemoveRedundancy(
		tokenizer, min_tokens_frac * context_size, context_size,
		text_segmenter, sent_encoder, preprocessor, threshold, seed
	),
	KeywordScorer(
		tokenizer, context_size, text_segmenter, sent_encoder,
		preprocessor, num_keywords, keywords_preprocessor, stop_words
	)
]

pipelines = [
	SummarizationPipeline(
		model, enc, postprocessor, min_summary_tokens,
		context_size, device
	) for enc in encoders
]

gpt_encoders = [
	TruncateMiddle(
		gpt_tokenizer, gpt_context_size, head_size, preprocessor
	),
	UniformSampler(
		gpt_tokenizer, min_tokens_frac * gpt_context_size, gpt_context_size,
		text_segmenter, preprocessor, seed
	),
	SegmentSampler(
		gpt_tokenizer, min_tokens_frac * gpt_context_size, gpt_context_size,
		text_segmenter, sent_encoder, preprocessor, threshold, boost, seed
	),
	RemoveRedundancy(
		gpt_tokenizer, min_tokens_frac * gpt_context_size, gpt_context_size,
		text_segmenter, sent_encoder, preprocessor, threshold, seed
	)
]

gpt_pipelines = [
	OpenAIPipeline(
		gpt_model, enc, system_prompt=system_prompt
	) for enc in gpt_encoders
]

In [None]:
processed_texts = preprocessor(texts)

In [None]:
max_word_segments = []
avg_segment = 0
total_segments = 0
for text in processed_texts:
	segments = text_segmenter(text)
	for segment in segments:
		avg_segment += count_words(segment)
		total_segments += 1
	max_word_segments.append(
		max(segments, key=lambda segment: count_words(segment))
	)
avg_segment /= total_segments
avg_segment

In [None]:
bearable_words = 150
for i, segment in enumerate(max_word_segments):
	words = count_words(segment)
	if words > bearable_words:
		print(
			f"{i} {words}: {repr(segment)}",
			end = "\n\n"
		)

In [None]:
ind = 7143
print(
	processed_texts[ind], texts[ind],
	sep = f"\n\n{"=" * 400}\n\n"
)

In [None]:
inps = encoders[0](texts[1])
inps

In [None]:
out = model.generate(**inps, min_length=400, max_length=600, early_stopping=True)
out.shape

In [None]:
print(tokenizer.decode(out[0], skip_special_tokens=True))

In [10]:
scorer = BERTScorer(lang="en", device=device)

gen_summaries = pipelines[-1](texts, 5)
scores = scorer.score(gen_summaries, summaries)
scores

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a PegasusTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


(tensor([0.7782, 0.7467, 0.7340, 0.7458, 0.7395, 0.7586, 0.7338, 0.7572, 0.7321,
         0.7546, 0.8187, 0.7177, 0.7543, 0.7617, 0.7572, 0.7160, 0.7432, 0.7415,
         0.7330, 0.7157, 0.7572, 0.6899, 0.7657, 0.7546, 0.7393, 0.7221, 0.6974,
         0.7578, 0.7232, 0.7456, 0.7599, 0.7722, 0.7436, 0.7351, 0.7522, 0.7635,
         0.7430, 0.7833, 0.7127, 0.7135, 0.7466, 0.7569, 0.7622, 0.7644, 0.7594,
         0.7719, 0.7159, 0.7445, 0.7852, 0.7573, 0.7383, 0.7324, 0.7888, 0.7764,
         0.7475, 0.7178, 0.7687, 0.7230, 0.7574, 0.7141, 0.7537, 0.6868, 0.7300,
         0.7424, 0.7717, 0.7423, 0.7524, 0.7206, 0.7210, 0.7494, 0.7753, 0.7559,
         0.7351, 0.7392, 0.7702, 0.7539, 0.7417, 0.7446, 0.7372, 0.7838, 0.7476,
         0.7637, 0.7264, 0.7440, 0.7496, 0.7354, 0.7212, 0.7515, 0.7434, 0.7675,
         0.7293, 0.7346, 0.7279, 0.7510, 0.7442, 0.7285, 0.7378, 0.7244, 0.7481,
         0.7745, 0.7632, 0.7422, 0.7709, 0.7517, 0.7380, 0.7659, 0.7658, 0.7647,
         0.7832, 0.7353, 0.7

In [44]:
scores = [metric.numpy() for metric in scores]

In [54]:
sort1, sort2, sort3 = [np.array(np.argsort(metric)) for metric in scores]

In [47]:
scores[0][sort1[::-1]]

array([0.82308435, 0.8187406 , 0.8143504 , 0.81290495, 0.8009752 ,
       0.79898643, 0.7948025 , 0.7904099 , 0.78912747, 0.78878796,
       0.78522074, 0.78485656, 0.78383386, 0.7838231 , 0.7833141 ,
       0.7831646 , 0.7818335 , 0.7813525 , 0.78083736, 0.77823   ,
       0.7767894 , 0.7763934 , 0.77639174, 0.7753353 , 0.7749982 ,
       0.7745497 , 0.7738112 , 0.7737669 , 0.7737015 , 0.7736088 ,
       0.7734233 , 0.7727956 , 0.772591  , 0.7724112 , 0.77221596,
       0.77192336, 0.7716665 , 0.77138   , 0.7709531 , 0.7708502 ,
       0.77066386, 0.7701761 , 0.7695588 , 0.7687056 , 0.7678865 ,
       0.767802  , 0.7675358 , 0.76664436, 0.7659894 , 0.7659317 ,
       0.76576406, 0.7656675 , 0.76511943, 0.7650994 , 0.7648475 ,
       0.76480514, 0.76473534, 0.76473176, 0.7643952 , 0.76421237,
       0.7641813 , 0.76416177, 0.7640749 , 0.763718  , 0.76357293,
       0.763476  , 0.763391  , 0.76319146, 0.7630727 , 0.7629554 ,
       0.76280177, 0.7627195 , 0.7623611 , 0.76223624, 0.76218

In [21]:
problem = texts[sort1[0]]
print(problem)

Title: Interior, Environment, and Related Agencies: FY2006 Appropriations

Subsection Most Recent Developments:

On August 2, 2005, H.R. 2361 , the Interior, Environment, and Related Agencies Appropriations Act for FY2006, was enacted as P.L. 109-54 . The law contained a total of $26.20 billion for Interior, Environment, and Related Agencies. The law also contained $1.50 billion in supplemental funds to cover a shortfall in veterans health care resources.

On December 30, 2005, H.R. 2863 was signed into law as P.L. 109-148 . The law affected funding levels enacted in P.L. 109-54 , through rescissions and emergency supplemental funds, which are not reflected in this report.

Subsection Introduction:

The FY2006 Interior, Environment, and Related Agencies appropriations law included funding for agencies and programs in three separate federal departments, as well as numerous related agencies and bureaus. The law provided funding for Department of the Interior (DOI) agencies (except for th

In [26]:
enc = encoders[-1](problem, return_batch=False)
sent = tokenizer.decode(enc[0], skip_special_tokens=True)
print(sent)

The FY2006 appropriations law contained three primary titles providing funding. This report is organized along the lines of the law. The FY2006 appropriations law provided $26.20 billion, an increase of 2% over the President's budget request for FY2006 of $25.72 billion, but a decrease of 3% below the FY2005 enacted level of $27.02 billion. The FY2006 request was $36.8 million, a decrease of $24.2 million (40%) from the FY2005 appropriation of $61.0 million. The request did not seek funds for statutory or contractual aid. The Administration has previously proposed discontinuing these programs, requesting no funds for FY2005, but Congress provided $11.2 million. For FY2006, the original House-passed bill contained $49.0 million for National Recreation and Preservation, but no funds for statutory or contractual aid. The FY2005 appropriations law did not fund these grants. The FY2006 appropriation provided that not to exceed $5.0 million could be allocated to Preserve America grants. For 

In [30]:
processed_problem = preprocessor(problem)
segments = text_segmenter(processed_problem)
segments

['Title: Interior, Environment, and Related Agencies: FY2006 Appropriations Subsection Most Recent Developments:\n\nOn August 2, 2005, H.R 2361, the Interior, Environment, and Related Agencies Appropriations Act for FY2006, was enacted as P.L 109-54.',
 'The law contained a total of $26.20 billion for Interior, Environment, and Related Agencies. The law also contained $1.50 billion in supplemental funds to cover a shortfall in veterans health care resources.',
 'On December 30, 2005, H.R 2863 was signed into law as P.L 109-148. The law affected funding levels enacted in P.L 109-54, through rescissions and emergency supplemental funds, which are not reflected in this report.',
 'Subsection Introduction:\n\nThe FY2006 Interior, Environment, and Related Agencies appropriations law included funding for agencies and programs in three separate federal departments, as well as numerous related agencies and bureaus.',
 'The law provided funding for Department of the Interior (DOI) agencies (exc

In [51]:
print(gen_summaries[sort1[0]])

Public financing of elections and spending limits have long been associated, in part, by the US Supreme Court's Citizens United ruling in the Citizens United case in the US Supreme Court case in the US Supreme Court case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case in the case

In [78]:
print(pipelines[-3](problem, repetition_penalty=3., top_p=.1, temperature=2)[0])

The President's FY2006 budget request included funding for the National Zoo's construction projects, except for the Zoo's facilities capital projects (S.Rept 109-80) which had a base of $13.0 million and were approved by both houses of Congress as part of the FY2006 Omnibus Budget Reconciliation Act (OBRA) on 23 February 2006).The Senate-passed bill did not contain instructions about the National Zoo's construction projects, except to state in report language (S.Rept 109-80) that there is a base of $13.0 million for the Zoo's facilities capital projects.Moreover, the House-passed bill did not contain instructions about the National Zoo's construction projects, except to state in report language (S.Rept 109-80) that there is a base of $13.0 million for the Zoo's facilities capital projects.Additionally, the Senate-passed bill did not contain instructions about the National Zoo's construction projects, except to state in report language (S.Rept 109-80) that there is a base of $13.0 milli

In [83]:
print(pipelines[3](problem)[0])

A provision of the FY2003 Omnibus Appropriations Act (P.L 89-209 20 U.S.C 951) was added to prevent lawsuits from blocking snowmobile access to Yellowstone, Grand Canyon, Yosemite, and Rocky Mountain national parks in the event of a future winter storm or other emergency. Congress enacted a similar provision as part of the FY2005 Consolidated Appropriations Act (P.L 108-447) to prevent lawsuits from blocking snowmobile access to those parks last winter. The NEA and NEH authorization (P.L 89-209 20 U.S.C 951) expired at the end of FY1993, but the agencies have been operating on temporary authority through appropriations law. The NEA and NEH authorization (P.L 89-209 20 U.S.C 951) expired at the end of FY1993, but the agencies have been operating on temporary authority through appropriations law. The NEA and NEH authorization (P.L 89-209 20 U.S.C 951) expired at the end of FY1993, but the agencies have been operating on temporary authority through appropriations law. The NEA and NEH auth