In [1]:
import os
from sys import getsizeof
from time import sleep
import pickle
import json
import re
import inspect
from warnings import filterwarnings

import numpy as np
import nltk
import matplotlib.pyplot as plt
import torch
from transformers import (
	BartTokenizer, BartForConditionalGeneration,
	T5Tokenizer, T5ForConditionalGeneration,
	PegasusForConditionalGeneration, PegasusTokenizerFast,
	GPT2TokenizerFast
)
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv

from utils.helpers import *
from utils.encoders import *
from utils.pipelines import *
from utils.trainer_utils import *
from utils.evaluator_utils import *

In [2]:
def plot_histogram(data):
	bins = int(len(data) ** .5)
	plt.hist(data, bins=bins)
	plt.show()

inf = float("inf")
filterwarnings("ignore")
device = get_device(500)
# device = "cpu"
load_dotenv()

True

In [3]:
data_dir = "/Users/naman/Workspace/Data/Long-Document-Summarization"
data_dir = "/home/nchibbar/Data"

bart_dir = f"{data_dir}/Models/BART"
t5_dir = f"{data_dir}/Models/T5"
pegasus_dir = f"{data_dir}/Models/PEGASUS"
gpt_dir = f"{data_dir}/Models/GPT-3.5-turbo-tokenizer"

govreport_dir = f"{data_dir}/GovReport/processed"
bigpatent_dir = f"{data_dir}/BigPatent/processed"
govreport_files = os.listdir(govreport_dir)
bigpatent_files = os.listdir(bigpatent_dir)

len(govreport_files), len(bigpatent_files)

(7238, 2856)

In [4]:
# Sentence transformer
# Automatically loads into gpu if available
sent_dir = f"{data_dir}/Models/Sent-Transformer"
sent_encoder = SentenceTransformer(sent_dir)

name = "pegasus"

match name:

	case "bart":
		tokenizer = BartTokenizer.from_pretrained(bart_dir)
		model = BartForConditionalGeneration.from_pretrained(bart_dir)
		context_size = model.config.max_position_embeddings

	case "t5":
		tokenizer = T5Tokenizer.from_pretrained(t5_dir)
		model = T5ForConditionalGeneration.from_pretrained(t5_dir)
		context_size = model.config.n_positions

	case "pegasus":
		tokenizer = PegasusTokenizerFast.from_pretrained(pegasus_dir)
		model = PegasusForConditionalGeneration.from_pretrained(pegasus_dir)
		context_size = model.config.max_position_embeddings

	case "gpt":
		tokenizer = GPT2TokenizerFast.from_pretrained(gpt_dir)
		model = "gpt-3.5-turbo"
		context_size = 4096

context_size

512

In [5]:
preprocessor = TextProcessor(preprocessing=True)
postprocessor = None

## BigPatent

In [None]:
word_counts = []
for file in bigpatent_files:
	file_path = f"{bigpatent_dir}/{file}"
	with open(file_path) as fp:
		data = json.load(fp)
	for text in data["texts"]:
		word_counts.append(count_words(text))

bins = int(len(word_counts) ** .5)
plt.hist(word_counts, bins=bins)
plt.show()

In [None]:
max(word_counts), min(word_counts), np.mean(word_counts)

In [None]:
sum([
	1
	for count in word_counts
	if count > 40_000
])

## Rough

In [6]:
min_words = 20_000
max_words = inf
max_texts = inf
texts, summaries = [], []
num_texts = 0
for file in govreport_files:
	file_path = f"{govreport_dir}/{file}"
	with open(file_path) as fp:
		data = json.load(fp)
	if min_words < count_words(data["text"]) < max_words:
		texts.append(data["text"])
		summaries.append(data["summary"])
		num_texts += 1
	if num_texts == max_texts:
		break

num_texts

317

In [7]:
segment_min_words = 20
text_segmenter = TextSegmenter(nltk.sent_tokenize, segment_min_words)

In [8]:
keywords_preprocessor = TextProcessor(
	only_words_nums = True,
	remove_nums = True
)
stop_words = get_stop_words(extra_stop_words=STOP_WORDS)
len(stop_words)

392

In [12]:
min_tokens_frac = .5
min_summary_tokens = 300
head_size = .5
threshold = .7
boost = .03
num_keywords = 20
seed = 69
system_prompt = "You will be given some segments of a very long document. Your task is to summarize the entire document as a whole by extracting key information and ideas from the segments. Generate a detailed, concise, and coherent summary in 500 words. Do not refer to the document in the summary in any way."

temperature = 2.
repetition_penalty = 3.
top_p = .95

encoders = [
	TruncateMiddle(
		tokenizer, context_size, 1, preprocessor
	),
	TruncateMiddle(
		tokenizer, context_size, head_size, preprocessor, True
	),
	UniformSampler(
		tokenizer, min_tokens_frac * context_size, context_size,
		text_segmenter, preprocessor, True, seed
	),
	SegmentSampler(
		tokenizer, min_tokens_frac * context_size, context_size,
		text_segmenter, sent_encoder, preprocessor, threshold, boost, seed
	),
	RemoveRedundancy(
		tokenizer, min_tokens_frac * context_size, context_size,
		text_segmenter, sent_encoder, preprocessor, threshold, seed
	),
	KeywordScorer(
		tokenizer, context_size, text_segmenter, sent_encoder,
		preprocessor, num_keywords, keywords_preprocessor, stop_words
	)
]

match name:

	case "gpt":
		gpt_pipelines = [
			OpenAIPipeline(
				model, enc, system_prompt=system_prompt
			) for enc in encoders
		]

	case _:
		pipelines = [
			SummarizationPipeline(
				model, enc, postprocessor, min_summary_tokens,
				context_size, device, temperature, repetition_penalty, top_p
			) for enc in encoders
		]

In [None]:
processed_texts = preprocessor(texts)

In [None]:
max_word_segments = []
avg_segment = 0
total_segments = 0
for text in processed_texts:
	segments = text_segmenter(text)
	for segment in segments:
		avg_segment += count_words(segment)
		total_segments += 1
	max_word_segments.append(
		max(segments, key=lambda segment: count_words(segment))
	)
avg_segment /= total_segments
avg_segment

In [None]:
bearable_words = 150
for i, segment in enumerate(max_word_segments):
	words = count_words(segment)
	if words > bearable_words:
		print(
			f"{i} {words}: {repr(segment)}",
			end = "\n\n"
		)

In [None]:
ind = 7143
print(
	processed_texts[ind], texts[ind],
	sep = f"\n\n{"=" * 400}\n\n"
)

In [10]:
with open(f"{data_dir}/pegasus-govreport.pkl", "rb") as fp:
	results = pickle.load(fp)
scores = results["scores"]
sort1, sort2, sort3 = results["sort1"], results["sort2"], results["sort3"]
gen_summaries = results["gen_summaries"]
scores[0][sort1]

array([0.65193915, 0.6542208 , 0.65794206, 0.69529766, 0.6995131 ,
       0.70691174, 0.7073892 , 0.7171764 , 0.7178343 , 0.71939534,
       0.7217039 , 0.72373635, 0.7324604 , 0.73268557, 0.73460007,
       0.73581743, 0.7372252 , 0.7412356 , 0.7424157 , 0.7426511 ,
       0.74519473, 0.745373  , 0.7456744 , 0.74736893, 0.749151  ,
       0.74943566, 0.75022244, 0.752278  , 0.75245076, 0.75440687,
       0.75562906, 0.7567007 , 0.75692   , 0.7570452 , 0.7581953 ,
       0.7583513 , 0.7586851 , 0.758883  , 0.7592064 , 0.7594475 ,
       0.75984323, 0.76003677, 0.7606733 , 0.7612004 , 0.7618613 ,
       0.76232195, 0.7631042 , 0.763729  , 0.76401484, 0.76408005,
       0.7642471 , 0.7642544 , 0.76503026, 0.7651434 , 0.7661807 ,
       0.766662  , 0.76669765, 0.76683724, 0.76692486, 0.76692766,
       0.76713556, 0.7672833 , 0.7674605 , 0.7674811 , 0.767522  ,
       0.7676269 , 0.7679292 , 0.76831627, 0.76873004, 0.76901287,
       0.7690211 , 0.7691008 , 0.7695514 , 0.76964384, 0.76975

In [11]:
problem_text = results["texts"][sort1[0]]
print(gen_summaries[sort1[0]])

Elections are widely considered a key harbinger of the durability and extent of Afghanistan's political development and a barometer for measuring the effects of factional, political, ethnic, and sectarian rivalries. The 2009 presidential and provincial elections were the first post-Taliban elections run by the Afghan government through its Afghanistan Independent Electoral Commission (IEC) DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCatch DropCat

In [None]:
enc = encoders[-1](problem_text, return_batch=False)
sent = tokenizer.decode(enc, skip_special_tokens=True)
print(sent)

In [None]:
processed_problem = preprocessor(problem_text)
segments = text_segmenter(processed_problem)
segments

In [13]:
print(pipelines[-1](problem_text))

You're using a PegasusTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


["As a judge on the United States Court of Appeals for the Second Circuit, Justice Sotomayor has had an opportunity to write opinions that have implications in areas such as gun control, civil rights, search and seizure, habeas corpus, and post-conviction relief. In contrast to the cases described above, Judge Sotomayor has also authored several civil rights opinions in which she ruled or would have ruled against the party claiming discrimination. This report selected cases authored by Judge Sotomayor during her tenure on the Second Circuit, including majority, concurring, and dissenting opinions in areas of legal significance.This report selected cases authored by Judge Sotomayor during her tenure on the Second Circuit, including majority, concurring, and dissenting opinions in areas of legal significance. this report selected cases authored by Judge Sotomayor during her tenure on the Second Circuit, including majority, concurring, and dissenting opinions in areas of legal significanc

In [15]:
print(results["summaries"][sort1[0]])

In May 2009, Supreme Court Justice David Souter announced his intention to retire from the Supreme Court. Several weeks later, President Obama nominated Judge Sonia Sotomayor, who served on the U.S. Court of Appeals for the Second Circuit, to fill his seat. To fulfill its constitutional "advice and consent" function, the Senate considered Judge Sotomayor's extensive record—compiled from years as a lawyer, prosecutor, district court judge, and appellate court judge—to better understand her legal approaches and judicial philosophy. On August 6, the Senate confirmed Justice Sotomayor by a vote of 68-31, and she was sworn in on August 8. This report provides an analysis of selected opinions authored by Judge Sotomayor during her tenure as a judge on the Second Circuit. Discussions of the selected opinions are grouped according to various topics of legal significance. As a group, the opinions belie easy categorization along any ideological spectrum. However, it is possible to draw some conc