In [1]:
import os
from sys import getsizeof
from time import sleep
import pickle
import json
import re
import inspect
from warnings import filterwarnings

import numpy as np
import nltk
import matplotlib.pyplot as plt
import torch
from transformers import (
	BartTokenizer, BartForConditionalGeneration,
	T5Tokenizer, T5ForConditionalGeneration,
	PegasusForConditionalGeneration, PegasusTokenizerFast,
	GPT2TokenizerFast
)
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv

from utils.helpers import *
from utils.encoders import *
from utils.pipelines import *
from utils.trainer_utils import *
from utils.evaluator_utils import *

In [2]:
def plot_histogram(data):
	bins = int(len(data) ** .5)
	plt.hist(data, bins=bins)
	plt.show()

inf = float("inf")
filterwarnings("ignore")
device = get_device(500)
# device = "cpu"
load_dotenv()

True

In [3]:
data_dir = "/Users/naman/Workspace/Data/Long-Document-Summarization"
data_dir = "/home/nchibbar/Data"

bart_dir = f"{data_dir}/Models/BART"
t5_dir = f"{data_dir}/Models/T5"
pegasus_dir = f"{data_dir}/Models/PEGASUS"
gpt_dir = f"{data_dir}/Models/GPT-3.5-turbo-tokenizer"

govreport_dir = f"{data_dir}/GovReport/processed"
bigpatent_dir = f"{data_dir}/BigPatent/processed"
govreport_files = os.listdir(govreport_dir)
bigpatent_files = os.listdir(bigpatent_dir)

len(govreport_files), len(bigpatent_files)

(7238, 2856)

In [4]:
# Sentence transformer
# Automatically loads into gpu if available
sent_dir = f"{data_dir}/Models/Sent-Transformer"
sent_encoder = SentenceTransformer(sent_dir)

name = "pegasus"

match name:

	case "bart":
		tokenizer = BartTokenizer.from_pretrained(bart_dir)
		model = BartForConditionalGeneration.from_pretrained(bart_dir)
		context_size = model.config.max_position_embeddings

	case "t5":
		tokenizer = T5Tokenizer.from_pretrained(t5_dir)
		model = T5ForConditionalGeneration.from_pretrained(t5_dir)
		context_size = model.config.n_positions

	case "pegasus":
		tokenizer = PegasusTokenizerFast.from_pretrained(pegasus_dir)
		model = PegasusForConditionalGeneration.from_pretrained(pegasus_dir)
		context_size = model.config.max_position_embeddings

	case "gpt":
		tokenizer = GPT2TokenizerFast.from_pretrained(gpt_dir)
		model = "gpt-3.5-turbo"
		context_size = 4096

context_size

512

In [5]:
preprocessor = TextProcessor(preprocessing=True)
postprocessor = None

## BigPatent

In [None]:
word_counts = []
for file in bigpatent_files:
	file_path = f"{bigpatent_dir}/{file}"
	with open(file_path) as fp:
		data = json.load(fp)
	for text in data["texts"]:
		word_counts.append(count_words(text))

bins = int(len(word_counts) ** .5)
plt.hist(word_counts, bins=bins)
plt.show()

In [None]:
max(word_counts), min(word_counts), np.mean(word_counts)

In [None]:
sum([
	1
	for count in word_counts
	if count > 40_000
])

## Rough

In [None]:
min_words = 20_000
max_words = inf
max_texts = inf
texts, summaries = [], []
num_texts = 0
for file in govreport_files:
	file_path = f"{govreport_dir}/{file}"
	with open(file_path) as fp:
		data = json.load(fp)
	if min_words < count_words(data["text"]) < max_words:
		texts.append(data["text"])
		summaries.append(data["summary"])
		num_texts += 1
	if num_texts == max_texts:
		break

num_texts

In [6]:
segment_min_words = 20
text_segmenter = TextSegmenter(nltk.sent_tokenize, segment_min_words)
keywords_preprocessor = TextProcessor(
	only_words_nums = True,
	remove_nums = True
)
stop_words = get_stop_words(extra_stop_words=STOP_WORDS)
len(stop_words)

392

In [7]:
min_tokens_frac = .5
min_summary_tokens = 300
head_size = .5
threshold = .7
boost = .03
num_keywords = 20
seed = 69
system_prompt = "You will be given some segments of a very long document. Your task is to summarize the entire document as a whole by extracting key information and ideas from the segments. Generate a detailed, concise, and coherent summary in 500 words. Do not refer to the document in the summary in any way."

temperature = 2.
repetition_penalty = 3.
top_p = .95

encoders = [
	TruncateMiddle(
		tokenizer, context_size, 1, preprocessor
	),
	TruncateMiddle(
		tokenizer, context_size, head_size, preprocessor, True
	),
	UniformSampler(
		tokenizer, min_tokens_frac * context_size, context_size,
		text_segmenter, preprocessor, True, seed
	),
	SegmentSampler(
		tokenizer, min_tokens_frac * context_size, context_size,
		text_segmenter, sent_encoder, preprocessor, threshold, boost, seed
	),
	RemoveRedundancy(
		tokenizer, min_tokens_frac * context_size, context_size,
		text_segmenter, sent_encoder, preprocessor, threshold, seed
	),
	KeywordScorer(
		tokenizer, context_size, text_segmenter, sent_encoder,
		preprocessor, num_keywords, keywords_preprocessor, stop_words
	)
]

match name:

	case "gpt":
		gpt_pipelines = [
			OpenAIPipeline(
				model, enc, system_prompt=system_prompt
			) for enc in encoders
		]

	case _:
		pipelines = [
			SummarizationPipeline(
				model, enc, postprocessor, min_summary_tokens,
				context_size, device, temperature, repetition_penalty, top_p
			) for enc in encoders
		]

In [None]:
processed_texts = preprocessor(texts)
max_word_segments = []
avg_segment = 0
total_segments = 0
for text in processed_texts:
	segments = text_segmenter(text)
	for segment in segments:
		avg_segment += count_words(segment)
		total_segments += 1
	max_word_segments.append(
		max(segments, key=lambda segment: count_words(segment))
	)
avg_segment /= total_segments
avg_segment

In [None]:
bearable_words = 150
for i, segment in enumerate(max_word_segments):
	words = count_words(segment)
	if words > bearable_words:
		print(
			f"{i} {words}: {repr(segment)}",
			end = "\n\n"
		)

In [None]:
ind = 7143
print(
	processed_texts[ind], texts[ind],
	sep = f"\n\n{"=" * 400}\n\n"
)

In [8]:
with open(f"{data_dir}/pegasus-govreport.pkl", "rb") as fp:
	results = pickle.load(fp)
scores = results["scores"]
sort1, sort2, sort3 = results["sort1"], results["sort2"], results["sort3"]
gen_summaries = results["gen_summaries"]
scores[0][sort1]

array([0.675691  , 0.7162112 , 0.7574562 , 0.75933695, 0.76402986,
       0.7669531 , 0.767002  , 0.7677376 , 0.76811725, 0.7697828 ,
       0.77050906, 0.7721592 , 0.7731201 , 0.7739637 , 0.7757685 ,
       0.7759561 , 0.7759706 , 0.7761142 , 0.7779481 , 0.78167456,
       0.7828735 , 0.78393763, 0.784287  , 0.78458005, 0.78500414,
       0.78527856, 0.78571516, 0.78596365, 0.7860271 , 0.7860367 ,
       0.7872863 , 0.7875255 , 0.7876092 , 0.7876619 , 0.7879738 ,
       0.78806406, 0.7882248 , 0.7882396 , 0.7885411 , 0.7886226 ,
       0.78864753, 0.7886622 , 0.7890073 , 0.78908485, 0.78910667,
       0.78911954, 0.78919554, 0.7891998 , 0.7892448 , 0.78964186,
       0.78969586, 0.78992486, 0.79017186, 0.7904004 , 0.79045725,
       0.79065657, 0.7906741 , 0.79072493, 0.7908622 , 0.79091513,
       0.7913277 , 0.79158986, 0.79188424, 0.79211515, 0.79226184,
       0.79251784, 0.7925694 , 0.79282624, 0.79307777, 0.79335654,
       0.79359376, 0.7937025 , 0.7937305 , 0.79377246, 0.79380

In [10]:
problem_text = results["texts"][sort1[0]]
print(gen_summaries[sort1[0]])

A glossary of terms: Aid to Families with Dependent Children (AFDC): Subsection Basic Eligibility: permits a state to give basic cash1-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-65561-6556


In [14]:
print(pipelines[-1](problem_text))

The FBI and the FSB, Russia's Federal Security Service (FSB), have been working together on counter-terrorism issues for several years, according to a senior Obama Administration official who spoke on the condition of anonymity because he was not allowed to speak publicly about the matter. A senior Obama Administration official commented that missile defense "has never been about Russia." Former U.S Ambassador to Russia Michael McFaul stated that, in response to the Ukraine/Crimea crisis, the United States should cease cooperation with Russia on a number of fronts, including negotiations over joint missile defensive systems under the auspices of the NATO-Russia Council - but only if Moscow accepts the alliance's new European missile shield plans as part of its agreement to withdraw from the 1987 Intermediate-range Nuclear Forces Treaty (INF treaty) - which expires at the end of this year.A senior Obama Administration official commented that, in response to the Ukraine/Crimea crisis, th

In [None]:
enc = encoders[-1](problem_text, return_batch=False)
sent = tokenizer.decode(enc, skip_special_tokens=True)
print(sent)

In [None]:
processed_problem = preprocessor(problem_text)
segments = text_segmenter(processed_problem)
segments

In [None]:
print(pipelines[-1](problem_text))

In [None]:
print(results["summaries"][sort1[0]])