In [1]:
import os
from sys import getsizeof
from time import sleep
import json
import re
import inspect
from warnings import filterwarnings

import numpy as np
import nltk
import matplotlib.pyplot as plt
import torch
from transformers import (
	BartTokenizer, BartForConditionalGeneration,
	T5Tokenizer, T5ForConditionalGeneration,
	PegasusForConditionalGeneration, PegasusTokenizerFast,
	GPT2TokenizerFast,
)
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from dotenv import load_dotenv
from summa.keywords import keywords
from summa.summarizer import summarize

from utils.helpers import *
from utils.encoders import *
from utils.pipelines import *
from utils.trainer_utils import *
from utils.evaluator_utils import *

In [2]:
inf = float("inf")
filterwarnings("ignore")
device = get_device()
# device = "cpu"
load_dotenv()

True

In [3]:
data_dir = "/Users/naman/Workspace/Data/Long-Document-Summarization"
data_dir = "/home/nchibbar/Data"

crs_files = os.listdir(crs_dir := f"{data_dir}/GovReport/crs")
gao_files = os.listdir(gao_dir := f"{data_dir}/GovReport/gao")

print(f"crs files: {len(crs_files)}, gao files: {len(gao_files)}")

out_dir = f"{data_dir}/GovReport/processed"

crs files: 7238, gao files: 12228


In [4]:
# Sentence transformer
# Automatically loads into gpu if available
sent_dir = f"{data_dir}/Models/Sent-Transformer"
sent_encoder = SentenceTransformer(sent_dir).to("cpu")

# BART
bart_dir = f"{data_dir}/Models/BART"
bart_tokenizer = BartTokenizer.from_pretrained(bart_dir)
bart_model = BartForConditionalGeneration.from_pretrained(bart_dir)
bart_context_size = bart_model.config.max_position_embeddings

# T5
t5_dir = f"{data_dir}/Models/T5"
t5_tokenizer = T5Tokenizer.from_pretrained(t5_dir)
t5_model = T5ForConditionalGeneration.from_pretrained(t5_dir)
t5_context_size = t5_model.config.n_positions

# Pegasus
pegasus_dir = f"{data_dir}/Models/PEGASUS"
pegasus_tokenizer = PegasusTokenizerFast.from_pretrained(pegasus_dir)
pegasus_model = PegasusForConditionalGeneration.from_pretrained(pegasus_dir)
pegasus_context_size = pegasus_model.config.max_position_embeddings

# GPT 3.5 turbo tokenizer
gpt_dir = f"{data_dir}/Models/GPT-3.5-turbo-tokenizer"
gpt_tokenizer = GPT2TokenizerFast.from_pretrained(gpt_dir)
gpt_model = "gpt-3.5-turbo"
gpt_context_size = 4096

bart_context_size, t5_context_size, pegasus_context_size

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


(1024, 4096, 512)

In [5]:
preprocessor = TextProcessor(preprocessing=True)
postprocessor = None

## GovReport

In [None]:
def combine_subsections(sections):
	text = ""
	for sec in sections:
		sec_text = "\n\n".join(sec["paragraphs"])
		if sec["section_title"]:
			sec_text = f"Section {sec["section_title"]}:\n\n{sec_text}"
		text = f"{text}\n\n{sec_text}" if text else sec_text
		if sec["subsections"]:
			sub_text = combine_subsections(sec["subsections"])
			text = f"{text}\n\n{sub_text}" if text else sub_text
	return text

In [None]:
num_crs_files = len(crs_files)
for i, file in enumerate(crs_files):
	full_path = os.path.join(crs_dir, file)
	with open(full_path) as fp:
		data = json.load(fp)
	clear_stdout()
	print(f"{num_crs_files - i} files left", end="")
	text = f"{data["title"]}\n\n"
	text += combine_subsections([data["reports"]])
	summary = " ".join(data["summary"])
	summary = preprocessor.process(summary)
	with open(f"{out_dir}/{file}", "w") as fp:
		json.dump({
			"text": text,
			"summary": summary
		}, fp)

In [None]:
for file in gao_files:
	file = os.path.join(gao_dir, file)
	with open(file) as fp:
		data = json.load(fp)
	text = combine_subsections(data["report"])
	text = preprocessor.process(text)
	print(data["highlight"])
	summary = "\n".join(data["highlight"])
	summary = preprocessor.process(summary)
	with open(f"{out_dir}/{file}", "w") as fp:
		json.dump({
			"text": text,
			"summary": summary
		}, fp)

## BigPatent

In [None]:
bigpatent_dir = f"{data_dir}/BigPatent/train/a"

bigpatent_files = os.listdir(bigpatent_dir)

word_counts = []
for file in bigpatent_files:
	with open(f"{bigpatent_dir}/{file}") as fp:
		for line in fp.readlines():
			data = json.loads(line)
			text = data["description"]
			word_counts.append(count_words(text))

bins = int(len(word_counts)**.5)
plt.hist(word_counts, bins=bins)
plt.show()

In [None]:
np.max(word_counts)

## LDA

In [None]:
vectorizer = CountVectorizer(stop_words="english")
vectorizer

In [None]:
dtm = vectorizer.fit_transform([data["description"]])
vectorizer

In [None]:
print(dtm)

In [None]:
topics = 1
lda = LatentDirichletAllocation(n_components=topics)
lda.fit(dtm)

In [None]:
topic_dist = lda.transform(dtm)
print(topic_dist)

In [None]:
def display_topics(model, feature_names, num_top_words):
	for topic_idx, topic in enumerate(model.components_):
		print(f"Topic {topic_idx}:")
		print(" ".join([feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))

In [None]:
num_top_words = 10
feature_names = vectorizer.get_feature_names_out()
display_topics(lda, feature_names, num_top_words)

## Rough

In [12]:
min_words = 70_000
max_words = inf
max_texts = 10
texts, summaries = [], []
num_texts = 0
for file in crs_files:
	with open(f"{out_dir}/{file}") as fp:
		data = json.load(fp)
	if min_words < count_words(data["text"]) < max_words:
		texts.append(data["text"])
		summaries.append(data["summary"])
		num_texts += 1
	if num_texts == max_texts:
		break

len(texts)

2

In [13]:
segment_min_words = 20
sent_segmenter = TextSegmenter(nltk.sent_tokenize, segment_min_words)

In [14]:
min_tokens_frac = .5
min_summary_tokens = 400
head_size = .5
threshold = .7
boost = .03
seed = 69
system_prompt = "You will be given some segments of a very long document. Your task is to summarize the entire document as a whole by extracting key information and ideas from the segments. Generate a detailed, concise, and coherent summary in 500 words. Do not refer to the document in the summary in any way."

sent_encoder.to(device)

bart_encoders = [
	TruncateMiddle(
		bart_tokenizer, bart_context_size, head_size, preprocessor, True
	),
	UniformSampler(
		bart_tokenizer, min_tokens_frac * bart_context_size, bart_context_size,
		sent_segmenter, preprocessor, True, seed
	),
	SegmentSampler(
		bart_tokenizer, min_tokens_frac * bart_context_size, bart_context_size,
		sent_segmenter, sent_encoder, preprocessor, True, threshold, boost, seed
	),
	RemoveRedundancy(
		bart_tokenizer, min_tokens_frac * bart_context_size, bart_context_size,
		sent_segmenter, sent_encoder, preprocessor, True, threshold, seed
	)
]
t5_encoders = [
	TruncateMiddle(
		t5_tokenizer, t5_context_size, head_size, preprocessor, True
	),
	UniformSampler(
		t5_tokenizer, min_tokens_frac * bart_context_size, t5_context_size,
		sent_segmenter, preprocessor, True, seed
	),
	SegmentSampler(
		t5_tokenizer, min_tokens_frac * bart_context_size, t5_context_size,
		sent_segmenter, sent_encoder, preprocessor, True, threshold, boost, seed
	),
	RemoveRedundancy(
		t5_tokenizer, min_tokens_frac * bart_context_size, t5_context_size,
		sent_segmenter, sent_encoder, preprocessor, True, threshold, seed
	)
]
gpt_encoders = [
	TruncateMiddle(
		gpt_tokenizer, gpt_context_size, head_size, preprocessor, True
	),
	UniformSampler(
		gpt_tokenizer, min_tokens_frac * gpt_context_size, gpt_context_size,
		sent_segmenter, preprocessor, True, seed
	),
	SegmentSampler(
		gpt_tokenizer, min_tokens_frac * gpt_context_size, gpt_context_size,
		sent_segmenter, sent_encoder, preprocessor, True, threshold, boost, seed
	),
	RemoveRedundancy(
		gpt_tokenizer, min_tokens_frac * gpt_context_size, gpt_context_size,
		sent_segmenter, sent_encoder, preprocessor, True, threshold, seed
	)
]
bart_pipelines = [
	SummarizationPipeline(
		bart_model, enc, postprocessor, min_summary_tokens,
		bart_context_size, device
	) for enc in bart_encoders
]
t5_pipelines = [
	SummarizationPipeline(
		t5_model, enc, postprocessor, min_summary_tokens,
		t5_context_size, device
	) for enc in t5_encoders
]
gpt_pipelines = [
	OpenAIPipeline(
		gpt_model, enc, system_prompt=system_prompt
	) for enc in gpt_encoders
]
pipelines = bart_pipelines + t5_pipelines

In [15]:
encodings1 = bart_encoders[1](texts, return_batch=False)
encodings2 = bart_encoders[2](texts, return_batch=False)

token_lengths1 = [len(enc) for enc in encodings1]
token_lengths2 = [len(enc) for enc in encodings2]

avg_tokens1 = np.mean(token_lengths1)
avg_tokens2 = np.mean(token_lengths2)

avg_tokens1, avg_tokens2

(749.5, 735.0)

In [None]:
for i, text in enumerate(texts):
	print(f"Processing text {i + 1}")
	bart_encoders[3](text, return_batch=False)

In [16]:
text = preprocessor(texts[3])
text = sent_segmenter(text)

[count_words(seg) for seg in text]

IndexError: list index out of range

In [17]:
text = texts[1]
text = preprocessor(text)

out = keywords(text, words=20)

print(out)

banks
bank
banking
financial
financially
financials
financier
governments
government
governance
governing
economic
economics
economically
markets
market
marketing
billion
billions
report
reported
reports
reporting
reportedly
countries
country
include
including
included
includes
new
economies
economy
china
international
internationally
internal
internally
states
state
stated
policy
policies
global
globally
globalization
said
globalized trading
national
nations
nationally
nationalize
nationalism
nationalization
nation
nationalizing
nationalized
crisis
trade
trades


In [10]:
summary = summarize(text)
print(summary)

The Global Financial Crisis: Analysis and Policy Implications Section Recent Developments1:
January 27 President Obama pledged to divert $30 billion of money repaid from the Troubled Asset Relief Program to smaller banks to help them make loans to small businesses.
September 24-25 At the Group of 20 Summit held in Pittsburgh, world leaders agreed to make the G-20 the leading forum for coordinating global economic policy; not to withdraw stimulus measures until a durable recovery is in place; to co-ordinate their exit strategies from the stimulus measures; to harmonize macroeconomic policies to avoid imbalances (America's deficits and Asia's savings glut) that worsened the financial crisis; and to eliminate subsidies on fossil fuels (only in the medium term).
Section The Global Financial Crisis and U.S Interests2:
Policymaking to deal with the global financial crisis and ensuing global recession has now moved from containing the contagion to specific actions aimed at promoting recovery 

In [18]:
def get_keywords(
		text: str,
		num_words: int = 20,
		stop_words: list | None = None
	):
	if stop_words is None:
		stop_words = []
	text = re.sub(r"[^\w\s']", "", text)
	text = re.sub(r"(\W|\b)[0-9]+(\W|\b)", "", text).strip()
	words = text.split()
	word_freq = {}
	for word in words:
		if word.lower() not in stop_words:
			word_freq[word] = word_freq.get(word, 0) + 1
	sorted_words = sorted(
		word_freq.keys(), key=lambda x: word_freq[x],
		reverse=True
	)
	return sorted_words[:num_words]

In [19]:
my_stop_words = [
	"also", "however", "therefore", "thus", "hence", "moreover",
	"must", "may", "might", "could", "would", "shall", "need",
	"needs", "given", "since", "though",
]

stop_words = nltk.corpus.stopwords.words("english") + my_stop_words

In [20]:
all_words = get_keywords(text, stop_words=stop_words)
all_words

['financial',
 'crisis',
 'countries',
 'US',
 'economic',
 'government',
 'banks',
 'IMF',
 'global',
 'United',
 'markets',
 'economy',
 'market',
 'China',
 'growth',
 'States',
 'Bank',
 'European',
 'stimulus',
 'credit']

In [40]:
for word in my_stop_words:
	if word in nltk.corpus.stopwords.words("english"):
		print(word)

In [64]:
re.sub(r"(\W|\b)[0-9]+(\W|\b)", "", "4534 4G 896 hv 7867")

' 4G  hv '