In [2]:
import json
import os
import numpy as np
import re
import nltk
import torch
from transformers import BartTokenizer, BartForConditionalGeneration
from sentence_transformers import SentenceTransformer
import inspect

from utils import *

device = get_device()
device

'mps'

In [19]:
stopwords = nltk.corpus.stopwords.words("english")
stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

## gov-report

In [7]:
data_dir = "/Users/naman/Workspace/Data/UCCS-REU"

crs_files = os.listdir(crs_dir := f"{data_dir}/GovReport/crs")
gao_files = os.listdir(gao_dir := f"{data_dir}/GovReport/gao")

print(f"crs files: {len(crs_files)}, gao files: {len(gao_files)}")

crs_out = f"{data_dir}/GovReport/crs-processed"
gao_out = f"{data_dir}/GovReport/gao-processed"

preprocessor = TextPreprocessor()

crs files: 7238, gao files: 12228


### Processing text

In [28]:
for file in crs_files:
	with open(f"{crs_dir}/{file}") as fp:
		data = json.load(fp)
	text = combine_subsections([data["reports"]])
	text = preprocessor.preprocess(text)
	summary = "\n".join(data["summary"])
	summary = preprocessor.preprocess(summary)
	with open(f"{crs_out}/{file}", "w") as fp:
		json.dump({
			"text": text,
			"summary": summary
		}, fp)

In [34]:
for file in gao_files:
	with open(f"{gao_dir}/{file}") as fp:
		data = json.load(fp)
	text = combine_subsections(data["report"])
	text = preprocessor.preprocess(text)
	print(data["highlight"])
	summary = "\n".join(data["highlight"])
	summary = preprocessor.preprocess(summary)
	with open(f"{gao_out}/{file}", "w") as fp:
		json.dump({
			"text": text,
			"summary": summary
		}, fp)

[]


---

In [14]:
tokenizer_dir = f"{data_dir}/Models/BART/tokenizer"
model_dir = f"{data_dir}/Models/BART/model"
checkpoint = "facebook/bart-large-cnn"

tokenizer = BartTokenizer.from_pretrained(tokenizer_dir)
model = BartForConditionalGeneration.from_pretrained(model_dir)

context_size, _ = max_lengths(model)
context_size

max_output_tokens = 500

In [8]:
file = f"{crs_out}/{crs_files[0]}"

with open(file) as fp:
	data = json.load(fp)
count_words(data["text"]), count_words(data["summary"])

(8357, 479)

In [37]:
def pick_sents(texts, sent_tokenizer, tokenizer, context_size):
	processed_texts = []

	for text in texts:
		# Extract and encode sentences
		sents = sent_tokenizer(text)
		sents = tokenizer(sents)["input_ids"]
		sents = np.array(sents, dtype=list)

		# Mean length of sentences
		mean_length = np.mean([
			len(sent) for sent in sents
		])

		# Approximate number of sentences needed
		num_samples = int(context_size / mean_length)

		# Check if there are enough sentences
		if len(sents) <= num_samples:
			flattened = [elm for lis in sents for elm in lis]
			processed_texts.append(flattened)
			continue

		# Sample until sentences fit in model
		while True:
			sampled = np.random.choice(sents, size=num_samples, replace=False)
			flattened = [elm for lis in sampled for elm in lis]
			if len(flattened) <= context_size:
				processed_texts.append(flattened)
				break

	# Pad sentences and create attention mask
	padded_ids = tokenizer.pad({
		"input_ids": processed_texts
	}, return_tensors="pt")

	return padded_ids

In [45]:
model_input = pick_sents([
	"today is a fine",
	"yeah, you're right its a really nice",
	data["text"]
], nltk.sent_tokenize, tokenizer, context_size)
model_input

{'input_ids': tensor([[    0, 34375,    16,  ...,     1,     1,     1],
        [    0, 42803,     6,  ...,     1,     1,     1],
        [    0,  4528,  4504,  ...,   532,     4,     2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]])}

In [46]:
output = model.generate(**model_input, max_length=max_output_tokens)
output

tensor([[    2,     0, 34375,    16,    10,  2051,   183,     4,  2477,    16,
            10,   372,   183,     4,    85,    18,    86,     7,   386,    10,
            92,   183,     4,  2776,    10,    92,   301,     4,  2776,   452,
             4,  2776,   110,    92,   183,    19,    10,    92,  1786,     4,
          2776,    30,  1158,    23,     5,  2576,     9,     5,  1842,     8,
           173,   110,   169,    62,     7,     5,   299,     4,  2578,   259,
             7,   386,     4,     2,     1,     1],
        [    2,     0,     0, 42803,     6,    47,   214,   235,    63,    10,
           269,  2579,     4, 42803,     6,   370,   214,   235,     4,  3139,
            10,   269, 34033,     4,  8976,     6,    47,    32,   235,     4,
            85,    18,    10,   269, 41541,     4,  8976,     4,   370,   214,
          4070,     4,    85,    16,    10,   269, 16911,     4,  8976, 39747,
           370,   214, 13984,     4,    85,    17,    27,    29,    10, 30327,


In [47]:
output.shape

torch.Size([3, 66])

In [48]:
summary = tokenizer.decode(output[2])
print(summary)

</s><s>The Glass-Steagall Act was enacted in 1933 to prevent banks from taking on too much debt. The act was intended to prevent the creation of a financial system that was too risky for investors. The Act was later amended to allow banks to take on more debt. It was also intended to protect investors' interests.</s>


In [61]:
class TextPostprocessor:

	def __init__(self, special_tokens: list[str]):
		self.special_tokens = re.compile(r"|".join(special_tokens))
	
	def __call__(self, texts: list[str]):
		if isinstance(texts, str):
			texts = [texts]
		texts = [self.special_tokens.sub("", text) for text in texts]
		return texts

In [62]:
special_tokens = tokenizer.special_tokens_map.values()
postprocessor = TextPostprocessor(special_tokens)
special_tokens

dict_values(['<s>', '</s>', '<unk>', '</s>', '<pad>', '<s>', '<mask>'])

In [63]:
postprocessor(summary)

["The Glass-Steagall Act was enacted in 1933 to prevent banks from taking on too much debt. The act was intended to prevent the creation of a financial system that was too risky for investors. The Act was later amended to allow banks to take on more debt. It was also intended to protect investors' interests."]

In [6]:
inp = truncate_middle([data["text"]], tokenizer, context_size, .4)
inp

{'input_ids': tensor([[    0, 43480, 24474,  ..., 10914,     4,     2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]])}

In [7]:
out = model.generate(**inp, max_length=max_output_tokens)
out

tensor([[    2,     0, 44189,    12, 21426, 44195,  1783,  8254,  1861,  3454,
            31,   915,  3454,     4,  6974, 36466,     9,  1861,     8,   915,
          3454,    64,   244,  7540, 10246, 24323,  8068,  9023,    31,  4975,
            11,  5157,  1048,     4,    20, 10875,     6,    30,  1495,     6,
           473,    45,  1100,   141,   915,  1520,    32, 13588,   624,  5157,
          1048,    50,   141,   786, 27045,    64,   304,  5157,  1713,     7,
          1391,  2267,     8,  1861,  1126,     4,     2]])

In [8]:
tokenizer.decode(out[0])

'</s><s>Glass-Steagall Act separated commercial banking from investment banking. Separation of commercial and investment banking can help insulate insured depositories from volatility in securities markets. The separation, by itself, does not address how investment banks are regulated within securities markets or how nonbanks can use securities activities to fund consumer and commercial debt.</s>'

In [13]:
inp = truncate_middle([
	"hey, this is a small text!",
	data["text"]
], tokenizer, context_size)
inp

{'input_ids': tensor([[    0, 12229,     6,  ...,     1,     1,     1],
        [    0, 43480, 24474,  ..., 10914,     4,     2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]])}

In [3]:
sent_checkpoint = "sentence-transformers/all-MiniLM-L6-v2"
sent_save_dir = f"{data_dir}/Models/Sent-Transformer"

sent_model = SentenceTransformer(sent_save_dir)
sent_model

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [9]:
out = sent_model.encode([
	"hey bruh",
	"whats up huh?",
	data["text"]
])
out

array([[-0.06765417, -0.03888808,  0.05449073, ...,  0.01187451,
         0.04905998,  0.01324423],
       [-0.11608477, -0.07559214,  0.05973152, ..., -0.01747406,
        -0.00257643,  0.05474273],
       [ 0.10994177, -0.04852243, -0.07865883, ...,  0.00304659,
         0.0611313 , -0.02873247]], dtype=float32)

In [10]:
out.shape

(3, 384)

In [65]:
class UniformSampler:

	def __init__(
			self, text_preprocessor, text_postprocessor,  sent_tokenizer, tokenizer,
			summarizer, summarizer_context_size, max_output_tokens
		):
		self.preprocessor = text_preprocessor
		self.postprocessor = text_postprocessor
		self.sent_tokenizer = sent_tokenizer
		self.tokenizer = tokenizer
		self.summarizer = summarizer
		self.context_size = summarizer_context_size
		self.max_tokens = max_output_tokens

	def __call__(self, texts: list[str]):
		texts = self.preprocessor(texts)
		inputs = self.pick_sents(texts)
		outputs = self.summarizer.generate(**inputs, max_length=self.max_tokens)
		summaries = [self.tokenizer.decode(out) for out in outputs]
		processed_summaries = self.postprocessor(summaries)
		return processed_summaries

	def pick_sents(self, texts):
		sent_tokenizer = self.sent_tokenizer
		tokenizer = self.tokenizer
		context_size = self.context_size

		processed_texts = []
		for text in texts:
			# Extract and encode sentences
			sents = sent_tokenizer(text)
			sents = tokenizer(sents)["input_ids"]
			sents = np.array(sents, dtype=list)

			# Mean length of sentences
			mean_length = np.mean([
				len(sent) for sent in sents
			])

			# Approximate number of sentences needed
			num_samples = int(context_size / mean_length)

			# Check if there are enough sentences
			if len(sents) <= num_samples:
				flattened = [elm for lis in sents for elm in lis]
				processed_texts.append(flattened)
				continue

			# Sample until sentences fit in model
			while True:
				sampled = np.random.choice(sents, size=num_samples, replace=False)
				flattened = [elm for lis in sampled for elm in lis]
				if len(flattened) <= context_size:
					processed_texts.append(flattened)
					break

		# Pad sentences and create attention mask
		padded_ids = tokenizer.pad({
			"input_ids": processed_texts
		}, return_tensors="pt")

		return padded_ids

In [67]:
summarizer = UniformSampler(
	preprocessor, postprocessor, nltk.sent_tokenize, tokenizer,
	model, context_size, max_output_tokens
)

In [68]:
summarizer(data["text"])

["The Glass-Steagall Act was designed to prevent banks from lending money to each other. It was intended to prevent the banks from using the money to invest in each other's businesses. The law was not intended to stop banks from investing in one another's businesses, but to prevent them from lending to one another."]