In [1]:
import json
import os
import numpy as np
import re
import nltk
import stanza
import torch
from transformers import BartTokenizer, BartForConditionalGeneration
import inspect

from utils import *

In [19]:
stopwords = nltk.corpus.stopwords.words("english")
stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [2]:
sample_text = """Hey there! 😊 Have you checked out the new website? Visit us at https://www.example.com or email us at info@example.com for more details. #ExcitingNews 📰

The meeting is scheduled for 12:30 PM on 15th August, 2023. Don't be late! 🚀 We’ll discuss the Q3 financials, which include a profit margin of 12.5% and a YoY growth of 8%. 📊

Here are the main points:

Increase marketing budget by 25% 🚀
Launch the new product line in Q4 🎉
Expand into the APAC region 🌏

To-do List:

 Finalize the budget proposal
 Review the Q3 report
 Prepare for the product launch event

Alice said, “We need to push our limits!” while Bob replied, “Let’s ensure quality over quantity.” This debate has been ongoing for weeks now. Some say, "Quality is key," but others argue, "We need quantity to drive growth."

Contact our support team at support@example.com or call 1-800-555-0199. For international calls, dial +1-800-555-0199.

Here's a snippet of code we need to review:

def hello_world():
    print("Hello, World!")

Make sure to check it and ensure it’s compliant with PEP 8 standards. Also, we have some legacy code:

<!DOCTYPE html>
<html>
<body>
    <h1>My First Heading</h1>
    <p>My first paragraph.</p>
</body>
</html>

Random thoughts:

I can't believe it's almost the end of the year!
What are everyone's New Year's resolutions? 🎆
The weather has been crazy lately, right? 🌦️
Check out this great article: https://www.example.com/blog/awesome-article. It's packed with useful info and insights.

Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer nec odio. Praesent libero. Sed cursus ante dapibus diam. Sed nisi. Nulla quis sem at nibh elementum imperdiet. Duis sagittis ipsum. Praesent mauris. Fusce nec tellus sed augue semper porta. Mauris massa.

Nullam quis risus eget urna mollis ornare vel eu leo.
Integer posuere erat a ante venenatis dapibus posuere velit aliquet.
Special characters to consider: !@#$%^&*()_+-=[]{}|;':",.<>/?`~

Do you remember the last project's details? If not, here's a quick refresher: the budget was $500,000, and the timeline was 6 months. We faced several challenges, including supply chain issues and unexpected delays. Yet, we managed to deliver on time. Kudos to the team! 🎉

By the way, here’s a fun fact: the longest recorded flight of a chicken is 13 seconds! 🐔 How cool is that?

Our next team-building activity will be a hiking trip. Bring your gear and be ready for an adventure. 🌄

Hiking Essentials:
Sturdy boots 🥾
Water bottle 🚰
Snacks 🍫
First-aid kit 🩹
The goal is to reach the summit by noon. Let's make it happen, team! 💪
"""

In [86]:
preprocessed_text = preprocessor(sample_text)
# preprocessed_text = preprocessor(sample_text, stop_words=stopwords)
print(preprocessed_text)

Hey there! Have you checked out the new website? Visit us at or email us at for more details.

The meeting is scheduled for 12:30 PM on 15th August, 2023. Don't be late! We'll discuss the Q3 financials, which include a profit margin of 12.5% and a YoY growth of 8%.

Here are the main points:

Increase marketing budget by 25%
Launch the new product line in Q4
Expand into the APAC region

To-do List:

 Finalize the budget proposal
 Review the Q3 report
 Prepare for the product launch event

Alice said, "We need to push our limits!" while Bob replied, "Let's ensure quality over quantity." This debate has been ongoing for weeks now. Some say, "Quality is key," but others argue, "We need quantity to drive growth."

Contact our support team at or call 1-800-555-0199. For international calls, dial +1-800-555-0199.

Here's a snippet of code we need to review:

def hello_world():
 print("Hello, World!")

Make sure to check it and ensure it's compliant with PEP 8 standards. Also, we have some le

In [100]:
stanza_tokenizer = stanza.Pipeline("en")
parsed_text = stanza_tokenizer(preprocessed_text)
[sentence.text for sentence in parsed_text.sentences]

['Hey there!',
 'Have you checked out the new website?',
 'Visit us at or email us at for more details.',
 'The meeting is scheduled for 12:30 PM on 15th August, 2023.',
 "Don't be late!",
 "We'll discuss the Q3 financials, which include a profit margin of 12.5% and a YoY growth of 8%.",
 'Here are the main points:',
 'Increase marketing budget by 25%\nLaunch the new product line in Q4\nExpand into the APAC region',
 'To-do List:',
 'Finalize the budget proposal\n Review the Q3 report\n Prepare for the product launch event',
 'Alice said, "We need to push our limits!" while Bob replied, "Let\'s ensure quality over quantity."',
 'This debate has been ongoing for weeks now.',
 'Some say, "Quality is key," but others argue, "We need quantity to drive growth."',
 'Contact our support team at or call 1-800-555-0199.',
 'For international calls, dial +1-800-555-0199.',
 "Here's a snippet of code we need to review:",
 'def hello_world():\n print("Hello, World!")',
 "Make sure to check it and 

## gov-report

In [3]:
data_dir = "/Users/naman/Workspace/Data/UCCS-REU"

crs_files = os.listdir(crs_dir := f"{data_dir}/GovReport/crs")
gao_files = os.listdir(gao_dir := f"{data_dir}/GovReport/gao")

print(f"crs files: {len(crs_files)}, gao files: {len(gao_files)}")

crs_out = f"{data_dir}/GovReport/crs-processed"
gao_out = f"{data_dir}/GovReport/gao-processed"

preprocessor = TextPreprocessor()

crs files: 7238, gao files: 12228


### Processing text

In [28]:
for file in crs_files:
	with open(f"{crs_dir}/{file}") as fp:
		data = json.load(fp)
	text = combine_subsections([data["reports"]])
	text = preprocessor(text)
	summary = "\n".join(data["summary"])
	summary = preprocessor(summary)
	with open(f"{crs_out}/{file}", "w") as fp:
		json.dump({
			"text": text,
			"summary": summary
		}, fp)

In [34]:
for file in gao_files:
	with open(f"{gao_dir}/{file}") as fp:
		data = json.load(fp)
	text = combine_subsections(data["report"])
	text = preprocessor(text)
	print(data["highlight"])
	summary = "\n".join(data["highlight"])
	summary = preprocessor(summary)
	with open(f"{gao_out}/{file}", "w") as fp:
		json.dump({
			"text": text,
			"summary": summary
		}, fp)

[]


---

In [154]:
tokenizer_dir = f"{data_dir}/Models/BART/tokenizer"
model_dir = f"{data_dir}/Models/BART/model"
checkpoint = "facebook/bart-large-cnn"

tokenizer = BartTokenizer.from_pretrained(tokenizer_dir)
model = BartForConditionalGeneration.from_pretrained(model_dir)

In [60]:
print(inspect.signature(tokenizer))

(text: Union[str, List[str], List[List[str]]] = None, text_pair: Union[str, List[str], List[List[str]], NoneType] = None, text_target: Union[str, List[str], List[List[str]]] = None, text_pair_target: Union[str, List[str], List[List[str]], NoneType] = None, add_special_tokens: bool = True, padding: Union[bool, str, transformers.utils.generic.PaddingStrategy] = False, truncation: Union[bool, str, transformers.tokenization_utils_base.TruncationStrategy] = None, max_length: Optional[int] = None, stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, return_tensors: Union[str, transformers.utils.generic.TensorType, NoneType] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, return_overflowing_tokens: bool = False, return_special_tokens_mask: bool = False, return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, **kwargs) -> transformers.tokenization_utils_base.BatchEncodi

In [4]:
file = f"{crs_out}/{crs_files[0]}"

with open(file) as fp:
	data = json.load(fp)
count_words(data["text"]), count_words(data["summary"])

(8357, 479)

In [6]:
inputs = tokenizer([data["text"]], return_tensors="pt")
inputs["input_ids"].shape

torch.Size([1, 10740])

In [125]:
context_size, _ = max_lengths(model)
context_size

1024

In [24]:
model_input = pick_sents(data["text"], nltk.sent_tokenize, tokenizer, context_size)
model_input.shape

torch.Size([861])

In [120]:
output = model.generate(model_input[None, :], max_length=500)
output

tensor([[    2,     0,   133,  1783,    21,  1887,     7,  2097,  1520,    31,
          7580,    11, 21779,  1713,     4,    20,  1783,    21,  1595,    11,
             5,  3874,     9,     5,   613,  1486,     9,  3010,    12, 27418,
             4,    20,  1760,    21,  1595,    30,     5,   121,     4,   104,
             4,  1148,    11,   494,  2266,     4,    85,    21,  1595,    19,
             5,   323,     9,     5,   446,     9,  7395,     8,     5,  1112,
             4,     2]])

In [123]:
tokenizer.decode(output[0])

'</s><s>The Act was designed to prevent banks from engaging in speculative activities. The Act was passed in the wake of the financial crisis of 2007-2008. The act was passed by the U.S. Congress in March 2008. It was passed with the support of the House of Representatives and the Senate.</s>'

In [124]:
data["summary"]

'The phrase "Glass-Steagall" generally refers to the separation of commercial banking from investment banking. Congress effected a separation of commercial and investment banking through four sections of the Banking Act of 1933Sections 16, 20, 21, and 32. These four statutory provisions are commonly referred to as the Glass-Steagall Act.\nKey Takeaways of This Report\nThe Glass-Steagall debate is not centered on prohibiting risky financial services; rather, the debate is about whether to permit inherently risky commercial and investment banking activities to be conducted within a single firmspecifically within firms holding federally insured deposits. Over the course of the nearly 70-year-long Glass-Steagall era, the clear-cut separation of traditional commercial banking and securities activities gradually eroded. This erosion was the result of a confluence of matters, including market changes, statutory changes, and regulatory and judicial interpretations. The Glass-Steagall era forma

In [135]:
def truncate_middle(text_ids, size, head_size=.5):
	head_len = int(size * head_size)
	tail_len = size - head_len
	truncated = np.concatenate([
		text_ids[:head_len],
		text_ids[len(text_ids) - tail_len:]
	])
	return torch.tensor(truncated)

In [157]:
all_tokenized = tokenizer([data["text"]], return_tensors="pt")["input_ids"]
all_tokenized.shape

torch.Size([1, 10740])

In [163]:
inp = truncate_middle(all_tokenized[0], context_size)
inp.shape

torch.Size([1024])

In [165]:
out = model.generate(inp[None, :], max_length=500)
out

tensor([[    2,     0, 25997,  8254,  1861,     8,   915,  3454,   149,   237,
          9042,     9,     5, 12539,  1783,     9, 26873,     4,    20, 18753,
            12, 16025,  1783,    34,   314, 29929, 37028,   545,     8,   733,
             9,     5, 10352,    12, 21426, 44195,  1783,     4,  6974, 36466,
             9,   915,  3454,   189,   244,  1888,     5, 13879,     9, 14951,
          8068,  9023,     4,    20, 10875,     6,    30,  1495,     6,   473,
            45,  1100,   141,   915,  1520,    32, 13588,   624,  5157,  1048,
             4,     2]])

In [166]:
tokenizer.decode(out[0])

'</s><s>Congress separated commercial and investment banking through four sections of the Banking Act of 1933. The Dodd-Frank Act has left untouched Sections 16 and 21 of the Glass-Steagall Act. Separation of investment banking may help reduce the complexity of examining depositories. The separation, by itself, does not address how investment banks are regulated within securities markets.</s>'