In [1]:
import json
import os
import numpy as np
import re
import nltk
import torch
from transformers import BartTokenizer, BartForConditionalGeneration
import inspect

from utils import *

In [19]:
stopwords = nltk.corpus.stopwords.words("english")
stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

## gov-report

In [2]:
data_dir = "/Users/naman/Workspace/Data/UCCS-REU"

crs_files = os.listdir(crs_dir := f"{data_dir}/GovReport/crs")
gao_files = os.listdir(gao_dir := f"{data_dir}/GovReport/gao")

print(f"crs files: {len(crs_files)}, gao files: {len(gao_files)}")

crs_out = f"{data_dir}/GovReport/crs-processed"
gao_out = f"{data_dir}/GovReport/gao-processed"

preprocessor = TextPreprocessor()

crs files: 7238, gao files: 12228


### Processing text

In [28]:
for file in crs_files:
	with open(f"{crs_dir}/{file}") as fp:
		data = json.load(fp)
	text = combine_subsections([data["reports"]])
	text = preprocessor.preprocess(text)
	summary = "\n".join(data["summary"])
	summary = preprocessor.preprocess(summary)
	with open(f"{crs_out}/{file}", "w") as fp:
		json.dump({
			"text": text,
			"summary": summary
		}, fp)

In [34]:
for file in gao_files:
	with open(f"{gao_dir}/{file}") as fp:
		data = json.load(fp)
	text = combine_subsections(data["report"])
	text = preprocessor.preprocess(text)
	print(data["highlight"])
	summary = "\n".join(data["highlight"])
	summary = preprocessor.preprocess(summary)
	with open(f"{gao_out}/{file}", "w") as fp:
		json.dump({
			"text": text,
			"summary": summary
		}, fp)

[]


---

In [3]:
tokenizer_dir = f"{data_dir}/Models/BART/tokenizer"
model_dir = f"{data_dir}/Models/BART/model"
checkpoint = "facebook/bart-large-cnn"

tokenizer = BartTokenizer.from_pretrained(tokenizer_dir)
model = BartForConditionalGeneration.from_pretrained(model_dir)

In [4]:
file = f"{crs_out}/{crs_files[0]}"

with open(file) as fp:
	data = json.load(fp)
count_words(data["text"]), count_words(data["summary"])

(8357, 479)

In [5]:
context_size, _ = max_lengths(model)
context_size

1024

In [6]:
inputs = tokenizer([data["text"]], return_tensors="pt")
inputs["input_ids"].shape

torch.Size([1, 10740])

In [18]:
model_input = pick_sents([data["text"]], nltk.sent_tokenize, tokenizer, context_size)
model_input

{'input_ids': tensor([[    0,   133,   315,   532,  2152,  6636,  3454, 16645,  2052,     7,
             5, 39553,     9,     5, 10352,    12, 21426, 44195,  1783,     4,
             2,     0,   133,  4033,    74,  2120,     7,  7057, 10352,    12,
         21426, 44195,    11,   430,  1319,     6,  1712,  4146,     9,   106,
            74,  1622,   769,    12,   225,  7257,     5,  1461, 10352,    12,
         21426, 44195, 17947,  2788,    14,    21, 29643,    30, 12209,  3813,
             4,     2,     0, 38867,   940, 10514,     7,  2337,  2973,     6,
           217,  3881,     7,   696,  1126,  5157,     6,    67,    21,  1202,
            13,   650,     8,  1084,    12,  8407,  1520,     6,    11,   233,
           142,  5157,  1048,    58,    67, 15902,   148,     5,  2860, 23384,
             8,   142,  2735,  3353,   747,    56,    45, 14681,  5157,  1048,
          5705,   615,    50,    15,    10,   739,   615,  3189,  2052,     7,
             5,  2860, 23384,     7,  

In [19]:
output = model.generate(**model_input, max_length=500)
output

tensor([[    2,     0,   133, 10352,    12, 21426, 44195,  1783,    21,  1595,
            11, 26873,     4,    85,    21,  3833,     7,  2097,  1520,    31,
           602,    15,   350,   203,  1126,     4,    20,  1783,    21,   423,
         13522,     7,  1157,     5,  1520,     7,   185,    15,    55,  1126,
             4,    85,    67,  1220,   106,     7,  1331,   103,     9,     5,
          1126,     7,    97,  1520,     4,    20,  1760,    21, 29643,    11,
          4013,     4,     2]])

In [20]:
tokenizer.decode(output[0])

'</s><s>The Glass-Steagall Act was passed in 1933. It was intended to prevent banks from taking on too much debt. The Act was later amended to allow the banks to take on more debt. It also allowed them to sell some of the debt to other banks. The act was repealed in 2005.</s>'

In [124]:
data["summary"]

'The phrase "Glass-Steagall" generally refers to the separation of commercial banking from investment banking. Congress effected a separation of commercial and investment banking through four sections of the Banking Act of 1933Sections 16, 20, 21, and 32. These four statutory provisions are commonly referred to as the Glass-Steagall Act.\nKey Takeaways of This Report\nThe Glass-Steagall debate is not centered on prohibiting risky financial services; rather, the debate is about whether to permit inherently risky commercial and investment banking activities to be conducted within a single firmspecifically within firms holding federally insured deposits. Over the course of the nearly 70-year-long Glass-Steagall era, the clear-cut separation of traditional commercial banking and securities activities gradually eroded. This erosion was the result of a confluence of matters, including market changes, statutory changes, and regulatory and judicial interpretations. The Glass-Steagall era forma

In [157]:
all_tokenized = tokenizer([data["text"]], return_tensors="pt")["input_ids"]
all_tokenized.shape

torch.Size([1, 10740])

In [6]:
inp = truncate_middle([data["text"]], tokenizer, context_size, .4)
inp

{'input_ids': tensor([[    0, 43480, 24474,  ..., 10914,     4,     2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]])}

In [7]:
out = model.generate(**inp, max_length=500)
out

tensor([[    2,     0, 44189,    12, 21426, 44195,  1783,  8254,  1861,  3454,
            31,   915,  3454,     4,  6974, 36466,     9,  1861,     8,   915,
          3454,    64,   244,  7540, 10246, 24323,  8068,  9023,    31,  4975,
            11,  5157,  1048,     4,    20, 10875,     6,    30,  1495,     6,
           473,    45,  1100,   141,   915,  1520,    32, 13588,   624,  5157,
          1048,    50,   141,   786, 27045,    64,   304,  5157,  1713,     7,
          1391,  2267,     8,  1861,  1126,     4,     2]])

In [8]:
tokenizer.decode(out[0])

'</s><s>Glass-Steagall Act separated commercial banking from investment banking. Separation of commercial and investment banking can help insulate insured depositories from volatility in securities markets. The separation, by itself, does not address how investment banks are regulated within securities markets or how nonbanks can use securities activities to fund consumer and commercial debt.</s>'