In [2]:
import json
import os
import numpy as np
import re
import nltk
import torch
from transformers import BartTokenizer, BartForConditionalGeneration
from sentence_transformers import SentenceTransformer
import inspect

from utils import *

device = get_device()
device

'mps'

In [2]:
data_dir = "/Users/naman/Workspace/Data/UCCS-REU"

crs_files = os.listdir(crs_dir := f"{data_dir}/GovReport/crs")
gao_files = os.listdir(gao_dir := f"{data_dir}/GovReport/gao")

print(f"crs files: {len(crs_files)}, gao files: {len(gao_files)}")

crs_out = f"{data_dir}/GovReport/crs-processed"
gao_out = f"{data_dir}/GovReport/gao-processed"

preprocessor = TextPreprocessor()

crs files: 7238, gao files: 12228


In [8]:
tokenizer_dir = f"{data_dir}/Models/BART/tokenizer"
model_dir = f"{data_dir}/Models/BART/model"
checkpoint = "facebook/bart-large-cnn"

tokenizer = BartTokenizer.from_pretrained(tokenizer_dir)
model = BartForConditionalGeneration.from_pretrained(model_dir)

context_size, _ = max_lengths(model)
max_output_tokens = 500

context_size

In [None]:
special_tokens = tokenizer.special_tokens_map.values()
postprocessor = TextPostprocessor(special_tokens)
special_tokens

In [4]:
file = f"{crs_out}/{crs_files[0]}"

with open(file) as fp:
	data = json.load(fp)
count_words(data["text"]), count_words(data["summary"])

(8357, 479)

## gov-report

In [28]:
for file in crs_files:
	with open(f"{crs_dir}/{file}") as fp:
		data = json.load(fp)
	text = combine_subsections([data["reports"]])
	text = preprocessor.preprocess(text)
	summary = "\n".join(data["summary"])
	summary = preprocessor.preprocess(summary)
	with open(f"{crs_out}/{file}", "w") as fp:
		json.dump({
			"text": text,
			"summary": summary
		}, fp)

In [34]:
for file in gao_files:
	with open(f"{gao_dir}/{file}") as fp:
		data = json.load(fp)
	text = combine_subsections(data["report"])
	text = preprocessor.preprocess(text)
	print(data["highlight"])
	summary = "\n".join(data["highlight"])
	summary = preprocessor.preprocess(summary)
	with open(f"{gao_out}/{file}", "w") as fp:
		json.dump({
			"text": text,
			"summary": summary
		}, fp)

[]


## Scratch

In [6]:
inp = truncate_middle([data["text"]], tokenizer, context_size, .4)
inp

{'input_ids': tensor([[    0, 43480, 24474,  ..., 10914,     4,     2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]])}

In [7]:
out = model.generate(**inp, max_length=max_output_tokens)
out

tensor([[    2,     0, 44189,    12, 21426, 44195,  1783,  8254,  1861,  3454,
            31,   915,  3454,     4,  6974, 36466,     9,  1861,     8,   915,
          3454,    64,   244,  7540, 10246, 24323,  8068,  9023,    31,  4975,
            11,  5157,  1048,     4,    20, 10875,     6,    30,  1495,     6,
           473,    45,  1100,   141,   915,  1520,    32, 13588,   624,  5157,
          1048,    50,   141,   786, 27045,    64,   304,  5157,  1713,     7,
          1391,  2267,     8,  1861,  1126,     4,     2]])

In [8]:
tokenizer.decode(out[0])

'</s><s>Glass-Steagall Act separated commercial banking from investment banking. Separation of commercial and investment banking can help insulate insured depositories from volatility in securities markets. The separation, by itself, does not address how investment banks are regulated within securities markets or how nonbanks can use securities activities to fund consumer and commercial debt.</s>'

In [13]:
inp = truncate_middle([
	"hey, this is a small text!",
	data["text"]
], tokenizer, context_size)
inp

{'input_ids': tensor([[    0, 12229,     6,  ...,     1,     1,     1],
        [    0, 43480, 24474,  ..., 10914,     4,     2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]])}

In [3]:
sent_checkpoint = "sentence-transformers/all-MiniLM-L6-v2"
sent_save_dir = f"{data_dir}/Models/Sent-Transformer"

sent_model = SentenceTransformer(sent_save_dir)
sent_model

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [9]:
out = sent_model.encode([
	"hey bruh",
	"whats up huh?",
	data["text"]
])
out.shape

array([[-0.06765417, -0.03888808,  0.05449073, ...,  0.01187451,
         0.04905998,  0.01324423],
       [-0.11608477, -0.07559214,  0.05973152, ..., -0.01747406,
        -0.00257643,  0.05474273],
       [ 0.10994177, -0.04852243, -0.07865883, ...,  0.00304659,
         0.0611313 , -0.02873247]], dtype=float32)