In [1]:
import os
from sys import getsizeof
import json
import re
import pickle
import inspect
from concurrent.futures import ProcessPoolExecutor

import numpy as np
import nltk
import matplotlib.pyplot as plt
import torch
from transformers import (
	BartTokenizer, BartForConditionalGeneration,
	T5Tokenizer, T5ForConditionalGeneration,
	GPT2TokenizerFast
)
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from dotenv import load_dotenv
import tiktoken
import openai

from utils.pipelines import *
from utils.helpers import *

In [2]:
load_dotenv()

True

In [3]:
# data_dir = "/Users/naman/Workspace/Data/UCCS-REU"
data_dir = "/home/nchibbar/Data"

crs_files = os.listdir(crs_dir := f"{data_dir}/GovReport/crs")
gao_files = os.listdir(gao_dir := f"{data_dir}/GovReport/gao")

print(f"crs files: {len(crs_files)}, gao files: {len(gao_files)}")

crs_out = f"{data_dir}/GovReport/crs-processed"
gao_out = f"{data_dir}/GovReport/gao-processed"

crs files: 7238, gao files: 12228


In [4]:
max_tokens = 512

# Sentence transformer
sent_dir = f"{data_dir}/Models/Sent-Transformer"
sent_checkpoint = "sentence-transformers/all-MiniLM-L6-v2"

sent_encoder = SentenceTransformer(sent_dir)
sent_encoder

# BART
bart_dir = f"{data_dir}/Models/BART"
bart_fine_tuned = f"{data_dir}/Models/BART-GovReport-SentenceSampler"
bart_checkpoint = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(bart_dir)
model = BartForConditionalGeneration.from_pretrained(bart_fine_tuned)
context_size = model.config.max_position_embeddings

# T5
# t5_dir = f"{data_dir}/Models/T5"
# t5_checkpoint = "google/flan-t5-base"
# t5_checkpoint = "pszemraj/long-t5-tglobal-base-16384-book-summary"
# tokenizer = T5Tokenizer.from_pretrained(t5_dir)
# model = T5ForConditionalGeneration.from_pretrained(t5_dir)
# context_size = model.config.n_positions

# GPT 3.5 turbo tokenizer
gpt_dir = f"{data_dir}/Models/GPT-3.5-turbo-tokenizer"
gpt_tokenizer = GPT2TokenizerFast.from_pretrained(gpt_dir)

context_size

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


1024

In [5]:
special_tokens = extract_special_tokens(
	tokenizer.special_tokens_map.values()
)
preprocessor = TextProcessor(preprocessing=True)
postprocessor = None
# postprocessor = TextProcessor(ignore_tokens=special_tokens)
special_tokens

['<s>', '</s>', '<unk>', '</s>', '<pad>', '<s>', '<mask>']

In [None]:
head_size = .5
threshold = .7
seed = 69
device = get_device()
device = "cpu"

encoders = [
	TruncateMiddle(
		tokenizer, context_size, head_size, preprocessor
	),
	UniformSampler(
		tokenizer, context_size, nltk.sent_tokenize, preprocessor, seed
	),
	SentenceSampler(
		tokenizer, context_size, nltk.sent_tokenize, sent_encoder,
		preprocessor, threshold, device, seed
	),
	RemoveRedundancy(
		tokenizer, context_size, nltk.sent_tokenize, sent_encoder,
		preprocessor, threshold, device, seed
	),
]

pipelines = [
	SummarizationPipeline(
		model, encoder, max_tokens, postprocessor, device
	) for encoder in encoders
]

## GovReport

In [None]:
def combine_subsections(sections):
	text = ""
	for sec in sections:
		sec_text = "\n\n".join(sec["paragraphs"])
		if sec["section_title"]:
			sec_text = f"Section {sec["section_title"]}:\n\n{sec_text}"
		text = f"{text}\n\n{sec_text}" if text else sec_text
		if sec["subsections"]:
			sub_text = combine_subsections(sec["subsections"])
			text = f"{text}\n\n{sub_text}" if text else sub_text
	return text

In [None]:
for file in crs_files:
	with open(f"{crs_dir}/{file}") as fp:
		data = json.load(fp)
	text = combine_subsections([data["reports"]])
	text = preprocessor.process(text)
	summary = "\n".join(data["summary"])
	summary = preprocessor.process(summary)
	with open(f"{crs_out}/{file}", "w") as fp:
		json.dump({
			"text": text,
			"summary": summary
		}, fp)

In [None]:
for file in gao_files:
	with open(f"{gao_dir}/{file}") as fp:
		data = json.load(fp)
	text = combine_subsections(data["report"])
	text = preprocessor.process(text)
	print(data["highlight"])
	summary = "\n".join(data["highlight"])
	summary = preprocessor.preprocess(summary)
	with open(f"{gao_out}/{file}", "w") as fp:
		json.dump({
			"text": text,
			"summary": summary
		}, fp)

## LDA

In [None]:
vectorizer = CountVectorizer(stop_words="english")
vectorizer

In [None]:
dtm = vectorizer.fit_transform([data["text"]])
vectorizer

In [None]:
print(dtm)

In [None]:
topics = 4
lda = LatentDirichletAllocation(n_components=topics)
lda.fit(dtm)

In [None]:
topic_dist = lda.transform(dtm)
print(topic_dist)

In [None]:
def display_topics(model, feature_names, num_top_words):
	for topic_idx, topic in enumerate(model.components_):
		print(f"Topic {topic_idx}:")
		print(" ".join([feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))

In [None]:
num_top_words = 10
feature_names = vectorizer.get_feature_names_out()
display_topics(lda, feature_names, num_top_words)

## Rough

In [6]:
texts, summaries = [], []

In [7]:
# max 73_791
min_words_text = 50_000
for file in crs_files:
	with open(f"{crs_out}/{file}") as fp:
		data = json.load(fp)
	if count_words(data["text"]) >= min_words_text:
		break
texts.append(data["text"])
summaries.append(data["summary"])

count_words(data["text"]), count_words(data["summary"])

(53559, 500)

In [5]:
texts, summaries = [], []
for file in crs_files:
	with open(f"{crs_out}/{file}") as fp:
		data = json.load(fp)
	texts.append(data["text"])
	summaries.append(data["summary"])

len(texts)

7238

In [9]:
batch_size = None if len(texts) < 3 else 3
evaluator = Evaluator(
	pipelines, texts, summaries, device=device
)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
num_workers = os.cpu_count()
# num_workers = 3
evaluator.generate_summaries(batch_size, None)

[6870.044142997358, 3887.949132011272, 4018.1995559833013, 9793.55108900927]

In [11]:
evaluator.get_rouge_score()

[{'rouge-1': [0.18313253012048192, 0.76, 0.10410958904109589],
  'rouge-2': [0.11138014527845035, 0.46938775510204084, 0.06318681318681318],
  'rouge-l': [0.23237093419338278, 0.7251437262463217, 0.13835293516474403],
  'rouge-w': [0.041725977592485294, 0.420434591230423, 0.02195231621305737]},
 {'rouge-1': [0.0625, 0.2549019607843137, 0.03561643835616438],
  'rouge-2': [0.009661835748792272, 0.04, 0.005494505494505495],
  'rouge-l': [0.08358463007595818, 0.25725166042361713, 0.04989870556426403],
  'rouge-w': [0.01384352889403138, 0.13688925530533755, 0.00729040142495195]},
 {'rouge-1': [0.0625, 0.2549019607843137, 0.03561643835616438],
  'rouge-2': [0.009661835748792272, 0.04, 0.005494505494505495],
  'rouge-l': [0.08358463007595818, 0.25725166042361713, 0.04989870556426403],
  'rouge-w': [0.01384352889403138, 0.13688925530533755, 0.00729040142495195]},
 {'rouge-1': [0.1411764705882353, 0.5, 0.0821917808219178],
  'rouge-2': [0.023640661938534275, 0.0847457627118644, 0.01373626373626

In [24]:
evaluator.get_bert_score()

[[0.8340007662773132, 0.8763248920440674, 0.7955765724182129],
 [0.7822791934013367, 0.8163412809371948, 0.7509458065032959],
 [0.7822791934013367, 0.8163412809371948, 0.7509458065032959],
 [0.7969955801963806, 0.8280269503593445, 0.7682060599327087]]

In [106]:
inp = tokenizer.decode(encoders[2](texts)["input_ids"][0])
inp

"<s>Participation data: In June 2009, a total of 18 million families, composed of 43 million recipients (including 33 million children), received TANF- or MOE-funded cash assistance In June 2010, a total of 19 million families, composed of 45 million recipients (including 34 million children), received TANF- or MOE-funded cash assistance The larger number of individuals or families receiving any TANF- or MOE-funded benefit or service is not known\n\nCRS report: CRS Report R40946, The Temporary Assistance for Needy Families Block Grant: An Introduction, by [author name scrubbed].\nIn some limited circumstances, families may be low-income, with incomes as high as 80% of area median income\n\nForm and recipient of federal assistance: Project-based rental assistance contracts between HUD and private property owners HUD has not had the authority to enter into new contracts since 1983, but does have the authority to renew existing contracts when they expire There are properties with project-

In [8]:
try:
	response = openai.chat.completions.create(
		model="gpt-3.5-turbo",
		messages=[
			{"role": "system", "content": "You are a summarizer who summarizes very long texts given important sentences."},
			{"role": "user", "content": inp}
		],
		max_tokens=4097
	)
except Exception as e:
	show_exception(e)

Encountered exception of type OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable, Traceback (most recent call last):
  File "/tmp/ipykernel_26527/1819395941.py", line 2, in <module>
    response = openai.chat.completions.create(
               ^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/nchibbar/naman-venv-3.12/lib/python3.12/site-packages/openai/_utils/_proxy.py", line 20, in __getattr__
    proxied = self.__get_proxied__()
              ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/nchibbar/naman-venv-3.12/lib/python3.12/site-packages/openai/_utils/_proxy.py", line 55, in __get_proxied__
    return self.__load__()
           ^^^^^^^^^^^^^^^
  File "/home/nchibbar/naman-venv-3.12/lib/python3.12/site-packages/openai/_module_client.py", line 12, in __load__
    return _load_client().chat
           ^^^^^^^^^^^^^^
  File "/home/nchibbar/naman-venv-3.12/lib/python3.12/site-packages/openai/__init__.py", line 323

In [121]:
generated = response.choices[0].message.content

In [120]:
summaries[0]

'The federal government spent almost $708 billion in FY2009 on programs for low-income people, and nearly $578 billion the previous year. The increased spending between the two years was largely due to the recession, with almost two-thirds coming from the American Recovery and Reinvestment Act (ARRA, P.L. 111-5), the economic stimulus enacted in February 2009.\nLow-income programs discussed in this report are distinct from social insurance programs, such as Social Security or Medicare, which aim to protect American workers universally against lost wages or benefits when they retire, become disabled, or lose a job. In contrast, programs addressed here focus explicitly on low-income populations. They provide assistance in obtaining basic needs, such as health care, food, or housing, and seek to address the causes of low income through education, training, or other services. While these programs are very diverse, the analysis in this report yields certain general findings:\n Health care d

In [4]:
system_prompt = "You are an expert summarizer. Your task is to summarize long texts into concise and informative summaries. When provided with key segments of a longer text, please summarize these segments while ensuring the main ideas and important details are preserved."

'You are an expert summarizer.'

In [None]:
class OpenAIPipeline:

	def __init__(
		self, model: str, encoder: Encoder, max_tokens: int,
		prompt_template: str="", system_prompt: str="",
		previous_messages: list[str]|None=None
	) -> None:
		self.model = model
		self.encoder = encoder
		self.max_tokens = max_tokens
		self.prompt_template = prompt_template
		self.system_prompt = system_prompt
		self.previous_messages = previous_messages
		self.call_inputs = None
	
	def create_inputs(self, text: str):
		system_prompt = self.system_prompt
		previous_messages = self.previous_messages
		messages = []
		if system_prompt:
			messages.append({"role": "system", "content": system_prompt})
		if previous_messages is not None:
			for text, summary in previous_messages:
				messages.append({"role": "user", "content": text})
				messages.append({"role": "assistant", "content": summary})
		prompt = f"{self.prompt_template}{text}"
		messages.append({"role": "user", "content": prompt})
		self.call_inputs = {
			"model": self.model,
			"messages": messages,
			"max_tokens": self.max_tokens
		}