In [1]:
import os
from sys import getsizeof
import json
import re
import pickle
import inspect
from concurrent.futures import ProcessPoolExecutor

import numpy as np
import nltk
import matplotlib.pyplot as plt
import torch
from transformers import (
	BartTokenizer, BartForConditionalGeneration,
	T5Tokenizer, T5ForConditionalGeneration,
	GPT2TokenizerFast
)
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from dotenv import load_dotenv
import tiktoken
import openai

from utils.helpers import *
from utils.encoders import *
from utils.pipelines import *
from utils.trainer_utils import *
from utils.evaluator_utils import *

In [2]:
load_dotenv()

True

In [3]:
# data_dir = "/Users/naman/Workspace/Data/UCCS-REU"
data_dir = "/home/nchibbar/Data"

crs_files = os.listdir(crs_dir := f"{data_dir}/GovReport/crs")
gao_files = os.listdir(gao_dir := f"{data_dir}/GovReport/gao")

print(f"crs files: {len(crs_files)}, gao files: {len(gao_files)}")

crs_out = f"{data_dir}/GovReport/crs-processed"
gao_out = f"{data_dir}/GovReport/gao-processed"

crs files: 7238, gao files: 12228


In [4]:
max_tokens = 512

# Sentence transformer
sent_dir = f"{data_dir}/Models/Sent-Transformer"
sent_encoder = SentenceTransformer(sent_dir)

# BART
bart_dir = f"{data_dir}/Models/BART"
bart_fine_tuned = f"{data_dir}/Models/BART-GovReport-SentenceSampler"
tokenizer = BartTokenizer.from_pretrained(bart_dir)
model = BartForConditionalGeneration.from_pretrained(bart_fine_tuned)
context_size = model.config.max_position_embeddings

# T5
# t5_dir = f"{data_dir}/Models/T5"
# tokenizer = T5Tokenizer.from_pretrained(t5_dir)
# model = T5ForConditionalGeneration.from_pretrained(t5_dir)
# context_size = model.config.n_positions

# GPT 3.5 turbo tokenizer
gpt_dir = f"{data_dir}/Models/GPT-3.5-turbo-tokenizer"
gpt_tokenizer = GPT2TokenizerFast.from_pretrained(gpt_dir)

context_size

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


1024

In [5]:
special_tokens = extract_special_tokens(
	tokenizer.special_tokens_map.values()
)
preprocessor = TextProcessor(preprocessing=True)
postprocessor = None
# postprocessor = TextProcessor(ignore_tokens=special_tokens)
special_tokens

['<s>', '</s>', '<unk>', '</s>', '<pad>', '<s>', '<mask>']

In [6]:
head_size = .5
threshold = .7
seed = 69
device = get_device()
device = "cpu"

encoders = [
	TruncateMiddle(
		tokenizer=tokenizer, max_tokens=context_size,
		head_size=head_size, preprocessor=preprocessor
	),
	UniformSampler(
		tokenizer=tokenizer, max_tokens=context_size,
		sent_tokenizer=nltk.sent_tokenize, preprocessor=preprocessor,
		seed=seed
	),
	SentenceSampler(
		tokenizer=tokenizer, max_tokens=context_size,
		sent_tokenizer=nltk.sent_tokenize, sent_encoder=sent_encoder,
		preprocessor=preprocessor, threshold=threshold,
		device=device, seed=seed
	),
	RemoveRedundancy(
		tokenizer=tokenizer, max_tokens=context_size,
		sent_tokenizer=nltk.sent_tokenize, sent_encoder=sent_encoder,
		preprocessor=preprocessor, threshold=threshold,
		device=device, seed=seed
	)
]

pipelines = [
	SummarizationPipeline(
		model, encoder, max_tokens, postprocessor, device
	) for encoder in encoders
]

## GovReport

In [None]:
def combine_subsections(sections):
	text = ""
	for sec in sections:
		sec_text = "\n\n".join(sec["paragraphs"])
		if sec["section_title"]:
			sec_text = f"Section {sec["section_title"]}:\n\n{sec_text}"
		text = f"{text}\n\n{sec_text}" if text else sec_text
		if sec["subsections"]:
			sub_text = combine_subsections(sec["subsections"])
			text = f"{text}\n\n{sub_text}" if text else sub_text
	return text

In [None]:
for file in crs_files:
	with open(f"{crs_dir}/{file}") as fp:
		data = json.load(fp)
	text = combine_subsections([data["reports"]])
	text = preprocessor.process(text)
	summary = "\n".join(data["summary"])
	summary = preprocessor.process(summary)
	with open(f"{crs_out}/{file}", "w") as fp:
		json.dump({
			"text": text,
			"summary": summary
		}, fp)

In [None]:
for file in gao_files:
	with open(f"{gao_dir}/{file}") as fp:
		data = json.load(fp)
	text = combine_subsections(data["report"])
	text = preprocessor.process(text)
	print(data["highlight"])
	summary = "\n".join(data["highlight"])
	summary = preprocessor.preprocess(summary)
	with open(f"{gao_out}/{file}", "w") as fp:
		json.dump({
			"text": text,
			"summary": summary
		}, fp)

## LDA

In [None]:
vectorizer = CountVectorizer(stop_words="english")
vectorizer

In [None]:
dtm = vectorizer.fit_transform([data["text"]])
vectorizer

In [None]:
print(dtm)

In [None]:
topics = 4
lda = LatentDirichletAllocation(n_components=topics)
lda.fit(dtm)

In [None]:
topic_dist = lda.transform(dtm)
print(topic_dist)

In [None]:
def display_topics(model, feature_names, num_top_words):
	for topic_idx, topic in enumerate(model.components_):
		print(f"Topic {topic_idx}:")
		print(" ".join([feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))

In [None]:
num_top_words = 10
feature_names = vectorizer.get_feature_names_out()
display_topics(lda, feature_names, num_top_words)

## Rough

In [7]:
texts, summaries = [], []

In [8]:
# max 73_791
min_words_text = 50_000
for file in crs_files:
	with open(f"{crs_out}/{file}") as fp:
		data = json.load(fp)
	if count_words(data["text"]) >= min_words_text:
		break
texts.append(data["text"])
summaries.append(data["summary"])

count_words(data["text"]), count_words(data["summary"])

(53559, 500)

In [5]:
texts, summaries = [], []
for file in crs_files:
	with open(f"{crs_out}/{file}") as fp:
		data = json.load(fp)
	texts.append(data["text"])
	summaries.append(data["summary"])

len(texts)

7238

In [9]:
class OpenAIPipeline:

	def __init__(
		self, model: str, encoder: Encoder,
		prompt_template: str="", system_prompt: str=""
	) -> None:
		self.model = model
		self.encoder = encoder
		self.max_tokens = encoder.max_tokens
		self.prompt_template = prompt_template
		self.system_prompt = system_prompt
		self.call_inputs = None
		self.response = None
	
	def __call__(self):
		...
	
	def create_inputs(
		self, text: str, previous_messages: list[str]|None=None
	) -> int:
		encoder = self.encoder
		max_tokens = self.max_tokens
		prompt_template = self.prompt_template
		tokenizer = encoder.tokenizer

		# Tokens used to create OpenAI prompt template
		# 3 tokens for prompt base
		# 4 tokens each for every message
		num_prev_msgs = 0 if previous_messages is None else len(previous_messages)
		tokens_used = 3 + 4 * (2 * num_prev_msgs + 1)

		# Create system prompt
		system_prompt = self.system_prompt
		messages = []
		if system_prompt:
			messages.append({"role": "system", "content": system_prompt})
			tokens_used += count_tokens(system_prompt, tokenizer) + 4
		if num_prev_msgs:
			for text, summary in previous_messages:
				messages.append({"role": "user", "content": text})
				messages.append({"role": "assistant", "content": summary})
				tokens_used += count_tokens([text, summary], tokenizer)
		tokens_used += count_tokens(prompt_template, tokenizer)
		encodings = encoder.encode(text, max_tokens - tokens_used)
		text = tokenizer.decode(encodings, ignore_special_tokens=True)
		prompt = f"{prompt_template}{text}"
		messages.append({"role": "user", "content": prompt})
		self.call_inputs = {
			"model": self.model,
			"messages": messages,
			"max_tokens": max_tokens
		}
		return tokens_used
	
	def send_call(self):
		call_inputs = self.call_inputs
		assert call_inputs is not None, "Call inputs not created"
		try:
			self.response = openai.chat.completions.create(**call_inputs)
		except Exception as e:
			show_exception(e)
		return self.response

In [10]:
max_tokens = 4096

encoder = SentenceSampler(
	gpt_tokenizer, max_tokens, nltk.sent_tokenize, sent_encoder,
	preprocessor, False, device=device, seed=seed
)

In [11]:
openai_model = "gpt-3.5-turbo"
system_prompt = "You are an expert summarizer. You summarize very long texts, given some of its sentences. You extract key information and ideas from the sentences to generate a concise and coherent summary with more than 300 words. Do not refer to the text in the summary"

openai_pipeline = OpenAIPipeline(
	openai_model, encoder, system_prompt=system_prompt
)

In [47]:
openai_pipeline.create_inputs(texts[0])

62

In [49]:
openai_pipeline.send_call()

In [54]:
openai_pipeline.response.choices[0].message.content

'The growth in federal spending between FY2008 and FY2009, largely due to the economic recession and the American Recovery and Reinvestment Act (ARRA) of 2009, focused on addressing economic disadvantage through various programs like education, training, and services. The ARRA authorized numerous initiatives targeting the causes and effects of economic hardship. Spending increased in areas like housing and development, with SNAP becoming one of the largest programs in FY2009. Health programs, including Medicaid, accounted for over half of the top 10 program spending. Different federal programs, such as Pell Grants and Title I-A, distribute funds to low-income populations through formula grants, competitive awards, or direct benefits. States also have their own earned income tax credit programs that supplement federal programs. Various federal programs set income eligibility limits using the federal poverty guidelines and may have overlapping target populations. Spending snapshots for l

In [51]:
openai_pipeline.response.usage

CompletionUsage(completion_tokens=244, prompt_tokens=3623, total_tokens=3867)