In [None]:
import os
from sys import getsizeof
import json
import re
import pickle
import inspect
from concurrent.futures import ProcessPoolExecutor

import numpy as np
import nltk
import matplotlib.pyplot as plt
import torch
from transformers import (
	BartTokenizer, BartForConditionalGeneration,
	T5Tokenizer, T5ForConditionalGeneration,
	GPT2TokenizerFast
)
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from dotenv import load_dotenv
import tiktoken
import openai

from utils.helpers import *
from utils.encoders import *
from utils.pipelines import *
from utils.trainer_utils import *
from utils.evaluator_utils import *

In [None]:
load_dotenv()

In [None]:
# data_dir = "/Users/naman/Workspace/Data/UCCS-REU"
data_dir = "/home/nchibbar/Data"

crs_files = os.listdir(crs_dir := f"{data_dir}/GovReport/crs")
gao_files = os.listdir(gao_dir := f"{data_dir}/GovReport/gao")

print(f"crs files: {len(crs_files)}, gao files: {len(gao_files)}")

crs_out = f"{data_dir}/GovReport/crs-processed"
gao_out = f"{data_dir}/GovReport/gao-processed"

In [None]:
max_tokens = 512

# Sentence transformer
sent_dir = f"{data_dir}/Models/Sent-Transformer"
sent_encoder = SentenceTransformer(sent_dir)

# BART
bart_dir = f"{data_dir}/Models/BART"
bart_fine_tuned = f"{data_dir}/Models/BART-GovReport-SentenceSampler"
tokenizer = BartTokenizer.from_pretrained(bart_dir)
model = BartForConditionalGeneration.from_pretrained(bart_fine_tuned)
context_size = model.config.max_position_embeddings

# T5
# t5_dir = f"{data_dir}/Models/T5"
# tokenizer = T5Tokenizer.from_pretrained(t5_dir)
# model = T5ForConditionalGeneration.from_pretrained(t5_dir)
# context_size = model.config.n_positions

# GPT 3.5 turbo tokenizer
gpt_dir = f"{data_dir}/Models/GPT-3.5-turbo-tokenizer"
gpt_tokenizer = GPT2TokenizerFast.from_pretrained(gpt_dir)

context_size

In [None]:
preprocessor = TextProcessor(preprocessing=True)
postprocessor = None

## GovReport

In [None]:
def combine_subsections(sections):
	text = ""
	for sec in sections:
		sec_text = "\n\n".join(sec["paragraphs"])
		if sec["section_title"]:
			sec_text = f"Section {sec["section_title"]}:\n\n{sec_text}"
		text = f"{text}\n\n{sec_text}" if text else sec_text
		if sec["subsections"]:
			sub_text = combine_subsections(sec["subsections"])
			text = f"{text}\n\n{sub_text}" if text else sub_text
	return text

In [None]:
for file in crs_files:
	with open(f"{crs_dir}/{file}") as fp:
		data = json.load(fp)
	text = combine_subsections([data["reports"]])
	text = preprocessor.process(text)
	summary = "\n".join(data["summary"])
	summary = preprocessor.process(summary)
	with open(f"{crs_out}/{file}", "w") as fp:
		json.dump({
			"text": text,
			"summary": summary
		}, fp)

In [None]:
for file in gao_files:
	with open(f"{gao_dir}/{file}") as fp:
		data = json.load(fp)
	text = combine_subsections(data["report"])
	text = preprocessor.process(text)
	print(data["highlight"])
	summary = "\n".join(data["highlight"])
	summary = preprocessor.preprocess(summary)
	with open(f"{gao_out}/{file}", "w") as fp:
		json.dump({
			"text": text,
			"summary": summary
		}, fp)

## LDA

In [None]:
vectorizer = CountVectorizer(stop_words="english")
vectorizer

In [None]:
dtm = vectorizer.fit_transform([data["text"]])
vectorizer

In [None]:
print(dtm)

In [None]:
topics = 4
lda = LatentDirichletAllocation(n_components=topics)
lda.fit(dtm)

In [None]:
topic_dist = lda.transform(dtm)
print(topic_dist)

In [None]:
def display_topics(model, feature_names, num_top_words):
	for topic_idx, topic in enumerate(model.components_):
		print(f"Topic {topic_idx}:")
		print(" ".join([feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))

In [None]:
num_top_words = 10
feature_names = vectorizer.get_feature_names_out()
display_topics(lda, feature_names, num_top_words)

## Rough

In [None]:
texts, summaries = [], []

In [None]:
# max 73_791
min_words_text = 70_000
for file in crs_files:
	with open(f"{crs_out}/{file}") as fp:
		data = json.load(fp)
	if count_words(data["text"]) >= min_words_text:
		break
texts.append(data["text"])
summaries.append(data["summary"])

count_words(data["text"]), count_words(data["summary"])

In [None]:
texts, summaries = [], []
for file in crs_files:
	with open(f"{crs_out}/{file}") as fp:
		data = json.load(fp)
	texts.append(data["text"])
	summaries.append(data["summary"])

len(texts)

In [None]:
max_tokens = 4096
head_size = .5
threshold = .7
seed = 69
device = get_device()
# device = "cpu"

encoder = RemoveRedundancy(
	gpt_tokenizer, max_tokens, nltk.sent_tokenize, sent_encoder,
	preprocessor, False, device=device, seed=seed
)

In [None]:
openai_model = "gpt-3.5-turbo"
system_prompt = "You summarize very long texts, given some of its sentences. You extract key information and ideas from the sentences to generate a detailed, concise, and coherent summary with more than 500 words. Do not refer to the source text in any way."

openai_pipeline = OpenAIPipeline(
	openai_model, encoder, system_prompt=system_prompt
)

In [None]:
class Evaluator:

	def __init__(
		self, pipelines, rouge_metrics: list[str]|None=None,
		rougen_max_n: int=2, rougew_weight_factor: int=1.2,
		device: str|torch.device|None=None
	) -> None:
		# Initialize pipelines
		pipelines = self.pipelines = pipelines if \
			isinstance(pipelines, list) else [pipelines]
		self.num_pipelines = len(pipelines)

		# Initialize BERT scorer
		self.bert_scorer = BERTScorer(lang="en", device=device)
		self.device = device

		# Initialise ROUGE scorer
		if rouge_metrics is None:
			rouge_metrics = ["rouge-n", "rouge-l", "rouge-w"]
		self.rouge_scorer = Rouge(
			metrics=rouge_metrics, max_n=rougen_max_n, limit_length=False,
			weight_factor=rougew_weight_factor
		)
		if "rouge-n" in rouge_metrics:
			rouge_metrics.remove("rouge-n")
			self.rouge_metrics = [
				f"rouge-{i+1}" for i in range(rougen_max_n)
			]
			self.rouge_metrics.extend(rouge_metrics)
		else:
			self.rouge_metrics = rouge_metrics
		self.rougen_max_n = rougen_max_n
		self.rougew_weight_factor = rougew_weight_factor

		self.generated_summaries = None

	def __call__(
		self, texts: str|list[str], summaries: str|list[str],
		batch_size: int|None=None, num_workers: int|None=None
	) -> dict:
		time_taken = self.generate_summaries(texts, batch_size, num_workers)
		print(f"Time taken to generate summaries: {time_taken}")
		bert_score = self.get_bert_score(summaries)
		rouge_score = self.get_rouge_score(summaries)
		scores = {
			"time-taken": time_taken,
			"bert-scores": bert_score,
			"rouge-scores": rouge_score
		}
		return scores
	
	def generate_summaries(
		self, texts: str|list[str], batch_size: int|None=None,
		num_workers: int|None=None
	) -> list[int]:
		if isinstance(texts, str):
			texts = [texts]
		generated_summaries = self.generated_summaries = []
		time_taken = []
		inputs = [
			(texts, i, batch_size) for i in range(self.num_pipelines)
		]
		if num_workers is not None and num_workers > 1:
			with ProcessPoolExecutor(max_workers=num_workers) as executor:
				results = executor.map(self._generate_summaries, inputs)
		else:
			results = map(self._generate_summaries, inputs)
		for summary, time in results:
			generated_summaries.extend(summary)
			time_taken.append(time)
		return time_taken
	
	# P, R, F
	def get_bert_score(
		self, summaries: str|list[str]
	) -> list[torch.Tensor]:
		generated_summaries = self.generated_summaries
		assert generated_summaries is not None, "Summaries not generated"
		num_pipelines = self.num_pipelines
		summaries = num_pipelines * summaries
		metrics = self.bert_scorer.score(generated_summaries, summaries)
		metrics = np.array([
			metric.reshape((num_pipelines, -1)).mean(dim=1)
			for metric in metrics
		])
		order = [2, 0, 1]
		metrics = metrics.T[:, order].tolist()
		return metrics
	
	# F, P, R
	def get_rouge_score(
		self, summaries: str|list[str]
	) -> list[dict[str, np.ndarray]]:
		generated_summaries = self.generated_summaries
		assert generated_summaries is not None, "Summaries not generated"
		num_generated_summaries = len(generated_summaries)
		num_summaries = len(summaries)
		scores = []
		for i in range(0, num_generated_summaries, num_summaries):
			pipeline_summaries = generated_summaries[i:i+num_summaries]
			mean_score = {
				metric: np.array([0., 0, 0])
				for metric in self.rouge_metrics
			}
			for cand, ref in zip(pipeline_summaries, summaries):
				score = self.rouge_scorer.get_scores(cand, ref)
				for metric, values in score.items():
					mean_score[metric] += list(values.values())
			for metric, values in mean_score.items():
				mean_score[metric] = (values / num_summaries).tolist()
			scores.append(mean_score)
		return scores
	
	def _generate_summaries(self, args):
		texts, ind, batch_size = args
		pipeline = self.pipelines[ind]
		start = perf_counter()
		summaries = pipeline(texts, batch_size)
		time_taken = (perf_counter() - start)
		print(f"Generated summary for pipeline {ind+1} in {time_taken}s")
		return summaries, time_taken


In [None]:
evaluator = Evaluator(openai_pipeline)

In [None]:
evaluator(texts, summaries)

In [None]:
openai_pipeline.response.usage

In [None]:
gpt_tokenizer.decode(encoder.encode(texts[0], max_tokens))

In [None]:
print(texts[0])