In [1]:
import os
import sys
import time
import pickle
import json
import re
import inspect
import warnings

import numpy as np
import nltk
import matplotlib.pyplot as plt
import torch
import transformers as tfm
import sentence_transformers as stfm
import dotenv

import configs as c
import encoders as e
import pipelines as p
import utils as u

def plot_histogram(data):
	bins = int(len(data) ** .5)
	plt.hist(data, bins=bins)
	plt.show()

warnings.filterwarnings("ignore")
device = u.get_device(c.GPU_USAGE_TOLERANCE)
dotenv.load_dotenv()

True

In [None]:
model_name = "pegasus"

sent_dir = f"{c.MODELS_DIR}/sent-transformer"
model_dir = f"{c.MODELS_DIR}/{model_name.lower()}"

govreport_dir = f"{c.BASE_DIR}/GovReport/processed"
bigpatent_dir = f"{c.BASE_DIR}/BigPatent/processed"
govreport_files = os.listdir(govreport_dir)
bigpatent_files = os.listdir(bigpatent_dir)

len(govreport_files), len(bigpatent_files)

In [None]:
# Sentence transformer
# Automatically loads into gpu if available
sent_encoder = stfm.SentenceTransformer(sent_dir, device=device)

match model_name:

	case "bart":
		tokenizer = tfm.BartTokenizer.from_pretrained(model_dir)
		model = tfm.BartForConditionalGeneration.from_pretrained(model_dir)
		context_size = model.config.max_position_embeddings

	case "t5":
		tokenizer = tfm.T5Tokenizer.from_pretrained(model_dir)
		model = tfm.T5ForConditionalGeneration.from_pretrained(model_dir)
		context_size = model.config.n_positions

	case "pegasus":
		tokenizer = tfm.PegasusTokenizerFast.from_pretrained(model_dir)
		model = tfm.PegasusForConditionalGeneration.from_pretrained(model_dir)
		context_size = model.config.max_position_embeddings

	case "gpt":
		tokenizer = tfm.GPT2TokenizerFast.from_pretrained(model_dir)
		model = "gpt-3.5-turbo"
		context_size = 4096

context_size

In [None]:
preprocessor = u.TextProcessor(preprocessing=True)
postprocessor = None

## BigPatent

In [None]:
word_counts = []
for file in bigpatent_files:
	file_path = f"{bigpatent_dir}/{file}"
	with open(file_path) as fp:
		data = json.load(fp)
	for text in data["texts"]:
		word_counts.append(u.count_words(text))

plot_histogram(word_counts)

In [None]:
max(word_counts), np.mean(word_counts), len(word_counts)

In [None]:
sum([
	1
	for count in word_counts
	if count > 40_000
])

## Rough

In [None]:
texts, summaries = [], []
num_texts = 0
for file in govreport_files:
	file_path = f"{govreport_dir}/{file}"
	with open(file_path) as fp:
		data = json.load(fp)
	if c.MIN_WORDS < u.count_words(data["text"]) < c.MAX_WORDS:
		texts.append(data["text"])
		summaries.append(data["summary"])
		num_texts += 1
	if num_texts == c.MAX_TEXTS:
		break

num_texts

In [None]:
SEGMENT_MIN_WORDS = 20
text_segmenter = u.TextSegmenter(nltk.sent_tokenize, SEGMENT_MIN_WORDS)
keywords_preprocessor = u.TextProcessor(
	only_words_nums = True,
	remove_nums = True
)
stop_words = u.get_stop_words(extra_stop_words=c.EXTRA_STOP_WORDS)
len(stop_words)

In [None]:
encoders = [
	e.TruncateMiddle(
		tokenizer, context_size, 1, preprocessor
	),
	e.TruncateMiddle(
		tokenizer, context_size, c.HEAD_SIZE, preprocessor, True
	),
	e.UniformSampler(
		tokenizer, c.MIN_TOKEN_FRAC * context_size, context_size,
		text_segmenter, preprocessor, True, c.SEED
	),
	e.SegmentSampler(
		tokenizer, c.MIN_TOKEN_FRAC * context_size, context_size,
		text_segmenter, sent_encoder, preprocessor, c.THRESHOLD, c.PROB_BOOST, c.SEED
	),
	e.RemoveRedundancy(
		tokenizer, c.MIN_TOKEN_FRAC * context_size, context_size,
		text_segmenter, sent_encoder, preprocessor, c.THRESHOLD, c.SEED
	),
	e.KeywordScorer(
		tokenizer, context_size, text_segmenter, sent_encoder,
		preprocessor, c.NUM_KEYWORDS, keywords_preprocessor, stop_words
	)
]

pipelines = [
	p.SummarizationPipeline(
		model, enc, postprocessor, c.MIN_SUMMARY_TOKENS,
		context_size, device, c.TEMPERATURE, c.REPETITION_PENALTY, c.TOP_P
	) for enc in encoders
] if model_name != "gpt" else [
	p.OpenAIPipeline(
		model, enc, postprocessor, c.SYSTEM_PROMPT
	) for enc in encoders
]

In [None]:
processed_texts = preprocessor(texts)
threshold = .5
num_segments_found = []
for text in processed_texts:
	keywords = u.get_keywords(text, 20, stop_words, keywords_preprocessor)
	keywords = " ".join(keywords)
	keyword_emb = sent_encoder.encode(keywords)
	segments = text_segmenter(text)
	segment_embs = sent_encoder.encode(segments)
	scores = segment_embs @ keyword_emb
	num_segments = (scores > threshold).sum()
	num_segments_found.append(num_segments)

In [None]:
np.sort(num_segments_found)

In [None]:
with open(f"{c.BASE_DIR}/pegasus-govreport.pkl", "rb") as fp:
	results = pickle.load(fp)
scores = results["scores"]
sort1, sort2, sort3 = results["sort1"], results["sort2"], results["sort3"]
gen_summaries = results["gen_summaries"]
scores[0][sort1]

In [None]:
ind = 0
problem_text = results["texts"][sort1[ind]]
print(gen_summaries[sort1[ind]])

In [None]:
with open(f"{c.BASE_DIR}/bart-bigpatent-times.json") as fp:
	results = json.load(fp)
times = np.array(results["encoder_times"])[1:]
times

In [None]:
plt.bar([
	"Truncate\nMiddle", "Document\nSkimming",
	"Skimming w/\npost-sampling\nremoval",
	"Skimming\nw/ pre-\nsampling\nremoval", "Summarization\nw/ Keyword\nExtraction"
], times, color="green")

In [None]:
a = np.array([
	[1, 2],
	[3, 4],
	[5, 6]
])
b = np.array([1, 1])

a @ b

In [33]:
class Parameter(torch.nn.Parameter):

	def __init__(self, val) -> None:

		super().__init__()
		self.val = float(val)
		self.param = torch.nn.Parameter(torch.tensor(self.val))

	# def __add__(self, other):
	# 	return self.param + other
	
	# def __radd__(self, other):
	# 	return self.param + other
	
	# def __mul__(self, other):
	# 	return self.param * other
	
	# def __rmul__(self, other):
	# 	return self.param * other

class Custom(torch.nn.Module):

	def __init__(self):

		super().__init__()
		self.c = 5
		self.param = Parameter(2)

	def forward(self, x):
		return x * self.param + self.c

In [35]:
model = Custom()

model(2)

AttributeError: 'int' object has no attribute 'detach'

In [30]:
for param in model.parameters():
	print(param)