In [1]:
import os
from sys import getsizeof
import json
import re
import pickle
from time import perf_counter, sleep
import inspect
from concurrent.futures import ProcessPoolExecutor

import numpy as np
import nltk
import matplotlib.pyplot as plt
import torch
from transformers import (
	BartTokenizer, BartForConditionalGeneration,
	T5Tokenizer, T5ForConditionalGeneration
)
from transformers.tokenization_utils_base import BatchEncoding
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from bert_score import BERTScorer
from rouge import Rouge
from dotenv import load_dotenv

from utils.pipelines import *
from utils.helpers import *

In [2]:
# data_dir = "/Users/naman/Workspace/Data/UCCS-REU"
data_dir = "/home/nchibbar/Data"

crs_files = os.listdir(crs_dir := f"{data_dir}/GovReport/crs")
gao_files = os.listdir(gao_dir := f"{data_dir}/GovReport/gao")

print(f"crs files: {len(crs_files)}, gao files: {len(gao_files)}")

crs_out = f"{data_dir}/GovReport/crs-processed"
gao_out = f"{data_dir}/GovReport/gao-processed"

crs files: 7238, gao files: 12228


In [3]:
max_tokens = 512

# BART
bart_dir = f"{data_dir}/Models/BART"
bart_fine_tuned = f"{data_dir}/Models/BART-GovReport-SentenceSampler"
bart_checkpoint = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(bart_dir)
model = BartForConditionalGeneration.from_pretrained(bart_dir)
context_size = model.config.max_position_embeddings

# T5
# t5_dir = f"{data_dir}/Models/T5"
# t5_checkpoint = "google/flan-t5-base"
# t5_checkpoint = "pszemraj/long-t5-tglobal-base-16384-book-summary"
# tokenizer = T5Tokenizer.from_pretrained(t5_dir)
# model = T5ForConditionalGeneration.from_pretrained(t5_dir)
# context_size = model.config.n_positions

context_size

1024

In [4]:
special_tokens = extract_special_tokens(
	tokenizer.special_tokens_map.values()
)
preprocessor = TextProcessor(preprocessing=True)
postprocessor = None
# postprocessor = TextProcessor(ignore_tokens=special_tokens)
special_tokens

['<s>', '</s>', '<unk>', '</s>', '<pad>', '<s>', '<mask>']

## GovReport

In [None]:
def combine_subsections(sections):
	text = ""
	for sec in sections:
		sec_text = "\n\n".join(sec["paragraphs"])
		if sec["section_title"]:
			sec_text = f"Section {sec["section_title"]}:\n\n{sec_text}"
		text = f"{text}\n\n{sec_text}" if text else sec_text
		if sec["subsections"]:
			sub_text = combine_subsections(sec["subsections"])
			text = f"{text}\n\n{sub_text}" if text else sub_text
	return text

In [None]:
for file in crs_files:
	with open(f"{crs_dir}/{file}") as fp:
		data = json.load(fp)
	text = combine_subsections([data["reports"]])
	text = preprocessor.process(text)
	summary = "\n".join(data["summary"])
	summary = preprocessor.process(summary)
	with open(f"{crs_out}/{file}", "w") as fp:
		json.dump({
			"text": text,
			"summary": summary
		}, fp)

In [None]:
for file in gao_files:
	with open(f"{gao_dir}/{file}") as fp:
		data = json.load(fp)
	text = combine_subsections(data["report"])
	text = preprocessor.process(text)
	print(data["highlight"])
	summary = "\n".join(data["highlight"])
	summary = preprocessor.preprocess(summary)
	with open(f"{gao_out}/{file}", "w") as fp:
		json.dump({
			"text": text,
			"summary": summary
		}, fp)

## LDA

In [None]:
vectorizer = CountVectorizer(stop_words="english")
vectorizer

In [None]:
dtm = vectorizer.fit_transform([data["text"]])
vectorizer

In [None]:
print(dtm)

In [None]:
topics = 4
lda = LatentDirichletAllocation(n_components=topics)
lda.fit(dtm)

In [None]:
topic_dist = lda.transform(dtm)
print(topic_dist)

In [None]:
def display_topics(model, feature_names, num_top_words):
	for topic_idx, topic in enumerate(model.components_):
		print(f"Topic {topic_idx}:")
		print(" ".join([feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))

In [None]:
num_top_words = 10
feature_names = vectorizer.get_feature_names_out()
display_topics(lda, feature_names, num_top_words)

## Rough

In [5]:
texts, summaries = [], []

In [6]:
# max 73_791
min_words_text = 70_000
for file in crs_files:
	with open(f"{crs_out}/{file}") as fp:
		data = json.load(fp)
	if count_words(data["text"]) >= min_words_text:
		break
texts.append(data["text"])
summaries.append(data["summary"])

count_words(data["text"]), count_words(data["summary"])

(70163, 364)

In [5]:
texts, summaries = [], []
for file in crs_files:
	with open(f"{crs_out}/{file}") as fp:
		data = json.load(fp)
	texts.append(data["text"])
	summaries.append(data["summary"])

len(texts)

7238

In [7]:
sent_checkpoint = "sentence-transformers/all-MiniLM-L6-v2"
sent_dir = f"{data_dir}/Models/Sent-Transformer"

sent_encoder = SentenceTransformer(sent_dir)
sent_encoder

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [8]:
head_size = .5
threshold = .7
seed = 69
device = get_device()
device = "cpu"

encoders = [
	TruncateMiddle(
		tokenizer, context_size, head_size, preprocessor
	),
	UniformSampler(
		tokenizer, context_size, nltk.sent_tokenize, preprocessor, seed
	),
	SentenceSampler(
		tokenizer, context_size, nltk.sent_tokenize, sent_encoder,
		preprocessor, threshold, device, seed
	),
	RemoveRedundancy(
		tokenizer, context_size, nltk.sent_tokenize, sent_encoder,
		preprocessor, threshold, device, seed
	),
]

pipelines = [
	SummarizationPipeline(
		model, encoder, max_tokens, postprocessor, device
	) for encoder in encoders
]

In [9]:
batch_size = None if len(texts) < 3 else 3
evaluator = Evaluator(
	pipelines, texts, summaries, device=device
)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
num_workers = os.cpu_count()
# num_workers = 3
evaluator.generate_summaries(batch_size, None)

[6870.044142997358, 3887.949132011272, 4018.1995559833013, 9793.55108900927]

In [11]:
evaluator.get_rouge_score()

[{'rouge-1': [0.18313253012048192, 0.76, 0.10410958904109589],
  'rouge-2': [0.11138014527845035, 0.46938775510204084, 0.06318681318681318],
  'rouge-l': [0.23237093419338278, 0.7251437262463217, 0.13835293516474403],
  'rouge-w': [0.041725977592485294, 0.420434591230423, 0.02195231621305737]},
 {'rouge-1': [0.0625, 0.2549019607843137, 0.03561643835616438],
  'rouge-2': [0.009661835748792272, 0.04, 0.005494505494505495],
  'rouge-l': [0.08358463007595818, 0.25725166042361713, 0.04989870556426403],
  'rouge-w': [0.01384352889403138, 0.13688925530533755, 0.00729040142495195]},
 {'rouge-1': [0.0625, 0.2549019607843137, 0.03561643835616438],
  'rouge-2': [0.009661835748792272, 0.04, 0.005494505494505495],
  'rouge-l': [0.08358463007595818, 0.25725166042361713, 0.04989870556426403],
  'rouge-w': [0.01384352889403138, 0.13688925530533755, 0.00729040142495195]},
 {'rouge-1': [0.1411764705882353, 0.5, 0.0821917808219178],
  'rouge-2': [0.023640661938534275, 0.0847457627118644, 0.01373626373626

In [24]:
evaluator.get_bert_score()

[[0.8340007662773132, 0.8763248920440674, 0.7955765724182129],
 [0.7822791934013367, 0.8163412809371948, 0.7509458065032959],
 [0.7822791934013367, 0.8163412809371948, 0.7509458065032959],
 [0.7969955801963806, 0.8280269503593445, 0.7682060599327087]]

In [None]:
# enc = tokenizer(texts)["input_ids"][0]
enc = encoders[1](texts)["input_ids"][0]

print(tokenizer.decode(enc))

In [25]:
model.config

BartConfig {
  "_name_or_path": "/home/nchibbar/Data/Models/BART",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "force_bos_token_to_be_generated": true,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1