In [None]:
import os
import torch
import logging
import textstat
import subprocess
import bert_score
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from rouge import Rouge
from pathlib import Path
from typing import Optional
from nltk.translate.bleu_score import sentence_bleu

import io_functions

from transformers import (
    PegasusTokenizer,
    PegasusForConditionalGeneration,
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from tqdm.notebook import tqdm

In [None]:
logging.basicConfig(level=logging.INFO, force=True)

In [None]:
if torch.cuda.is_available():
    logging.info(f'GPU Name: {torch.cuda.get_device_name(0)}')
    logging.info(f'CUDA Version: {torch.version.cuda}')
    logging.info(f'GPU Memory Allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB')
    logging.info(f'GPU Memory Reserved: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB')

    try:
        logging.info('\nDetailed GPU Info:\n')
        subprocess.run(['nvidia-smi'], check=True)
    except FileNotFoundError:
        logging.info('nvidia-smi not found. Ensure NVIDIA drivers are installed.')
else:
    logging.info('No GPU detected. Running on CPU.')

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
company_id = '312932093'
df = io_functions.load_if_scraped(company_id=company_id)

df.info()

In [None]:
df.head()

In [None]:
df.shape

In [None]:
model_name = 'human-centered-summarization/financial-summarization-pegasus'

tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)

In [None]:
max_length = tokenizer.model_max_length
max_length

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=max_length, chunk_overlap=max_length / 10)


def summarize_text(text: str):
    chunks = text_splitter.split_text(text)
    logging.debug([len(c) for c in chunks])

    summaries = []
    for chunk in tqdm(chunks):
        inputs = tokenizer(chunk, return_tensors='pt', truncation=True, max_length=max_length).to(
            device
        )
        summary_ids = model.generate(**inputs, max_length=max_length / 4, early_stopping=True)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(summary)

    return summaries

In [None]:
def recursive_summary(text, target_length=max_length):
    tokens = tokenizer.tokenize(text)
    logging.debug(f'token size: {len(tokens)}')

    combined_summary = text
    while len(tokens) > target_length:
        summaries = summarize_text(combined_summary)
        combined_summary = ' '.join(summaries)
        tokens = tokenizer.tokenize(combined_summary)

    return combined_summary

In [None]:
summaries = []
for text in tqdm(df.full_text):
    summary = recursive_summary(text=text)
    summaries.append(summary)

In [None]:
summary_column = f'{model_name}-summaries'
df[summary_column] = summaries

In [None]:
def evaluate_summary(row, model_name, type='baseline'):
    text_to_summarize = row.full_text
    summary = row[summary_column]
    uuid = row.uuid
    company_id = row.companyid
    company_name = row.companyname

    rouge_evaluator = Rouge()
    rouge_scores = rouge_evaluator.get_scores(summary, text_to_summarize)

    if isinstance(rouge_scores, list):
        rouge_scores = rouge_scores[0]

    reference_tokens = text_to_summarize.split()
    candidate_tokens = summary.split()
    bleu_score = sentence_bleu([reference_tokens], candidate_tokens)

    P, R, F1 = bert_score.score(
        [summary], [text_to_summarize], rescale_with_baseline=True, lang='en'
    )

    original_len = len(text_to_summarize.split())
    summary_len = len(summary.split())
    compression_ratio = summary_len / original_len if original_len > 0 else 0

    readability = textstat.flesch_reading_ease(summary)

    results = {}
    results['model_name'] = model_name
    results['uuid'] = uuid
    results['companyid'] = company_id
    results['companyname'] = company_name

    for metric, scores in rouge_scores.items():
        results[f'{metric}_r'] = scores['r']
        results[f'{metric}_p'] = scores['p']
        results[f'{metric}_f'] = scores['f']

    results['bleu'] = bleu_score
    results['bert_precision'] = P.item()
    results['bert_recall'] = R.item()
    results['bert_f1'] = F1.item()
    results['compression_ratio'] = compression_ratio
    results['readability'] = readability

    return pd.DataFrame([results])

In [None]:
evaluation_results = pd.DataFrame()

for index, row in tqdm(df.iterrows()):
    evaluation_result = evaluate_summary(row, model_name)
    evaluation_results = pd.concat([evaluation_results, evaluation_result], ignore_index=True)

evaluation_results.head()

In [None]:
csv_filename = Path('..') / 'data' / 'evaluation_results.csv'

if os.path.exists(csv_filename):
    existing_df = pd.read_csv(csv_filename)
    if ((existing_df.model_name == model_name) & (existing_df.companyid == company_id)).any():
        logging.info(
            f'model {model_name} and {company_id} combination already exists in {csv_filename}. no new row added.'
        )
        updated_df = existing_df
    else:
        updated_df = pd.concat([existing_df, evaluation_results], ignore_index=True)
        logging.info(f'model {model_name} not found. appending new row to {csv_filename}.')
else:
    updated_df = evaluation_results
    logging.info(f'{csv_filename} not found. creating new file.')

updated_df.to_csv(csv_filename, index=False)
logging.info(f'results saved to {csv_filename}')

In [None]:
def visualize_results(df):
    '''
    Visualizes evaluation metrics stored in a DataFrame.

    Parameters:
    df (pandas.DataFrame): DataFrame containing evaluation results with numeric metric columns.

    This function produces:
    - Histograms for each numeric metric.
    - A correlation heatmap of the numeric metrics.
    '''
    numeric_cols = df.select_dtypes(include='number').columns.tolist()

    df[numeric_cols].hist(bins=20, figsize=(15, 10))
    plt.suptitle('Histograms of Evaluation Metrics', fontsize=16)
    plt.tight_layout(rect=[0, 0, 1, 0.96])
    plt.show()

    plt.figure(figsize=(12, 10))
    correlation_matrix = df[numeric_cols].corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Heatmap of Evaluation Metrics', fontsize=16)
    plt.show()


visualize_results(evaluation_results)

In [None]:
df.to_csv(
    Path('..') / 'data' / 'summaries' / f'{company_id}_{model_name}.csv'.replace('/', '-'),
    sep='\t',
    index=False,
    quoting=1,
    escapechar='\\',
    doublequote=True,
    quotechar='"',
    lineterminator='\n',
)

In [None]:
df.full_text[0]

In [None]:
df[summary_column][0]