## Setup and Imports

In [16]:
import os
import concurrent.futures
import fitz
import logging
from pathlib import Path
import pandas as pd
import torch
from cleantext import clean
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from tqdm.auto import tqdm
import re
import json
import TextAnalysis as TA
from generate_config import load_model_config, get_generation_parameters, save_model_config

## Configure logging

In [17]:
logging.basicConfig(level=logging.INFO)

## Models to benchmark

In [None]:
BASE_DIR = Path(".")
CONTENT_DIR = BASE_DIR / "content"
MODELS_DIR = BASE_DIR / "models"
SUMMARIES_DIR = BASE_DIR / "summaries"
MODEL_CONFIGS_DIR = BASE_DIR / "model_configs"  # New config directory
DEPRECATED_CONFIGS_DIR = MODEL_CONFIGS_DIR / "deprecated"  # Directory for old configs

# Verify directory structure exists
if not CONTENT_DIR.exists():
    raise FileNotFoundError(f"Content directory not found at {CONTENT_DIR}. Please create it and add PDF files.")

if not MODELS_DIR.exists():
    raise FileNotFoundError(f"Models directory not found at {MODELS_DIR}. Please create it and add model directories.")

# Find all model directories
MODELS_TO_BENCHMARK = [str(d) for d in MODELS_DIR.iterdir() if d.is_dir()]
if not MODELS_TO_BENCHMARK:
    raise FileNotFoundError(f"No model directories found in {MODELS_DIR}")

logging.info(f"Found {len(MODELS_TO_BENCHMARK)} models to benchmark: {[Path(m).name for m in MODELS_TO_BENCHMARK]}")

# Create summaries directory, model-specific subdirectories, and config directory
SUMMARIES_DIR.mkdir(exist_ok=True)
MODEL_CONFIGS_DIR.mkdir(exist_ok=True)
DEPRECATED_CONFIGS_DIR.mkdir(exist_ok=True)  # Create deprecated directory
for model_path in MODELS_TO_BENCHMARK:
    model_name = Path(model_path).name
    (SUMMARIES_DIR / model_name).mkdir(exist_ok=True)

## Load Stopwords

In [None]:
def load_stopwords(filepath='stopwords.txt'):
    """
    Load stopwords from file or create default if doesn't exist.
    Returns set of stopwords.
    """
    default_stopwords = {
        'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from',
        'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the',
        'to', 'was', 'were', 'will', 'with', 'the', 'this', 'but', 'they',
        'have', 'had', 'what', 'when', 'where', 'who', 'which', 'why', 'how'
    }
    
    if not os.path.exists(filepath):
        logging.info(f"Stopwords file not found at {filepath}. Creating default stopwords file.")
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write('\n'.join(sorted(default_stopwords)))
        return default_stopwords
    
    with open(filepath, 'r', encoding='utf-8') as f:
        stopwords = {word.strip() for word in f.readlines() if word.strip()}
    
    logging.info(f"Loaded {len(stopwords)} stopwords from {filepath}")
    return stopwords

# Create/load stopwords
stopwords = load_stopwords()

## Text Processing Functions

In [20]:
def process_text(text):
    """Applies a series of cleaning steps to the text."""
    keywords = ["REFERENCES", "ACKNOWLEDGEMENTS", "References", "Acknowledgements"]
    
    text = find_and_remove_references(text, keywords)
    text = remove_text_in_brackets(text)
    text = remove_lines_starting_with_number_or_symbol(text)
    text = remove_lines_with_one_word(text)
    text = remove_empty_lines(text)
    
    return text

def find_and_remove_references(text, keywords):
    """Finds and removes text after the first occurrence of any of the given keywords."""
    earliest_position = float('inf')
    for keyword in keywords:
        position = text.find(keyword)
        if position != -1:
            earliest_position = min(position, earliest_position)
    
    if earliest_position != float('inf'):
        text = text[:earliest_position]
    return text

def remove_text_in_brackets(text):
    """Removes text enclosed in parentheses or square brackets."""
    pattern = r'\([^)]*\)|\[[^\]]*\]'
    return re.sub(pattern, '', text)

def remove_lines_starting_with_number_or_symbol(text):
    """Removes lines starting with a number or symbol."""
    pattern = r'^[\d\W].*$'
    return re.sub(pattern, '', text, flags=re.MULTILINE)

def remove_lines_with_one_word(text):
    """Removes lines containing only one word."""
    lines = text.split('\n')
    pattern = r'^\s*\w+\s*$'
    filtered_lines = [line for line in lines if not re.match(pattern, line)]
    return '\n'.join(filtered_lines)

def remove_empty_lines(text):
    """Removes empty lines."""
    lines = text.split('\n')
    non_empty_lines = [line for line in lines if line.strip() != '']
    return '\n'.join(non_empty_lines)

## PDF Processing Functions

In [21]:
def extract_and_process_text(pdf_path, output_path):
    """Extracts text from a PDF file, cleans it, and saves it to a text file."""
    pdf_doc = fitz.open(pdf_path)
    text = ''
    for page in pdf_doc:
        text += page.get_text()

    cleaned_text = process_text(text)
    
    with open(output_path, 'w', encoding='utf-8') as output_file:
        output_file.write(cleaned_text)
    return output_path

def process_pdf(pdf_filename):
    """Processes a single PDF file, extracts and cleans text."""
    text_filename = pdf_filename.with_suffix('.txt')  # Use with_suffix to change the extension
    cleaned_text = extract_and_process_text(pdf_filename, text_filename)
    return cleaned_text

## Summarization Functions

In [22]:
def save_summaries(summaries, output_dir, filename):
    """
    Save summaries to a JSON file in the specified directory and return the summaries text.
    
    Args:
        summaries: List of summary dictionaries
        output_dir: Path to output directory
        filename: Name of the original file
    
    Returns:
        tuple: (output_path, concatenated_summaries_text)
    """
    output_path = output_dir / f"{Path(filename).stem}_summary.json"
    
    # Convert torch tensors to lists for JSON serialization
    serializable_summaries = []
    concatenated_summaries = []
    
    for summary in summaries:
        serializable_summary = {
            "input_tokens": summary["input_tokens"].tolist(),
            "summary": summary["summary"]
        }
        serializable_summaries.append(serializable_summary)
        concatenated_summaries.append(summary["summary"][0])  # Extract the summary text
    
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(serializable_summaries, f, ensure_ascii=False, indent=2)
    
    # Join all summaries with a separator
    full_summary = " ".join(concatenated_summaries)
    return output_path, full_summary

def summarize_with_model(model_path, text_filenames, device, settings):
    """
    Generate summaries using a specific model.
    
    Args:
        model_path: Path to the model
        text_filenames: List of text files to summarize
        device: torch device
        settings: Dictionary of generation settings
    
    Returns:
        Dictionary mapping filenames to summary texts
    """
    logging.info(f"Loading model from {model_path}")
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    model_name = Path(model_path).name
    output_dir = SUMMARIES_DIR / model_name
    
    summaries = {}
    for file_path in text_filenames:
        logging.info(f"Processing {file_path} with model {model_name}")
        
        with open(file_path, 'r', errors='ignore') as f:
            raw_text = f.read()
        
        clean_text = clean(raw_text)
        _summaries = summarize_via_tokenbatches(
            clean_text, model, tokenizer, device, settings['token_batch_length'], settings['batch_stride'], **settings['parameters']
        )
        
        # Save summaries and get the concatenated text
        _, summary_text = save_summaries(_summaries, output_dir, file_path)
        # Store with the original PDF filename as key
        pdf_name = Path(file_path).stem + '.pdf'
        summaries[pdf_name] = summary_text
        
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    
    return summaries

def summarize(ids, mask, model, tokenizer, device, **kwargs):
    """Generate summary using the model."""
    ids = ids[None, :]
    mask = mask[None, :]

    input_ids = ids.to(device)
    attention_mask = mask.to(device)
    global_attention_mask = torch.zeros_like(attention_mask)
    global_attention_mask[:, 0] = 1

    summary_pred_ids = model.generate(
        ids,
        attention_mask=mask,
        global_attention_mask=global_attention_mask,
        return_dict_in_generate=True,
        **kwargs
    )
    summary = tokenizer.batch_decode(
        summary_pred_ids.sequences,
        skip_special_tokens=True,
        remove_invalid_values=True,
    )

    return summary

def summarize_via_tokenbatches(input_text, model, tokenizer, device, token_batch_length, batch_stride, **kwargs):
    """Process text in batches for summarization."""
    encoded_input = tokenizer(
        input_text,
        padding='max_length',
        truncation=True,
        max_length=token_batch_length,
        stride=batch_stride,
        return_overflowing_tokens=True,
        add_special_tokens=False,
        return_tensors='pt',
    )

    in_id_arr = encoded_input.input_ids.to(device)
    att_arr = encoded_input.attention_mask.to(device)
    gen_summaries = []

    with tqdm(total=len(in_id_arr)) as pbar:
        for _id, _mask in zip(in_id_arr, att_arr):
            result = summarize(ids=_id, mask=_mask, model=model, tokenizer=tokenizer, device=device, **kwargs)
            _sum = {"input_tokens": _id, "summary": result}
            gen_summaries.append(_sum)
            pbar.update()

    return gen_summaries

## Summary Processing Functions

In [None]:
def process_summaries(df, stopwords):
    """Calculate n-grams for summaries."""
    for _, row in df.iterrows():
        summary = row.get('Summary', '')
        if pd.isna(summary) or summary == '':
            empty_result = [('', 0)] * 10
            results = {
                'Top_Words': empty_result,
                'Top_Bigrams': empty_result,
                'Top_Trigrams': empty_result
            }
        else:
            results = {
                'Top_Words': TA.topwords(summary, stopwords),
                'Top_Bigrams': TA.topbigrams(summary, stopwords),
                'Top_Trigrams': TA.toptrigrams(summary, stopwords)
            }
        
        for key, value in results.items():
            df.at[_, key] = value
    
    return df

## Main Execution

In [None]:
def main():
    # Set device
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    
    # Process PDF files to .txt
    logging.info("Processing PDF files...")
    pdf_filenames = list(CONTENT_DIR.glob('*.pdf'))
    with concurrent.futures.ThreadPoolExecutor() as executor:
        executor.map(process_pdf, pdf_filenames)

    # Initialize list to store DataFrames
    all_model_summaries_dfs = []

    # Initialize dictionary to store model names
    all_model_names = {}

    # Iterate through models and generate summaries
    for model_path in MODELS_TO_BENCHMARK:
        model_name = Path(model_path).name
        model_config_path = MODEL_CONFIGS_DIR / f"{model_name}_config.json"
        
        # Load config if it exists, generate a new one if it doesn't
        if model_config_path.exists():
            model_config = load_model_config(model_config_path)
        else:
            save_model_config(model_path, model_name)
            model_config = load_model_config(model_config_path)
        
        logging.info(f"Starting summarization with model: {model_path}")
        model_summaries = summarize_with_model(
            model_path,
            list(CONTENT_DIR.glob('*.txt')),
            device,
            model_config
        )
        
        # Create DataFrame for current model's summaries
        model_df = pd.DataFrame({
            'Document': [Path(f).name for f in pdf_filenames],
            'Model': [model_name] * len(pdf_filenames),
            'Summary': [model_summaries[Path(f).name] for f in pdf_filenames]  # Use just the filename
        })
        
        # Add model DataFrame to the list
        all_model_summaries_dfs.append(model_df)
        all_model_names[model_name] = model_config

    # Concatenate DataFrames from all models
    df = pd.concat(all_model_summaries_dfs, ignore_index=True)

    # Create processing summary
    summary_report = {
        "models_processed": list(all_model_names.keys()),
        "timestamp": pd.Timestamp.now().isoformat()
    }
    
    with open(SUMMARIES_DIR / "processing_summary.json", 'w') as f:
        json.dump(summary_report, f, indent=2)

    process_summaries(df, "stopwords.txt")

    # Save DataFrame to CSV file
    df.to_csv(SUMMARIES_DIR / "{model_name}_summaries.csv", index=False)

if __name__ == "__main__":
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s'
    )
    main()

In [10]:
torch.cuda.empty_cache()