## Setup and Imports

In [37]:
import os
import concurrent.futures
import fitz
import logging
from pathlib import Path

import pandas as pd
import torch
from cleantext import clean
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from tqdm.auto import tqdm
import re
import json
import TextAnalysis as TA

## Configure logging

In [38]:
logging.basicConfig(level=logging.INFO)

## Models to benchmark

In [39]:
BASE_DIR = Path(".")
CONTENT_DIR = BASE_DIR / "content"
MODELS_DIR = BASE_DIR / "models"
SUMMARIES_DIR = BASE_DIR / "summaries"

# Verify directory structure exists
if not CONTENT_DIR.exists():
    raise FileNotFoundError(f"Content directory not found at {CONTENT_DIR}. Please create it and add PDF files.")

if not MODELS_DIR.exists():
    raise FileNotFoundError(f"Models directory not found at {MODELS_DIR}. Please create it and add model directories.")

# Find all model directories
MODELS_TO_BENCHMARK = [str(d) for d in MODELS_DIR.iterdir() if d.is_dir()]
if not MODELS_TO_BENCHMARK:
    raise FileNotFoundError(f"No model directories found in {MODELS_DIR}")

logging.info(f"Found {len(MODELS_TO_BENCHMARK)} models to benchmark: {[Path(m).name for m in MODELS_TO_BENCHMARK]}")

# Create summaries directory and model-specific subdirectories
SUMMARIES_DIR.mkdir(exist_ok=True)
for model_path in MODELS_TO_BENCHMARK:
    model_name = Path(model_path).name
    (SUMMARIES_DIR / model_name).mkdir(exist_ok=True)

INFO:root:Found 1 models to benchmark: ['large-book_summary']


## Load Stopwords

In [40]:
def load_stopwords(filepath='stopwords.txt'):
    """
    Load stopwords from file or create default if doesn't exist.
    Returns set of stopwords.
    """
    default_stopwords = {
        'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from',
        'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the',
        'to', 'was', 'were', 'will', 'with', 'the', 'this', 'but', 'they',
        'have', 'had', 'what', 'when', 'where', 'who', 'which', 'why', 'how'
    }
    
    if not os.path.exists(filepath):
        logging.info(f"Stopwords file not found at {filepath}. Creating default stopwords file.")
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write('\n'.join(sorted(default_stopwords)))
        return default_stopwords
    
    with open(filepath, 'r', encoding='utf-8') as f:
        stopwords = {word.strip() for word in f.readlines() if word.strip()}
    
    logging.info(f"Loaded {len(stopwords)} stopwords from {filepath}")
    return stopwords

# Create/load stopwords
stopwords = load_stopwords()

INFO:root:Loaded 583 stopwords from stopwords.txt


## Text Processing Functions

In [41]:
def process_text(text):
    """Applies a series of cleaning steps to the text."""
    keywords = ["REFERENCES", "ACKNOWLEDGEMENTS", "References", "Acknowledgements"]
    
    text = find_and_remove_references(text, keywords)
    text = remove_text_in_brackets(text)
    text = remove_lines_starting_with_number_or_symbol(text)
    text = remove_lines_with_one_word(text)
    text = remove_empty_lines(text)
    
    return text

def find_and_remove_references(text, keywords):
    """Finds and removes text after the first occurrence of any of the given keywords."""
    earliest_position = float('inf')
    for keyword in keywords:
        position = text.find(keyword)
        if position != -1:
            earliest_position = min(position, earliest_position)
    
    if earliest_position != float('inf'):
        text = text[:earliest_position]
    return text

def remove_text_in_brackets(text):
    """Removes text enclosed in parentheses or square brackets."""
    pattern = r'\([^)]*\)|\[[^\]]*\]'
    return re.sub(pattern, '', text)

def remove_lines_starting_with_number_or_symbol(text):
    """Removes lines starting with a number or symbol."""
    pattern = r'^[\d\W].*$'
    return re.sub(pattern, '', text, flags=re.MULTILINE)

def remove_lines_with_one_word(text):
    """Removes lines containing only one word."""
    lines = text.split('\n')
    pattern = r'^\s*\w+\s*$'
    filtered_lines = [line for line in lines if not re.match(pattern, line)]
    return '\n'.join(filtered_lines)

def remove_empty_lines(text):
    """Removes empty lines."""
    lines = text.split('\n')
    non_empty_lines = [line for line in lines if line.strip() != '']
    return '\n'.join(non_empty_lines)

## PDF Processing Functions

In [42]:
def extract_and_process_text(pdf_path, output_path):
    """Extracts text from a PDF file, cleans it, and saves it to a text file."""
    pdf_doc = fitz.open(pdf_path)
    text = ''
    for page in pdf_doc:
        text += page.get_text()

    cleaned_text = process_text(text)
    
    with open(output_path, 'w', encoding='utf-8') as output_file:
        output_file.write(cleaned_text)
    return output_path

def process_pdf(pdf_filename):
    """Processes a single PDF file, extracts and cleans text."""
    text_filename = pdf_filename.with_suffix('.txt')  # Use with_suffix to change the extension
    cleaned_text = extract_and_process_text(pdf_filename, text_filename)
    return cleaned_text

## Summarization Functions

In [43]:
def save_summaries(summaries, output_dir, filename):
    """
    Save summaries to a JSON file in the specified directory.
    
    Args:
        summaries: List of summary dictionaries
        output_dir: Path to output directory
        filename: Name of the original file
    """
    output_path = output_dir / f"{Path(filename).stem}_summary.json"
    
    # Convert torch tensors to lists for JSON serialization
    serializable_summaries = []
    for summary in summaries:
        serializable_summary = {
            "input_tokens": summary["input_tokens"].tolist(),
            "summary": summary["summary"]
        }
        serializable_summaries.append(serializable_summary)
    
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(serializable_summaries, f, ensure_ascii=False, indent=2)
    
    return output_path

def summarize_with_model(model_path, text_filenames, device, token_batch_length, batch_stride, settings):
    """
    Generate summaries using a specific model.
    
    Args:
        model_path: Path to the model
        text_filenames: List of text files to summarize
        device: torch device
        token_batch_length: Maximum token length for processing
        batch_stride: Stride length for processing
        settings: Dictionary of generation settings
    
    Returns:
        Dictionary mapping filenames to summaries
    """
    logging.info(f"Loading model from {model_path}")
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    model_name = Path(model_path).name
    output_dir = SUMMARIES_DIR / model_name
    
    summaries = {}
    for file_path in text_filenames:
        logging.info(f"Processing {file_path} with model {model_name}")
        
        with open(file_path, 'r', errors='ignore') as f:
            raw_text = f.read()
        
        clean_text = clean(raw_text)
        _summaries = summarize_via_tokenbatches(
            clean_text, model, tokenizer, device, 
            token_batch_length, batch_stride, **settings
        )
        
        # Save summaries for this file
        summary_path = save_summaries(_summaries, output_dir, file_path)
        summaries[file_path] = summary_path
        
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    
    return summaries

def summarize(ids, mask, model, tokenizer, device, **kwargs):
    """Generate summary using the model."""
    ids = ids[None, :]
    mask = mask[None, :]

    input_ids = ids.to(device)
    attention_mask = mask.to(device)
    global_attention_mask = torch.zeros_like(attention_mask)
    global_attention_mask[:, 0] = 1

    summary_pred_ids = model.generate(
        ids,
        attention_mask=mask,
        global_attention_mask=global_attention_mask,
        return_dict_in_generate=True,
        **kwargs
    )
    summary = tokenizer.batch_decode(
        summary_pred_ids.sequences,
        skip_special_tokens=True,
        remove_invalid_values=True,
    )

    return summary

def summarize_via_tokenbatches(input_text, model, tokenizer, device, token_batch_length, batch_stride, **kwargs):
    """Process text in batches for summarization."""
    encoded_input = tokenizer(
        input_text,
        padding='max_length',
        truncation=True,
        max_length=token_batch_length,
        stride=batch_stride,
        return_overflowing_tokens=True,
        add_special_tokens=False,
        return_tensors='pt',
    )

    in_id_arr = encoded_input.input_ids.to(device)
    att_arr = encoded_input.attention_mask.to(device)
    gen_summaries = []

    with tqdm(total=len(in_id_arr)) as pbar:
        for _id, _mask in zip(in_id_arr, att_arr):
            result = summarize(ids=_id, mask=_mask, model=model, tokenizer=tokenizer, device=device, **kwargs)
            _sum = {"input_tokens": _id, "summary": result}
            gen_summaries.append(_sum)
            pbar.update()

    return gen_summaries

## Summary Processing Functions

In [44]:
def process_summaries(df, stopwords_path):
    """Process summaries and calculate n-grams using TextAnalysis module."""
    # Load stopwords at the start of processing
    stopwords = load_stopwords(stopwords_path)
    
    results = {
        'Summaries Top 10 Words': [],
        'Summaries Top 10 Bigrams': [],
        'Summaries Top 10 Trigrams': []
    }
    
    for _, row in df.iterrows():
        summaries = row.get('Summaries', '')
        if pd.isna(summaries) or summaries == '':
            empty_result = [('', 0)] * 10
            for key in results:
                results[key].append(empty_result)
            continue
            
        results['Summaries Top 10 Words'].append(TA.topwords(summaries, stopwords))
        results['Summaries Top 10 Bigrams'].append(TA.topbigrams(summaries, stopwords))
        results['Summaries Top 10 Trigrams'].append(TA.toptrigrams(summaries, stopwords))
    
    # Add results to DataFrame
    for key in results:
        df[key] = results[key]
    
    return df


## Main Execution

In [45]:
def main():
    # Set device
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    
    # Set parameters
    token_batch_length = 3072
    batch_stride = 50
    max_len_ratio = 8
    
    settings = {
        'min_length': 5,
        'max_length': int(token_batch_length // max_len_ratio),
        'no_repeat_ngram_size': 7,
        'encoder_no_repeat_ngram_size': 7,
        'repetition_penalty': 3.7,
        'num_beams': 12,
        'length_penalty': 0.5,
        'early_stopping': True,
        'do_sample': False
    }
    
    # Process PDFs
    pdf_filenames = list(CONTENT_DIR.glob('*.pdf'))
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
        text_filenames = list(executor.map(process_pdf, pdf_filenames))
    
    # Process with each model
    all_model_summaries = {}
    for model_path in MODELS_TO_BENCHMARK:
        logging.info(f"Starting summarization with model: {model_path}")
        model_summaries = summarize_with_model(
            model_path,
            text_filenames,
            device,
            token_batch_length,
            batch_stride,
            settings
        )
        all_model_summaries[Path(model_path).name] = model_summaries
    
    # Create a summary of the processing
    summary_report = {
        "models_processed": list(all_model_summaries.keys()),
        "files_processed": len(text_filenames),
        "timestamp": pd.Timestamp.now().isoformat()
    }
    
    with open(SUMMARIES_DIR / "processing_summary.json", 'w') as f:
        json.dump(summary_report, f, indent=2)
    
    logging.info(f"Completed processing {len(text_filenames)} files with {len(MODELS_TO_BENCHMARK)} models")

# %%
if __name__ == "__main__":
    main()

TypeError: Path.replace() takes 2 positional arguments but 3 were given

Depracted Below

In [6]:
# import os
# import concurrent.futures
# import fitz
# import logging
# from pathlib import Path

# import pandas as pd
# import torch
# from cleantext import clean
# # from datasets import load_dataset
# # from nltk.corpus import stopwords
# from sklearn.decomposition import LatentDirichletAllocation
# from sklearn.feature_extraction.text import TfidfVectorizer
# from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
# from tqdm.auto import tqdm
# import re

# # Assuming you have a TextAnalysis module
# import TextAnalysis as TA

# # Configure logging
# logging.basicConfig(level=logging.INFO)

# # --- Helper Functions ---

# def load_stopwords(filepath='stopwords.txt'):
#     """Load stopwords from local file."""
#     try:
#         with open(filepath, 'r', encoding='utf-8') as f:
#             return {line.strip() for line in f}
#     except FileNotFoundError:
#         print(f"Warning: {filepath} not found. Using empty stopwords set.")
#         return set()


# def remove_text_in_brackets(text):
#   """Removes text enclosed in parentheses or square brackets."""
#   pattern = r'\([^)]*\)|\[[^\]]*\]'
#   return re.sub(pattern, '', text)

# def remove_lines_starting_with_number_or_symbol(text):
#   """Removes lines starting with a number or symbol."""
#   pattern = r'^[\d\W].*$'
#   return re.sub(pattern, '', text, flags=re.MULTILINE)

# def remove_lines_with_one_word(text):
#   """Removes lines containing only one word."""
#   lines = text.split('\n')
#   pattern = r'^\s*\w+\s*$'
#   filtered_lines = [line for line in lines if not re.match(pattern, line)]
#   return '\n'.join(filtered_lines)

# def remove_empty_lines(text):
#   """Removes empty lines."""
#   lines = text.split('\n')
#   non_empty_lines = [line for line in lines if line.strip() != '']
#   return '\n'.join(non_empty_lines)

# def find_and_remove_references(text, keywords):
#   """Finds and removes text after the first occurrence of any of the given keywords."""
#   earliest_position = -1
#   for keyword in keywords:
#     position = text.find(keyword)
#     if position != -1:
#       earliest_position = position if earliest_position == -1 else min(position, earliest_position)
#   if earliest_position != -1:
#     text = text[:earliest_position]
#   return text

# def process_text(text):
#   """Applies a series of cleaning steps to the text."""
#   keywords = ["REFERENCES", "ACKNOWLEDGEMENTS", "References", "Acknowledgements"]

#   text = find_and_remove_references(text, keywords)
#   text = find_and_remove_references(text, keywords)
#   text = remove_text_in_brackets(text)
#   text = remove_lines_starting_with_number_or_symbol(text)
#   text = remove_lines_with_one_word(text)
#   text = remove_empty_lines(text)

#   return text

# def extract_and_process_text(pdf_path, output_path):
#   """Extracts text from a PDF file, cleans it, and saves it to a text file."""
#   pdf_doc = fitz.open(pdf_path)
#   text = ''
#   for page in pdf_doc:
#     text += page.get_text()

#   lines = text.split('\n')
#   filtered_lines = [line for line in lines if len(line.strip()) > 1]
#   filtered_text = '\n'.join(filtered_lines)
#   cleaned_text = process_text(filtered_text)

#   with open(output_path, 'w', encoding='utf-8') as output_file:
#     output_file.write(cleaned_text)
#   return output_path

# def process_pdf(pdf_filename):
#   """Processes a single PDF file, extracts and cleans text, and returns the text filename."""
#   text_filename = pdf_filename.replace(".pdf", ".txt")
#   cleaned_text = extract_and_process_text(pdf_filename, text_filename)
#   return cleaned_text


# def summarize(ids, mask, **kwargs):
#     ids = ids[None, :]
#     mask = mask[None, :]

#     input_ids = ids.to(_device)
#     attention_mask = mask.to(_device)
#     global_attention_mask = torch.zeros_like(attention_mask)
#     global_attention_mask[:, 0] = 1

#     summary_pred_ids = model.generate(
#         ids,
#         attention_mask=mask,
#         global_attention_mask=global_attention_mask,
#         return_dict_in_generate=True,
#         **kwargs
#     )
#     summary = tokenizer.batch_decode(
#         summary_pred_ids.sequences,
#         skip_special_tokens=True,
#         remove_invalid_values=True,
#     )

#     return summary

# def summarize_via_tokenbatches(input_text, **kwargs):
#     encoded_input = tokenizer(
#         input_text,
#         padding='max_length',
#         truncation=True,
#         max_length=token_batch_length,
#         stride=batch_stride,
#         return_overflowing_tokens=True,
#         add_special_tokens=False,
#         return_tensors='pt',
#     )

#     in_id_arr, att_arr = encoded_input.input_ids.to(_device), encoded_input.attention_mask.to(_device)
#     gen_summaries = []

#     with tqdm(total=len(in_id_arr)) as pbar:
#         for _id, _mask in zip(in_id_arr, att_arr):
#             result = summarize(ids=_id, mask=_mask, **kwargs)
#             _sum = {"input_tokens": _id, "summary": result}
#             gen_summaries.append(_sum)
#             pbar.update()

#     return gen_summaries

# # --- Combine Summaries for Given Author ---

# # Assuming you have a DataFrame named 'datf'
# def combine_summaries(datf):
#   for index, row in datf.iterrows():
#     identifiers = row['arxiv-id'].split(',')
#     concat_summ = ""

#     for identifier in identifiers:
#       if identifier.strip() == 'None':
#         concat_summ = 'None'
#         break

#       sanitized_identifier = identifier.strip().replace('/', '_')
#       filename = f'SUM_{sanitized_identifier}.txt'  # Assuming summaries are saved as SUM_*.txt

#       if os.path.exists(filename):
#         with open(filename, 'r', encoding='utf-8') as file:
#           concat_summ += file.read()
#       else:
#         concat_summ = 'File not found'

#     datf.at[index, 'Summaries'] = concat_summ

# # --- N-grams ---

# def sum_n_grams(df, directorypath):
#     top10Dict = {'Summaries Top 10 Words': [],
#                  'Summaries Top 10 Bigrams': [],
#                  'Summaries Top 10 Trigrams': []}

#     for i in df.values:
#         summaries = i[15]  # Assuming 'Summaries' is the column name for your combined summaries

#         # Now use functions from TextAnalysis
#         top10words = TA.topwords(summaries, directorypath)
#         top10bigrams = TA.topbigrams(summaries, directorypath)
#         top10trigrams = TA.toptrigrams(summaries, directorypath)

#         top10Dict['Summaries Top 10 Words'].append(top10words)
#         top10Dict['Summaries Top 10 Bigrams'].append(top10bigrams)
#         top10Dict['Summaries Top 10 Trigrams'].append(top10trigrams)

#     sum_top10Df = df
#     sum_top10Df['Summaries Top 10 Words'] = top10Dict['Summaries Top 10 Words']
#     sum_top10Df['Summaries Top 10 Bigrams'] = top10Dict['Summaries Top 10 Bigrams']
#     sum_top10Df['Summaries Top 10 Trigrams'] = top10Dict['Summaries Top 10 Trigrams']

#     sum_top10Df = sum_top10Df[['Input Author', 'Input Institution', 'First Author', 'Bibcode', 'Title', 'Publication Date',
#              'Keywords', 'Affiliations', 'Abstract', 'Top 10 Words', 'Top 10 Bigrams', 'Top 10 Trigrams', 'Data Type',
#             'arxiv-id', 'Summaries', 'Summaries Top 10 Words', 'Summaries Top 10 Bigrams', 'Summaries Top 10 Trigrams']]

#     return sum_top10Df

## Process the PDF

In [None]:
# # --- PDF Processing ---

# # Specify the directory containing your PDF files
# content_directory = ".\content"

# # Find all PDF files in the directory
# pdf_filenames = [os.path.join(content_directory, file) for file in os.listdir(content_directory) if file.endswith('.pdf')]

# # Process the PDF files in parallel
# text_filenames = []
# num_workers = 100

# with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
#   results = list(executor.map(process_pdf, pdf_filenames))

# text_filenames.extend(result for result in results if result is not None)

## Set Parameteres for Summarization

In [None]:
# # --- Summarization Model ---
# #Set your User ID
# username = "ielhaime"

# hf_tag = "pszemraj/led-large-book-summary"
# _device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# model = AutoModelForSeq2SeqLM.from_pretrained(f"./model").to(_device)
# tokenizer = AutoTokenizer.from_pretrained(f"./model")
# token_batch_length = 3072
# batch_stride = 50
# max_len_ratio = 8

# settings = {
#     'min_length': 5,
#     'max_length': int(token_batch_length // max_len_ratio),
#     'no_repeat_ngram_size': 7,
#     'encoder_no_repeat_ngram_size': 7,
#     'repetition_penalty': 3.7,
#     'num_beams': 12,
#     'length_penalty': 0.5,
#     'early_stopping': True,
#     'do_sample': False
# }

# logging.info(f"using textgen params:\n\n:{settings}")

## Run the Summarization 

In [None]:
# # --- Summarization ---

# directory = './content'
# text_files = [os.path.join(directory, file) for file in os.listdir(directory) if file.endswith('.txt')]

# # Summarize each text file and store in a dictionary
# summaries = {} 
# for file_path in text_files:
#   with open(file_path, 'r', errors='ignore') as f:
#     raw_text = f.read()

#   long_text = clean(raw_text)
#   logging.info(f"removed {len(long_text) - len(raw_text)} chars via cleaning")

#   _summaries = summarize_via_tokenbatches(long_text, **settings)
#   summaries[file_path] = _summaries  # Store summaries by filename

#   if torch.cuda.is_available():
#     torch.cuda.empty_cache()

## Output Summaries and N-Grams

In [None]:
# # --- Main Execution ---

# summaries = {}

# datf = pd.read_csv('output.csv')

# stop_dir = "stopwords.txt"

# sample_text = summaries

# sample_ngrams = {
#     'top_words': TA.topwords(sample_text, stop_dir),
#     'top_bigrams': TA.topbigrams(sample_text, stop_dir),
#     'top_trigrams': TA.toptrigrams(sample_text, stop_dir)
# }

# # Add columns to DataFrame with the same summary and n-grams for all rows
# datf['Summary'] = sample_text
# datf['Top_Words'] = [sample_ngrams['top_words']] * len(datf)
# datf['Top_Bigrams'] = [sample_ngrams['top_bigrams']] * len(datf)
# datf['Top_Trigrams'] = [sample_ngrams['top_trigrams']] * len(datf)

# # Save the updated DataFrame
# datf.to_csv('Results.csv', index=False)


# # # summaries = {}

# # arVix Broken (Uncomment when its up again. For now, make do with the above code. Its will output the summary and the N-Grams )

# # # Load your DataFrame
# # datf = pd.read_csv('output.csv')  # Replace 'output.csv' with your actual file

# # # Combine summaries
# # combine_summaries(datf)

# # # Calculate N-grams and update DataFrame
# # path_stop = '/content/'
# # stop_file = 'stopwords.txt'
# # stop_dir = path_stop + stop_file
# # final_df = sum_n_grams(datf, stop_dir)

# # # Save the updated DataFrame
# # final_df.to_csv('Results.csv', index=False)