# Common setup

## Install dependencies

In [78]:
!pip install transformers beautifulsoup4

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable


## Imports

In [79]:
import argparse
import os
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, logging
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from bs4 import BeautifulSoup

## Set logging

In [80]:
# Silences "Some weights were not initialized from the model checkpoint" warning
logging.set_verbosity_error()

## Input files

In [81]:
input_files=["/home/levi/rhel-8-docs/rhel-9/titles/configuring-and-maintaining/deduplicating-and-compressing-logical-volumes-on-rhel/master.html"]

## Define main function

In [82]:
def preprocess_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    text = ' '.join(soup.stripped_strings)
    return text

In [83]:
def main(input_files):
    # parser = argparse.ArgumentParser(description="Abstractive Text Summarization Script")
    # parser.add_argument("input_files", nargs='+', help="Paths to the input text files")

    # args = parser.parse_args()
    # input_files = args.input_files

    for input_file_path in input_files:
        # Check if the file exists
        if not os.path.exists(input_file_path):
            print(f"Error: File '{input_file_path}' does not exist.")
            continue

        # Read input text from the file
        try:
            with open(input_file_path, "r", encoding="utf-8") as file:
                input_text = file.read()

            # Check if the file is empty
            if not input_text.strip():
                print(f"Error: File '{input_file_path}' is empty.")
                continue
        
        # Check if the file is corrupt
        except Exception as e:
            print(f"Error reading file '{input_file_path}': {e}")
            continue
        
        summary_dict = abstractive_summarization(input_file_path)
        
        for filepath, summary in summary_dict.items():
            print(f"\nFilename: {filepath}\nSummary: {summary}")


# Summarization function GPT

In [84]:
def abstractive_summarization(file):
    # Load pre-trained GPT-2 model and tokenizer only once
    model = GPT2LMHeadModel.from_pretrained("gpt2")
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

    # Add a padding token to the tokenizer
    tokenizer.pad_token = tokenizer.eos_token

    # Tokenize and generate summary
    with open(file, "r") as f:
        text = f.read()
        
        # Split text into chunks
        chunk_size = 512  # Adjust as needed
        chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

        summaries = []
        for chunk in chunks:
            inputs = tokenizer("summarize: " + chunk, return_tensors="pt", padding=True, truncation=True, max_length=300)
            summary_ids = model.generate(inputs["input_ids"], max_length=300, length_penalty=2.0, num_beams=4, early_stopping=True)
            summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
            summaries.append(summary)

        return {file: ' '.join(summaries)}

main(input_files)

KeyboardInterrupt: 