# Install dependencies

In [18]:
!pip install transformers

Defaulting to user installation because normal site-packages is not writeable


# Imports

In [19]:
import argparse

In [20]:
import os

In [21]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

In [22]:
from transformers import logging

# Set logging

In [23]:
# Silences "Some weights were not initialized from the model checkpoint" warning
logging.set_verbosity_error()

# Define main function

In [24]:
def main(input_files):
    # parser = argparse.ArgumentParser(description="Abstractive Text Summarization Script")
    # parser.add_argument("input_files", nargs='+', help="Paths to the input text files")

    # args = parser.parse_args()
    # input_files = args.input_files

    for input_file_path in input_files:
        # Check if the file exists
        if not os.path.exists(input_file_path):
            print(f"Error: File '{input_file_path}' does not exist.")
            continue

        # Read input text from the file
        try:
            with open(input_file_path, "r", encoding="utf-8") as file:
                input_text = file.read()

            # Check if the file is empty
            if not input_text.strip():
                print(f"Error: File '{input_file_path}' is empty.")
                continue
        
        # Check if the file is corrupt
        except Exception as e:
            print(f"Error reading file '{input_file_path}': {e}")
            continue
        
        summary = abstractive_summarization(input_file_path)
        
        print(summary)


# Define summarization function

In [25]:
def abstractive_summarization(file):
    # Load pre-trained model and tokenizer only once
    model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
    tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum", model_max_length=1024)

    # Tokenize and generate summary
    with open(file, "r") as f:
        text = f.read()
        
        inputs = tokenizer("summarize: " + text, return_tensors="pt", padding="longest", truncation=True)
        summary_ids = model.generate(inputs["input_ids"], max_length=150, length_penalty=2.0, num_beams=4, early_stopping=True)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

        return summary

# Run main function

In [26]:
main(['/home/levi/rhel-8-docs/rhel-9/modules/performance/ref_vdo-thread-types.adoc'])

VDO uses various thread types to handle specific operations, such as read and write operations.
