In [1]:
!pip install PyPDF2 transformers pytesseract pillow

import PyPDF2
from transformers import pipeline
from pytesseract import image_to_string
from PIL import Image
import io

def extract_text_from_pdf(pdf_file):
    """
    Extracts text from a PDF file, prioritizing direct text extraction and falling back to OCR only when necessary.
    """
    try:
        with open(pdf_file, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:  # If text is extracted directly
                    text += page_text + "\n"
                else:  # Fallback to OCR for scanned PDFs
                    print(f"Page {reader.pages.index(page) + 1} is scanned. Extracting text using OCR...")
                    for image in page.images:
                        img = Image.open(io.BytesIO(image.data))
                        text += image_to_string(img) + "\n"
            return text.strip()
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return ""

def summarize_text(text, max_length=300, min_length=100):
    """
    Summarizes text using the BART model, optimized for large texts by chunking.
    """
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    try:
        chunk_size = 1024  # BART's max token size per input
        text_chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
        summaries = []

        for chunk in text_chunks:
            summary = summarizer(chunk, max_length=max_length, min_length=min_length, do_sample=False)
            summaries.append(summary[0]['summary_text'])

        return " ".join(summaries)
    except Exception as e:
        print(f"Error during summarization: {e}")
        return "Summarization failed."

def main():
    """
    Main function to handle user input and summarize text or PDF content.
    """
    input_type = input("Enter the type of input (pdf/text): ").strip().lower()

    if input_type == "pdf":
        pdf_path = input("Enter the path to the PDF file: ").strip()
        print("Extracting text from PDF...")
        text = extract_text_from_pdf(pdf_path)
    elif input_type == "text":
        text = input("Enter the text: ").strip()
    else:
        print("Invalid input type. Please choose either 'pdf' or 'text'.")
        return

    if not text:
        print("No text to summarize.")
        return

    print("\nSummarizing...")
    summarized_text = summarize_text(text)
    print("\nSummary:\n", summarized_text)

if __name__ == "__main__":
    main()


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract, PyPDF2
Successfully installed PyPDF2-3.0.1 pytesseract-0.3.13
Enter the type of input (pdf/text): pdf
Enter the path to the PDF file: /content/example.pdf
Extracting text from PDF...

Summarizing...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0
Your max_length is set to 300, but your input_length is only 106. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=53)



Summary:
 The Indian Constitution has a total of 470 articles. The articles are divided into 25 parts: the Constitution, Articles 1-4, Articles 5-11, Articles 6-10, Articles 11-12, Articles 13-14, and Articles 15-16. The Constitution includes the Indian Citizenship, Indian Laws, and Indian Migration and Indian Reservations. The Indian Indian Constituent Assembly has the power to amend the Constitution. The U.S. House of Representatives has the power to pass bills into law. The Senate has the authority to pass laws. The U.N. Security Council has the ability to impose sanctions on violators of the laws of the United States. The European Union has the right to enforce the European Convention on Human Rights (ECHR) through the European Court of Justice (ECJ). The European Parliament has the duty to protect the rights of the people of the EU. The EU has the obligation to protect human rights, including the right of people to equal rights. A list of some of the most important words in the E