<a href="https://colab.research.google.com/github/Layswaferstyle/Docu_scan/blob/main/Docu_scan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install dependencies in Colab
!pip install opencv-python pytesseract transformers Pillow pdf2image numpy

# If you want to process PDFs, install poppler (for pdf2image)

!apt-get install -y poppler-utils


In [None]:
import cv2
import pytesseract
from transformers import pipeline
from PIL import Image
import os
import platform
import numpy as np
from pdf2image import convert_from_path
import torch



# pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'

# Determine the device to use
device = 0 if torch.cuda.is_available() else -1

# Initialize models
summarizer = pipeline("summarization", model="t5-base", device=device)
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", device=device)

def ocr_image_from_path(image_path):
    image = cv2.imread(image_path)
    if image is None:
        raise Exception(f"Failed to load image: {image_path}")
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    text = pytesseract.image_to_string(gray)
    return text.strip()

def ocr_image_from_pil(pil_image):
    image = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    text = pytesseract.image_to_string(gray)
    return text.strip()

def summarize_text(text):
    # Truncate text to avoid exceeding model's maximum input size
    max_model_input_size = 1024  # BART's max input size is 1024 tokens
    if len(text) > max_model_input_size:
        text = text[:max_model_input_size]

    # Calculate minimum summary length (at least 10% of original text, but not more than max_length)
    min_summary_length = max(10, int(len(text) * 0.10))
    max_summary_length = min(int(len(text) * 0.10), len(text)) # Ensure max_length is not more than text length

    if len(text) > 100:
        summary = summarizer(text, max_length=max_summary_length, min_length=min_summary_length, do_sample=False)
        return summary[0]['summary_text']
    return "Text is too short to summarize. Here's the extracted text:\n" + text

def process_image(image_path):
    print(f"\nProcessing image: {image_path}")
    extracted_text = ocr_image_from_path(image_path)
    # print("Extracted Text:\n", extracted_text)
    summary = summarize_text(extracted_text)
    print("Summary:\n", summary)
    while True:
        question = input("\nAsk a question about the summary (type 'quit' or 'stop' to exit): ")
        if question.lower() in ['quit', 'stop']:
            print("Next File")
            break
        answer = qa_pipeline(question=question, context=summary)
        print("Question:", question)
        print("Answer:", answer['answer'])

def process_pdf(pdf_path):
    print(f"\nProcessing PDF: {pdf_path}")
    pages = convert_from_path(pdf_path)
    for i, page in enumerate(pages):
        print(f"\n--- Page {i+1} Summary ---")
        extracted_text = ocr_image_from_pil(page)
        # print("Extracted Text:\n", extracted_text) # Optional: uncomment to see full extracted text
        summary = summarize_text(extracted_text)
        print("Summary:\n", summary)
        while True:
            question = input("\nAsk a question about this page's summary (type 'quit' or 'stop' to exit page): ")
            if question.lower() in ['quit', 'stop']:
                print("Moving to next page or file...")
                break
            answer = qa_pipeline(question=question, context=summary)
            print("Question:", question)
            print("Answer:", answer['answer'])

def process_folder(folder_path):
    for filename in os.listdir(folder_path):
        filepath = os.path.join(folder_path, filename)
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
            process_image(filepath)
        elif filename.lower().endswith('.pdf'):
            process_pdf(filepath)

def main():
    user_input = input("Enter image file path, PDF file path, or folder path: ").strip()
    if os.path.isdir(user_input):
        process_folder(user_input)
    elif user_input.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
        process_image(user_input)
    elif user_input.lower().endswith('.pdf'):
        process_pdf(user_input)
    else:
        print("Invalid input. Please provide a valid image, PDF, or folder path.")

if __name__ == "__main__":
    main()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Task
Modify the provided Python code to generate a single document-level summary of a PDF file instead of page-by-page summaries. The code should also be runnable on a GPU. Additionally, suggest faster and better models compared to BART for summarization.

## Modify pdf processing

### Subtask:
Update the `process_pdf` function to extract text from all pages and concatenate it into a single string.


**Reasoning**:
The subtask is to modify the `process_pdf` function to concatenate the text from all pages. This involves initializing an empty string, iterating through the pages, extracting text from each page, and appending it to the string with a separator.



In [4]:
def process_pdf(pdf_path):
    print(f"\nProcessing PDF: {pdf_path}")
    pages = convert_from_path(pdf_path)
    all_text = ""
    for i, page in enumerate(pages):
        extracted_text = ocr_image_from_pil(page)
        all_text += f"--- Page {i+1} ---\n{extracted_text}\n\n"
    # The rest of the process_pdf function will be modified in subsequent steps
    # to use all_text for summarization instead of processing page by page.
    # For now, just print the concatenated text to verify the change.
    print("Concatenated Text from all pages:\n", all_text)

def process_image(image_path):
    print(f"\nProcessing image: {image_path}")
    extracted_text = ocr_image_from_path(image_path)
    print("Extracted Text:\n", extracted_text)
    summary = summarize_text(extracted_text)
    print("Summary:\n", summary)
    while True:
        question = input("\nAsk a question about the summary (type 'quit' or 'stop' to exit): ")
        if question.lower() in ['quit', 'stop']:
            print("Next File")
            break
        answer = qa_pipeline(question=question, context=summary)
        print("Question:", question)
        print("Answer:", answer['answer'])


def process_folder(folder_path):
    for filename in os.listdir(folder_path):
        filepath = os.path.join(folder_path, filename)
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
            process_image(filepath)
        elif filename.lower().endswith('.pdf'):
            process_pdf(filepath)

def main():
    user_input = input("Enter image file path, PDF file path, or folder path: ").strip()
    if os.path.isdir(user_input):
        process_folder(user_input)
    elif user_input.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
        process_image(user_input)
    elif user_input.lower().endswith('.pdf'):
        process_pdf(user_input)
    else:
        print("Invalid input. Please provide a valid image, PDF, or folder path.")

if __name__ == "__main__":
    main()

Enter image file path, PDF file path, or folder path: /content/inputs

Processing image: /content/inputs/WhatsApp Image 2024-10-03 at 11.48.02 AM.jpeg
Extracted Text:
 a Sena Sachiv Shakha/MS-18
Ekikrit Mukhyalaya

Military Secretary's Branch
Integrated HO of MoD (Army)
BHO PO, New Delhi-110011

A/14184/143/MS-184 9.7 Aug 2024

HQ Eastern Comd (MS / Sigs)

3 Corps Op Sig Ret

POSTING OFFICER

Peron verter fam UN iscton i south Sudan (UNMISS), 1671423 Lt Col Madhur Male

Sigs \s posted to 3 Corps Op Sig Regt

 

UN

  

2 On arrival in India, orders for attachment (if any) will be issued Dy SD)
|HQ of MoD (Amy) {or de-briefing On termination of allachment
‘SD Dte/ SD-3 (UN) officer has been granted 30 days PAL for the yr 2023

3 During the period of attachment / leave the officer will be borne on the strength of his
new unit

4 Attention of the officer (s drawn to AO 209/76 (JoiningiJourney pd). AO 02/2016/MS
(instrs for Rendition of CRs) and AO 09/201 1/DGMS (Health Cate System) for co

**Reasoning**:
The previous step successfully concatenated the text from all pages of the PDF. The next step is to integrate the summarization and question-answering logic into the `process_pdf` function, using the concatenated text as the input for the summarizer and the context for the question-answering pipeline.



In [5]:
def process_pdf(pdf_path):
    print(f"\nProcessing PDF: {pdf_path}")
    pages = convert_from_path(pdf_path)
    all_text = ""
    for i, page in enumerate(pages):
        extracted_text = ocr_image_from_pil(page)
        all_text += f"--- Page {i+1} ---\n{extracted_text}\n\n"

    print("Processing all pages for a single summary...")
    summary = summarize_text(all_text)
    print("Document Summary:\n", summary)

    while True:
        question = input("\nAsk a question about the document summary (type 'quit' or 'stop' to exit): ")
        if question.lower() in ['quit', 'stop']:
            print("Finished processing PDF.")
            break
        answer = qa_pipeline(question=question, context=summary)
        print("Question:", question)
        print("Answer:", answer['answer'])

def process_image(image_path):
    print(f"\nProcessing image: {image_path}")
    extracted_text = ocr_image_from_path(image_path)
    print("Extracted Text:\n", extracted_text)
    summary = summarize_text(extracted_text)
    print("Summary:\n", summary)
    while True:
        question = input("\nAsk a question about the summary (type 'quit' or 'stop' to exit): ")
        if question.lower() in ['quit', 'stop']:
            print("Next File")
            break
        answer = qa_pipeline(question=question, context=summary)
        print("Question:", question)
        print("Answer:", answer['answer'])


def process_folder(folder_path):
    for filename in os.listdir(folder_path):
        filepath = os.path.join(folder_path, filename)
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
            process_image(filepath)
        elif filename.lower().endswith('.pdf'):
            process_pdf(filepath)

def main():
    user_input = input("Enter image file path, PDF file path, or folder path: ").strip()
    if os.path.isdir(user_input):
        process_folder(user_input)
    elif user_input.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
        process_image(user_input)
    elif user_input.lower().endswith('.pdf'):
        process_pdf(user_input)
    else:
        print("Invalid input. Please provide a valid image, PDF, or folder path.")

if __name__ == "__main__":
    main()

Enter image file path, PDF file path, or folder path: /content/inputs

Processing PDF: /content/inputs/wcms_160629.pdf
Processing all pages for a single summary...
Document Summary:
 Nurses are in high demand in developed and developing countries. Western countries have intensified their efforts to recruit foreign nurses. In response to the global nurse shortage, more Asian countries are sending nurses abroad. The World Health Organization estimates that the world needs to increase the number of health workers by more than four million? to achieve the global health goal set by the Millennium Development Goals (MDGs) The demand for nurses is growing as an integral part of the overall health system to provide better health care.

Ask a question about the document summary (type 'quit' or 'stop' to exit): quit
Finished processing PDF.

Processing PDF: /content/inputs/Guidebook.pdf
Processing all pages for a single summary...
Document Summary:
 The Riviera Sapphire is located in Bamonwado S

PDFPageCountError: Unable to get page count.
I/O Error: Couldn't open file '/content/inputs/Sara_Goa_Del_Boarding.pdf': No such file or directory.


## Suggest alternative models

### Subtask:
Suggest faster and better models compared to BART for summarization.


**Reasoning**:
Research and describe alternative models for summarization, explaining their potential benefits over BART and how they could be integrated, without providing code for them.

