In [None]:
!sudo apt install libtesseract-dev
!sudo apt install tesseract-ocr-ita
!pip install pytesseract -q
!pip install PyPDF2 -q
!pip install pdf2image -q
!pip install pymupdf -q
!apt-get install poppler-utils -q
!pip install easyocr -q

In [None]:
import os
import requests
import fitz
from pdf2image import convert_from_bytes
import pytesseract
import easyocr

In [None]:
def process_pdfs_and_save(urls_or_paths, easy_ocr = False):
    directory = '/content/drive/path_to_folder'
    files = [f for f in os.listdir(directory) if f.startswith('ftx-') and f.endswith('.txt')]
    if easy_ocr == True:
      reader = easyocr.Reader(['it'], gpu = True)

    with requests.Session() as session:
        for url_or_path in urls_or_paths:
            to_check = "ftx-" + url_or_path.replace('/', '_') + ".txt"
            # Check if the URL/Path is already processed
            if to_check in files:
                continue

            # Check if the URL points to a PDF file
            if not url_or_path.endswith(".pdf"):
                continue

            # Check if the URL/Path is a URL or a file path
            if url_or_path.startswith("http"):
                # Download the PDF file from the URL
                response = session.get(url_or_path)
                content = response.content
            else:
                # Read the PDF file from the file path
                with open(url_or_path, "rb") as f:
                    content = f.read()

            # Open the PDF content using PyMuPDF
            with fitz.open(stream=content, filetype="pdf") as doc:
                num_pages = len(doc)

                # Extract text from the PDF
                text = ""
                for page in doc:
                    text += page.get_text()

                # Perform OCR on the PDF if the extracted text is empty
                if not text:
                    # Skip OCR if the number of pages is greater than or equal to 50
                    if num_pages >= 50 and easy_ocr == False:
                        continue

                    # Convert PDF pages to images
                    images = convert_from_bytes(content)

                    # Perform OCR on the images using Tesseract
                    for i, img in enumerate(images):
                        if easy_ocr == False:
                          text += pytesseract.image_to_string(img, lang='ita')
                          # Perform OCR on the images using EasyOcr
                        else: 
                          text += reader.readtext(page, batch_size = 50, detail = 0)

                    # Clear the image list to free up memory
                    del images

                # Save the text content to a TXT file
                filepath = os.path.join(directory, to_check)
                with open(filepath, "w") as f:
                    f.write(text)
    return True
