# OCR model

### Install required packages

In [16]:
%pip install -q -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


### Import the required libraries and configure Tesseract OCR

In [17]:
import os
import cv2
import pytesseract
import numpy as np
from pdf2image import convert_from_path
import shutil
from IPython.display import display, FileLink
import ipywidgets as widgets
from concurrent.futures import ThreadPoolExecutor, as_completed

# Configurare il percorso dell'eseguibile Tesseract (modifica il percorso se necessario)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Configurare la cartella di upload
UPLOAD_FOLDER = "uploads/"
if not os.path.exists(UPLOAD_FOLDER):
    os.makedirs(UPLOAD_FOLDER)

# Configurare la cartella di output
PROJECT_DIR = "output"
if not os.path.exists(PROJECT_DIR):
    os.makedirs(PROJECT_DIR)


### Conversions Functions

In [18]:
def pdf_to_images(pdf_path):
    images = convert_from_path(pdf_path)
    images = [np.array(image) for image in images]  # Converti in numpy.ndarray
    return images


def ocr_image(image):
    if isinstance(image, np.ndarray):
        if len(image.shape) == 3:
            if image.shape[2] == 3:
                gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
                text = pytesseract.image_to_string(gray, lang='eng')
                return text
            else:
                raise ValueError("Input image should have 3 channels (BGR format)")
        else:
            raise ValueError("Input image should have 3 dimensions (height, width, channels)")
    else:
        raise TypeError("Input should be a numpy.ndarray")

def text_to_latex(text):
    latex_text = "\\documentclass{article}\n\\begin{document}\n"
    latex_text += text.replace("\n", "\\\\ \n")
    latex_text += "\n\\end{document}"
    return latex_text


### Upload and process PDF files

In [19]:
def process_pdf(file_path):
    images = pdf_to_images(file_path)
    full_text = ""

    tasks = {}
    with ThreadPoolExecutor(max_workers=len(images)) as executor:
        for i, image in enumerate(images):
            task = executor.submit(ocr_image, image)
            tasks[i] = task
    
    all_responses = [None] * len(images)  # Crea una lista con la lunghezza delle immagini
    for i in tasks:
        future = tasks[i]
        result = future.result()
        all_responses[i] = result  # Memorizza il risultato nella posizione corretta

    full_text = "\n".join(all_responses)

    # latex_code = text_to_latex(full_text)

    # with open(os.path.join(PROJECT_DIR, "main.tex"), "w") as f:
    #     f.write(latex_code)
    # shutil.make_archive("latex_project", "zip", PROJECT_DIR)
    # return "latex_project.zip"

    with open(os.path.join(PROJECT_DIR, "output.txt"), "w") as f:
        f.write(full_text)
    return os.path.join(PROJECT_DIR, "output.txt")


### Interface

In [20]:
# Caricare il file PDF
file_upload = widgets.FileUpload(accept='.pdf', multiple=False)
display(file_upload)

# Pulsante per processare il file
def on_button_clicked(b):
    if file_upload.value:
        # Accedi al primo elemento della tupla se presente
        if isinstance(file_upload.value, tuple) and len(file_upload.value) > 0:
            file_info = file_upload.value[0]
            file_name = file_info['name']  # Nome del file
            file_content = file_info['content']  # Contenuto del file
            file_path = os.path.join(UPLOAD_FOLDER, file_name)  # Percorso del file

            # Salva il contenuto del file
            with open(file_path, 'wb') as f:
                f.write(file_content)

            # Processa il file PDF e genera l'output
            file_path = process_pdf(file_path)

            # Mostra il link per scaricare il file .zip
            display(FileLink(file_path))
        else:
            print("Nessun file caricato!")

button = widgets.Button(description="Processa PDF")
button.on_click(on_button_clicked)
display(button)


FileUpload(value=(), accept='.pdf', description='Upload')

Button(description='Processa PDF', style=ButtonStyle())