In [1]:
# FOR COLAB ONLY
# from google.colab import drive
# import pandas as pd

# # Mounting file-system from drive
# drive.mount('/content/drive')

# # Function to load files
# def load_file(file_path):
#   with open(file_path, 'r', encoding='utf-8') as f:
#     content = f.read()
#   return content

# # Logining in and authenticating hugging face account
# !pip install -q huggingface_hub
# from huggingface_hub import login
# login(token=load_file('/content/drive/My Drive/work/cimatec/enade_to_edag/data/keys/hf') \
#     .strip())

In [60]:
from pdf2image import convert_from_path
from PIL import Image, ImageDraw
import pytesseract
import torch
from transformers import LayoutLMv3Processor, LayoutLMv3Model
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

# Initializing processor
processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-large", \
                                                apply_ocr=True, ocr_lang="por")
device = "cuda" if torch.cuda.is_available() else "cpu"

# Function to extract questions from PDF
def extract_questions_layout(pdf_path, dpi=300, visual=-1):
# Converting PDF pages to PIL images
    pages = convert_from_path(pdf_path, dpi=dpi)
    pages = pages[1:-2]
    
    # Running LayoutLMv3 OCR
    tokens = []
    boxes = []
    for i, page in enumerate(pages, start=1):
        # Running model
        enc = processor(page, return_tensors="pt", truncation=True, \
        max_length=1024).to(device)
        
        # Extracting tokens and boundig boxes
        page_tokens = enc.tokens()
        page_tokens = [token.replace('Ġ', '') for token in page_tokens]
        page_boxes  = enc.bbox.squeeze(0)
        
        tokens.append(page_tokens)
        boxes.append(page_boxes)
        
        # Drawing bounding boxes on questions
        if visual != -1 and i == visual:
            orig_w, orig_h = page.size
            draw = ImageDraw.Draw(page)
            for idx in range(len(page_tokens)):
                x0, y0, x1, y1 = page_boxes[idx]
                draw.rectangle([(orig_w*x0/1000, orig_h*y0/1000), \
                                (orig_w*x1/1000, orig_h*y1/1000)], \
                               outline="red", width=2)
            
            plt.figure(figsize=(12, 6))
            plt.imshow(page)

            return pages, tokens, boxes
    
    return pages, tokens, boxes

# Calling function
pages, tokens, boxes = extract_questions_layout("data/enade_2023.pdf", visual=-1)

In [94]:
# Saving screenshot of question for later usage on an LLM
page_overflow = False
prev = {'left': None, "right": None, "discursiva": None}
open_counter = 1
closed_counter = 1
for i, t in enumerate(tokens):
    # Getting indices of questions on a page
    indices = [j for j, token in enumerate(t) if token == 'QUEST']

    # Saving continuation of a question on another second page
    if page_overflow is True:
        try:
            bot_limit = boxes[i][indices[0], 1]
        except:
            bot_limit = torch.max(boxes[i][:-1, 1])
        
        img = pages[i]
        orig_w, orig_h = img.size
        left = (prev["left"].item()/1000)*orig_w - 5
        right = (prev["right"].item()/1000)* orig_w + 5
        top = 5
        bottom = (bot_limit.item()/1000)*orig_h + 5

        crop = img.crop((left, top, right, bottom))
        if discursiva:
            fname = f"data/visual_approach/open_question_{open_counter-1:02d}_p2.png"
        else:
            fname = f"data/visual_approach/closed_question_{closed_counter-1:02d}_p2.png"
        crop.save(fname)
        
        page_overflow = False

    # Going through questions on a page
    for j, idx in enumerate(indices):
        # Checking if its closed or open
        discursiva = (t[idx+3] == 'DIS')

        # Computing limits for cropping
        left_limit, right_limit = boxes[i][idx, 0], torch.max(boxes[i][:-1, 2])
        top_limit = boxes[i][idx, 1]
        
        # Two questions on the same page
        if j < len(indices)-1:
            bot_limit = boxes[i][indices[j+1], 1]
        
        # A question that either finishes the page or goes to another page
        else:
            page_overflow = True
            bot_limit = boxes[i][-10, 3]

        # Saving question
        img = pages[i]
        orig_w, orig_h = img.size
        left = (left_limit.item()/1000)*orig_w - 5
        right = (right_limit.item()/1000)* orig_w + 5
        top = (top_limit.item()/1000)*orig_h - 5
        bottom = (bot_limit.item()/1000)*orig_h + 5

        crop = img.crop((left, top, right, bottom))
        if discursiva:
            fname = f"data/visual_approach/open_question_{open_counter:02d}_p1.png"
            open_counter += 1
        else:
            fname = f"data/visual_approach/closed_question_{closed_counter:02d}_p1.png"
            closed_counter += 1
        crop.save(fname)

        # Updating previous dictionary if needed
        if page_overflow:
            prev.update({"left": left_limit, "right": right_limit, \
                         "discursiva": discursiva})

In [95]:
import os
import re

# "Glueing" multiple page questions
folder = "data/visual_approach"
open_pattern = re.compile(r"open_question_(\d{2})_p[12]\.png$")
closed_pattern = re.compile(r"closed_question_(\d{2})_p[12]\.png$")

# Gathering all files and group by question number
groups = {}
for fname in os.listdir(folder):
    open_m = open_pattern.match(fname)
    closed_m = closed_pattern.match(fname)
    
    if open_m:
        qnum = open_m.group(1)
        try:
            groups[f'open_{qnum}'].append(fname)
        except:
            groups[f'open_{qnum}'] = [fname]

    elif closed_m:
        qnum = closed_m.group(1)
        try:
            groups[f'closed_{qnum}'].append(fname)
        except:
            groups[f'closed_{qnum}'] = [fname]

# Stacking images vertically when question has more than one part
for qnum, fnames in groups.items():
    if len(fnames) < 2:
        path = os.path.join(folder, fnames[0])
        os.rename(path, os.path.join(folder, f"{fnames[0].split('_p')[0]}.png"))
        continue

    # Organizing order to stack correctly
    fnames_sorted = sorted(fnames)

    # Computing dimensions for the new canvas
    imgs = [Image.open(os.path.join(folder, f)) for f in fnames_sorted]
    widths, heights = zip(*(im.size for im in imgs))
    max_width = max(widths)
    total_height = sum(heights)

    # Creating a blank canvas and pasting each part
    combined = Image.new("RGB", (max_width, total_height), (255, 255, 255))
    y_offset = 0
    for im in imgs:
        combined.paste(im, (0, y_offset))
        y_offset += im.height

    # Saving the stacked image
    combined.save(os.path.join(folder, f"{fnames[0].split('_p')[0]}.png"))

    # Deleting the old files with parts
    for f in fnames:
        os.remove(os.path.join(folder, f))