In [None]:
# https://huggingface.co/microsoft/table-transformer-structure-recognition-v1.1-all
# https://huggingface.co/docs/transformers/main/en/model_doc/table-transformer

In [None]:
from transformers import TableTransformerForObjectDetection, DetrImageProcessor
from PIL import Image
# import requests


# Load the model and processor
model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-transformer-structure-recognition-v1.1-all")
processor = DetrImageProcessor.from_pretrained("microsoft/table-transformer-structure-recognition-v1.1-all")

In [None]:
image = Image.open("table.png")

# Preprocess the image
inputs = processor(images=image, return_tensors="pt")

# Perform inference
outputs = model(**inputs)


In [None]:
# Extract the bounding boxes and labels
results = processor.post_process_object_detection(outputs, target_sizes=[image.size[::-1]])[0]

# Extract table structure and text content
for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    if score > 0.5:  # Threshold for confidence
        box = [round(i, 2) for i in box.tolist()]
        print(f"Detected {model.config.id2label[label.item()]} with confidence {round(score.item(), 3)} at location {box}")


In [None]:
import os
import dotenv
from pdf2image import convert_from_path
from PIL import Image


def convert_pdf_to_images(pdf_path, output_folder="output_images"):
    # Ensure the output directory exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Convert PDF to a list of images
    images = convert_from_path(pdf_path, dpi=200)  # dpi can be adjusted based on desired quality

    # Save images to the output folder
    image_files = []
    for i, image in enumerate(images):
        image_path = os.path.join(output_folder, f"page_{i+1}.png")
        image.save(image_path, 'PNG')
        image_files.append(image_path)
    
    return image_files

In [None]:
dotenv.load_dotenv()
pdf_path = os.getenv("PDF_PATH")
image_files = convert_pdf_to_images(pdf_path)
print("Images saved:", image_files)

In [None]:
import torch

print(torch.cuda.is_available())
print(torch.__version__)

In [None]:
from transformers import LayoutLMv2Processor, LayoutLMv2ForTokenClassification
from PIL import Image
import os
import dotenv


dotenv.load_dotenv()

# Load processor and model
processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
model = LayoutLMv2ForTokenClassification.from_pretrained("microsoft/layoutlmv2-base-uncased")

# Load image
image = Image.open(os.getenv("PDF_IMAGE_PATH"))
words = ["Hello", "world"]  # List of words recognized in the OCR process (you would use an actual OCR tool here)
boxes = [[27, 76, 91, 112], [95, 73, 191, 113]]  # Example bounding boxes for each word

# Prepare encoding
encoding = processor(image, words, boxes=boxes, return_tensors="pt")

# Forward pass
output = model(**encoding)

# Post-process here to extract desired output