In [1]:
from transformers import TrOCRProcessor

processor = TrOCRProcessor.from_pretrained("microsoft/trocr-small-printed")




In [2]:
from transformers import VisionEncoderDecoderModel

model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-small-printed")

config.json:   0%|          | 0.00/4.21k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/246M [00:00<?, ?B/s]

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-small-printed and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
import torch

print("CUDA Available:", torch.cuda.is_available())
print("Number of GPUs Available:", torch.cuda.device_count())
print("Current Device:", torch.cuda.current_device())
print("Device Name:", torch.cuda.get_device_name(torch.cuda.current_device()))

CUDA Available: True
Number of GPUs Available: 1
Current Device: 0
Device Name: NVIDIA GeForce RTX 3050 Laptop GPU


In [3]:
import os
import pandas as pd
from PIL import Image
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from concurrent.futures import ThreadPoolExecutor, as_completed

# Define paths
data_csv_path = '../../data/data-cross-section.csv'
images_dir = '../../data/preprocessed'
output_csv_path = '../../data/data-extracted.csv'

# Load CSV file
df = pd.read_csv(data_csv_path)

# Load processor and model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

def extract_text_from_image(image_path):
    # Load image
    image = Image.open(image_path).convert("RGB")
    
    # Preprocess image
    pixel_values = processor(image, return_tensors="pt").pixel_values.to(device)
    
    # Perform OCR
    with torch.no_grad():
        generated_ids = model.generate(pixel_values)
        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    return generated_text

def process_image(index, folder_path, image_name):
    image_path = os.path.join(folder_path, image_name)
    extracted_text = extract_text_from_image(image_path)
    
    # Update DataFrame entry
    current_text = df.at[index, 'extracted_text']
    if current_text:
        df.at[index, 'extracted_text'] = f"{current_text} {extracted_text}"
    else:
        df.at[index, 'extracted_text'] = extracted_text

# Ensure 'extracted_text' column exists in DataFrame
if 'extracted_text' not in df.columns:
    df['extracted_text'] = ''

# Create a ThreadPoolExecutor for parallel processing
with ThreadPoolExecutor(max_workers=4) as executor:
    futures = []
    for index, row in df.iterrows():
        folder_path = os.path.join(images_dir, str(index))
        if os.path.exists(folder_path):
            for image_name in os.listdir(folder_path):
                futures.append(executor.submit(process_image, index, folder_path, image_name))
    
    # Wait for all futures to complete
    for future in as_completed(futures):
        future.result()  # To handle exceptions if needed

# Save the updated DataFrame to a new CSV file
df.to_csv(output_csv_path, index=False)

print(f"Updated data saved to {output_csv_path}")

df



Updated data saved to ../../data/data-extracted.csv


Unnamed: 0,image_link,group_id,entity_name,entity_value,extracted_text
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram,NATURE PROP@S' DESODORISER LE LINGE. QUI PERME...
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup,MCIOUS/ CONVENTION WITH ASYDU LIKE V DESIGNED ...
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,item_weight,0.709 gram,COMPOSITION 50 MG 25 mg SERVING SIZE: 1 TABLET...
3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,item_weight,0.709 gram,"THIS PRODUCT IS YOU ARE PREGNANT, RURSING, BRI..."
4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,item_weight,1400 milligram,HORBEACH HIGH STRENGTH CAPSULES HUSK PSYLLIUM ...
...,...,...,...,...,...
94,https://m.media-amazon.com/images/I/61Dq3LRei9...,523149,item_weight,10.0 kilogram,0.00 FOR RINETT SILICONE RINGS
95,https://m.media-amazon.com/images/I/71XK5d3Oh9...,416664,wattage,49.0 watt,
96,https://m.media-amazon.com/images/I/61kyBEJYDe...,459516,item_weight,500 milligram,TERTAL MAX GREEN DISCOVER WELLNESS COTEL 60 BU...
97,https://m.media-amazon.com/images/I/71uQmsTESv...,459516,item_weight,500 milligram,TOTAL MAX HEALTHY BENEFITS OF GREEN COFFEE SYS...
