In [1]:
!pip install --upgrade datasets



In [2]:
from transformers import (
    VisionEncoderDecoderModel,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,  # Questo è l'import mancante!
    TrOCRProcessor
)
from torchvision import transforms
from PIL import Image
import torch

In [3]:
#ok
import kagglehub

# Download latest version
path = kagglehub.dataset_download("robikscube/textocr-text-extraction-from-images-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/textocr-text-extraction-from-images-dataset


In [4]:
#ok
import json

dataset_path = "/kaggle/input/textocr-text-extraction-from-images-dataset"
train_json = f"{dataset_path}/TextOCR_0.1_train.json"

# Carica il file JSON
with open(train_json) as f:
    data = json.load(f)

# Crea un file JSONL con solo {image_path, text}
output_path = "train_data_trocr.jsonl"
with open(output_path, "w", encoding="utf-8") as f_out:
    for img_id, img in list(data["imgs"].items())[:700]:  # Puoi aumentare il limite
        filename = img['file_name'].replace('train/', '')
        img_path = f"{dataset_path}/train_val_images/train_images/{filename}"

        # Trova gli ID delle annotazioni
        annotation_ids = data["imgToAnns"].get(img_id, [])

        # Recupera le trascrizioni, ignorando bounding box
        texts = []
        for ann_id in annotation_ids:
            ann = data["anns"][str(ann_id)]
            text = ann["utf8_string"].strip()
            if text:  # Esclude stringhe vuote
                texts.append(text)

        if texts:
            full_text = " ".join(texts)
            entry = {"image_path": img_path, "text": full_text}
            f_out.write(json.dumps(entry, ensure_ascii=False) + "\n")

print("✅ File JSONL pronto per il fine-tuning con TrOCR.")
!head -n 5 train_data_trocr.jsonl


✅ File JSONL pronto per il fine-tuning con TrOCR.
{"image_path": "/kaggle/input/textocr-text-extraction-from-images-dataset/train_val_images/train_images/a4ea732cd3d5948a.jpg", "text": "Performance Sport Watch ...period. . 400 300 15 12 60 65 170 . . . . 100 GTOR® 45 . . 10 20 30 40 50 ."}
{"image_path": "/kaggle/input/textocr-text-extraction-from-images-dataset/train_val_images/train_images/4bf43a7b2a898044.jpg", "text": "400 Z 7 at nLa A. James LYNCH REAL ESTATE 781.599.1599 INSURANCE 781.598.4700 . . . . 12 . . 23 n's Roast Beef Seafood stbeef.com Life Care Center of the North Shore 111 Binch Street/ Lyna, MA 01902 Her Ea . KNEE SAVER KNEE SAVER ."}
{"image_path": "/kaggle/input/textocr-text-extraction-from-images-dataset/train_val_images/train_images/1b55b309b0f50d02.jpg", "text": "CAOL ILA DISTILLERY 1996 GLE MALT SCOTCH WHISKY . . ."}
{"image_path": "/kaggle/input/textocr-text-extraction-from-images-dataset/train_val_images/train_images/00c359f294f7dcd9.jpg", "text": "G-ATCO HUSK

In [5]:
from datasets import load_dataset
import os

# 1. Verifica che il file esista
file_path = "/content/train_data_trocr.jsonl"
assert os.path.exists(file_path), f"File non trovato: {file_path}"

# 2. Carica il dataset correttamente
try:
    # Prova prima con questo metodo (più comune)
    dataset = load_dataset("json", data_files=file_path)["train"]
except Exception as e:
    print(f"Metodo standard fallito: {e}\nProvando alternativa...")
    # Metodo alternativo se il primo non funziona
    from datasets import Dataset
    dataset = Dataset.from_json(file_path)

# 3. Verifica il risultato
print(f"Dataset caricato con {len(dataset)} esempi")
print(dataset[0])  # Mostra il primo esempio

Generating train split: 0 examples [00:00, ? examples/s]

Dataset caricato con 700 esempi
{'image_path': '/kaggle/input/textocr-text-extraction-from-images-dataset/train_val_images/train_images/a4ea732cd3d5948a.jpg', 'text': 'Performance Sport Watch ...period. . 400 300 15 12 60 65 170 . . . . 100 GTOR® 45 . . 10 20 30 40 50 .'}


In [6]:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, Seq2SeqTrainingArguments, Seq2SeqTrainer
from PIL import Image
import torch

# 1. Inizializza il processore e il modello
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-stage1")  # <-- AGGIUNTO
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-stage1")

# 2. Congela il decoder
for param in model.decoder.parameters():
    param.requires_grad = False

# 3. Configurazione del training
training_args = Seq2SeqTrainingArguments(
    output_dir="./trocr-encoder-only",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    learning_rate=5e-5,
    fp16=torch.cuda.is_available(),
    logging_steps=100,
    save_steps=500,
    remove_unused_columns=False,
    eval_strategy="no",
)

# 4. Funzione di preprocessing corretta
def preprocess_function(examples):
    # Apre le immagini
    images = [Image.open(img_path).convert("RGB") for img_path in examples["image_path"]]

    # Processa immagini e testo
    pixel_values = processor(images, return_tensors="pt").pixel_values
    labels = processor.tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=64,
        return_tensors="pt"
    ).input_ids

    return {
        "pixel_values": pixel_values,
        "labels": labels
    }

# 5. Applica il preprocessing
dataset = dataset.map(
    preprocess_function,
    batched=True,
    batch_size=8,
    remove_columns=["image_path", "text"]  # Rimuove le colonne originali
)

# 6. Crea e avvia il trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)
trainer.save_model()
processor.save_pretrained("./trocr-encoder-only")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-stage1 and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream ta

Map:   0%|          | 0/700 [00:00<?, ? examples/s]

[]

In [9]:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image
import torch

# Carica processor e modello dal checkpoint fine-tuned
model_path = "./trocr-encoder-only"
processor = TrOCRProcessor.from_pretrained(model_path)
model = VisionEncoderDecoderModel.from_pretrained(model_path)
model.eval()

from google.colab import files
uploaded = files.upload()
filename = next(iter(uploaded))
image = Image.open(filename).convert("RGB")


# Preprocessing e inferenza
pixel_values = processor(images=image, return_tensors="pt").pixel_values
with torch.no_grad():
    generated_ids = model.generate(pixel_values)

# Decodifica del testo
text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print("Riconosciuto:", text)


Saving love.jpg to love (1).jpg
Riconosciuto: I LOVE YOU
