<a href="https://colab.research.google.com/github/Mazafard/BD-Landing/blob/main/TrainOcr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## initial

In [None]:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image
import requests

# load image from the IAM database
url = 'https://aesas.pt/wp-content/uploads/2022/05/Divulgacao-dos-cursos1-scaled.jpg'
image = Image.open(requests.get(url, stream=True).raw).convert("RGB")

In [None]:
processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-printed')
model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-printed')
pixel_values = processor(images=image, return_tensors="pt").pixel_values

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_mod

In [None]:
generated_ids = model.generate(pixel_values)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

In [None]:
print(generated_text)

***


In [None]:
!pip install faker Pillow


Collecting faker
  Downloading faker-37.1.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.1.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.1.0


In [None]:
!mkdir -p noto

# دانلود فونت NotoSerif از گوگل فونت
!wget https://www.1001fonts.com/download/font/noto-serif.regular.ttf -O noto/NotoSerif-Regular.ttf

--2025-04-21 23:48:47--  https://www.1001fonts.com/download/font/noto-serif.regular.ttf
Resolving www.1001fonts.com (www.1001fonts.com)... 54.39.177.155
Connecting to www.1001fonts.com (www.1001fonts.com)|54.39.177.155|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 374460 (366K) [application/font-sfnt]
Saving to: ‘noto/NotoSerif-Regular.ttf’


2025-04-21 23:48:49 (396 KB/s) - ‘noto/NotoSerif-Regular.ttf’ saved [374460/374460]



### Make the dataset

In [None]:
import requests
import random
import os
from PIL import Image, ImageDraw, ImageFont

# آدرس کتاب
url = "https://gutenberg.org/cache/epub/3333/pg3333.txt"

# دانلود متن
response = requests.get(url)
raw_text = response.text


# ط بین 'Luís Vaz de Camões' و 'END' نگه داشته شود
start_name = "Luís Vaz de Camões"
end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK OS LUSÍADAS ***"

start_idx = raw_text.find(start_name)
end_idx = raw_text.find(end_marker)

if start_idx != -1 and end_idx != -1:
    cleaned_text = raw_text[start_idx:end_idx].strip()
else:
    raise ValueError("Start or end markers not found in the text.")


# جدا کردن به خطوط و فیلتر کردن خط‌های خالی
lines = [line.strip() for line in cleaned_text.splitlines() if line.strip()]

# فقط خط‌هایی که خیلی کوتاه نیستن
valid_lines = [line for line in lines if len(line) > 30 and len(line)< 46]

# پوشه برای ذخیره تصاویر
os.makedirs("ocr_dataset_full/images", exist_ok=True)
os.makedirs("ocr_dataset_full/labels", exist_ok=True)

# فونت – اگه فونت پرتغالی خاصی داری جایگزین کن
font_path = "/content/noto/NotoSerif-Regular.ttf"
font = ImageFont.truetype(font_path, size=40)

# ساخت دیتاست تصویری ساده از خطوط شعر
for i in range(len(valid_lines)):  # تعداد تصاویری که می‌خوای بسازی
    text = random.choice(valid_lines)

    # ساخت تصویر
    image = Image.new("RGB", (1200, 100), color="white")
    draw = ImageDraw.Draw(image)
    draw.text((10, 10), text, fill="black", font=font)

    # ذخیره تصویر و برچسب
    image_path = f"ocr_dataset_full/images/sample_{i}.png"
    label_path = f"ocr_dataset_full/labels/sample_{i}.txt"
    image.save(image_path)
    with open(label_path, "w", encoding="utf-8") as f:
        f.write(text)


## make HDF5


In [None]:
!pip install h5py numpy



In [None]:
import h5py
import numpy as np
from PIL import Image
import os


data_dir = "ocr_dataset_full"
image_dir = os.path.join(data_dir, "images")
label_dir = os.path.join(data_dir, "labels")

images = []
texts = []


for filename in os.listdir(image_dir):
  if filename.endswith(".png"):
    image_path = os.path.join(image_dir, filename)
    img = Image.open(image_path)
    img = np.array(img)
    images.append(img)

    label_path = os.path.join(label_dir, filename.replace(".png", ".txt"))

    with open(label_path, encoding="utf-8") as f:
      text = f.read().strip()

    texts.append(text)



with h5py.File('dataset_full.h5', 'w') as f:
    f.create_dataset('images', data=images)
    dt = h5py.special_dtype(vlen=str)
    f.create_dataset('texts', data=texts, dtype=dt)



In [None]:
!pip install huggingface_hub



In [None]:
from huggingface_hub import login
from google.colab import userdata

api_token = userdata.get('HUGGINGFACE_API_TOKEN')

login(api_token)

from huggingface_hub import create_repo, upload_file

repo_name = "mazafard/portugues_ocr_dataset_full"
create_repo(repo_id=repo_name, repo_type="dataset", exist_ok=True)

hdf5_file_path = "dataset_full.h5"

upload_file(
    path_or_fileobj=hdf5_file_path,
    path_in_repo="dataset_full.h5",
    repo_id=repo_name,
    repo_type="dataset"
)

dataset_full.h5:   0%|          | 0.00/2.94G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mazafard/portugues_ocr_dataset_full/commit/83b89c0da2506d99f8769e3b26c786db284ddfc3', commit_message='Upload dataset_full.h5 with huggingface_hub', commit_description='', oid='83b89c0da2506d99f8769e3b26c786db284ddfc3', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mazafard/portugues_ocr_dataset_full', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mazafard/portugues_ocr_dataset_full'), pr_revision=None, pr_num=None)

In [None]:
with h5py.File("dataset_full.h5", "r") as f:
    print("Keys:", list(f.keys()))  # ['images', 'texts']
    print("Number of samples:", len(f["texts"]))
    print("Image shape:", f["images"].shape)
    print("Example text:", f["texts"][200])

Keys: ['images', 'texts']
Number of samples: 8174
Image shape: (8174, 100, 1200, 3)
Example text: b'E de Helicona as Musas fez passar-se'


In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xx

In [None]:
from datasets import Dataset
import h5py
import pandas as pd

# Load HDF5 data
with h5py.File("dataset_full.h5", "r") as f:
    images = f["images"][:]
    texts = [t.decode("utf-8") if isinstance(t, bytes) else t for t in f["texts"][:]]

# Make a DataFrame
df = pd.DataFrame({
    "text": texts,
    # optionally: encode images or use image paths if saved separately
})

# Create Hugging Face Dataset
hf_dataset = Dataset.from_pandas(df)

# Convert to parquet and push to hub
hf_dataset.to_parquet("ocr_dataset_full.parquet")


Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

326629

In [None]:
upload_file(
    path_or_fileobj='ocr_dataset_full.parquet',
    path_in_repo="ocr_dataset_full.parquet",
    repo_id=repo_name,
    repo_type="dataset"
)

ocr_dataset_full.parquet:   0%|          | 0.00/231k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mazafard/portugues_ocr_dataset_full/commit/745286d3b0f96c9bd581e760823276e749b69f1e', commit_message='Upload ocr_dataset_full.parquet with huggingface_hub', commit_description='', oid='745286d3b0f96c9bd581e760823276e749b69f1e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mazafard/portugues_ocr_dataset_full', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mazafard/portugues_ocr_dataset_full'), pr_revision=None, pr_num=None)

In [None]:
!pip install transformers datasets torch torchvision




In [None]:
from datasets import Dataset
import os

data_dir = "ocr_dataset"
image_dir = os.path.join(data_dir, "images")
label_dir = os.path.join(data_dir, "labels")

data = []

for filename in os.listdir(image_dir):
    if filename.endswith(".png"):
        image_path = os.path.join(image_dir, filename)
        label_path = os.path.join(label_dir, filename.replace(".png", ".txt"))
        with open(label_path, encoding="utf-8") as f:
            text = f.read().strip()
        data.append({"image_path": image_path, "text": text})

dataset = Dataset.from_list(data)


In [None]:
from transformers import TrOCRProcessor

processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")


In [None]:
from PIL import Image

def preprocess_batch(batch):
    # Access image_path and text from the batch using batch.get
    # Check if image_path is in the batch and is not empty
    if "image_path" in batch and batch["image_path"]:
        images = [Image.open(path).convert("RGB") for path in batch["image_path"]]
        pixel_values = processor(images=images, return_tensors="pt", padding=True).pixel_values
        labels = processor.tokenizer(
            batch["text"],
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        ).input_ids

        batch["pixel_values"] = pixel_values
        batch["labels"] = labels
    return batch

# This line was causing the issue
# remove_columns = ["image_path", "text"]

# No need to remove columns here, the map function will replace the columns as needed
dataset = dataset.map(
    preprocess_batch,
    batched=True,
    batch_size=100,
)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
from transformers import VisionEncoderDecoderModel

model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed")
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size


Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_mod

In [None]:
from transformers import TrainerCallback
from PIL import Image, ImageDraw
import torch
import os

class PredictionLoggerCallback(TrainerCallback):
    def __init__(self, dataset, processor, output_dir="predictions"):
        self.dataset = dataset
        self.processor = processor
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)

    def on_epoch_end(self, args, state, control, model=None, **kwargs):
        model.eval()
        sample = self.dataset.select(range(3))  # انتخاب ۳ نمونه اول

        # Preprocess the sample to ensure it has the necessary format for the model
        sample = sample.map(
            preprocess_batch, # Use the same preprocess_batch function
            batched=True,
            batch_size=3, # Process all 3 samples at once
        )

        for idx, example in enumerate(sample):
            with torch.no_grad():
                # pixel_values should be already a tensor at this point
                # Access the first element of the list, which should be the tensor
                # The original line: pixel_values = example["pixel_values"][0].to(model.device)
                # example["pixel_values"] is a list of tensors, one for each image in the batch.
                # Select the first tensor from this list and move it to the device.
                # The fix: access the correct tensor in the list using [idx] not [0]
                pixel_values = example["pixel_values"][idx].to(model.device)
                # Add batch dimension since model.generate expects a batch
                generated_ids = model.generate(pixel_values.unsqueeze(0), max_length=128)
                pred_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

            # Decode labels, ensure 'labels' key exists
            true_text = self.processor.decode(example.get("labels", []), skip_special_tokens=True)

            # Since the dataset has been preprocessed, image_path is removed
            # Save predictions without images or just use a placeholder
            #  image = Image.new("RGB", (500, 100), color="white") # create a placeholder image
            #  draw = ImageDraw.Draw(image)
            #  draw.text((10, 10), f"PRED: {pred_text}", fill="blue")
            #  draw.text((10, 60), f"TRUE: {true_text}", fill="green")
            #  image.save(os.path.join(self.output_dir, f"epoch_{state.epoch}_sample_{idx}.png"))

            # Or, just print the predictions:
            print(f"Sample {idx}:")
            print(f"PRED: {pred_text}")
            print(f"TRUE: {true_text}")
            print("-" * 20) # separator

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./trocr-finetuned",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    save_steps=500,
    logging_steps=100,
    learning_rate=5e-5,
    fp16=False,
    save_total_limit=2,
    remove_unused_columns=False, # Add this line
)

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=processor.tokenizer,
    train_dataset=dataset,
    callbacks=[PredictionLoggerCallback(dataset, processor)]

)

  trainer = Trainer(


In [None]:
trainer.train()


Exception in thread Thread-28 (_loader_worker):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.11/dist-packages/torch_xla/distributed/parallel_loader.py", line 165, in _loader_worker
    _, data = next(data_iter)
              ^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/accelerate/data_loader.py", line 566, in __iter__
    current_batch = next(dataloader_iter)
                    ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 708, in __next__
    data = self._next_data()
           ^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 764, in _next_data
    data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
           ^^^^^^^^^^^

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

AttributeError: 'list' object has no attribute 'to'

In [None]:
from datasets import Dataset
import os

data_dir = "ocr_dataset"
image_dir = os.path.join(data_dir, "images")
label_dir = os.path.join(data_dir, "labels")

data = []

for filename in os.listdir(image_dir):
    if filename.endswith(".png"):
        image_path = os.path.join(image_dir, filename)
        label_path = os.path.join(label_dir, filename.replace(".png", ".txt"))
        with open(label_path, encoding="utf-8") as f:
            text = f.read().strip()
        data.append({"image_path": image_path, "text": text})

dataset = Dataset.from_list(data)

from transformers import TrOCRProcessor

processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")



from PIL import Image

def preprocess_batch(batch):
    # Ensure the batch contains both image_path and text
    if "image_path" in batch and batch["image_path"]:
        # Open images from file paths
        images = [Image.open(path).convert("RGB") for path in batch["image_path"]]
        # Use processor to convert images to pixel_values
        pixel_values = processor(images=images, return_tensors="pt", padding=True).pixel_values
        # Tokenize the texts and create labels
        labels = processor.tokenizer(
            batch["text"],
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        ).input_ids

        # Add processed pixel_values and labels to batch
        batch["pixel_values"] = pixel_values
        batch["labels"] = labels
    return batch

# Process the dataset
dataset = dataset.map(
    preprocess_batch,
    batched=True,
    batch_size=1000,
)


from transformers import VisionEncoderDecoderModel

processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed", use_fast=True)
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [None]:




# Callback to log predictions after each epoch
from transformers import TrainerCallback
import torch
import os

class PredictionLoggerCallback(TrainerCallback):
    def __init__(self, dataset, processor, output_dir="predictions"):
        self.dataset = dataset
        self.processor = processor
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)

    def on_epoch_end(self, args, state, control, model=None, **kwargs):
        model.eval()
        sample = self.dataset.select(range(3))  # Select first 3 samples

        # Preprocess the sample
        sample = sample.map(
            preprocess_batch,
            batched=True,
            batch_size=3,
        )

        for idx, example in enumerate(sample):
            with torch.no_grad():
                # Access the pixel_values tensor and move it to the device
                pixel_values = example["pixel_values"][idx]  # فقط تانسور را می‌گیریم
                generated_ids = model.generate(pixel_values.unsqueeze(0), max_length=128)
                pred_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

            # Decode true labels
            true_text = self.processor.decode(example.get("labels", []), skip_special_tokens=True)

            # Print or log predictions
            print(f"Sample {idx}:")
            print(f"PRED: {pred_text}")
            print(f"TRUE: {true_text}")
            print("-" * 20)

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./trocr-finetuned",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    save_steps=500,
    logging_steps=100,
    learning_rate=5e-5,
    fp16=False,
    save_total_limit=2,
    remove_unused_columns=False,  # Ensure that unused columns are not included
)

trainer = Trainer(
    model=model,
    args=training_args,
    processing_class=processor,
    train_dataset=dataset,
)

trainer.train()




NameError: name 'model' is not defined

In [None]:
!pip install torch torchvision torchaudio torch_xla




In [None]:
import torch
import torch_xla.core.xla_model as xm

# انتخاب دستگاه: اگر TPU در دسترس باشد، آن را استفاده می‌کنیم
device = xm.xla_device()  # استفاده از TPU

# انتقال مدل به TPU
model = model.to(device)


NameError: name 'model' is not defined

In [None]:
model.save_pretrained("./trocr-finetuned")
processor.save_pretrained("./trocr-finetuned")


In [None]:
image = Image.open("ocr_dataset/images/sample_123.png").convert("RGB")
pixel_values = processor(images=image, return_tensors="pt").pixel_values
generated_ids = model.generate(pixel_values)
predicted_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(predicted_text)


In [None]:
# 1. نصب کتابخانه‌های مورد نیاز
!pip install torch torchvision torchaudio torch_xla transformers accelerate




In [None]:

# 2. وارد کردن کتابخانه‌ها
import torch
from transformers import VisionEncoderDecoderModel, AutoTokenizer, AutoProcessor
from accelerate import Accelerator
import torch_xla.core.xla_model as xm
import os

from datasets import Dataset
import os

data_dir = "ocr_dataset"
image_dir = os.path.join(data_dir, "images")
label_dir = os.path.join(data_dir, "labels")

data = []

for filename in os.listdir(image_dir):
    if filename.endswith(".png"):
        image_path = os.path.join(image_dir, filename)
        label_path = os.path.join(label_dir, filename.replace(".png", ".txt"))
        with open(label_path, encoding="utf-8") as f:
            text = f.read().strip()
        data.append({"image_path": image_path, "text": text})

dataset = Dataset.from_list(data)

# 3. انتخاب دستگاه: اگر TPU در دسترس باشد، از آن استفاده می‌کنیم
device = xm.xla_device()  # انتخاب TPU

# 4. بارگذاری مدل و توکنایزر
model_name = "microsoft/trocr-base-printed"
model = VisionEncoderDecoderModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
processor = AutoProcessor.from_pretrained(model_name)

# انتقال مدل به TPU
model = model.to(device)




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": false,
  "torch_dtype": 

In [None]:
def preprocess_batch(batch):
    # پردازش تصاویر
    images = [processor(image_path) for image_path in batch['image_path']]
    pixel_values = processor(images=images, return_tensors="pt").pixel_values

    # بررسی وجود 'text' در داده‌ها
    if 'text' in batch:
        # توکنایز کردن متن‌ها
        labels = tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128, return_tensors="pt").input_ids
    else:
        raise ValueError("'text' column is missing in the batch data.")

    # اضافه کردن pixel_values و labels به batch
    batch['pixel_values'] = pixel_values
    batch['labels'] = labels
    return batch

print(dataset[:2])  # لیست نام ستون‌ها



{'image_path': ['ocr_dataset/images/sample_7856.png', 'ocr_dataset/images/sample_9670.png'], 'text': ['Serão dadas na terra leis melhores.', 'Quem o gerou, vingança já lhe ordena:']}


In [None]:
from datasets import Dataset, DatasetDict
from datasets import load_dataset


dataset = Dataset.from_dict(dataset)

train_dataset, test_dataset = dataset['train'].train_test_split(test_size=0.2).values()


# ایجاد DataLoader برای داده‌های آموزش
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# ایجاد DataLoader برای داده‌های تست
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)


AttributeError: 'Dataset' object has no attribute 'items'

In [None]:

print(len(dataset))
iterator = iter(train_dataloader)

first_batch = next(iterator)
second_batch = next(iterator)

print("First batch:", first_batch)
print("Second batch:", second_batch)



# 7. تنظیمات و آموزش مدل با استفاده از `Accelerator` (برای ساده‌سازی مدیریت دستگاه‌ها)
accelerator = Accelerator()

# آماده‌سازی مدل، optimizer و DataLoader برای استفاده با TPU
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
model, optimizer, train_dataloader = accelerator.prepare(model, optimizer, train_dataloader)


10000
First batch: {'image_path': ['ocr_dataset/images/sample_9350.png', 'ocr_dataset/images/sample_2149.png', 'ocr_dataset/images/sample_5609.png', 'ocr_dataset/images/sample_5043.png', 'ocr_dataset/images/sample_7244.png', 'ocr_dataset/images/sample_2986.png', 'ocr_dataset/images/sample_4165.png', 'ocr_dataset/images/sample_7228.png', 'ocr_dataset/images/sample_9803.png', 'ocr_dataset/images/sample_2559.png'], 'text': ['Não sofre amores, nem delicadeza;', 'Vede, Ninfas, que engenhos de senhores', '(Já Cristo neste tempo lhe ordenava', 'E a maneira do trajo diferente.', 'Mostra a Fortuna injusta seus poderes.', 'E o Ganges, que no céu terreno mora.', 'Quando o mar descobrindo lhe mostrava', 'Que divide Asia de Africa; e as milhores', 'additions or deletions to any Project Gutenberg™ work, and (c) any', '--"Se pretendes, Rei alto, de vingar-te']}
Second batch: {'image_path': ['ocr_dataset/images/sample_5258.png', 'ocr_dataset/images/sample_1610.png', 'ocr_dataset/images/sample_1568.png

In [None]:

# 8. حلقه آموزش
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    for batch_idx, batch in enumerate(train_dataloader):
        print(batch)
        break
        # انتقال داده‌ها به TPU
        inputs = batch['pixel_values'].to(accelerator.device)
        labels = batch['labels'].to(accelerator.device)

        # آموزش مدل
        optimizer.zero_grad()

        # انجام پیش‌بینی
        outputs = model(inputs, labels=labels)
        loss = outputs.loss

        # عقب‌گرد و به‌روزرسانی وزن‌ها
        accelerator.backward(loss)
        optimizer.step()

        # نمایش خطا در هر 100 گام
        if accelerator.is_local_main_process and (batch_idx % 100 == 0):
            print(f"Epoch {epoch+1}/{num_epochs}, Step {batch_idx}, Loss: {loss.item()}")

# 9. ذخیره مدل
if accelerator.is_local_main_process:
    model.save_pretrained("path_to_save_model")
    tokenizer.save_pretrained("path_to_save_tokenizer")

## install


In [None]:
!pip install h5py requests datasets torch torchvision torchaudio transformers accelerate


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_c

## 1- load model


In [None]:
!pip uninstall -y tensorflow
!pip install tensorflow-cpu


Found existing installation: tensorflow 2.18.0
Uninstalling tensorflow-2.18.0:
  Successfully uninstalled tensorflow-2.18.0
Collecting tensorflow-cpu
  Downloading tensorflow_cpu-2.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting tensorboard~=2.19.0 (from tensorflow-cpu)
  Downloading tensorboard-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting ml-dtypes<1.0.0,>=0.5.1 (from tensorflow-cpu)
  Downloading ml_dtypes-0.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (21 kB)
Downloading tensorflow_cpu-2.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (251.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.8/251.8 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ml_dtypes-0.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m113.0 MB/s[0m eta [36m0:00:00[0m
[?25hDow

In [None]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))



True
NVIDIA A100-SXM4-40GB


In [None]:
from transformers import VisionEncoderDecoderModel, TrOCRProcessor, AutoTokenizer, AutoProcessor # Import TrOCRProcessor here
import torch


device = torch.device("cuda")  # Get your CUDA device


model_name = "microsoft/trocr-base-printed"
model = VisionEncoderDecoderModel.from_pretrained(model_name)
processor = TrOCRProcessor.from_pretrained(model_name)

# Access the internal image processor
image_processor = processor.image_processor

# Access the internal tokenizer
text_processor = processor.tokenizer


# Set decoder_start_token_id and other configurations AFTER model and processor loading
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size


model = model.to(device)



Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_mod

## 2- load dataset


In [None]:
!pip install datasets requests



In [None]:
import torch
from transformers import VisionEncoderDecoderModel,TrOCRProcessor,AutoTokenizer,AutoProcessor
from datasets import Dataset
import requests
import h5py
from PIL import Image
import io
import os




url = "https://huggingface.co/datasets/mazafard/portugues_ocr_dataset_full/resolve/main/dataset_full.h5"
file_path = "dataset_full.h5"

if not os.path.exists(file_path):
    print(f"Downloading {file_path}...")
    response = requests.get(url)
    with open(file_path, "wb") as f:
        f.write(response.content)
    print(f"{file_path} downloaded successfully.")
else:
    print(f"{file_path} already exists. Skipping download.")

with h5py.File(file_path, "r") as f:
  images = f["images"][:]
  texts = f["texts"][:]



data = []

for i in range(len(images)):
        image = Image.fromarray(images[i])
        image_byte_arr = io.BytesIO()
        image.save(image_byte_arr, format='PNG')
        image_byte_arr = image_byte_arr.getvalue()
        text = texts[i].decode("utf-8")
        data.append({"image": image_byte_arr, "text": text})

dataset = Dataset.from_list(data)

print(dataset[0])


dataset_full.h5 already exists. Skipping download.
{'image': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x04\xb0\x00\x00\x00d\x08\x02\x00\x00\x00\x19\xa3^\xb2\x00\x00$[IDATx\x9c\xed\xddiT\x14W\xda\x07\xf0\xdb\xd0\xd0\xec(\x02\x02\xe2\x02\xc1\x05\x02\x88\xa2\xe2\x16\xc5\r\x10EDMT\x12\xc7\x88J$j\xa2\x98e\x9c\x981\x13\x9d\x98IF#\x99\x18=9\x1aMB\x8c\x8c(\x01E\x83q\x03\xe3\x1a\x83K4c\x14P@\x02\x82(\xb4B\xb34\xf4\xfb\xa1\xce{\xcf\xb5\x97\xa2\xa9^M\xff\x7f\x1f<E\xd7\xad[O\x97\xb7\xaa\xeb\xa9\xbauK\xa4P(\x08\x00\x00\x00\x00\x00\x00X\x1e+S\x07\x00\x00\x00\x00\x00\x00\x00\xa6\x81\x84\x10\x00\x00\x00\x00\x00\xc0B!!\x04\x00\x00\x00\x00\x00\xb0PH\x08\x01\x00\x00\x00\x00\x00,\x14\x12B\x00\x00\x00\x00\x00\x00\x0b\x85\x84\x10\x00\x00\x00\x00\x00\xc0B!!\x04\x00\x00\x00\x00\x00\xb0PH\x08\x01\x00\x00\x00\x00\x00,\x14\x12B\x00\x00\x00\x00\x00\x00\x0b\x85\x84\x10\x00\x00\x00\x00\x00\xc0B!!\x04\x00\x00\x00\x00\x00\xb0PH\x08\x01\x00\x00\x00\x00\x00,\x14\x12B\x00\x00\x00\x00\x00\x00\x0b\x85\x84\x10\x00\x00\

## 3- preprocess data

In [None]:
from PIL import Image

def preprocess_batch(examples):
    images = [Image.open(io.BytesIO(b)).convert("RGB") for b in examples["image"]]
    # Process images separately
    pixel_values = image_processor(images=images, return_tensors="pt",device=device).pixel_values
    pixel_values = pixel_values.to(device) # Move to device

    texts = examples["text"]
    if not isinstance(texts[0], str):
        texts = [t.decode("utf-8") if isinstance(t, bytes) else str(t) for t in texts]

    # Process texts separately
    labels = text_processor(text=texts,
                      padding="max_length",
                      truncation=True,
                      max_length=128,
                      return_tensors="pt").input_ids
    labels = labels.to(device) # Move to device

    examples["pixel_values"] = [pixel_value for pixel_value in pixel_values]
    examples["labels"] = labels

    del examples["image"]

    return examples

## 4- apply preprocess


In [None]:
print(dataset[:2])
dataset = dataset.map(preprocess_batch, batched=True, batch_size=1000)


{'image': [b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x04\xb0\x00\x00\x00d\x08\x02\x00\x00\x00\x19\xa3^\xb2\x00\x00$[IDATx\x9c\xed\xddiT\x14W\xda\x07\xf0\xdb\xd0\xd0\xec(\x02\x02\xe2\x02\xc1\x05\x02\x88\xa2\xe2\x16\xc5\r\x10EDMT\x12\xc7\x88J$j\xa2\x98e\x9c\x981\x13\x9d\x98IF#\x99\x18=9\x1aMB\x8c\x8c(\x01E\x83q\x03\xe3\x1a\x83K4c\x14P@\x02\x82(\xb4B\xb34\xf4\xfb\xa1\xce{\xcf\xb5\x97\xa2\xa9^M\xff\x7f\x1f<E\xd7\xad[O\x97\xb7\xaa\xeb\xa9\xbauK\xa4P(\x08\x00\x00\x00\x00\x00\x00X\x1e+S\x07\x00\x00\x00\x00\x00\x00\x00\xa6\x81\x84\x10\x00\x00\x00\x00\x00\xc0B!!\x04\x00\x00\x00\x00\x00\xb0PH\x08\x01\x00\x00\x00\x00\x00,\x14\x12B\x00\x00\x00\x00\x00\x00\x0b\x85\x84\x10\x00\x00\x00\x00\x00\xc0B!!\x04\x00\x00\x00\x00\x00\xb0PH\x08\x01\x00\x00\x00\x00\x00,\x14\x12B\x00\x00\x00\x00\x00\x00\x0b\x85\x84\x10\x00\x00\x00\x00\x00\xc0B!!\x04\x00\x00\x00\x00\x00\xb0PH\x08\x01\x00\x00\x00\x00\x00,\x14\x12B\x00\x00\x00\x00\x00\x00\x0b\x85\x84\x10\x00\x00\x00\x00\x00\xc0B!!\x04\x00\x00\x00\x00\x00\xb0PH\x

Map:   0%|          | 0/8174 [00:00<?, ? examples/s]

  return self.preprocess(images, **kwargs)


In [None]:
train_dataset, test_dataset = dataset.train_test_split(test_size=0.2).values()

print(f"Training set size: {len(train_dataset)}")
print(f"Test set size: {len(test_dataset)}")

print(train_dataset[0])

Training set size: 6539
Test set size: 1635
{'text': 'E o louvor altos casos persuade.', 'pixel_values': [[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.

##don't need


In [None]:
# Callback to log predictions after each epoch
from transformers import TrainerCallback
import torch
import os

class PredictionLoggerCallback(TrainerCallback):
    def __init__(self, dataset, processor, output_dir="predictions"):
        self.dataset = dataset
        self.processor = processor
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)

    def on_epoch_end(self, args, state, control, model=None, **kwargs):
        model.eval()
        sample = self.dataset.select(range(3))  # Select first 3 samples

        # Preprocess the sample
        sample = sample.map(
            preprocess_batch,
            batched=True,
            batch_size=3,
        )

        for idx, example in enumerate(sample):
            with torch.no_grad():
                # Access the pixel_values tensor and move it to the device
                pixel_values = example["pixel_values"][idx]
                generated_ids = model.generate(pixel_values.unsqueeze(0), max_length=128)
                pred_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

            # Decode true labels
            true_text = self.processor.decode(example.get("labels", []), skip_special_tokens=True)

            # Print or log predictions
            print(f"Sample {idx}:")
            print(f"PRED: {pred_text}")
            print(f"TRUE: {true_text}")
            print("-" * 20)




## upload

In [None]:
import os
from huggingface_hub import login, create_repo, upload_file

def upload_model_to_hub(model_path):
    """
    Uploads a model to the Hugging Face Hub.

    Args:
        model_path (str): The local path to the saved model directory.

    """
    huggingface_token = userdata.get('HUGGINGFACE_API_TOKEN')

    repo_id=f"mazafard/{model_path}"

    # Login to Hugging Face Hub
    login(token=huggingface_token)

    # Create a repo on Hugging Face Hub (if it doesn't exist)
    create_repo(repo_id=repo_id, exist_ok=True)

    # Upload the model files
    for filename in os.listdir(model_path):
        upload_file(
            path_or_fileobj=os.path.join(model_path, filename),
            path_in_repo=filename,
            repo_id=repo_id,
        )
        print(f"Uploaded {filename}")

    print(f"Model uploaded to Hugging Face Hub: {repo_id}")

## train

In [None]:
from transformers import TrainingArguments, Trainer
import datetime

training_args = TrainingArguments(
    output_dir="./trocr-finetuned",
    per_device_train_batch_size=56,
    num_train_epochs=3,
    save_steps=500,
    logging_steps=50,
    learning_rate=5e-5,
    gradient_accumulation_steps=2,
    fp16=True,
    save_total_limit=2,
    remove_unused_columns=False,
     dataloader_num_workers=2,
)


trainer = Trainer(
    model=model,
    args=training_args,
    processing_class=processor,
    train_dataset=train_dataset,
)

training_completed = True
try:
    trainer.train()
except KeyboardInterrupt:
    print("Training interrupted.")
    training_completed = False
except Exception as e:  # Catch other exceptions
    print(f"An error occurred during training: {e}")
    training_completed = False
finally:
    current_datetime = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    status_suffix = "" if training_completed else "_interrupted"  #
    model_name = f"trocr-finetuned_{current_datetime}{status_suffix}"  # Include status
    model_path = f"./{model_name}"
    trainer.save_model(model_path)
    print(f"Model saved to {model_path}")
    # upload_model_to_hub(model_name)
# Call the function and check the return value
#training_status = train_and_save_model(trainer)


Step,Training Loss
50,1.3412
100,0.1204
150,0.0573


Model saved to ./trocr-finetuned_20250422_125947


In [None]:
 upload_model_to_hub(model_name)

Uploaded tokenizer_config.json


model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Uploaded model.safetensors
Uploaded special_tokens_map.json
Uploaded merges.txt
Uploaded vocab.json
Uploaded config.json


training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

Uploaded training_args.bin
Uploaded preprocessor_config.json
Uploaded tokenizer.json
Uploaded generation_config.json
Model uploaded to Hugging Face Hub: mazafard/trocr-finetuned_20250422_125947


In [None]:
# prompt:  add an script to upload readme  file and Inference Providers in huggingface

import io
from huggingface_hub import HfApi

def upload_readme(repo_id, readme_content):
    """Uploads a README file to a Hugging Face repository.

    Args:
        repo_id (str): The ID of the repository (e.g., "username/repo_name").
        readme_content (str): The content of the README file.
    """
    api = HfApi()
    try:
        readme_bytes = readme_content.encode("utf-8") # Encode to bytes
        api.upload_file(
            path_or_fileobj=io.StringIO(readme_content),
            path_in_repo="README.md",
            repo_id=repo_id,
            repo_type="model",
        )
        print("README.md uploaded successfully!")
    except Exception as e:
        print(f"Error uploading README.md: {e}")


def upload_inference_providers(repo_id, provider_files):
    """Uploads inference provider files to a Hugging Face repository.

    Args:
        repo_id (str): The ID of the repository (e.g., "username/repo_name").
        provider_files (list): A list of file paths to the provider files.
    """
    api = HfApi()
    for file_path in provider_files:
        try:
            api.upload_file(
                path_or_fileobj=file_path,
                path_in_repo=os.path.basename(file_path),  # Use the original filename
                repo_id=repo_id,
                repo_type="model",
            )
            print(f"{os.path.basename(file_path)} uploaded successfully!")
        except Exception as e:
            print(f"Error uploading {os.path.basename(file_path)}: {e}")

# Example usage (replace with your actual values)
repo_id = "mazafard/trocr-finetuned_20250422_125947"  # Replace with your repo ID
readme_content = "# My Fine-tuned TrOCR Model\nThis is a README file for my fine-tuned TrOCR model."  # Replace with your README content
provider_files = ["inference_provider_1.txt", "inference_provider_2.txt"]  # Replace with actual file paths


upload_readme(repo_id, readme_content)

# Check if the provider files exist before uploading
for file_path in provider_files:
    if os.path.exists(file_path):
        upload_inference_providers(repo_id, provider_files)
    else:
        print(f"Warning: Provider file '{file_path}' not found. Skipping upload.")


Error uploading README.md: path_or_fileobj must be either an instance of str, bytes or io.BufferedIOBase. If you passed a file-like object, make sure it is in binary mode.


In [None]:
# prompt: write a code to create onnx for infrence in huggingface

from transformers import pipeline
import torch



# Load the pipeline with the saved model
pipe = pipeline("image-to-text", model=model_path, device=0 if torch.cuda.is_available() else -1)


# Example usage
image_path = "ocr_dataset/images/sample_123.png"  # Replace with your image path
text = pipe(image_path)
print(text)


# Export to ONNX
from pathlib import Path

onnx_path = Path("./trocr-finetuned.onnx")

# Export the model to ONNX format (replace with appropriate parameters)
# Note: This is a simplified example and may need adjustments depending on your model
dummy_input = pipe.feature_extractor(images=Image.open(image_path), return_tensors="pt").pixel_values
torch.onnx.export(
    pipe.model,
    dummy_input,
    onnx_path,
    input_names=["pixel_values"],  # Replace with your actual input names
    output_names=["output"],  # Replace with your actual output names
    opset_version=11,  # Adjust as needed
    dynamic_axes={
        "pixel_values": {0: "batch_size", 1: "sequence_length"},
        "output": {0: "batch_size"},  # Add dynamic axes for output if needed
    },
)

print(f"Model exported to ONNX format at {onnx_path}")
