# VisionTransformer (ViT) and a text decoder setup (e.g., TrOCR from Hugging Face).

**Step 1: Install Required Libraries**

In [None]:
!pip install datasets
!pip install transformers datasets


Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

**Step 2: Import Libraries and Load Model Components**

Load the Vision Transformer and tokenizer, and set up the feature extractor and tokenizer for preprocessing.

In [None]:
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
from datasets import Dataset
import torch
import os
from PIL import Image

# Initialize model, feature extractor, and tokenizer
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-stage1")
feature_extractor = ViTFeatureExtractor.from_pretrained("microsoft/trocr-base-stage1")
tokenizer = AutoTokenizer.from_pretrained("microsoft/trocr-base-stage1")

# Set decoder_start_token_id and pad_token_id
model.config.decoder_start_token_id = tokenizer.cls_token_id
if model.config.decoder_start_token_id is None:
    model.config.decoder_start_token_id = tokenizer.bos_token_id

# Set pad_token_id
model.config.pad_token_id = tokenizer.pad_token_id

# Ensure model is set to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-stage1 and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/224 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]



VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=False)
              (key): Linear(in_features=768, out_features=768, bias=False)
              (value): Linear(in_features=768, out_features=768, bias=False)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_fea

**Step 3: Data Loading and Preprocessing**

Write functions to load each image and its corresponding text file.

In [None]:
def load_data(image_folder, text_folder):
    data = []
    for img_name in os.listdir(image_folder):
        img_path = os.path.join(image_folder, img_name)
        txt_path = os.path.join(text_folder, img_name.replace('.jpg', '.txt'))

        # Read image
        image = Image.open(img_path).convert("RGB")

        # Read text
        with open(txt_path, 'r') as file:
            text = file.read().strip()

        data.append({"image": image, "text": text})
    return data

# Load data
image_folder = "/Source_jpeg"
text_folder = "/Source_text"
dataset = load_data(image_folder, text_folder)


**Step 4: Preprocess Each Image and Text Pair**

Convert each image to patches, tokenize the corresponding text, and prepare them as tensors for the model.

In [None]:
def preprocess(data):
    pixel_values = []
    labels = []

    for item in data:
        # Process image to pixel values (patches handled internally by ViT feature extractor)
        pixel_value = feature_extractor(item["image"], return_tensors="pt").pixel_values
        pixel_values.append(pixel_value)

        # Tokenize text and pad for the decoder
        label = tokenizer(item["text"], padding="max_length", truncation=True, return_tensors="pt").input_ids
        label[label == tokenizer.pad_token_id] = -100  # Ignore padding in loss calculation
        labels.append(label)

    # Convert lists to tensors
    pixel_values = torch.cat(pixel_values)
    labels = torch.cat(labels)

    return {"pixel_values": pixel_values, "labels": labels}

# Preprocess the dataset
processed_data = preprocess(dataset)


**Step 5: Define DataLoader for Training**

With the preprocessed data, create a PyTorch DataLoader for batching.

In [None]:
from torch.utils.data import DataLoader, TensorDataset

# Create a TensorDataset and DataLoader
tensor_dataset = TensorDataset(processed_data["pixel_values"], processed_data["labels"])
dataloader = DataLoader(tensor_dataset, batch_size=2, shuffle=True)  # Adjust batch size as needed


**Step 6: Training Setup**

Set up the training loop with a custom training loop (since Colab often has memory constraints).

In [None]:
from transformers import AdamW

# Set up optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
num_epochs = 30
model.train()

for epoch in range(num_epochs):
    for batch in dataloader:
        pixel_values, labels = batch
        pixel_values = pixel_values.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(pixel_values=pixel_values, labels=labels)
        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}/{num_epochs} completed. Loss: {loss.item()}")


Epoch 1/30 completed. Loss: 4.951147079467773
Epoch 2/30 completed. Loss: 4.797314643859863
Epoch 3/30 completed. Loss: 5.158468246459961
Epoch 4/30 completed. Loss: 5.095791339874268
Epoch 5/30 completed. Loss: 6.572628974914551
Epoch 6/30 completed. Loss: 5.508522987365723
Epoch 7/30 completed. Loss: 5.168274402618408
Epoch 8/30 completed. Loss: 4.774728775024414
Epoch 9/30 completed. Loss: 5.174046516418457
Epoch 10/30 completed. Loss: 5.0552849769592285
Epoch 11/30 completed. Loss: 6.343886375427246
Epoch 12/30 completed. Loss: 5.307904243469238
Epoch 13/30 completed. Loss: 5.104947090148926
Epoch 14/30 completed. Loss: 4.77267599105835
Epoch 15/30 completed. Loss: 4.650323867797852
Epoch 16/30 completed. Loss: 4.871545314788818
Epoch 17/30 completed. Loss: 5.993780612945557
Epoch 18/30 completed. Loss: 4.635336875915527
Epoch 19/30 completed. Loss: 5.476109504699707
Epoch 20/30 completed. Loss: 4.41399621963501
Epoch 21/30 completed. Loss: 4.325629711151123
Epoch 22/30 completed. 

**Step 7: Inference and Evaluation**

After training, you can test the model on a sample image and its corresponding text:

In [None]:
# Load a sample image
sample_image_path = "/content/WhatsApp Image 2024-10-28 at 14.14.17_e118b6eb.jpg"
sample_image = Image.open(sample_image_path).convert("RGB")

# Process image
pixel_values = feature_extractor(sample_image, return_tensors="pt").pixel_values.to(device)

# Generate text prediction
model.eval()
with torch.no_grad():
    generated_ids = model.generate(pixel_values)
    generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

print("Generated Text:", generated_text)


In [None]:
import matplotlib.pyplot as plt

# Load and visualize the sample image
sample_image_path = "/content/PT-ADBGC-AC-GCBGC-PAS-077-01143_m0142_derivada B.jpg"
sample_image = Image.open(sample_image_path).convert("RGB")

# Display the image
plt.imshow(sample_image)
plt.axis("off")
plt.show()
