# Entrenamiento del modelo con las imágenes preprocesadas

In [1]:
import torch
from torch.utils.data import Dataset
from PIL import Image
import pandas as pd
import os
from torchvision import transforms

class OCRDataset(Dataset):
    def __init__(self, csv_file, image_folder, transform=None, char2idx=None, idx2char=None):
        self.data = pd.read_csv(csv_file)
        self.image_folder = image_folder
        self.transform = transform
        self.char2idx = char2idx
        self.idx2char = idx2char

    def __len__(self):
        return len(self.data)

    def encode_label(self, text):
        return [self.char2idx[c] for c in text if c in self.char2idx]

    def decode_label(self, indices):
        return ''.join([self.idx2char[i] for i in indices if i in self.idx2char])

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        img_path = os.path.join(self.image_folder, row['Direccion'])
        label = row['Texto']

        image = Image.open(img_path).convert('L')
        if self.transform:
            image = self.transform(image)

        encoded = torch.tensor(self.encode_label(label), dtype=torch.long)
        return image, encoded

In [2]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

In [3]:
import string

# Define allowed characters (you can customize this)
all_chars = string.ascii_letters + string.digits + string.punctuation + ' '
char2idx = {char: idx + 1 for idx, char in enumerate(all_chars)}  # Start from 1
char2idx['<blank>'] = 0  # CTC requires blank token at index 0

idx2char = {idx: char for char, idx in char2idx.items()}

In [4]:
from torch.utils.data import DataLoader

dataset = OCRDataset("../Data/ImagenTexto.csv", "../Data/Anotaciones", transform=transform, char2idx=char2idx, idx2char=idx2char)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [5]:
import torch
import torch.nn as nn

class CRNN(nn.Module):
    def __init__(self, img_height, num_channels, num_classes, rnn_hidden_size=256):
        super(CRNN, self).__init__()

        # Feature extractor (CNN backbone)
        self.cnn = nn.Sequential(
            nn.Conv2d(num_channels, 64, 3, 1, 1),  # output: (64, H, W)
            nn.ReLU(),
            nn.MaxPool2d(2, 2),                   # output: (64, H/2, W/2)

            nn.Conv2d(64, 128, 3, 1, 1),          # output: (128, H/2, W/2)
            nn.ReLU(),
            nn.MaxPool2d(2, 2),                   # output: (128, H/4, W/4)

            nn.Conv2d(128, 256, 3, 1, 1),         # output: (256, H/4, W/4)
            nn.BatchNorm2d(256),
            nn.ReLU(),

            nn.Conv2d(256, 256, 3, 1, 1),         # output: (256, H/4, W/4)
            nn.ReLU(),
            nn.MaxPool2d((2, 1), (2, 1)),         # output: (256, H/8, W/4)

            nn.Conv2d(256, 512, 3, 1, 1),
            nn.BatchNorm2d(512),
            nn.ReLU(),

            nn.Conv2d(512, 512, 3, 1, 1),
            nn.ReLU(),
            nn.MaxPool2d((2, 1), (2, 1)),         # output: (512, H/16, W/4)

            nn.Conv2d(512, 512, 2, 1, 0),         # output: (512, H/16 -1, W/4 -1)
            nn.ReLU()
        )

        # RNN for sequence modeling
        self.rnn = nn.Sequential(
            nn.LSTM(512, rnn_hidden_size, bidirectional=True, batch_first=True),
            nn.LSTM(2 * rnn_hidden_size, rnn_hidden_size, bidirectional=True, batch_first=True)
        )

        # Final classifier
        self.fc = nn.Linear(2 * rnn_hidden_size, num_classes)

    def forward(self, x):
        # x: (batch, channels, height, width)
        conv_out = self.cnn(x)  # shape: (B, C, H, W)
        b, c, h, w = conv_out.size()

        assert h == 1 or h == 2, f"Unexpected height: {h}, check image input size and pooling"

        conv_out = conv_out.squeeze(2)  # remove height dim -> (B, C, W)
        conv_out = conv_out.permute(0, 2, 1)  # (B, W, C)

        rnn_out, _ = self.rnn(conv_out)  # (B, W, 2*hidden)
        out = self.fc(rnn_out)  # (B, W, num_classes)

        return out.permute(1, 0, 2)  # (W, B, num_classes) for CTC loss

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CRNN(img_h=32, num_channels=1, num_classes=len(char2idx)).to(device)

# CTC Loss
ctc_loss = nn.CTCLoss(blank=0, zero_infinity=True)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Training loop
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    loop = tqdm(dataloader, desc=f"Epoch [{epoch+1}/{num_epochs}]", leave=False)
    
    for images, labels in loop:
        images = images.to(device)
        
        # Prepare label lengths and targets
        label_lengths = torch.tensor([len(t) for t in labels], dtype=torch.long)
        targets = torch.cat([t for t in labels]).to(device)

        # Forward pass
        outputs = model(images)  # (T, B, C)
        log_probs = outputs.log_softmax(2)

        # Compute input lengths (same for all samples since image widths are equal)
        input_lengths = torch.full(size=(log_probs.size(1),), fill_value=log_probs.size(0), dtype=torch.long)

        # Compute loss
        loss = ctc_loss(log_probs, targets, input_lengths, label_lengths)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        loop.set_postfix(loss=loss.item())

    print(f"Epoch {epoch+1}: Avg Loss = {total_loss / len(dataloader):.4f}")