In [None]:
import os
import pandas as pd
from PIL import Image
import torch
from torchvision import transforms
import csv
import string

In [None]:
# Transformaciones
transform_im = transforms.Compose([
    transforms.Grayscale(num_output_channels=1),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])  # Normaliza a [-1, 1]
])


#Preprocesamiento para texto
# Diccionario de mapeo letra -> índice (0 a 51)
all_letters = list(string.ascii_uppercase + string.ascii_lowercase)
letter_to_idx = {char: idx+1 for idx, char in enumerate(all_letters)}



In [None]:
idx_to_letter = {v: k for k, v in letter_to_idx.items()}

In [None]:
# Listas para acumular los datos
img_list = []
label_values = []  # Cambiado el nombre para mayor claridad
lengths = []
CSV_PATH = "word_dataset/word_labels.csv"
IMG_DIR = "word_dataset"

with open(CSV_PATH, "r", encoding="utf-8") as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # Saltar encabezado

    for row in reader:
        img_name, label = row
        img_path = os.path.join(IMG_DIR, img_name)

        # Procesar imagen
        img = Image.open(img_path)
        img_list.append(transform_im(img))

        # Procesar texto
        label_idx = [letter_to_idx[char] for char in label]
        label_values.append(torch.tensor(label_idx, dtype=torch.long).clone().detach())
        lengths.append(len(label_idx))


# Convertir a tensores al final
img_tensor = torch.stack(img_list)  # Tensor de imágenes
text_tensor = [torch.tensor(label, dtype=torch.long) for label in label_values]  # Lista de tensores de etiquetas


# Longitudes de las secuencias (necesario para CTC Loss)
lengths_tensor = torch.tensor(lengths, dtype=torch.long)


  text_tensor = [torch.tensor(label, dtype=torch.long) for label in label_values]  # Lista de tensores de etiquetas


In [None]:
# Inspeccionar primeras 10 etiquetas
for i in range(10):
    label_indices = label_values[i].tolist()
    label_str = "".join([idx_to_letter[idx] for idx in label_indices])
    print(f"Label {i}: índices = {label_indices}, texto = '{label_str}'")


Label 0: índices = [12, 9, 13, 15, 14], texto = 'LIMON'
Label 1: índices = [12, 9, 13, 15, 14], texto = 'LIMON'
Label 2: índices = [12, 9, 13, 15, 14], texto = 'LIMON'
Label 3: índices = [12, 9, 13, 15, 14], texto = 'LIMON'
Label 4: índices = [12, 9, 13, 15, 14], texto = 'LIMON'
Label 5: índices = [12, 9, 13, 15, 14], texto = 'LIMON'
Label 6: índices = [12, 9, 13, 15, 14], texto = 'LIMON'
Label 7: índices = [12, 9, 13, 15, 14], texto = 'LIMON'
Label 8: índices = [12, 9, 13, 15, 14], texto = 'LIMON'
Label 9: índices = [12, 9, 13, 15, 14], texto = 'LIMON'


In [None]:
print("size im: \n\n", img_tensor.shape, "\n\n")





size im: 

 torch.Size([1000, 1, 224, 224]) 




In [None]:
from sklearn.model_selection import train_test_split
import torch


In [None]:
# Convertimos img_tensor a lista para emparejarlo con las etiquetas
img_list = list(img_tensor)
print(text_tensor)
# Dividir en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(
    img_list, text_tensor, test_size=0.2, random_state=42
)

# Volver a convertir a tensores
X_train_tensor = torch.stack(X_train)
X_test_tensor = torch.stack(X_test)

# Calcular longitudes de etiquetas (para CTCLoss)
train_target_lengths = torch.tensor([len(seq) for seq in y_train], dtype=torch.long)
test_target_lengths = torch.tensor([len(seq) for seq in y_test], dtype=torch.long)


print(y_train)
print(train_target_lengths)
# Guardar todos los datos
torch.save({
    'X_train': X_train_tensor,
    'X_test': X_test_tensor,
    'y_train': y_train,
    'y_test': y_test,
    'train_target_lengths': train_target_lengths,
    'test_target_lengths': test_target_lengths
}, 'train_test_split.pt')


[tensor([12,  9, 13, 15, 14]), tensor([12,  9, 13, 15, 14]), tensor([12,  9, 13, 15, 14]), tensor([12,  9, 13, 15, 14]), tensor([12,  9, 13, 15, 14]), tensor([12,  9, 13, 15, 14]), tensor([12,  9, 13, 15, 14]), tensor([12,  9, 13, 15, 14]), tensor([12,  9, 13, 15, 14]), tensor([12,  9, 13, 15, 14]), tensor([12,  9, 13, 15, 14]), tensor([12,  9, 13, 15, 14]), tensor([12,  9, 13, 15, 14]), tensor([12,  9, 13, 15, 14]), tensor([12,  9, 13, 15, 14]), tensor([12,  9, 13, 15, 14]), tensor([12,  9, 13, 15, 14]), tensor([12,  9, 13, 15, 14]), tensor([12,  9, 13, 15, 14]), tensor([12,  9, 13, 15, 14]), tensor([12,  9, 13, 15, 14]), tensor([12,  9, 13, 15, 14]), tensor([12,  9, 13, 15, 14]), tensor([12,  9, 13, 15, 14]), tensor([12,  9, 13, 15, 14]), tensor([12,  9, 13, 15, 14]), tensor([12,  9, 13, 15, 14]), tensor([12,  9, 13, 15, 14]), tensor([12,  9, 13, 15, 14]), tensor([12,  9, 13, 15, 14]), tensor([12,  9, 13, 15, 14]), tensor([12,  9, 13, 15, 14]), tensor([12,  9, 13, 15, 14]), tensor([1

In [None]:
import torch

In [None]:
# Carga el contenido del archivo
data = torch.load("train_test_split.pt")

# Accede a los datos si están organizados en un diccionario
X_train = data["X_train"]
X_test = data["X_test"]
y_train = data["y_train"]
print(y_train)



y_test = data["y_test"]
train_target_lengths = data["train_target_lengths"]
test_target_lengths = data["test_target_lengths"]



[tensor([12,  9, 13, 15, 14]), tensor([ 3,  9, 18, 21,  5, 12,  1]), tensor([13,  1, 14, 26,  1, 14,  1]), tensor([ 3,  9, 18, 21,  5, 12,  1]), tensor([16,  5, 18,  1]), tensor([ 3,  9, 18, 21,  5, 12,  1]), tensor([12,  9, 13, 15, 14]), tensor([16,  5, 18,  1]), tensor([ 3,  9, 18, 21,  5, 12,  1]), tensor([16,  5, 18,  1]), tensor([12,  9, 13, 15, 14]), tensor([12,  9, 13, 15, 14]), tensor([16,  5, 18,  1]), tensor([16,  5, 18,  1]), tensor([13,  1, 14, 26,  1, 14,  1]), tensor([14,  1, 18,  1, 14, 10,  1]), tensor([13,  1, 14, 26,  1, 14,  1]), tensor([14,  1, 18,  1, 14, 10,  1]), tensor([16,  5, 18,  1]), tensor([13,  1, 14, 26,  1, 14,  1]), tensor([14,  1, 18,  1, 14, 10,  1]), tensor([13,  1, 14, 26,  1, 14,  1]), tensor([16,  5, 18,  1]), tensor([12,  9, 13, 15, 14]), tensor([16,  5, 18,  1]), tensor([12,  9, 13, 15, 14]), tensor([ 3,  9, 18, 21,  5, 12,  1]), tensor([14,  1, 18,  1, 14, 10,  1]), tensor([16,  5, 18,  1]), tensor([ 3,  9, 18, 21,  5, 12,  1]), tensor([14,  1,

In [None]:
from torch.utils.data import Dataset, DataLoader

class SimpleCTCDataset(Dataset):
    def __init__(self, images, labels, target_lengths):
        self.images = images
        self.labels = labels
        self.target_lengths = target_lengths

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        return self.images[idx], self.labels[idx], self.target_lengths[idx]


In [None]:
def simple_ctc_collate_fn(batch):
    images, labels, target_lengths = zip(*batch)
    images = torch.stack(images)
    label_lengths = torch.tensor([len(l) for l in labels], dtype=torch.long)
    flat_labels = torch.cat(labels)
    target_lengths = torch.stack(target_lengths)  # Asegúrate de que target_lengths sea un tensor
    return images, flat_labels, label_lengths, target_lengths


In [None]:



print(y_train)

print(train_target_lengths.shape)
print(test_target_lengths.shape)

train_dataset = SimpleCTCDataset(X_train, y_train, train_target_lengths)
test_dataset = SimpleCTCDataset(X_test, y_test, test_target_lengths)

batch_size = 32



train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=simple_ctc_collate_fn)
for batch_idx, (imgs, labels, label_lengths, target_lengths) in enumerate(train_loader):
    print(f"Batch {batch_idx}:")
    print(f" - imgs.shape: {imgs.shape}")
    print(f" - labels.shape: {labels.shape}")
    print(f" - label_lengths.shape: {label_lengths.shape}")
    print(f" - target_lengths.shape: {target_lengths.shape}")
    break  # Solo imprimimos las dimensiones del primer batch
def obtener_labels_segmentados(labels, target_lengths):
    """
    Devuelve una lista de tensores, donde cada uno representa la secuencia original.
    """
    secuencias = []
    indice = 0
    for long in target_lengths:
        secuencia = labels[indice:indice + long]
        secuencias.append(secuencia)
        indice += long
    return secuencias


for i, (imgs, labels, label_lengths, target_lengths) in enumerate(train_loader):

    print(f"  🏷️ labels: ", obtener_labels_segmentados(labels, label_lengths))

    break  # Quita esto si quieres ver más de un batch

test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=simple_ctc_collate_fn)


[tensor([12,  9, 13, 15, 14]), tensor([ 3,  9, 18, 21,  5, 12,  1]), tensor([13,  1, 14, 26,  1, 14,  1]), tensor([ 3,  9, 18, 21,  5, 12,  1]), tensor([16,  5, 18,  1]), tensor([ 3,  9, 18, 21,  5, 12,  1]), tensor([12,  9, 13, 15, 14]), tensor([16,  5, 18,  1]), tensor([ 3,  9, 18, 21,  5, 12,  1]), tensor([16,  5, 18,  1]), tensor([12,  9, 13, 15, 14]), tensor([12,  9, 13, 15, 14]), tensor([16,  5, 18,  1]), tensor([16,  5, 18,  1]), tensor([13,  1, 14, 26,  1, 14,  1]), tensor([14,  1, 18,  1, 14, 10,  1]), tensor([13,  1, 14, 26,  1, 14,  1]), tensor([14,  1, 18,  1, 14, 10,  1]), tensor([16,  5, 18,  1]), tensor([13,  1, 14, 26,  1, 14,  1]), tensor([14,  1, 18,  1, 14, 10,  1]), tensor([13,  1, 14, 26,  1, 14,  1]), tensor([16,  5, 18,  1]), tensor([12,  9, 13, 15, 14]), tensor([16,  5, 18,  1]), tensor([12,  9, 13, 15, 14]), tensor([ 3,  9, 18, 21,  5, 12,  1]), tensor([14,  1, 18,  1, 14, 10,  1]), tensor([16,  5, 18,  1]), tensor([ 3,  9, 18, 21,  5, 12,  1]), tensor([14,  1,

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
def desencoder(x):
    # Revertir la normalización (de [0, 1] a [1, 28])
    x_denorm = x * 27 + 1  # Ahora x está entre 1 y 28

    # Quitar la dimensión del canal si no es necesaria
    x_processed = x_denorm.squeeze(1)  # [32, 28]

    # Redondear si deseas convertir a enteros (por ejemplo, índices entre 1 y 28)
    x_indices = torch.round(x_processed).long()

    # Clip por si acaso algún valor salió levemente fuera de rango
    x_indices = torch.clamp(x_indices, 1, 28)

    print(x_indices.shape)  # [32, 28]
    print(x_indices)
class CRNN(nn.Module):
    def __init__(self, num_classes):
        super(CRNN, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 64, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  # 224 → 112

            nn.Conv2d(64, 128, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  # 112 → 56
        )

        self.rnn = nn.LSTM(128 * 56, 128, num_layers=2, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(128 * 2, num_classes)

    def forward(self, x):
        x = self.cnn(x)  # (B, 128, 56, 56)
        b, c, h, w = x.size()
        x = x.permute(0, 3, 1, 2)  # (B, W, C, H)
        x = x.contiguous().view(b, w, c * h)  # (B, W, C*H)
        x, _ = self.rnn(x)
        x = self.fc(x)  # (B, W, num_classes)
        x = x.permute(1, 0, 2)  # (T, B, C) para CTCLoss
        return x.log_softmax(2)

In [None]:
import string

# Lista de caracteres (mayúsculas) + blank (índice 0)
all_letters = list(string.ascii_uppercase)  # 26 letras
letter_to_idx = {char: idx + 1 for idx, char in enumerate(all_letters)}  # Índices 1-26
letter_to_idx['-'] = 0  # Añadir blank explícitamente

idx_to_letter = {idx: char for char, idx in letter_to_idx.items()}
idx_to_letter[0] = '-'  # Mapear 0 a blank

print("letter_to_idx:", letter_to_idx)
print("idx_to_letter:", idx_to_letter)


letter_to_idx: {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'H': 8, 'I': 9, 'J': 10, 'K': 11, 'L': 12, 'M': 13, 'N': 14, 'O': 15, 'P': 16, 'Q': 17, 'R': 18, 'S': 19, 'T': 20, 'U': 21, 'V': 22, 'W': 23, 'X': 24, 'Y': 25, 'Z': 26, '-': 0}
idx_to_letter: {1: 'A', 2: 'B', 3: 'C', 4: 'D', 5: 'E', 6: 'F', 7: 'G', 8: 'H', 9: 'I', 10: 'J', 11: 'K', 12: 'L', 13: 'M', 14: 'N', 15: 'O', 16: 'P', 17: 'Q', 18: 'R', 19: 'S', 20: 'T', 21: 'U', 22: 'V', 23: 'W', 24: 'X', 25: 'Y', 26: 'Z', 0: '-'}


In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:

'''
def transferir_pesos_conv(letter_model_path, crnn_model):
    """
    Carga los pesos de un modelo LetterClassifier y transfiere las capas convolucionales
    al modelo CRNN si sus nombres y tamaños coinciden.
    """
    print("Cargando pesos de LetterClassifier desde:", letter_model_path)
    letter_state = torch.load(letter_model_path, map_location='cpu')

    crnn_state = crnn_model.state_dict()
    transferidos = []

    for name in letter_state:
        if name in crnn_state and letter_state[name].size() == crnn_state[name].size():
            crnn_state[name] = letter_state[name]
            transferidos.append(name)

    crnn_model.load_state_dict(crnn_state)
    print(f"Se han transferido {len(transferidos)} capas:")
    for capa in transferidos:
        print(f"  ✓ {capa}")


'''


'\ndef transferir_pesos_conv(letter_model_path, crnn_model):\n    """\n    Carga los pesos de un modelo LetterClassifier y transfiere las capas convolucionales\n    al modelo CRNN si sus nombres y tamaños coinciden.\n    """\n    print("Cargando pesos de LetterClassifier desde:", letter_model_path)\n    letter_state = torch.load(letter_model_path, map_location=\'cpu\')\n\n    crnn_state = crnn_model.state_dict()\n    transferidos = []\n\n    for name in letter_state:\n        if name in crnn_state and letter_state[name].size() == crnn_state[name].size():\n            crnn_state[name] = letter_state[name]\n            transferidos.append(name)\n\n    crnn_model.load_state_dict(crnn_state)\n    print(f"Se han transferido {len(transferidos)} capas:")\n    for capa in transferidos:\n        print(f"  ✓ {capa}")\n\n\n'

In [None]:


num_classes = len(idx_to_letter) + 1  # +1 para el símbolo "blank" de CTC
print(num_classes)
model = CRNN(num_classes).to(device)
criterion = nn.CTCLoss(blank=0, reduction='mean', zero_infinity=True)
optimizer = optim.Adam(model.parameters(), lr=0.001)

state_dict = torch.load("letter_classifier.pth", map_location="cpu")

model.cnn[3].weight.data = state_dict['conv_layers.6.weight'].clone()
model.cnn[3].bias.data   = state_dict['conv_layers.6.bias'].clone()




In [85]:
def ctc_decode(preds, idx_to_letter):
    """
    preds: Tensor (T, N, C) con logits sin softmax
    idx_to_letter: dict mapeo {índice: letra}
    """
    print("Shape original preds:", preds.shape)  # (T, N, C)
    probs = preds.softmax(dim=2)                 # (T, N, C)
    batch_size = probs.size(1)
    blank_idx = 0                                # suponemos blank=0

    decoded = []
    for n in range(batch_size):
        seq = []
        prev_idx = None
        for t in range(probs.size(0)):
            idx = probs[t, n].argmax().item()
            if idx != blank_idx and idx != prev_idx:
                seq.append(idx_to_letter.get(idx, '?'))
            prev_idx = idx
        decoded.append("".join(seq))
    return decoded


def train(model, loader, optimizer, criterion, device, idx_to_letter):
    model.train()
    total_loss = 0.0
    for batch_idx, (imgs, labels, label_lengths, target_lengths) in enumerate(loader):
        print("labels",labels)
        print("label_lengths",label_lengths)
        print("target_lengths",target_lengths)
        imgs = imgs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()

        # 1) Forward
        outputs = model(imgs)                  # (N, C, H, W) o similar

        # 2) Decodificación de debug
        decoded_batch = ctc_decode(outputs.cpu(), idx_to_letter)
        print(f"[Batch {batch_idx}] Predicciones decodificadas:", decoded_batch)


        decoded_labels = []
        start = 0

        for l in target_lengths:
            seq = labels[start:start + l].tolist()
            decoded = ''.join([idx_to_letter[i] for i in seq])
            decoded_labels.append(decoded)
            start += l

        print("Respuesta esperada: ", decoded_labels)

        #Hacemos unos ajustes con permute:
        #outputs = outputs.permute(1, 0, 2)

            # 3) Cálculo de la pérdida CTC
        input_lengths = torch.full((imgs.size(0),), 56, dtype=torch.long, device=device)


        print("target_lengths", target_lengths.shape)
        print("outputs", outputs.shape)
        print("labels", labels.shape)
        print("input_lengths", input_lengths.shape)
        # Ajusta input_lengths según la longitud temporal real de cada secuencia
        labels2 = labels

        # Imprime los valores de input_lengths para verificar
        print("input_lengths:", input_lengths)

        loss = criterion(outputs.log_softmax(2), labels, input_lengths, target_lengths)

        # 4) Backward + optim
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(loader)
    print(f"Pérdida media en entrenamiento: {avg_loss:.4f}")
    return avg_loss


In [None]:

import torchvision.models as models
import torch.nn as nn

# Cargar modelo preentrenado
model2 = models.resnet18(pretrained=True)

# Modificar la primera capa para aceptar 1 canal (escala de grises)
model2.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)

# Reemplazar la última capa para 52 clases (letras mayúsculas y minúsculas)
num_features = model2.fc.in_features
model2.fc = nn.Linear(num_features, 28)  # 52 clases (A-Z y a-z)

# Congelar capas (opcional)
for param in model2.parameters():
    param.requires_grad = False  # Descongelar solo las capas modificadas
model2.conv1.requires_grad = True
model2.fc.requires_grad = True




In [87]:
for batch_idx, (imgs, labels, label_lengths, target_lengths) in enumerate(train_loader):
    print(f"Batch {batch_idx}:")
    print(" - imgs.shape:", imgs.shape)  # Ej: torch.Size([8, 1, 224, 224])
    print(" - labels.shape:", labels.shape)
    print(" - label_lengths:", label_lengths)
    print(" - target_lengths:", target_lengths)
    print(" - len(target_lengths):", len(target_lengths))

    # Solo inspeccionar el primer batch
    break


Batch 0:
 - imgs.shape: torch.Size([32, 1, 224, 224])
 - labels.shape: torch.Size([198])
 - label_lengths: tensor([7, 7, 7, 7, 4, 7, 7, 4, 5, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 4, 4, 7, 5, 7,
        5, 5, 7, 7, 4, 4, 7, 7])
 - target_lengths: tensor([7, 7, 7, 7, 4, 7, 7, 4, 5, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 4, 4, 7, 5, 7,
        5, 5, 7, 7, 4, 4, 7, 7])
 - len(target_lengths): 32


In [88]:

for epoch in range(50):
    loss = train(model, train_loader, optimizer, criterion, device, idx_to_letter)
    print(f"Epoch {epoch+1}, Loss: {loss:.4f}")


labels tensor([16,  5, 18,  1, 12,  9, 13, 15, 14,  3,  9, 18, 21,  5, 12,  1, 16,  5,
        18,  1, 12,  9, 13, 15, 14, 14,  1, 18,  1, 14, 10,  1, 12,  9, 13, 15,
        14, 13,  1, 14, 26,  1, 14,  1, 16,  5, 18,  1, 13,  1, 14, 26,  1, 14,
         1, 13,  1, 14, 26,  1, 14,  1, 13,  1, 14, 26,  1, 14,  1, 16,  5, 18,
         1,  3,  9, 18, 21,  5, 12,  1, 14,  1, 18,  1, 14, 10,  1,  3,  9, 18,
        21,  5, 12,  1, 13,  1, 14, 26,  1, 14,  1,  3,  9, 18, 21,  5, 12,  1,
        13,  1, 14, 26,  1, 14,  1, 16,  5, 18,  1, 14,  1, 18,  1, 14, 10,  1,
        16,  5, 18,  1, 16,  5, 18,  1, 13,  1, 14, 26,  1, 14,  1, 14,  1, 18,
         1, 14, 10,  1, 16,  5, 18,  1, 12,  9, 13, 15, 14,  3,  9, 18, 21,  5,
        12,  1,  3,  9, 18, 21,  5, 12,  1, 16,  5, 18,  1, 16,  5, 18,  1, 13,
         1, 14, 26,  1, 14,  1])
label_lengths tensor([4, 5, 7, 4, 5, 7, 5, 7, 4, 7, 7, 7, 4, 7, 7, 7, 7, 7, 7, 4, 7, 4, 4, 7,
        7, 4, 5, 7, 7, 4, 4, 7])
target_lengths tensor([4, 5, 7, 4

In [101]:
def evaluate(model, loader, criterion, device):
    """
    Evalúa el modelo en el dataloader proporcionado.
    Devuelve la pérdida media (CTC Loss) sobre todos los batches.
    """
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for imgs, labels, label_lengths, target_lengths in loader:
            outputs = model(imgs)
            imgs = imgs.to(device)
            labels = labels.to(device)

            # Forward                   # (N, C, H, W) o similar según tu CRNN

            # Todas las secuencias de entrada tienen longitud T
            input_lengths = torch.full(
                (outputs.size(1),),
                outputs.size(0),
                dtype=torch.long,
                device=device
            )
            # Cálculo de la pérdida CTC
            print(outputs.shape, labels.shape, label_lengths.shape, target_lengths.shape)
            loss = criterion(
    outputs.log_softmax(2),   # Logits con softmax
    labels,
    input_lengths,
    target_lengths.to(device)
)

            total_loss += loss.item()

    return total_loss / len(loader)


# —–––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
# Uso tras el entrenamiento:
# (suponiendo que ya tienes `model`, `test_loader`, `criterion` y `device` definidos)


test_loss = evaluate(model, test_loader, criterion, device)
print(f"Pérdida media en test: {test_loss:.4f}")


torch.Size([56, 32, 28]) torch.Size([187]) torch.Size([32]) torch.Size([32])
torch.Size([56, 32, 28]) torch.Size([192]) torch.Size([32]) torch.Size([32])
torch.Size([56, 32, 28]) torch.Size([208]) torch.Size([32]) torch.Size([32])
torch.Size([56, 32, 28]) torch.Size([187]) torch.Size([32]) torch.Size([32])
torch.Size([56, 32, 28]) torch.Size([177]) torch.Size([32]) torch.Size([32])
torch.Size([56, 32, 28]) torch.Size([199]) torch.Size([32]) torch.Size([32])
torch.Size([56, 8, 28]) torch.Size([52]) torch.Size([8]) torch.Size([8])
Pérdida media en test: 0.0742


In [90]:
# Guardar modelo + pesos + hiperparámetros
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'epoch': epoch,
    'loss': loss,
    'letter_to_idx': letter_to_idx,  # Guardar el diccionario de caracteres
}, 'modelo_completo_copy.pth')


In [116]:
!pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [None]:
# Asegúrate de tener definida la misma clase que usaste
model = CRNN(num_classes).to(device)  # Usa los mismos argumentos
checkpoint = torch.load('modelo_completo_copy.pth', map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
model.to(device)
model.eval()

# Diccionario de letras
letter_to_idx = checkpoint['letter_to_idx']
idx_to_letter = {v: k for k, v in letter_to_idx.items()}
from PIL import Image
from torchvision import transforms

# Transforms (ajústalos a lo que usaste en entrenamiento)
transform = transforms.Compose([
    transforms.Grayscale(),               # Asegura escala de grises
    transforms.Resize((224, 224)),        # Tamaño usado durante entrenamiento
    transforms.ToTensor(),                # Convierte a tensor
    transforms.Normalize((0.5,), (0.5,))  # Normalización típica para imágenes grises
])

# Cargar imagen
image = Image.open("word_dataset/CIRUELA_0.png")  # Cambia por tu imagen
input_tensor = transform(image).unsqueeze(0).to(device)  # Añade dimensión batch

with torch.no_grad():
    output = model(input_tensor)  # → (T, N, C) o (N, T, C), depende de tu modelo

    if output.shape[0] != input_tensor.shape[0]:  # Asegura que sea (T, N, C)
        output = output.permute(1, 0, 2)

    logits = output[0]  # (T, C) — secuencia de logits para una sola imagen
    predicted_indices = logits.argmax(dim=-1)  # Elegir la clase más probable en cada paso

    # Decodificación tipo CTC: eliminar duplicados y el símbolo blank
    decoded = []
    previous = -1
    blank_idx = 0  # Asegúrate de que este índice es el correcto para el símbolo "blank"

    for idx in predicted_indices:
        idx = idx.item()
        if idx != previous and idx != blank_idx:
            decoded.append(idx)
        previous = idx

    # Convertir a letras
    predicted_text = ''.join([idx_to_letter[i] for i in decoded])
    print("Texto predicho:", predicted_text, type(predicted_text))








Texto predicho: CIRUELA <class 'str'>


In [5]:
!pip install torch torchvision torchaudio

Collecting torchaudio
  Downloading torchaudio-2.7.0-cp310-cp310-win_amd64.whl.metadata (6.7 kB)
Downloading torchaudio-2.7.0-cp310-cp310-win_amd64.whl (2.5 MB)
   ---------------------------------------- 0.0/2.5 MB ? eta -:--:--
   ---------------- ----------------------- 1.0/2.5 MB 8.4 MB/s eta 0:00:01
   ---------------------------------------- 2.5/2.5 MB 7.9 MB/s eta 0:00:00
Installing collected packages: torchaudio
Successfully installed torchaudio-2.7.0


In [10]:
import os
import csv
import torch
from PIL import Image
from torchvision import transforms
from tqdm import tqdm

import string
import torch
import torch.nn as nn
import torch.optim as optim


class CRNN(nn.Module):
    def __init__(self, num_classes):
        super(CRNN, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 64, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  # 224 → 112

            nn.Conv2d(64, 128, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  # 112 → 56
        )

        self.rnn = nn.LSTM(128 * 56, 128, num_layers=2, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(128 * 2, num_classes)

    def forward(self, x):
        x = self.cnn(x)  # (B, 128, 56, 56)
        b, c, h, w = x.size()
        x = x.permute(0, 3, 1, 2)  # (B, W, C, H)
        x = x.contiguous().view(b, w, c * h)  # (B, W, C*H)
        x, _ = self.rnn(x)
        x = self.fc(x)  # (B, W, num_classes)
        x = x.permute(1, 0, 2)  # (T, B, C) para CTCLoss
        return x.log_softmax(2)
    
    
# --- Cargar modelo y diccionarios ---
all_letters = list(string.ascii_uppercase + string.ascii_lowercase)
letter_to_idx = {char: idx+1 for idx, char in enumerate(all_letters)}
idx_to_letter = {v: k for k, v in letter_to_idx.items()}


num_classes = 28
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CRNN(num_classes).to(device)  # Asegúrate que 'num_classes' está definido
checkpoint = torch.load("modelo_completo_copy.pth", map_location=device)
model.load_state_dict(checkpoint["model_state_dict"])
model.eval()


blank_idx = 0  # Asegúrate de que este es el índice correcto para tu símbolo "blank"

# --- Transforms ---
transform = transforms.Compose([
    transforms.Grayscale(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# --- Paths ---
IMG_DIR = "word_dataset"
CSV_PATH = "word_dataset/word_labels.csv"

# --- Evaluación ---
correct = 0
total = 0

with open(CSV_PATH, "r", encoding="utf-8") as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # Saltar encabezado

    for row in tqdm(reader):
        img_name, true_label = row
        img_path = os.path.join(IMG_DIR, img_name)

        # Preprocesar imagen
        image = Image.open(img_path).convert("RGB")
        input_tensor = transform(image).unsqueeze(0).to(device)

        with torch.no_grad():
            output = model(input_tensor)  # (T, N, C) o (N, T, C)
            if output.shape[0] != input_tensor.shape[0]:
                output = output.permute(1, 0, 2)
            logits = output[0]  # (T, C)

            predicted_indices = logits.argmax(dim=-1)
            decoded = []
            previous = -1
            for idx in predicted_indices:
                idx = idx.item()
                if idx != previous and idx != blank_idx:
                    decoded.append(idx)
                previous = idx

            predicted_text = ''.join([idx_to_letter[i] for i in decoded])

        if predicted_text.strip().lower() == true_label.strip().lower():
            correct += 1
        total += 1

accuracy = correct / total * 100
print(f"Accuracy: {accuracy:.2f}%")


1000it [00:21, 45.92it/s]

Accuracy: 99.20%





In [135]:
from gtts import gTTS
from pydub import AudioSegment
import winsound

# Texto que deseas decir
texto = predicted_text

# 1. Crear archivo MP3 con gTTS
tts = gTTS(text=texto, lang='es')
tts.save("temp.mp3")

# 2. Convertir MP3 a WAV usando pydub
AudioSegment.from_mp3("temp.mp3").export("temp.wav", format="wav")

# 3. Reproducir sin abrir reproductor
winsound.PlaySound("temp.wav", winsound.SND_FILENAME | winsound.SND_ASYNC)