# Carga y Exploración del Dataset


In [1]:
import pandas as pd
import os

captions_file = 'archive/captions.txt'
images_folder = 'archive/Images'

captions_df = pd.read_csv(captions_file)

print(captions_df.head())

unique_images = captions_df['image'].unique()
print(f"Número de imágenes únicas: {len(unique_images)}")

                       image  \
0  1000268201_693b08cb0e.jpg   
1  1000268201_693b08cb0e.jpg   
2  1000268201_693b08cb0e.jpg   
3  1000268201_693b08cb0e.jpg   
4  1000268201_693b08cb0e.jpg   

                                             caption  
0  A child in a pink dress is climbing up a set o...  
1              A girl going into a wooden building .  
2   A little girl climbing into a wooden playhouse .  
3  A little girl climbing the stairs to her playh...  
4  A little girl in a pink dress going into a woo...  
Número de imágenes únicas: 8091


In [2]:
missing_images = []
for img_name in unique_images:
    if not os.path.exists(os.path.join(images_folder, img_name)):
        missing_images.append(img_name)

if missing_images:
    print("Imágenes faltantes:", missing_images)
else:
    print("¡Todas las imágenes existen!")


¡Todas las imágenes existen!


# Preprocesar las descripciones

In [3]:
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mauriciotorres/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
captions_list = captions_df['caption'].tolist()
tokenized_captions = [word_tokenize(caption.lower().strip()) for caption in captions_list]

tokenized_captions[:1]

[['a',
  'child',
  'in',
  'a',
  'pink',
  'dress',
  'is',
  'climbing',
  'up',
  'a',
  'set',
  'of',
  'stairs',
  'in',
  'an',
  'entry',
  'way',
  '.']]

In [5]:
vocab_size = 5000
special_tokens = ['<PAD>', '<START>', '<END>', '<UNK>']

all_words = [word for caption in tokenized_captions for word in caption]
word_counts = Counter(all_words)

vocab = special_tokens + [word for word, count in word_counts.most_common(vocab_size - len(special_tokens))]

word_to_idx = {word: idx for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

captions_indices = []
for caption in tokenized_captions:
    indices = [word_to_idx['<START>']]
    for word in caption:
        if word in word_to_idx:
            indices.append(word_to_idx[word])
        else:
            indices.append(word_to_idx['<UNK>'])
    indices.append(word_to_idx['<END>'])
    captions_indices.append(indices)

In [6]:
print("Ejemplo tokenizado:", tokenized_captions[0])
print("Ejemplo indexado:", captions_indices[0])

Ejemplo tokenizado: ['a', 'child', 'in', 'a', 'pink', 'dress', 'is', 'climbing', 'up', 'a', 'set', 'of', 'stairs', 'in', 'an', 'entry', 'way', '.']
Ejemplo indexado: [1, 4, 45, 6, 4, 93, 173, 9, 122, 56, 4, 399, 14, 396, 6, 31, 3, 697, 5, 2]


# Preprocesar las imagenes

In [7]:
#Definir las transformaciones necesarias para las imágenes.

import torchvision.transforms as transforms

# Transformaciones estándar para ResNet
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Redimensionar
    transforms.ToTensor(),          # Convertir a tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalizar
])

# Dataset y DataLoader

In [8]:
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image

"""
class ImageCaptionDataset(Dataset):
    def __init__(self, captions_df, images_folder, transform, word_to_idx, max_length=50):
        self.captions_df = captions_df
        self.images_folder = images_folder
        self.transform = transform
        self.word_to_idx = word_to_idx
        self.max_length = max_length

    def __len__(self):
        return len(self.captions_df)

    def __getitem__(self, idx):
        # Obtener la imagen y su caption
        img_name = self.captions_df.iloc[idx]['image']
        caption = self.captions_df.iloc[idx]['caption']

        # Cargar y transformar la imagen
        image = Image.open(os.path.join(self.images_folder, img_name)).convert('RGB')
        image = self.transform(image)

        # Tokenizar y convertir caption a índices
        tokens = word_tokenize(caption.lower().strip())
        indices = [self.word_to_idx['<START>']]
        indices.extend([self.word_to_idx.get(word, self.word_to_idx['<UNK>']) for word in tokens])
        indices.append(self.word_to_idx['<END>'])

        # Padding para que todas las secuencias tengan la misma longitud
        if len(indices) < self.max_length:
            indices.extend([self.word_to_idx['<PAD>']] * (self.max_length - len(indices)))
        else:
            indices = indices[:self.max_length-1] + [self.word_to_idx['<END>']]

        return image, torch.tensor(indices)"""

"\nclass ImageCaptionDataset(Dataset):\n    def __init__(self, captions_df, images_folder, transform, word_to_idx, max_length=50):\n        self.captions_df = captions_df\n        self.images_folder = images_folder\n        self.transform = transform\n        self.word_to_idx = word_to_idx\n        self.max_length = max_length\n\n    def __len__(self):\n        return len(self.captions_df)\n\n    def __getitem__(self, idx):\n        # Obtener la imagen y su caption\n        img_name = self.captions_df.iloc[idx]['image']\n        caption = self.captions_df.iloc[idx]['caption']\n\n        # Cargar y transformar la imagen\n        image = Image.open(os.path.join(self.images_folder, img_name)).convert('RGB')\n        image = self.transform(image)\n\n        # Tokenizar y convertir caption a índices\n        tokens = word_tokenize(caption.lower().strip())\n        indices = [self.word_to_idx['<START>']]\n        indices.extend([self.word_to_idx.get(word, self.word_to_idx['<UNK>']) for word 

In [9]:
from dataset import ImageCaptionDataset


dataset = ImageCaptionDataset(
    captions_df=captions_df,
    images_folder='archive/Images',
    transform=image_transform,
    word_to_idx=word_to_idx
)

# Crear el dataloader
batch_size = 128
dataloader = DataLoader(
    dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=0,  # Ajustar según tu CPU
    pin_memory=True
)

# Verificar un batch
for images, captions in dataloader:
    print("Batch de imágenes:", images.shape)  # Debería ser [batch_size, 3, 224, 224]
    print("Batch de captions:", captions.shape)  # Debería ser [batch_size, max_length]
    break



Batch de imágenes: torch.Size([128, 3, 224, 224])
Batch de captions: torch.Size([128, 50])


# Crear el modelo

In [10]:
import torch
import torchvision.models as models
import torch.nn as nn

### Encoder (Extrae caracteristicas)

In [11]:
# ResNet50 es una red neuronal convolucional profunda con 50 capas que usa bloques de residual 
# learning (conexiones residuales) para facilitar el entrenamiento de redes muy profundas. Fue 
# diseñada para mejorar la precisión en tareas de visión por computadora como clasificación, 
# detección y extracción de características, evitando el problema del vanishing gradient. 

class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        super(EncoderCNN, self).__init__()
        # Cargar ResNet50 preentrenada
        resnet = models.resnet50(pretrained=True)
        # Eliminar la última capa (fully connected)
        modules = list(resnet.children())[:-1]
        self.resnet = nn.Sequential(*modules)
        # Añadir una capa lineal para ajustar la dimensión
        self.linear = nn.Linear(2048, embed_size)  # ResNet50 produce 2048 características
        
    def forward(self, images):
        # Extraer características
        features = self.resnet(images)
        # Eliminar la dimensión extra
        features = features.view(features.size(0), -1)
        # Ajustar la dimensión
        features = self.linear(features)
        return features

### Decoder (genera la descripcion)

In [12]:
class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers=1):
        super(DecoderRNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, features, captions):
        # Embedding de las descripciones
        embeddings = self.embed(captions)
        
        # Concatenar características de la imagen con las descripciones
        inputs = torch.cat((features.unsqueeze(1), embeddings), 1)
        
        # LSTM
        outputs, _ = self.lstm(inputs)
        
        # Capa lineal para predecir la siguiente palabra
        outputs = self.linear(outputs)
        return outputs

### Juntamos decoder y encoder

In [13]:
class ImageCaptioningModel(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers=1):
        super(ImageCaptioningModel, self).__init__()
        self.encoder = EncoderCNN(embed_size)  # Pasamos embed_size al encoder
        self.decoder = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers)
        
    def forward(self, images, captions):
        # Extraer características de la imagen
        features = self.encoder(images)
        
        # Generar la descripción
        outputs = self.decoder(features, captions)
        return outputs

### Creamos un modelo

In [14]:
embed_size = 256
hidden_size = 512
vocab_size = len(vocab)  # Usar el tamaño del vocabulario que creamos antes

model = ImageCaptioningModel(embed_size, hidden_size, vocab_size)

# 5. Verificar que el modelo funciona
# Obtener un batch de prueba
for images, captions in dataloader:
    # Forward pass
    outputs = model(images, captions)
    print("Shape de la salida:", outputs.shape)  # Debería ser [batch_size, max_length, vocab_size]
    break



Shape de la salida: torch.Size([128, 51, 5000])


# Entrenar el modelo

In [15]:
# Verificar el modelo
for images, captions in dataloader:
    outputs = model(images, captions)
    print("Dimensiones del modelo:")
    print(f"Input images shape: {images.shape}")
    print(f"Input captions shape: {captions.shape}")
    print(f"Output shape: {outputs.shape}")
    break

Dimensiones del modelo:
Input images shape: torch.Size([128, 3, 224, 224])
Input captions shape: torch.Size([128, 50])
Output shape: torch.Size([128, 51, 5000])


In [None]:
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

# 1. Crear el modelo (si no está creado)
embed_size = 256
hidden_size = 512
vocab_size = len(vocab)
model = ImageCaptioningModel(embed_size, hidden_size, vocab_size)

# 2. Definir criterio y optimizador
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 3. Función de entrenamiento
def train_model(model, dataloader, criterion, optimizer, num_epochs=10):
    model.train()  # Poner el modelo en modo entrenamiento
    for epoch in range(num_epochs):
        running_loss = 0.0
        # Usar tqdm para mostrar una barra de progreso
        for images, captions in tqdm(dataloader, desc=f'Epoch {epoch+1}/{num_epochs}'):
            # Limpiar los gradientes
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(images, captions)  # [batch_size, seq_length, vocab_size]
            
            # Preparar los tensores para la pérdida
            # Ajustar las dimensiones para que coincidan
            outputs = outputs[:, :50, :].contiguous()  # Mantener solo 50 tokens
            captions = captions.contiguous()           # Mantener los captions originales
            
            # Reshape para la función de pérdida
            batch_size = outputs.size(0)
            seq_length = outputs.size(1)
            
            # Reshape para la función de pérdida
            outputs = outputs.view(batch_size * seq_length, -1)  # [batch_size * seq_length, vocab_size]
            captions = captions.view(batch_size * seq_length)    # [batch_size * seq_length]
            
            # Calcular la pérdida
            loss = criterion(outputs, captions)
            
            # Backward pass
            loss.backward()
            
            # Actualizar los pesos
            optimizer.step()
            
            # Actualizar la pérdida
            running_loss += loss.item()
        
        # Imprimir la pérdida promedio
        epoch_loss = running_loss / len(dataloader)
        print(f'Epoch {epoch+1}, Loss: {epoch_loss:.4f}')
        
        # Guardar el modelo cada 5 épocas
        if (epoch + 1) % 5 == 0:
            torch.save(model.state_dict(), f'model_epoch_{epoch+1}.pth')

# 4. Entrenar el modelo
num_epochs = 10
train_model(model, dataloader, criterion, optimizer, num_epochs)

Epoch 1/10:   1%|▏         | 4/317 [03:13<4:11:30, 48.21s/it]

# Evaluar y Ajustar