In [None]:
pip install torch torchvision transformers nltk




In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torchvision import models
from PIL import Image
from torch.utils.data import Dataset, DataLoader

class TransformerCaptioningModel(nn.Module):
    def __init__(self, feature_dim, embed_dim, num_heads, ff_hidden_dim, num_layers, vocab_size, max_len):
        super(TransformerCaptioningModel, self).__init__()
        self.feature_embed = nn.Linear(feature_dim, embed_dim)
        self.positional_encoding = nn.Parameter(torch.zeros(1, max_len, embed_dim))
        self.transformer = nn.Transformer(embed_dim, num_heads, num_layers, num_layers, ff_hidden_dim)
        self.fc = nn.Linear(embed_dim, vocab_size)
        self.embed_dim = embed_dim

    def forward(self, features, captions):
        features = self.feature_embed(features).unsqueeze(0)
        captions = captions + self.positional_encoding[:, :captions.size(1), :]
        out = self.transformer(features, captions)
        out = self.fc(out)
        return out

# Hyperparameters
feature_dim = 2048
embed_dim = 512
num_heads = 8
ff_hidden_dim = 2048
num_layers = 6
vocab_size = 10000  # Example vocabulary size, change accordingly
max_len = 50

# Initialize the model
model = TransformerCaptioningModel(feature_dim, embed_dim, num_heads, ff_hidden_dim, num_layers, vocab_size, max_len)
model = model.to(device)


In [None]:
def extract_features(image_path, model, transform):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)
    with torch.no_grad():
        features = model(image)
    features = features.view(features.size(0), -1)
    return features

# Example usage for the uploaded images
image_path_1 = '/content/1000268201_693b08cb0e.jpg'
image_path_2 = '/content/3637013_c675de7705.jpg'
features_1 = extract_features(image_path_1, resnet, transform)
features_2 = extract_features(image_path_2, resnet, transform)
print("Extracted Features Shape for Image 1:", features_1.shape)
print("Extracted Features Shape for Image 2:", features_2.shape)


Extracted Features Shape for Image 1: torch.Size([1, 2048])
Extracted Features Shape for Image 2: torch.Size([1, 2048])


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import nltk
from collections import Counter

class ImageCaptionDataset(Dataset):
    def __init__(self, image_paths, captions, transform, vocab, max_len=50):
        self.image_paths = image_paths
        self.captions = captions
        self.transform = transform
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        caption = self.captions[idx]
        image = Image.open(image_path).convert('RGB')
        image = self.transform(image)
        caption = [self.vocab.get(word, self.vocab['<unk>']) for word in caption.split()]
        caption = [self.vocab['<start>']] + caption + [self.vocab['<end>']]
        if len(caption) < self.max_len:
            caption = caption + [self.vocab['<pad>']] * (self.max_len - len(caption))
        else:
            caption = caption[:self.max_len]
        return image, torch.tensor(caption)

# Example vocabulary and dataset
vocab = {'<pad>': 0, '<start>': 1, '<end>': 2, '<unk>': 3, 'a': 4, 'child': 5, 'playing': 6, 'by': 7, 'the': 8, 'house': 9}  # Example vocab
image_paths = [image_path_1, image_path_2]
captions = ['a child playing by the house', 'a woman standing by a pond']

dataset = ImageCaptionDataset(image_paths, captions, transform, vocab)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import nltk
from collections import Counter

class ImageCaptionDataset(Dataset):
    def __init__(self, image_paths, captions, transform, vocab, max_len=50):
        self.image_paths = image_paths
        self.captions = captions
        self.transform = transform
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        caption = self.captions[idx]
        image = Image.open(image_path).convert('RGB')
        image = self.transform(image)
        caption = [self.vocab.get(word, self.vocab['<unk>']) for word in caption.split()]
        caption = [self.vocab['<start>']] + caption + [self.vocab['<end>']]
        if len(caption) < self.max_len:
            caption = caption + [self.vocab['<pad>']] * (self.max_len - len(caption))
        else:
            caption = caption[:self.max_len]
        return image, torch.tensor(caption)

# Example vocabulary and dataset
vocab = {'<pad>': 0, '<start>': 1, '<end>': 2, '<unk>': 3, 'a': 4, 'child': 5, 'playing': 6, 'by': 7, 'the': 8, 'house': 9}  # Example vocab
image_paths = [image_path_1, image_path_2]
captions = ['a child playing by the house', 'a woman standing by a pond']

dataset = ImageCaptionDataset(image_paths, captions, transform, vocab)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)


In [None]:
import torch
import torch.nn as nn

class CaptionGenerator(nn.Module):
    def __init__(self, feature_dim, embed_dim, vocab_size, num_layers, num_heads):
        super(CaptionGenerator, self).__init__()
        self.feature_embed = nn.Linear(feature_dim, embed_dim)
        self.caption_embed = nn.Embedding(vocab_size, embed_dim)
        self.positional_encoding = nn.Parameter(torch.zeros(1, 5000, embed_dim))
        self.transformer = nn.Transformer(embed_dim, num_heads, num_layers)
        self.fc = nn.Linear(embed_dim, vocab_size)

    def forward(self, features, captions):
        # Ensure features is a dense tensor
        if features.is_sparse:
            features = features.to_dense()

        # Check the shapes of features and captions
        print("Original features shape:", features.shape)
        print("Original captions shape:", captions.shape)

        features = self.feature_embed(features)
        print("After embedding features shape:", features.shape)

        if features.dim() == 2:  # (batch_size, embed_dim)
            features = features.unsqueeze(1)  # (batch_size, 1, embed_dim)
        elif features.dim() == 3:  # already in (batch_size, 1, embed_dim) format
            pass
        else:
            raise ValueError(f"Unexpected feature dimensions after embedding: {features.dim()}")

        print("After unsqueeze features shape:", features.shape)

        captions = self.caption_embed(captions) + self.positional_encoding[:, :captions.size(1), :]  # (batch_size, seq_len, embed_dim)

        print("Embedded captions shape:", captions.shape)

        # Ensure features have the correct dimensions before permuting
        if features.dim() == 3:
            features = features.permute(1, 0, 2)  # (1, batch_size, embed_dim)
        else:
            raise ValueError(f"Unexpected feature dimensions: {features.dim()}")

        # Ensure captions have the correct dimensions before permuting
        if captions.dim() == 3:
            captions = captions.permute(1, 0, 2)  # (seq_len, batch_size, embed_dim)
        else:
            raise ValueError(f"Unexpected captions dimensions: {captions.dim()}")

        print("Permuted features shape:", features.shape)
        print("Permuted captions shape:", captions.shape)

        out = self.transformer(features, captions)  # (seq_len, batch_size, embed_dim)
        out = self.fc(out)  # (seq_len, batch_size, vocab_size)

        return out

# Assuming feature_dim, embed_dim, vocab_size, num_layers, and num_heads are defined
feature_dim = 2048  # Example value
embed_dim = 512  # Example value
vocab_size = 10000  # Example value
num_layers = 6  # Example value
num_heads = 8  # Example value
model = CaptionGenerator(feature_dim, embed_dim, vocab_size, num_layers, num_heads)


In [None]:
import torch

def generate_caption(model, features, vocab, max_len=20):
    model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Assuming features is a single sample with shape [feature_dim]
    features = torch.tensor(features).unsqueeze(0).to(device)  # Add batch dimension
    print("Initial features shape:", features.shape)  # Debug print

    caption = [vocab['<start>']]
    for _ in range(max_len):
        caption_tensor = torch.tensor(caption).unsqueeze(0).to(device)

        with torch.no_grad():
            output = model(features, caption_tensor)

        # Debugging: Print the shape of output
        print("Output shape:", output.shape)

        # Assuming output is a tensor with shape [seq_len, batch_size, vocab_size]
        max_index = output[-1].argmax(dim=-1)  # Get the index of maximum value from the last timestep
        next_word = max_index.item()  # Extract the scalar value of the index

        # Check if the next_word index is in the vocabulary
        if next_word not in vocab.values():
            print(f"Warning: Generated index {next_word} not in vocabulary")
            break

        caption.append(next_word)
        if next_word == vocab['<end>']:
            break

    words = [list(vocab.keys())[list(vocab.values()).index(idx)] for idx in caption if idx in vocab.values()]
    return ' '.join(words[1:-1])  # Skip <start> and <end> tokens

# Example usage
# Assuming model, features_1, features_2, and vocab are defined elsewhere
generated_caption_1 = generate_caption(model, features_1, vocab)
generated_caption_2 = generate_caption(model, features_2, vocab)

print("Generated Caption for Image 1:", generated_caption_1)
print("Generated Caption for Image 2:", generated_caption_2)


Initial features shape: torch.Size([1, 1, 2048])
Original features shape: torch.Size([1, 1, 2048])
Original captions shape: torch.Size([1, 1])
After embedding features shape: torch.Size([1, 1, 512])
After unsqueeze features shape: torch.Size([1, 1, 512])
Embedded captions shape: torch.Size([1, 1, 512])
Permuted features shape: torch.Size([1, 1, 512])
Permuted captions shape: torch.Size([1, 1, 512])
Output shape: torch.Size([1, 1, 10000])
Initial features shape: torch.Size([1, 1, 2048])
Original features shape: torch.Size([1, 1, 2048])
Original captions shape: torch.Size([1, 1])
After embedding features shape: torch.Size([1, 1, 512])
After unsqueeze features shape: torch.Size([1, 1, 512])
Embedded captions shape: torch.Size([1, 1, 512])
Permuted features shape: torch.Size([1, 1, 512])
Permuted captions shape: torch.Size([1, 1, 512])
Output shape: torch.Size([1, 1, 10000])
Generated Caption for Image 1: 
Generated Caption for Image 2: 


  features = torch.tensor(features).unsqueeze(0).to(device)  # Add batch dimension
