<a href="https://colab.research.google.com/github/Kushi2407/Codsoft_1/blob/main/IMAGE_CAPTIONING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torchvision.models as models
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torchvision import transforms
from torch.autograd import Variable
from PIL import Image
import numpy as np

# Download and load the pre-trained ResNet model
resnet = models.resnet152(pretrained=True)
modules = list(resnet.children())[:-1]
resnet = nn.Sequential(*modules)
for param in resnet.parameters():
    param.requires_grad = False

# Define the captioning model
class ImageCaptioningModel(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers=1):
        super(ImageCaptioningModel, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)

    def forward(self, features, captions, lengths):
        embeddings = self.embed(captions)
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        packed = pack_padded_sequence(embeddings, lengths, batch_first=True)
        hiddens, _ = self.lstm(packed)
        outputs = self.linear(hiddens[0])
        return outputs

# Define data preprocessing functions
def load_image(image_path, transform=None):
    image = Image.open(image_path)
    image = image.resize([224, 224], Image.LANCZOS)

    if transform is not None:
        image = transform(image).unsqueeze(0)

    return image

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])

# Initialize the model and load pre-trained word embeddings (if any)
embed_size = 256
hidden_size = 512
vocab_size = len(vocab)  # You need to define your vocabulary
model = ImageCaptioningModel(embed_size, hidden_size, vocab_size)

# Load pre-trained word embeddings if available
# word_embeddings = load_word_embeddings()
# model.embed.weight.data.copy_(torch.from_numpy(word_embeddings))

# Load pre-trained ResNet weights
model.resnet = resnet

# Set other hyperparameters
num_epochs = 5
batch_size = 64
learning_rate = 0.001

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    for i, (images, captions, lengths) in enumerate(data_loader):
        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        images = Variable(images)
        captions = Variable(captions)
        targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]
        features = model.resnet(images)
        outputs = model(features, captions, lengths)

        # Calculate loss
        loss = criterion(outputs, targets)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        # Print log info
        if i % 100 == 0:
            print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
                  .format(epoch, num_epochs, i, len(data_loader), loss.item()))

# Save the model
torch.save(model.state_dict(), 'image_captioning_model.pth')


Downloading: "https://download.pytorch.org/models/resnet152-394f9c45.pth" to /root/.cache/torch/hub/checkpoints/resnet152-394f9c45.pth
100%|██████████| 230M/230M [00:03<00:00, 65.3MB/s]


NameError: ignored