In [None]:
from IPython.display import clear_output, display

In [None]:
# %pip install torch torchvision pillow spacy numpy
# %pip install torchtext
# %pip install pycocotools

In [None]:
import os
import math
import random

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.models as models
from torch.utils.data import DataLoader, Dataset
from torchvision.datasets import CocoCaptions
from torchtext.data.utils import get_tokenizer

from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm

from PIL import Image
import spacy

In [None]:
dataset_variant = 'val2017'

# Contents

- In this notebook, you are required to make a Transformer(in decoder) based image captioning model using PyTorch. Use a custom transformer class for your decoder and not a pre-defined model architecture. You can use nn.TransformerDecoder and nn.TransformerDecoderLayer if you want.
- Use the Coco 2017 dataset. You dont need to use the entire train variant as it can get too big.
- After training, show your model's performance by taking a few images and using the model to generate their captions and then visualizing the images and their captions
- For image feature extraction it's recommended to use a pre-trained model's features(not final output but intermediate layer outputs) as a backbone model(which will be our image encoder) and then feed them to the Transformer decoder. You can set requires_grad of the backbone model to False as we don't need to train it and it'll improve the training speed significantly. A good example of a backbone model is ResNet

## Downloading the data

In [None]:
# Define paths for dataset and annotations
data_dir = './data'
images_dir = os.path.join(data_dir, dataset_variant)
annotations_dir = os.path.join(data_dir, 'annotations')

# Create directories if they don't exist
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
if not os.path.exists(images_dir):
    os.makedirs(images_dir)
if not os.path.exists(annotations_dir):
    os.makedirs(annotations_dir)

# Download dataset
!wget http://images.cocodataset.org/zips/{dataset_variant}.zip -P {data_dir}

# Unzip dataset
!unzip {data_dir}/{dataset_variant}.zip -d {data_dir}

clear_output()


In [None]:
# Download annotations
!wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip -P {annotations_dir}

# # Unzip annotations
!unzip {annotations_dir}/annotations_trainval2017.zip -d {annotations_dir}

clear_output()

## Loading the Dataset

In [None]:
transform = transforms.Compose(
        [
            transforms.Resize((299, 299)),
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        ]
    )

# Load MS-COCO dataset
train_dataset = CocoCaptions(root=f'./data/{dataset_variant}', annFile=f'./data/annotations/annotations/captions_{dataset_variant}.json', transform=transform)

loading annotations into memory...
Done (t=0.04s)
creating index...
index created!


In [None]:
# You might need to create a colate function(or another custom dataset class which works on top of the CocoCaptions class) for handling certain
# processing of your dataset like handling the captions different length scenario or converting text tokens to numeric representation etc.v

## Building the tokenizer and vocabulary

It's recommended to create a tokenizer and a vocabulary for your text corpus but you can solve it your own way if you want.

A recommendation for text tokenizer is Spacy but it's not mandatory. You can use another library or write your own custom tokenizer if you want

## Defining the Model

## Training the model

## Visualizing the results