In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torch.utils.data import Dataset, DataLoader, Sampler
from torchvision import transforms, utils

import torchvision.models as models


from skimage import io, transform

import matplotlib.pyplot as plt # for plotting
import numpy as np

import matplotlib.pyplot as plt
import numpy as np
import sys
import os
from PIL import Image
import numpy
import nltk
from nltk import word_tokenize
nltk.download('punkt')
from collections import Counter, defaultdict

### Image Transforms

In [None]:
class Rescale(object):
    """Rescale the image in a sample to a given size.

    Args:
        output_size (tuple or int): Desired output size. If tuple, output is
            matched to output_size. If int, smaller of image edges is matched
            to output_size keeping aspect ratio the same.
    """

    def __init__(self, output_size):
        assert isinstance(output_size, (int, tuple))
        self.output_size = output_size

    def __call__(self, image):
        h, w = image.shape[:2]
        if isinstance(self.output_size, int):
            if h > w:
                new_h, new_w = self.output_size * h / w, self.output_size
            else:
                new_h, new_w = self.output_size, self.output_size * w / h
        else:
            new_h, new_w = self.output_size

        new_h, new_w = int(new_h), int(new_w)
        image = image.astype(np.float32)
        img = transform.resize(image, (new_h, new_w))
        # print('img : ',type(img))
        return img


class ToTensor(object):
    """Convert ndarrays in sample to Tensors."""

    def __call__(self, image):
        # swap color axis because
        # numpy image: H x W x C
        # torch image: C X H X W
        image = image.astype(np.float32)
        image = image.transpose((2, 0, 1))
        return image

IMAGE_RESIZE = (224, 224)
# Sequentially compose the transforms
img_transform = transforms.Compose([Rescale(IMAGE_RESIZE), ToTensor()])


### Captions Preprocessing

In [None]:
class CaptionsPreprocessing:
    """Preprocess the captions, generate vocabulary and convert words to tensor tokens

    Args:
        captions_file_path (string): captions tsv file path
    """
    def __init__(self, captions_file_path):
        self.captions_file_path = captions_file_path

        # Read raw captions
        self.raw_captions_dict = self.read_raw_captions()

        # Preprocess captions
        self.captions_dict = self.process_captions()

        # Create vocabulary
        self.vocab = self.generate_vocabulary()

    def read_raw_captions(self):
        """
        Returns:
            Dictionary with raw captions list keyed by image ids (integers)
        """
        captions_dict = {}
        with open(self.captions_file_path, 'r', encoding='utf-8') as f:
            for img_caption_line in f.readlines():
                img_captions = img_caption_line.strip().split('\t')
                captions_dict[int(img_captions[0])] = img_captions[1:]

        return captions_dict

    def process_captions(self):
        """
        Use this function to generate dictionary and other preprocessing on captions
        """
        captions_dict = {}
        for key in self.raw_captions_dict:
            captions = [["<SS>"] + word_tokenize(caption.lower()) + ["<ES>"] for caption in self.raw_captions_dict[key]]
            for i,caption in enumerate(captions):
                new_key = (key,i)
                captions_dict[new_key] = caption
        #
        return captions_dict

    def generate_vocabulary(self):
        """
        Use this function to generate dictionary and other preprocessing on captions
        """
        # all_captions = [caption for captions in self.captions_dict.values() for caption in captions]
        all_captions = self.captions_dict.values()
        token_counts = Counter([token for caption in all_captions for token in caption])
        # <UN> unknown token starting in dict indexed at 0
        all_tokens = ["<UN>"] + [entry[0] for entry in token_counts.most_common()]
        # Generate the vocabulary
        vocab = {token:index for index,token in enumerate(all_tokens)}
        return vocab
            
    def captions_transform(self, img_caption):
        """
        Use this function to generate tensor tokens for the text captions
        Args:
            img_caption_list: List of captions for a particular image
        """
        vocab = self.vocab
        # maxlength = len(img_caption_list[0])
        # for l in img_caption_list:
        #     maxlength = (maxlength <= len(l)) and len(l) or maxlength
        tenser = torch.zeros([len(img_caption)],dtype=torch.long,requires_grad=False)
        # Generate tensors
        for i,token in enumerate(img_caption):
            tenser[i] = vocab[token]
        return tenser

# Set the captions tsv file path
CAPTIONS_FILE_PATH = '/content/drive/My Drive/Colab Notebooks/caption data/public_test_captions.tsv'
captions_preprocessing_obj = CaptionsPreprocessing(CAPTIONS_FILE_PATH)

In [None]:
print(captions_preprocessing_obj.vocab)
# captions_preprocessing_obj.captions_transform(captions_preprocessing_obj.captions_dict[(257,3)])
print('row caption : ',len(captions_preprocessing_obj.raw_captions_dict))
print('total caption : ',len(captions_preprocessing_obj.captions_dict))
print('vocab size : ',len(captions_preprocessing_obj.vocab))


### Dataset Class

In [None]:
class ImageCaptionsDataset(Dataset):

    def __init__(self, img_dir, captions_dict, img_transform=None, captions_transform=None):
        """
        Args:
            img_dir (string): Directory with all the images.
            captions_dict: Dictionary with captions list keyed by image ids (integers)
            img_transform (callable, optional): Optional transform to be applied
                on the image sample.

            captions_transform: (callable, optional): Optional transform to be applied
                on the caption sample (list).
        """
        self.img_dir = img_dir
        self.captions_dict = captions_dict
        self.img_transform = img_transform
        self.captions_transform = captions_transform

        self.image_ids = list(captions_dict.keys())

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, idx):
        key,ind = self.image_ids[idx]    #idx is index
        img_name = os.path.join(self.img_dir, 'image_{}.jpg'.format(key))
        image = io.imread(img_name)
        caption = self.captions_dict[self.image_ids[idx]]

        if self.img_transform:
            image = self.img_transform(image)

        if self.captions_transform:
            caption = self.captions_transform(caption)

        sample = {'image': image, 'caption': caption}

        return sample

In [None]:
class std_EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        super(std_EncoderCNN, self).__init__()
    #now here we gone import pretrend cnn and use tranfer learning ye'ah and i'm gonna chage it. FUCK YOU
    # conv layers: (in_channel size, out_channels size, kernel_size, stride, padding)
        self.conv1_1 = nn.Conv2d(3, 64, kernel_size=3, padding=1)
        self.conv1_2 = nn.Conv2d(64, 64, kernel_size=3, padding=1)

        self.conv2_1 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.conv2_2 = nn.Conv2d(128, 128, kernel_size=3, padding=1)
 
        self.conv3_1 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
        self.conv3_2 = nn.Conv2d(256, 256, kernel_size=3, padding=1)
        self.conv3_3 = nn.Conv2d(256, 256, kernel_size=3, padding=1)

        self.conv4_1 = nn.Conv2d(256, 512, kernel_size=3, padding=1)
        self.conv4_2 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
        self.conv4_3 = nn.Conv2d(512, 512, kernel_size=3, padding=1)

        self.conv5_1 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
        self.conv5_2 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
        self.conv5_3 = nn.Conv2d(512, 512, kernel_size=3, padding=1)

        # max pooling (kernel_size, stride)
        self.pool = nn.MaxPool2d(2, 2)

        # fully conected layers:
        self.fc6 = nn.Linear(7*7*512, 4096)
        self.fc7 = nn.Linear(4096, 4096)
        self.fc8 = nn.Linear(4096, embed_size)

    def forward(self, x, training=True):
        x = F.relu(self.conv1_1(x))
        x = F.relu(self.conv1_2(x))
        x = self.pool(x)
        x = F.relu(self.conv2_1(x))
        x = F.relu(self.conv2_2(x))
        x = self.pool(x)
        x = F.relu(self.conv3_1(x))
        x = F.relu(self.conv3_2(x))
        x = F.relu(self.conv3_3(x))
        x = self.pool(x)
        x = F.relu(self.conv4_1(x))
        x = F.relu(self.conv4_2(x))
        x = F.relu(self.conv4_3(x))
        x = self.pool(x)
        x = F.relu(self.conv5_1(x))
        x = F.relu(self.conv5_2(x))
        x = F.relu(self.conv5_3(x))
        x = self.pool(x)
        x = x.view(-1, 7 * 7 * 512)
        x = F.relu(self.fc6(x))
        x = F.dropout(x, 0.5, training=training)
        x = F.relu(self.fc7(x))
        x = F.dropout(x, 0.5, training=training)
        x = self.fc8(x)
        return x

In [None]:
class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        super(EncoderCNN, self).__init__()
    #now here we gone import pretrend cnn and use tranfer learning ye'ah and i'm gonna chage it. FUCK YOU
    # conv layers: (in_channel size, out_channels size, kernel_size, stride, padding)
        self.conv1 = nn.Conv2d(3, 32, kernel_size=5, padding = 1)    #222
        self.conv2 = nn.Conv2d(32, 64, kernel_size=5, padding = 1)    #220
        self.conv3 = nn.Conv2d(64, 128, kernel_size=5, padding = 2)   #220
        # maxpool alplied (220 -> 110)
        self.conv4 = nn.Conv2d(128, 256, kernel_size=5, padding = 1)  #106
        self.conv5 = nn.Conv2d(256, 256, kernel_size=5, padding = 2)  #

        self.conv6 = nn.Conv2d(256, 512, kernel_size=5, padding = 1)  
        self.conv7 = nn.Conv2d(512, 512, kernel_size=5)

        self.conv8 = nn.Conv2d(512, 1024, kernel_size=5, padding = 1)
        self.conv9 = nn.Conv2d(1024, 1024, kernel_size=5)

        self.pool = nn.MaxPool2d(2, 2)

        self.fc1 = nn.Linear(7*7*1024, 4096)
        self.fc2 = nn.Linear(4096, embed_size)

    def forward(self, x, training=True):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = self.pool(x)
        x = F.relu(self.conv4(x))
        x = F.relu(self.conv5(x))
        x = self.pool(x)
        x = F.relu(self.conv6(x))
        x = F.relu(self.conv7(x))
        x = self.pool(x)
        x = F.relu(self.conv8(x))
        x = F.relu(self.conv9(x))
        x = self.pool(x)
        x = x.reshape(1,x.reshape(-1).shape[0])
        x = F.relu(self.fc6(x))
        x = F.dropout(x, 0.5, training=training)
        x = self.fc8(x)
        return x

In [None]:
class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(DecoderRNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers)
        self.linear = nn.Linear(hidden_size, vocab_size)
        # self.dropout = nn.Dropout(0.5)

    def forward(self, features, captions):
        embeddings = self.embed(captions)
        embeddings = embeddings.permute(1, 0, 2)
        features = features.unsqueeze(0)
        embeddings = torch.cat((features, embeddings), dim=0)
        hiddens, _ = self.lstm(embeddings)
        outputs = self.linear(hiddens)   # 90 X 32 X 4096 -> (2880 X 4096) - (2880 X 1)
        return outputs


### Model Architecture

In [None]:
class ImageCaptionsNet(nn.Module):
    def __init__(self):
        super(ImageCaptionsNet, self).__init__()
        embed_size = 512
        vocab_size = len(captions_preprocessing_obj.vocab)
        hidden_size = 2048 # it will be changed to (2/3)*(embed_size + vocab_size)
        num_layers = 2
        self.encoderCNN = std_EncoderCNN(embed_size)
        self.decoderRNN = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers)

    def forward(self, x):
        image_batch, captions_batch = x
        features = self.encoderCNN.forward(image_batch)
        outputs = self.decoderRNN.forward(features, captions_batch)
        return outputs


**Data Set Formation**

In [None]:
class Sampler(Sampler):
    def __init__(self,dataset,batch_size):
        self.total_indices = len(dataset)
        self.sampler_dict = self.genrate_dict(dataset)
        self.batch_size = batch_size
        self.batches_list = self.create_batch_lists()
    def genrate_dict(self, dataset):
        sampler_dict_p = {}
        for i in range(len(dataset)):
            length = len(dataset[i]['caption'])
            if length in sampler_dict_p.keys():
                sampler_dict_p[length] = sampler_dict_p[length]+[i]
            else:
                sampler_dict_p[length] = [i]
        return sampler_dict_p
    
    def __len__(self):
        return self.total_indices
    
    def create_batch_lists(self):
        ret_list = []
        for key in self.sampler_dict.keys():
            batches = [ (self.sampler_dict[key][b:(b + self.batch_size)]) for b in range(0, len(self.sampler_dict[key]), self.batch_size)]
            for batch in batches:
                ret_list.append(batch)
        return ret_list
    
    def __iter__(self):
        for i, batch in enumerate(self.batches_list):
            yield batch

**Sampler and Data Loader**

In [None]:
%cd /content/drive/My\ Drive/
IMAGE_DIR = '/content/drive/My Drive/Colab Notebooks/public_test_images'

# Creating the Dataset
train_dataset = ImageCaptionsDataset(
    img_dir = IMAGE_DIR, 
    captions_dict = captions_preprocessing_obj.captions_dict, 
    img_transform = img_transform,
    captions_transform = captions_preprocessing_obj.captions_transform
)

# Define your hyperparameters
NUMBER_OF_EPOCHS = 3
LEARNING_RATE = 1e-1
BATCH_SIZE = 32
NUM_WORKERS = 8

my_sampler = Sampler(train_dataset,BATCH_SIZE)
# for key in my_sampler.sampler_dict.keys():
#     print(key,' : ',my_sampler.sampler_dict[key])

train_loader = DataLoader(train_dataset, shuffle=False, batch_sampler = my_sampler, num_workers=NUM_WORKERS)
# train_loader = DataLoader(train_dataset, shuffle=False, batch_size = BATCH_SIZE, sampler = sampler_dict, num_workers=NUM_WORKERS)

### Training Loop

In [None]:
model = ImageCaptionsNet()
device = torch.device("cuda:0")
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)
loss_function = nn.CrossEntropyLoss()
print("GPU avalible" , torch.cuda.device_count())
is_parallel = False
if torch.cuda.device_count() >= 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    model = nn.DataParallel(model)
    model.to(device)
    is_parallel = True

for epoch in range(NUMBER_OF_EPOCHS):
    for batch_idx, sample in enumerate(train_loader):
        model.zero_grad()

        image_batch, captions_batch = sample['image'], sample['caption']
        print(type(image_batch[0]),image_batch.shape)
        print(type(captions_batch),captions_batch.shape)
        # If GPU training required
        # image_batch, captions_batch = image_batch.cuda(), captions_batch.cuda()
        if is_parallel:
            image_batch = image_batch.to(device)
            captions_batch = captions_batch.to(device)
        output_captions = model((image_batch,captions_batch))
        output_captions = output_captions[:-1]
        captions_batch =  captions_batch.t()
        print(output_captions.shape,captions_batch.shape)
        loss = loss_function(output_captions.reshape(-1,output_captions.shape[2]), captions_batch.reshape(-1))
        print(batch_idx,' | ',loss)
        loss.backward(loss)
        optimizer.step()

    print("Iteration: " + str(epoch + 1))

**Rough work**



In [None]:
%cd /content/drive/My\ Drive/
directory='/content/drive/My Drive/Colab Notebooks/public_test_images'
l = len(os.listdir(directory))

path = os.path.join(directory, "image_280.jpg")
img = Image.open(path)
plt.imshow(img)
plt.show()
img = np.array(img)
print(img.shape)
img = img_transform(img)
img = img.transpose((1,2,0))
plt.imshow(img)
plt.show()

In [None]:
%cd /content/drive/My\ Drive/
directory='/content/drive/My Drive/Colab Notebooks/public_test_images'

path = os.path.join(directory, "image_280")
img = Image.open(path)
img = img_transform(img)
plt.imshow(img)
plt.show()

# process and load images feature vector
def process_nDload_images(dir, batch_size, start, end):
    """
    load all the images's feature vector in numpy array indexed by their respective id.
    """
    # loads images belongs to dataset located in directory 'dir'.
    image_names = os.listdir(dir)
    image_names = np.array(image_names)
    feature_vector_size = 4096
    
    # Pre-allocate input-batch-array for images.
    batch = np.zeros(shape=(batch_size,224,224,3), dtype=np.float16)

    # Pre-allocate output-batch-feature-array for batch-images
    transfer_values = np.zeros(shape=(batch_size,feature_vector_size), dtype=np.float16)

    for i, imname in enumerate(image_names[start:end]):
        path = os.path.join(directory, imname)
        img = load_image(path,size=(224,224))
        batch[i] = img
    
    transfer_values = extract_features(batch)