In [134]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from PIL import Image
import os
import numpy as np
import pandas as pd
import cv2
import warnings
warnings.filterwarnings('ignore')

import torch
import torch.nn as nn
import torchvision
import torch.nn.functional as F
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
from torchvision import datasets, models
import torch.optim as optim
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
#nltk.download('punkt')
import ast
import pickle

from transform import *
from custom_data import ImageCaptionDataset

In [141]:
data = pd.read_csv('data/captions.csv')

In [144]:
data.captions = data.captions.apply(lambda x: ast.literal_eval(x))

In [145]:
data.captions[0]

[' Two young guys with shaggy hair look at their hands while hanging out in the yard .',
 ' Two young , White males are outside near many bushes .',
 ' Two men in green shirts are standing in a yard .',
 ' A man in a blue shirt standing in a garden .',
 ' Two friends enjoy time spent together .']

```
data.captions = (
    data.captions.apply(lambda x: [list(i.split()) for i in x])
                 .apply(lambda x: [[j.lower() for j in i] for i in x])
)
```

In [121]:
def create_vocab_file(col_serie_pandas):
    data_vocab = col_serie_pandas.apply(lambda x: ' '.join(x))
    data_vocab = ' '.join(list(data_vocab))
    vocabulary = word_tokenize(data_vocab)

    words_tokens = set([i.lower() for i in vocabulary])
    token_map_integer = {}

    for n, i in enumerate(words_tokens):
        token_map_integer[i] = n

    token_map_integer['<start>'] = -400
    token_map_integer['<end>'] = -401
    
    return vocabulary, token_map_integer

```
f = open("mapping.pkl","wb")
pickle.dump(token_map_integer,f)
f.close()
```

```
with open('mapping.pkl', 'rb') as f:
    test = pickle.load(f)
```

In [None]:
# We have to transform the caption associated with the image into a list of word
# tokenisation

# iterate over all the training caption
# create a dictionnary that maps all unique words to an Integer and these words : <start>, <end>
words = ''.join(data.captions.values.flatten())
word_tokenized = word_tokenize(words.lower())
vocabulary = list(set(word_tokenized))

In [None]:
batch_size = 10
num_workers = 4
csv_file = 'data/captions.csv'
root_dir = 'data/flickr30k_images'

transform = transforms.Compose([
    Rescale(224),
    Normalize(),
    ToTensor()
])

In [None]:
valid_size = 0.3

def train_valid_split(training_set, validation_size):
    """ Function that split our dataset into train and validation
        given in parameter the training set and the % of sample for validation"""
    
    # obtain training indices that will be used for validation
    num_train = len(training_set)
    indices = list(range(num_train))
    np.random.shuffle(indices)
    split = int(np.floor(validation_size * num_train))
    train_idx, valid_idx = indices[split:], indices[:split]

    # define samplers for obtaining training and validation batches
    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)
    
    return train_sampler, valid_sampler




train_set = ImageCaptionDataset(csv_file=csv_file,
                                root_dir=root_dir,
                                transform=transform)

train_sampler, valid_sampler = train_valid_split(train_set, valid_size)


train_loader = DataLoader(train_set,
                          batch_size=batch_size,
                          sampler=train_sampler,
                          num_workers=num_workers)

valid_loader = torch.utils.data.DataLoader(train_set,
                                           batch_size=batch_size,
                                           sampler=valid_sampler,
                                           num_workers=num_workers)

In [None]:
plt.figure(figsize=(7,7))

# obtain one batch of training images
batch = next(iter(train_loader))

# display 10 images
for i in np.arange(1):
    
    images, labels = batch['image'], batch['captions']
    
    #unormalize images
    image = images[i].numpy()
    image = np.transpose(image, (1, 2, 0))
    
    labels = labels[i]
    
    #plt.subplot(5,1,i+1)
    plt.imshow(np.squeeze(image), cmap='gray')
    display(labels)

In [None]:
from models import EncoderCNN

embed_size = 50

train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
    print('CUDA not available... Training on CPU')
else:
    print('CUDA available... Training on GPU')
    

# Initialize the encoder and decoder
encoder = EncoderCNN(embed_size)
decoder = 

In [None]:
print(f"input features: {encoder.embed.in_features}")
print(f"output features (embed_size): {encoder.embed.out_features}")
assert encoder.embed.out_features==embed_size, "The embbeding size doesn't match the output size of the Encoder"

In [None]:
if train_on_gpu:
    encoder.cuda()

criterion = nn.CrossEntropyLoss()