In [1]:
import pandas as pd
import PIL.Image
# import torch
import numpy as np
import tensorflow as tf
import re

# Load Images & Captions
• Load, resize and normalize the images to a suitable format that can be efficiently
processed by the CNN model.\
• You should choose a standard size and normalization suitable for your CNN
model.

In [2]:
captions_file_path = "data/captions.txt"
captions = pd.read_csv(captions_file_path)
captions.head()

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...


In [3]:
def load_next_batch(images_path, captions, batch_size):
    for i in range(0, captions.unique().size, batch_size):
        batch = np.zeros((min(batch_size, captions.unique().size - i), 224, 224, 3), dtype=np.uint8)
        for j in range(min(batch_size, captions.unique().size - i)):
            image_name = captions.unique()[i + j]
            image = PIL.Image.open(images_path + image_name)
            image = image.resize((224, 224))
            image = np.array(image)
            batch[j] = image

        yield tf.convert_to_tensor(batch)

In [4]:
images_path = "./data/Images/"
batch_generator = load_next_batch(images_path, captions['image'], 1000)

for batch in batch_generator:
    print(batch.shape)

KeyboardInterrupt: 

# Preprocessing Captions for RNN

## Normalising & Processing Captions

In [9]:
processed_captions = captions['caption'].apply(lambda x: x.lower())                    # put into lower case
processed_captions = processed_captions.apply(lambda x: re.sub(r"[^A-Za-z ]", "", x))  # remove non-alphabetic and non-space characters
processed_captions = processed_captions.apply(lambda x: re.sub(r" +", " ", x))         # remove multiple spaces
processed_captions = processed_captions.apply(lambda x: x.strip())                     # remove leading and trailing spaces

# remove single character words except for 'a' in the middle or start of sentence
processed_captions = processed_captions.apply(lambda x: re.sub(r" [^a] ", " ", x))
processed_captions = processed_captions.apply(lambda x: re.sub(r"(^[^a] | .$)", "", x))

processed_captions = "begintag " + processed_captions + " endtag"

processed_captions

0        begintag a child in a pink dress is climbing u...
1        begintag a girl going into a wooden building e...
2        begintag a little girl climbing into a wooden ...
3        begintag a little girl climbing the stairs to ...
4        begintag a little girl in a pink dress going i...
                               ...                        
40450    begintag a man in a pink shirt climbs a rock f...
40451    begintag a man is rock climbing high in the ai...
40452    begintag a person in a red shirt climbing up a...
40453        begintag a rock climber in a red shirt endtag
40454    begintag a rock climber practices on a rock cl...
Name: caption, Length: 40455, dtype: object

## Tokenize Captions & Build Vocabulary

In [46]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(processed_captions)

sequences = tokenizer.texts_to_sequences(processed_captions)

import functools
max_length = functools.reduce(lambda x, maximum: max(maximum, x), map(len, sequences))
del functools

for sequence in sequences:
    sequence.extend([3 for _ in range(max_length - len(sequence))])

sequences

[[2,
  1,
  42,
  4,
  1,
  90,
  169,
  7,
  119,
  53,
  1,
  394,
  12,
  391,
  4,
  28,
  5193,
  692,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3],
 [2,
  1,
  19,
  313,
  64,
  1,
  193,
  117,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3],
 [2,
  1,
  40,
  19,
  119,
  64,
  1,
  193,
  2423,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3],
 [2,
  1,
  40,
  19,
  119,
  5,
  391,
  20,
  60,
  2423,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3],
 [2,
  1,
  40,
  19,
  4,
  1,
  90,
  169,
  313,
  64,
  1,
  193,
  2985,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3],
 [2,
  1,


## Write Token Indices Into a JSON File to View

In [7]:
import json

with open('word_index.json', 'w') as json_file:
    json.dump(tokenizer.word_index, json_file)

In [42]:
x = [[1], [2, 3], [4, 5, 6]]

list(map(lambda z: z.extend([0 for _ in range(3 - len(z))]), x))

[None, None, None]