In [4]:
import pandas as pd
import PIL.Image
# import torch
import numpy as np
import keras
import re

import tensorflow as tf

# Load Images & Captions
• Load, resize and normalize the images to a suitable format that can be efficiently
processed by the CNN model.\
• You should choose a standard size and normalization suitable for your CNN
model.

In [8]:
captions_file_path = "data/captions.txt"
captions = pd.read_csv(captions_file_path)
captions.head()

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...


In [56]:
def load_next_batch(images_path, captions, batch_size):
    for i in range(0, captions.unique().size, batch_size):
        batch = np.zeros((min(batch_size, captions.unique().size - i), 224, 224, 3), dtype=np.float64)
        for j in range(min(batch_size, captions.unique().size - i)):
            image_name = captions.unique()[i + j]
            image = PIL.Image.open(images_path + image_name)
            image = image.resize((224, 224))
            image = np.array(image, dtype=np.float64)
            image /= 255.0
            batch[j] = image

        yield batch

In [47]:
images_path = "./data/Images/"
batch_generator = load_next_batch(images_path, captions['image'], 1000)

for batch in batch_generator:
    print(batch.shape)

(1000, 224, 224, 3)
(1000, 224, 224, 3)
(1000, 224, 224, 3)
(1000, 224, 224, 3)
(1000, 224, 224, 3)
(1000, 224, 224, 3)
(1000, 224, 224, 3)
(1000, 224, 224, 3)
(91, 224, 224, 3)


# Preprocessing Captions for RNN

## Normalising & Processing Captions

In [4]:
processed_captions = captions['caption'].apply(lambda x: x.lower())                    # put into lower case
processed_captions = processed_captions.apply(lambda x: re.sub(r"[^A-Za-z ]", "", x))  # remove non-alphabetic and non-space characters
processed_captions = processed_captions.apply(lambda x: re.sub(r" +", " ", x))         # remove multiple spaces
processed_captions = processed_captions.apply(lambda x: x.strip())                     # remove leading and trailing spaces

# remove single character words except for 'a' in the middle or start of sentence
processed_captions = processed_captions.apply(lambda x: re.sub(r" [^a] ", " ", x))
processed_captions = processed_captions.apply(lambda x: re.sub(r"(^[^a] | .$)", "", x))

processed_captions = "begintag " + processed_captions + " endtag"

processed_captions

0        begintag a child in a pink dress is climbing u...
1        begintag a girl going into a wooden building e...
2        begintag a little girl climbing into a wooden ...
3        begintag a little girl climbing the stairs to ...
4        begintag a little girl in a pink dress going i...
                               ...                        
40450    begintag a man in a pink shirt climbs a rock f...
40451    begintag a man is rock climbing high in the ai...
40452    begintag a person in a red shirt climbing up a...
40453        begintag a rock climber in a red shirt endtag
40454    begintag a rock climber practices on a rock cl...
Name: caption, Length: 40455, dtype: object

## Tokenize Captions & Build Vocabulary

In [6]:
tokenizer = keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(processed_captions)

sequences = tokenizer.texts_to_sequences(processed_captions)

import functools
max_length = functools.reduce(lambda x, maximum: max(maximum, x), map(len, sequences))
del functools

sequences = keras.preprocessing.sequence.pad_sequences(sequences, max_length, padding='post')

sequences.shape

(40455, 37)

## Write Token Indices Into a JSON File to View

In [7]:
import json

with open('word_index.json', 'w') as json_file:
    json.dump(tokenizer.word_index, json_file)

# Preparing Output Labels

In [7]:
next_word = np.zeros((sequences.shape[0], sequences.shape[1] - 1))

for i in range(sequences.shape[0]):
    for j in range(1, sequences.shape[1]):
        next_word[i][j - 1] = sequences[i][j]

In [124]:
resnet = keras.applications.resnet_v2.ResNet50V2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# do something here with the top layers of the model

images_path = "./data/Images/"
resnet.compile(loss='categorical_crossentropy')
batch = load_next_batch(images_path, captions['image'], 200).__next__()

In [96]:
prediction: np.ndarray = resnet.predict(batch)

prediction.nonzero()

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 2s/step


In [107]:
image_index = 1

indices = prediction[image_index].argsort()
indices[-5:]

keras.applications.resnet.decode_predictions(prediction, top=5)

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/imagenet_class_index.json
[1m35363/35363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2us/step


[[('n03467068', 'guillotine', 0.858231),
  ('n03776460', 'mobile_home', 0.02827406),
  ('n03899768', 'patio', 0.026914496),
  ('n03697007', 'lumbermill', 0.01383134),
  ('n02727426', 'apiary', 0.009180675)],
 [('n02087046', 'toy_terrier', 0.31183186),
  ('n02091032', 'Italian_greyhound', 0.26057705),
  ('n02089867', 'Walker_hound', 0.1668409),
  ('n02100236', 'German_short-haired_pointer', 0.102685325),
  ('n02096585', 'Boston_bull', 0.05993147)],
 [('n04026417', 'purse', 0.30360505),
  ('n03709823', 'mailbag', 0.09338272),
  ('n02110958', 'pug', 0.0662199),
  ('n02085620', 'Chihuahua', 0.051686347),
  ('n04398044', 'teapot', 0.045142654)],
 [('n02087046', 'toy_terrier', 0.9063482),
  ('n02085620', 'Chihuahua', 0.062341396),
  ('n02096585', 'Boston_bull', 0.02767644),
  ('n02110806', 'basenji', 0.0016489134),
  ('n02113186', 'Cardigan', 0.0007351666)],
 [('n09835506', 'ballplayer', 0.20567855),
  ('n03124170', 'cowboy_hat', 0.15766764),
  ('n03763968', 'military_uniform', 0.086610205),

In [114]:
image_index = 9

img = np.array(batch[image_index] * 255, dtype=np.uint8)

image = PIL.Image.fromarray(img)
image.show()