In [1]:
import pandas as pd
import PIL.Image
# import torch
import numpy as np
import keras
import re

import tensorflow as tf

# Load Images & Captions
• Load, resize and normalize the images to a suitable format that can be efficiently
processed by the CNN model.\
• You should choose a standard size and normalization suitable for your CNN
model.

In [2]:
captions_file_path = "data/captions.txt"
captions = pd.read_csv(captions_file_path)
captions.head()

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...


In [None]:
def read_all_images(images_path, captions):
    data = np.zeros((captions.unique().size, 224, 224, 3), dtype=np.float32)
    for i in range(captions.unique().size):
        image_name = captions.unique()[i]
        image = PIL.Image.open(images_path + image_name)
        image = image.resize((224, 224))
        image = np.array(image, dtype=np.float64)
        image /= 255.0
        data[i] = image

        if i % 100 == 0:
            print(i, end='\r')

    return data

In [None]:
images_path = "./data/Images/"
data = read_all_images(images_path, captions['image'])

# Preprocessing Captions for RNN

## Normalising & Processing Captions

In [3]:
processed_captions = captions['caption'].apply(lambda x: x.lower())                    # put into lower case
processed_captions = processed_captions.apply(lambda x: re.sub(r"[^A-Za-z ]", "", x))  # remove non-alphabetic and non-space characters
processed_captions = processed_captions.apply(lambda x: re.sub(r" +", " ", x))         # remove multiple spaces
processed_captions = processed_captions.apply(lambda x: x.strip())                     # remove leading and trailing spaces

# remove single character words except for 'a' in the middle or start of sentence
processed_captions = processed_captions.apply(lambda x: re.sub(r" [^a] ", " ", x))
processed_captions = processed_captions.apply(lambda x: re.sub(r"(^[^a] | .$)", "", x))

processed_captions = "begintag " + processed_captions + " endtag"

processed_captions

0        begintag a child in a pink dress is climbing u...
1        begintag a girl going into a wooden building e...
2        begintag a little girl climbing into a wooden ...
3        begintag a little girl climbing the stairs to ...
4        begintag a little girl in a pink dress going i...
                               ...                        
40450    begintag a man in a pink shirt climbs a rock f...
40451    begintag a man is rock climbing high in the ai...
40452    begintag a person in a red shirt climbing up a...
40453        begintag a rock climber in a red shirt endtag
40454    begintag a rock climber practices on a rock cl...
Name: caption, Length: 40455, dtype: object

## Tokenize Captions, Build Vocabulary & Prepare Output Labels

In [4]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(processed_captions)
vocab_size = len(tokenizer.word_index) + 1

sequences = tokenizer.texts_to_sequences(processed_captions)

import functools
max_length = functools.reduce(lambda x, maximum: max(maximum, x), map(len, sequences))
del functools

# take only one caption for every image
sequences = [sequences[i] for i in range(len(sequences)) if i % 5 == 0]

caption_lengths = [len(sequence) for sequence in sequences]

next_word = np.array([sequences[i][j] for i in range(len(sequences)) for j in range(1, len(sequences[i]))])
next_word = keras.utils.to_categorical(next_word, num_classes=vocab_size)

sequences = [sequences[i][:j] for i in range(len(sequences)) for j in range(1, len(sequences[i]) + 1)]
sequences = [sequence for sequence in sequences if sequence[len(sequence) - 1] != 3]
sequences = keras.preprocessing.sequence.pad_sequences(sequences, max_length, padding='post')

next_word.shape[0] == sequences.shape[0]

True

## Write Token Indices Into a JSON File to View

In [None]:
import json

with open('word_index.json', 'w') as json_file:
    json.dump(tokenizer.word_index, json_file)

# Convolution Neural Network (ResNet)

In [None]:
resnet = keras.applications.resnet_v2.ResNet50V2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
pooling = keras.layers.GlobalAveragePooling2D()(resnet.output)
resnet = keras.Model(inputs=resnet.input, outputs=pooling)

resnet.summary()

In [None]:
image_features = resnet.predict(data, verbose=2)
image_features

In [None]:
np.save("image_features.npy", image_features)

In [5]:
image_features = np.load("image_features.npy")
image_features

array([[0.0000000e+00, 2.5279337e-01, 0.0000000e+00, ..., 2.3617539e-01,
        9.9756315e-02, 5.7161540e-02],
       [7.6399356e-02, 5.9836339e-02, 0.0000000e+00, ..., 9.5864497e-02,
        4.8636729e-01, 5.8380485e-01],
       [0.0000000e+00, 2.7479172e-01, 1.6393012e-01, ..., 8.0603585e-03,
        6.4714789e-02, 1.1330431e+00],
       ...,
       [2.4502127e-01, 1.3263669e-03, 9.2156701e-02, ..., 0.0000000e+00,
        4.5959583e-01, 2.5537786e-01],
       [1.3663623e-03, 7.6390989e-02, 1.5210731e-01, ..., 1.7262001e-01,
        1.7311759e+00, 7.9534501e-01],
       [2.2924218e-02, 0.0000000e+00, 1.3212231e-01, ..., 0.0000000e+00,
        0.0000000e+00, 8.7138914e-02]], dtype=float32)

In [16]:
image_input = keras.Input((image_features.shape[1],))
image_model_bef = keras.layers.Dense(256, activation='relu')(image_input)
image_model = keras.layers.Reshape((1, 256))(image_model_bef)

image_model = keras.Model(inputs=image_input, outputs=image_model)

image_model.summary()

In [17]:
caption_input = keras.Input((max_length,))
caption_model = keras.layers.Embedding(vocab_size, 256)(caption_input)

caption_model = keras.Model(inputs=caption_input, outputs=caption_model)

caption_model.summary()

In [20]:
image_caption_model = keras.layers.Concatenate(1)([image_model.output, caption_model.output])
image_caption_model = keras.layers.LSTM(256)(image_caption_model)
image_caption_model = keras.layers.Dropout(0.5)(image_caption_model)
image_caption_model = keras.layers.add([image_caption_model,image_model_bef])
image_caption_model = keras.layers.Dense(128, activation='relu')(image_caption_model)
image_caption_model = keras.layers.Dropout(0.5)(image_caption_model)
image_caption_model = keras.layers.Dense(vocab_size,activation='softmax')(image_caption_model)

image_caption_model = keras.Model(inputs=[image_input, caption_input], outputs=image_caption_model)

image_caption_model.compile(loss='categorical_crossentropy',optimizer="adam")
image_caption_model.summary()

In [21]:
image_input_data = np.repeat(image_features, [l - 1 for l in caption_lengths], axis=0)
caption_input_data = sequences

image_caption_model.fit([image_input_data, caption_input_data], next_word, epochs=10, batch_size=32)


Epoch 1/10
[1m3065/3065[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 51ms/step - loss: 5.6406
Epoch 2/10
[1m3065/3065[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 43ms/step - loss: 5.2083
Epoch 3/10
[1m3065/3065[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 46ms/step - loss: 5.1376
Epoch 4/10
[1m3065/3065[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 42ms/step - loss: 5.1188
Epoch 5/10
[1m3065/3065[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 43ms/step - loss: 5.1034
Epoch 6/10
[1m3065/3065[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 45ms/step - loss: 5.0751
Epoch 7/10
[1m3065/3065[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 43ms/step - loss: 5.0735
Epoch 8/10
[1m3065/3065[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 43ms/step - loss: 5.0654
Epoch 9/10
[1m3065/3065[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 43ms/step - loss: 5.0415
Epoch 10/10
[1m3065/3065[0m [32m━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x18ca08850d0>

In [12]:
image_caption_model.save_weights("image_caption_model.weights.h5")