In [1]:
! ls ../data/

CrowdFlowerAnnotations.txt  Flickr8k.token.txt	       machine_translation
ExpertAnnotations.txt	    Flickr_8k.devImages.txt    readme.txt
Flicker8k_smaller	    Flickr_8k.testImages.txt
Flickr8k.lemma.token.txt    Flickr_8k.trainImages.txt


In [2]:
import os
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.applications import VGG16
from tensorflow.keras.preprocessing import image
import tensorflow as tf

THE_SIZE = 72
IMAGE_SIZE = (THE_SIZE, THE_SIZE)

# Load the Flickr8k dataset
images_dir = "../../../data/Flicker8k_smaller/"
captions_file = "../../../data/Flickr8k.token.txt"

captions = {}
with open(captions_file, "r") as f:
    for line in f:
        image_id, caption = line.strip().split("\t")[0], line.strip().split("\t")[1]
        image_id = os.path.splitext(os.path.basename(image_id))[0]
        if image_id not in captions:
            captions[image_id] = []
        captions[image_id].append(caption)


first3pairs = {k: captions[k] for k in list(captions)[:3]}
print(first3pairs)

2023-03-19 10:41:23.005711: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-19 10:41:25.662851: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-03-19 10:41:25.662965: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-03-19 10:41:30.681672: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

{'1000268201_693b08cb0e': ['A child in a pink dress is climbing up a set of stairs in an entry way .', 'A girl going into a wooden building .', 'A little girl climbing into a wooden playhouse .', 'A little girl climbing the stairs to her playhouse .', 'A little girl in a pink dress going into a wooden cabin .'], '1001773457_577c3a7d70': ['A black dog and a spotted dog are fighting', 'A black dog and a tri-colored dog playing with each other on the road .', 'A black dog and a white dog with brown spots are staring at each other in the street .', 'Two dogs of different breeds looking at each other on the road .', 'Two dogs on pavement moving toward each other .'], '1002674143_1b742ab4b8': ['A little girl covered in paint sits in front of a painted rainbow with her hands in a bowl .', 'A little girl is sitting in front of a large painted rainbow .', 'A small girl in the grass plays with fingerpaints in front of a white canvas with a rainbow on it .', 'There is a girl with pigtails sitting

# Preprocess the images and captions

## why `captions_list[:SAMPLE_DATA_COUNT]`
* It will crash because of my Mac's capacity, take first 1000 sentences 

In [3]:
def decode_and_resize(img_path):
    img = tf.io.read_file(img_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, IMAGE_SIZE)
    img = tf.image.convert_image_dtype(img, tf.float32)
    return img

SAMPLE_DATA_COUNT = 750

image_paths = sorted(os.listdir(images_dir))
image_paths = [os.path.join(images_dir, path) for path in image_paths]

images_list = [decode_and_resize(image_path) for image_path in image_paths]

captions_list = [captions[os.path.splitext(os.path.basename(image_path))[0]] for image_path in image_paths]
captions_text_all_in_one = ["<start> " + " ".join(caption_list) + " <end>" for caption_list in captions_list]

print(captions_list[0][0])

captions_text = []
for caption_list in captions_list[:SAMPLE_DATA_COUNT]:
    captions_text.append("<start> " + caption_list[0] + " <end>")

2023-03-19 10:41:43.031273: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-03-19 10:41:43.031654: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2023-03-19 10:41:43.032031: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (f3b674af7ced): /proc/driver/nvidia/version does not exist
2023-03-19 10:41:43.047894: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


A child in a pink dress is climbing up a set of stairs in an entry way .


In [4]:
encoder_input_data = np.array(images_list[:SAMPLE_DATA_COUNT])
print(encoder_input_data.shape)

(750, 72, 72, 3)


In [5]:
captions_list[:3]

[['A child in a pink dress is climbing up a set of stairs in an entry way .',
  'A girl going into a wooden building .',
  'A little girl climbing into a wooden playhouse .',
  'A little girl climbing the stairs to her playhouse .',
  'A little girl in a pink dress going into a wooden cabin .'],
 ['A black dog and a spotted dog are fighting',
  'A black dog and a tri-colored dog playing with each other on the road .',
  'A black dog and a white dog with brown spots are staring at each other in the street .',
  'Two dogs of different breeds looking at each other on the road .',
  'Two dogs on pavement moving toward each other .'],
 ['A little girl covered in paint sits in front of a painted rainbow with her hands in a bowl .',
  'A little girl is sitting in front of a large painted rainbow .',
  'A small girl in the grass plays with fingerpaints in front of a white canvas with a rainbow on it .',
  'There is a girl with pigtails sitting in front of a rainbow painting .',
  'Young girl w

In [6]:
captions_text[:3]

['<start> A child in a pink dress is climbing up a set of stairs in an entry way . <end>',
 '<start> A black dog and a spotted dog are fighting <end>',
 '<start> A little girl covered in paint sits in front of a painted rainbow with her hands in a bowl . <end>']

In [7]:
SEQ_LENGTH = 25
tokenizer = keras.preprocessing.text.Tokenizer(filters="", oov_token="<OOV>")
tokenizer.fit_on_texts(captions_text)

captions_sequences = tokenizer.texts_to_sequences(captions_text)
captions_padded = keras.preprocessing.sequence.pad_sequences(captions_sequences, maxlen=SEQ_LENGTH, padding="post", truncating="post")

vocab_size = len(tokenizer.word_index) + 1 #add 1 for <OOV>

In [8]:
print(captions_padded)
print(captions_padded.shape)
print(vocab_size)

[[   3    2   34 ...    0    0    0]
 [   3    2   16 ...    0    0    0]
 [   3    2   40 ...    0    0    0]
 ...
 [   3   16   61 ...    0    0    0]
 [   3    2 1270 ...    0    0    0]
 [   3    2   17 ...    0    0    0]]
(750, 25)
1275


# Prepare the data

In [9]:
decoder_input_data = captions_padded[:, :-1]
decoder_output_data = captions_padded[:, 1:]

print(decoder_input_data[:3])
print(decoder_output_data[:3])
print(decoder_input_data.shape)
print(decoder_output_data.shape)

[[  3   2  34   6   2  85 159  11  78  49   2 365  13 366   6  31 560 367
    5   4   0   0   0   0]
 [  3   2  16  10   8   2 368  10  21 369   4   0   0   0   0   0   0   0
    0   0   0   0   0   0]
 [  3   2  40  17 126   6 561  79   6  46  13   2 370 371  14  47 205   6
    2 562   5   4   0   0]]
[[  2  34   6   2  85 159  11  78  49   2 365  13 366   6  31 560 367   5
    4   0   0   0   0   0]
 [  2  16  10   8   2 368  10  21 369   4   0   0   0   0   0   0   0   0
    0   0   0   0   0   0]
 [  2  40  17 126   6 561  79   6  46  13   2 370 371  14  47 205   6   2
  562   5   4   0   0   0]]
(750, 24)
(750, 24)


In [10]:
from tensorflow import keras
from tensorflow.keras import layers

latent_dim = 64
input_shape = (THE_SIZE, THE_SIZE, 3)

encoder_inputs = keras.Input(shape=input_shape)
x = layers.Conv2D(32, kernel_size=(3, 3), activation="relu")(encoder_inputs)
x = layers.MaxPooling2D(pool_size=(2, 2))(x)
x = layers.Conv2D(64, kernel_size=(3, 3), activation="relu")(x)
x = layers.MaxPooling2D(pool_size=(2, 2))(x)
x = layers.Flatten()(x)
encoder_outputs = layers.Dense(latent_dim, activation="relu")(x)

encoder = keras.Model(inputs=encoder_inputs, outputs=encoder_outputs)
encoder.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 72, 72, 3)]       0         
                                                                 
 conv2d (Conv2D)             (None, 70, 70, 32)        896       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 35, 35, 32)       0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 33, 33, 64)        18496     
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 16, 16, 64)       0         
 2D)                                                             
                                                                 
 flatten (Flatten)           (None, 16384)             0     

In [11]:
size_of_vector = 10

decoder_inputs_layer = keras.Input(shape=(SEQ_LENGTH-1,))
embedding_layer = layers.Embedding(vocab_size, size_of_vector, input_length=SEQ_LENGTH-1)(decoder_inputs_layer)

# initial_h_state = Input(shape=(latent_dim,))
# initial_c_state = Input(shape=(latent_dim,))
# initial_state = [initial_h_state, initial_c_state]

rnn_layer = layers.LSTM(latent_dim, return_sequences=True, return_state=True)
tmp, _, _ = rnn_layer(embedding_layer, initial_state=[encoder_outputs, encoder_outputs])
decoder_dense = layers.Dense(vocab_size, activation='softmax')

decoder_outputs_layer = decoder_dense(tmp)
decoder = keras.Model(inputs=[decoder_inputs_layer, encoder_outputs, encoder_outputs], outputs=decoder_outputs_layer)

decoder.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
decoder.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 24)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 24, 10)       12750       ['input_2[0][0]']                
                                                                                                  
 input_4 (InputLayer)           [(None, 64)]         0           []                               
                                                                                                  
 lstm (LSTM)                    [(None, 24, 64),     19200       ['embedding[1][0]',              
                                 (None, 64),                      'input_4[0][0]',          

In [12]:
model = keras.Model([encoder_inputs, decoder_inputs_layer], decoder_outputs_layer)
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 72, 72, 3)]  0           []                               
                                                                                                  
 conv2d (Conv2D)                (None, 70, 70, 32)   896         ['input_1[0][0]']                
                                                                                                  
 max_pooling2d (MaxPooling2D)   (None, 35, 35, 32)   0           ['conv2d[0][0]']                 
                                                                                                  
 conv2d_1 (Conv2D)              (None, 33, 33, 64)   18496       ['max_pooling2d[0][0]']          
                                                                                            

In [13]:
batch_size = 4
epochs = 50

print(encoder_input_data.shape)
print(decoder_input_data.shape)
print(decoder_output_data.shape)

model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit([encoder_input_data, decoder_input_data], keras.utils.to_categorical(decoder_output_data, num_classes=vocab_size),
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)

(750, 72, 72, 3)
(750, 24)
(750, 24)
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f2fcd42eb50>

# inference model
* We can have another inference model
* The key point is to 
  1. reuse `rnn_layer` from decoder part above
  2. put `h_state` and `c_state` into `outputs`, otherwise the mode.predict() will not return those 2 hidden states

In [21]:
the_inputs_layer = keras.Input(shape=(1,))
the_embedding_layer = layers.Embedding(vocab_size, size_of_vector, input_length=SEQ_LENGTH-1)(the_inputs_layer)

initial_h_state = keras.Input(shape=(latent_dim,))
initial_c_state = keras.Input(shape=(latent_dim,))

# the_rnn_layer = layers.LSTM(latent_dim, return_sequences=True, return_state=True)
tmp2, h_state, c_state = rnn_layer(the_embedding_layer, initial_state=[initial_h_state, initial_c_state])
the_decoder_dense = layers.Dense(vocab_size, activation='softmax')

the_outputs_layer = the_decoder_dense(tmp2)
inference_model = keras.Model(
    inputs=[the_inputs_layer, initial_h_state, initial_c_state], 
    outputs=[the_outputs_layer, h_state, c_state])

# inference_model.set_weights(decoder.get_weights())

inference_model.summary()

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_8 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, 1, 10)        12750       ['input_8[0][0]']                
                                                                                                  
 input_9 (InputLayer)           [(None, 64)]         0           []                               
                                                                                                  
 input_10 (InputLayer)          [(None, 64)]         0           []                               
                                                                                            

# How to do inference 
1. encode input and retrieve initial decoder state
  * Use the `encoder` to generate hidden state for decoder
2. run one step of decoder with this initial state and a "start of sequence" token as target. Output will be the next target token
3. Repeat with the current target token and current states

In [22]:
def decode_sequence(input_seq):
    print(input_seq.shape)
    states_value = encoder.predict(np.array([input_seq]))
    print(np.array(states_value).shape)


    seed_text = "<START>"
    seed_sequence = tokenizer.texts_to_sequences([seed_text])[0]
    num_words = 5  
    res = [seed_sequence[0]]
    h_state = states_value
    c_state = states_value
    for i in range(num_words):
        print(h_state.shape)
        print(c_state.shape)
        print(seed_sequence)

        next_word_probs, h_state, c_state = inference_model.predict([np.array(seed_sequence), h_state, c_state])
        # next_word_probs = inference_model.predict([np.array(seed_sequence), h_state, c_state])
        
        # lstm_layer = inference_model.layers[4] # assuming the LSTM layer is the third layer
        # print(lstm_layer.states)
        # print(inference_model.state_updates)
        # h_state, c_state = lstm_layer.states
        
        # print(next_word_probs)

        next_idx = np.argmax(next_word_probs[0][0])
        # print(next_idx)

        seed_sequence[0] = next_idx
        res.append(next_idx)

    generated_text = ' '.join([tokenizer.index_word[i] for i in res])
    print(generated_text)


decode_sequence(images_list[1])

(72, 72, 3)
(1, 64)
(1, 64)
(1, 64)
[3]
(1, 64)
(1, 64)
[966]
(1, 64)
(1, 64)
[350]
(1, 64)
(1, 64)
[350]
(1, 64)
(1, 64)
[350]
<start> wrapping swims swims swims goats


# inference randomly

In [23]:
import random

def select_by_prob(probs):
    print(sum(probs))
    
    total = 0.0
    select = random.random()
    for idx, prob in enumerate(probs):
        total += prob
        if total > select:
            return idx
            
    return len(probs)-1
    

def decode_sequence(input_seq):
    print(input_seq.shape)
    states_value = encoder.predict(np.array([input_seq]))
    print(np.array(states_value).shape)


    seed_text = "<START>"
    seed_sequence = tokenizer.texts_to_sequences([seed_text])[0]
    num_words = 5  
    res = [seed_sequence[0]]
    h_state = states_value
    c_state = states_value
    for i in range(num_words):
        # print(h_state.shape)
        # print(c_state.shape)
        print(seed_sequence)

        next_word_probs, h_state, c_state = inference_model.predict([np.array(seed_sequence), h_state, c_state])
        
        # print(next_word_probs)

        next_idx = select_by_prob(next_word_probs[0][0])
        if next_idx == 0:
            break
        
        seed_sequence[0] = next_idx
        res.append(next_idx)

    generated_text = ' '.join([tokenizer.index_word[i] for i in res])
    print(generated_text)


decode_sequence(images_list[0])
    
# seed_text = "<START>"
# seed_sequence = tokenizer.texts_to_sequences([seed_text])[0]
# num_words = 5  
# res = [seed_sequence[0]]
# for i in range(num_words):
#     print(seed_sequence)
    
#     next_word_probs = inference_model.predict(np.array(seed_sequence))
#     # print(next_word_probs)
    
#     next_idx = select_by_prob(next_word_probs[0][0])
#     # print(next_idx)
#     if next_idx == 0:
#         break
    
#     seed_sequence[0] = next_idx
#     res.append(next_idx)
    
# generated_text = ' '.join([tokenizer.index_word[i] for i in res])
# print(generated_text)

(72, 72, 3)
(1, 64)
[3]
0.9999999741849024
[83]
1.0000000575382728
[579]
1.0000000881846063
[77]
1.0000000032305252
[827]
1.000000074331183
<start> face picks girls dusk airport
