In [0]:
import pickle
import numpy as np
import os
import matplotlib.pyplot as plt

In [28]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [1]:
from keras.models import Model
from keras.layers import Input, Dense, GRU, Embedding, TimeDistributed
from keras.applications import ResNet50
from keras.optimizers import RMSprop

Using TensorFlow backend.


In [0]:
with open('/content/gdrive/My Drive/Image Captioning Data/indices_to_words.pickle', 'rb') as f:
  indices_to_words = pickle.load(f)

In [0]:
with open('/content/gdrive/My Drive/Image Captioning Data/words_to_indices.pickle', 'rb') as f:
  words_to_indices = pickle.load(f)

In [0]:
with open('/content/gdrive/My Drive/Image Captioning Data/images_train.npy', 'rb') as f:
  images_train = np.load(f)

In [0]:
with open('/content/gdrive/My Drive/Image Captioning Data/images_val.npy', 'rb') as f:
  images_val = np.load(f)

In [0]:
with open('/content/gdrive/My Drive/Image Captioning Data/captions_train.npy', 'rb') as f:
  captions_train = np.load(f)

In [0]:
with open('/content/gdrive/My Drive/Image Captioning Data/captions_val.npy', 'rb') as f:
  captions_val = np.load(f)

In [0]:
with open('/content/gdrive/My Drive/Image Captioning Data/next_words_train.npy', 'rb') as f:
  next_words_train = np.load(f)

In [0]:
with open('/content/gdrive/My Drive/Image Captioning Data/next_words_val.npy', 'rb') as f:
  next_words_val = np.load(f)

In [38]:
next_words_train.shape

(30000, 40, 1)

In [2]:
embedding_size = 128
maxLen = 40
vocab_size = 8919 
img_emb_size = 2048
state_size = 512

In [3]:
img_input = Input(shape=(img_emb_size, ), name='Image-Input')
img_output = Dense(state_size, activation='tanh', name='Image-output')

In [4]:
decoder_input = Input(shape=(40,), name='decoder-input')

In [5]:
decoder_embedding = Embedding(input_dim = vocab_size, output_dim = embedding_size, input_length = maxLen, name='decoder-embedding')

In [6]:
decoder_layer1 = GRU(state_size, name='decoder-layer-1', return_sequences = True)
decoder_layer2 = GRU(state_size, name='decoder-layer-2', return_sequences = True)
decoder_layer3 = GRU(state_size, name='decoder-layer-3', return_sequences = True)

In [7]:
decoder_time_dense = TimeDistributed(Dense(vocab_size, activation='softmax'), name='decoder-output')

In [8]:
initial_state = img_output(img_input)
net = decoder_input
net = decoder_embedding(net)
net = decoder_layer1(net, initial_state = initial_state)
net = decoder_layer2(net, initial_state = initial_state)
net = decoder_layer3(net, initial_state = initial_state)

In [9]:
decoder_output = decoder_time_dense(net)

In [10]:
model = Model(inputs=[img_input, decoder_input], output=[decoder_output])

  """Entry point for launching an IPython kernel.


In [11]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
decoder-input (InputLayer)      (None, 40)           0                                            
__________________________________________________________________________________________________
Image-Input (InputLayer)        (None, 2048)         0                                            
__________________________________________________________________________________________________
decoder-embedding (Embedding)   (None, 40, 128)      1141632     decoder-input[0][0]              
__________________________________________________________________________________________________
Image-output (Dense)            (None, 512)          1049088     Image-Input[0][0]                
__________________________________________________________________________________________________
decoder-la

In [13]:
model.compile(optimizer='RMSprop', loss='sparse_categorical_crossentropy')
model.load_weights('./model_weights.h5')

In [0]:
x_data = {
            'decoder-input': captions_train,
            'Image-Input': images_train
         }
y_data = {
    'decoder-output': next_words_train
}

In [57]:
model.fit(x_data, y_data, batch_size=256, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fb1819d5fd0>

In [58]:
model.save('image-cap-model.h5')

  '. They will not be included '
  '. They will not be included '
  '. They will not be included '


In [0]:
model.save_weights("./model_weights.h5")

In [14]:
model.get_weights()

[array([[-0.12980668, -0.20748658, -0.23258394, ...,  0.07226366,
         -0.18122517,  0.10592537],
        [-0.07265926, -0.0601264 , -0.00825674, ...,  0.04235341,
         -0.01624523,  0.06026336],
        [-0.00915301,  0.07104307, -0.04051955, ..., -0.04782885,
          0.04540734,  0.01736098],
        ...,
        [-0.04228642,  0.00067012, -0.01512671, ...,  0.00445633,
          0.0223436 ,  0.00868735],
        [-0.05606803, -0.03827476, -0.01715764, ...,  0.01013056,
         -0.02649954,  0.03117946],
        [-0.03419574, -0.00163113, -0.04230185, ...,  0.03912022,
         -0.05919104,  0.04827494]], dtype=float32),
 array([[ 0.00282478,  0.02775853,  0.06116672, ..., -0.13835436,
          0.03946351, -0.04980527],
        [-0.01870038,  0.03940734,  0.0202294 , ...,  0.01407946,
          0.0062386 ,  0.05606497],
        [ 0.04050946,  0.03650239,  0.09849489, ...,  0.02190135,
         -0.03008177,  0.01742705],
        ...,
        [-0.00047515,  0.00840705,  0.0