In [1]:
import tensorflow as tf

In [2]:
import pandas as pd
import os
import numpy as np
from keras_preprocessing import image as im


# Loading data

In [3]:
def load_data(image_path, annotations_path):
    df = pd.read_csv(annotations_path)
    df = df[~df['human_sentiment'].isna()]
    in_folder = os.listdir(image_path)
    df.loc[df['image_name'].isin(in_folder), 'image_name'] = \
    df.loc[df['image_name'].isin(in_folder), 'image_name'].apply(lambda x: os.path.join(image_path,x))
    images = []
    for image_name in df['image_name'].values:
        images.append(np.array(im.load_img(image_name, target_size=(224,224))))
    images = np.array(images)
    images = images/255
    annotations = df['annotation'].str.lower().values
    return images, annotations

In [4]:
images, annotations = load_data("../../../images/", '../../../emo-at-cap.csv')

# Processing of data

In [5]:
import nltk

In [6]:
tokenizer = nltk.tokenize.WordPunctTokenizer()

In [7]:
tokenized = [tokenizer.tokenize(i) for i in annotations]

In [8]:
unique_tokens, unique_counts = np.unique(np.hstack(tokenized), return_counts=True)

In [9]:
sorted(list(zip(unique_tokens, unique_counts)), key = lambda x: x[1])[::-1][:50]

[('the', 3261),
 ('is', 2031),
 ('man', 1905),
 ('and', 1536),
 ('woman', 1361),
 ('are', 1036),
 ('a', 963),
 ('to', 891),
 ('looks', 751),
 ('happy', 705),
 ('with', 547),
 ('something', 480),
 ('men', 478),
 ('of', 463),
 ('two', 408),
 ('other', 361),
 ('people', 343),
 ('couple', 334),
 (',', 330),
 ('in', 278),
 ('look', 270),
 ('because', 265),
 ('they', 256),
 ('about', 247),
 ('on', 246),
 ('at', 192),
 ('trying', 190),
 ('serious', 189),
 ('women', 184),
 ('each', 179),
 ('together', 176),
 ('by', 175),
 ('having', 171),
 ('flirting', 165),
 ('he', 155),
 ('smiling', 151),
 ('arguing', 141),
 ('angry', 140),
 ('scared', 138),
 ('worried', 137),
 ('hugging', 135),
 ('surprised', 134),
 ('calm', 133),
 ('company', 131),
 ('her', 129),
 ('group', 123),
 ('for', 121),
 ('she', 117),
 ('looking', 117),
 ('his', 115)]

In [10]:
len(unique_tokens)

2162

In [11]:
pad_token = '<PAD>'
start_token = '<S>'
end_token = '<E>'

In [12]:
vocab = dict(zip(unique_tokens,list(range(3,len(unique_tokens)+3))))
vocab[pad_token] = 0
vocab[start_token] = 1
vocab[end_token] = 2

In [13]:
inverse_vocab = dict([(v,k) for k,v in vocab.items()])

In [14]:
add_special_tokens = lambda x: [start_token] + x + [end_token]

In [15]:
tokenized = list(map(add_special_tokens,tokenized))

In [16]:
max_len = len(max(tokenized, key=len))

In [17]:
indexed = list(map(lambda x: [vocab[i] for i in x],tokenized))

In [18]:
padded = tf.keras.preprocessing.sequence.pad_sequences(indexed, truncating='post')

In [19]:
images.shape

(3840, 224, 224, 3)

# Simple model without additional features, transfer learning and attention

In [20]:
def calc_size(x, f, s, padding='same'):
    p = 2 if padding=='same' else 1
    return (x-f+2)/s+1

### Convolutional model

In [21]:
class ConvEncoder(tf.keras.Model):
    def __init__(self, image_shape=(224,224,3)):
        super(ConvEncoder, self).__init__()
        self.input_conv = tf.keras.layers.Conv2D(filters=128, kernel_size=7, input_shape=image_shape, activation='relu',
                           padding='same', name='input_conv', strides=(1,1))
        self.conv1 = tf.keras.layers.Conv2D(filters=64, kernel_size=7, activation='relu',
                               padding='same', name='conv1', strides=(2,2))
        self.batch_norm1 =  tf.keras.layers.BatchNormalization()

        self.conv2 = tf.keras.layers.Conv2D(filters=32, kernel_size=5, activation='relu',
                               padding='same', name='conv2', strides=(2,2))
        self.batch_norm2 =  tf.keras.layers.BatchNormalization()

        self.conv3 = tf.keras.layers.Conv2D(filters=16, kernel_size=5, activation='relu',
                               padding='same', name='conv3', strides=(2,2))
        self.batch_norm3 =  tf.keras.layers.BatchNormalization()

        self.conv4 = tf.keras.layers.Conv2D(filters=8, kernel_size=3, activation='relu',
                               padding='same', name='conv4', strides=(2,2))
        self.batch_norm4 =  tf.keras.layers.BatchNormalization()

        self.conv5 = tf.keras.layers.Conv2D(filters=4, kernel_size=3, activation='relu',
                               padding='same', name='conv5', strides=(2,2))
        self.batch_norm5 =  tf.keras.layers.BatchNormalization()
        self.conv6 = tf.keras.layers.Conv2D(filters=4, kernel_size=3, activation='relu',
                               padding='same', name='conv5', strides=(2,2))
        self.batch_norm6 =  tf.keras.layers.BatchNormalization()
       
        self.flatten = tf.keras.layers.Flatten(name='final_code')
    
    def call(self, input):
        conv1_out = self.batch_norm1(self.conv1(self.input_conv(input)))
        conv2_out = self.batch_norm2(self.conv2(conv1_out))
        conv3_out = self.batch_norm3(self.conv3(conv2_out))
        conv4_out = self.batch_norm4(self.conv4(conv3_out))
        conv5_out = self.batch_norm5(self.conv5(conv4_out))
        conv6_out = self.batch_norm6(self.conv6(conv4_out))
        result = [self.flatten(conv5_out),self.flatten(conv6_out)]
        return result

In [22]:
conv_encoder = ConvEncoder()

In [23]:
conv_res = conv_encoder(np.expand_dims(images[0],axis=0))



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



In [24]:
conv_res

[<tf.Tensor: shape=(1, 196), dtype=float32, numpy=
 array([[0.00000000e+00, 1.27262319e-04, 5.52950427e-03, 1.28674146e-04,
         0.00000000e+00, 0.00000000e+00, 4.41587623e-03, 7.41579104e-04,
         0.00000000e+00, 0.00000000e+00, 1.40692154e-03, 9.76479845e-04,
         0.00000000e+00, 0.00000000e+00, 4.24025906e-03, 0.00000000e+00,
         6.33005984e-04, 0.00000000e+00, 6.23816717e-03, 0.00000000e+00,
         0.00000000e+00, 0.00000000e+00, 5.29834395e-03, 1.34039321e-03,
         2.53455737e-03, 0.00000000e+00, 2.49279244e-03, 0.00000000e+00,
         9.31133865e-04, 0.00000000e+00, 1.04052108e-02, 3.84958833e-03,
         2.17406475e-03, 0.00000000e+00, 8.96684267e-03, 2.14515137e-03,
         0.00000000e+00, 0.00000000e+00, 6.66103559e-03, 0.00000000e+00,
         2.40253098e-03, 0.00000000e+00, 9.93811991e-03, 7.70624308e-03,
         0.00000000e+00, 0.00000000e+00, 8.51054024e-03, 4.06292733e-03,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,


### LSTM model

In [25]:
class LSTMDecoder(tf.keras.Model):
    def __init__(self, max_tokens, lstm_units):
        super(LSTMDecoder, self).__init__()
        self.embeddings = tf.keras.layers.Embedding(input_dim=max_tokens+1, output_dim=128, name='embeddings')
        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name='layer_normalization')
        self.lstm = tf.keras.layers.LSTM(units=lstm_units, return_state=True, name='lstm_decoder')
        self.output_dense = tf.keras.layers.Dense(units=max_tokens)
        
    def call(self, input, features):
        embedded = self.embeddings(input)
        result_lstm, state_h, state_c = self.lstm(embedded, initial_state=features)
        normalized = self.layer_norm(result_lstm)
        logits = self.output_dense(normalized)
        return logits, [state_h, state_c]

In [26]:
lstm_decoder = LSTMDecoder(len(inverse_vocab), conv_res[0].shape[1])

In [27]:
lstm_decoder(np.expand_dims(padded[0],axis=0), conv_res)

(<tf.Tensor: shape=(1, 2165), dtype=float32, numpy=
 array([[ 0.98470294, -0.2562873 , -0.15531676, ...,  0.13696869,
         -0.34102824,  0.16832274]], dtype=float32)>,
 [<tf.Tensor: shape=(1, 196), dtype=float32, numpy=
  array([[ 5.40204113e-03,  9.26242676e-03, -3.24909692e-03,
          -3.44752078e-03, -8.29503592e-03,  1.33345195e-03,
          -8.13061092e-03, -1.90062309e-03,  4.38818336e-03,
           6.07338408e-03,  1.83624052e-05,  4.12985357e-03,
           2.72139511e-03,  6.26521325e-03, -6.96459087e-03,
          -2.87974632e-04,  1.41559413e-03,  2.32430105e-03,
          -1.54397672e-03,  1.23339123e-03, -5.78529085e-04,
          -2.77572637e-03, -3.74819199e-03,  7.76082103e-04,
           6.11304946e-04,  8.84504989e-04,  1.25597836e-02,
           4.81049111e-03,  2.81226193e-03,  4.43036854e-03,
           5.70475706e-04,  1.94471190e-03, -4.27779829e-04,
           6.46081707e-03,  6.68543670e-03,  1.04081277e-02,
           8.11339263e-03, -3.23595782e-03, 

### Training

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(reduction='none',
                                                                         from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)


In [30]:
def loss_function(real, pred):
        mask = tf.math.logical_not(tf.math.equal(real, vocab[pad_token]))
        loss_ = loss_object(real, pred)

        mask = tf.cast(mask, dtype=loss_.dtype)
        loss_ *= mask
        return loss_

In [31]:
@tf.function
def train_step(image, target, lengths, optimizer):

    with tf.GradientTape() as tape:
        initial_state = conv_encoder(image)
        # Teacher forcing - feeding the target as the next input
        batched_loss = []
        for t in range(1, target.shape[1]):
            
            dec_input = tf.expand_dims(target[:, t-1], 1)
            result, initial_state = lstm_decoder(input=dec_input, features=initial_state)

            batched_loss.append(loss_function(target[:, t], result))
        batched_loss = tf.reshape(tf.stack(batched_loss), shape=target[:,1:].shape)
        batched_loss = tf.reduce_sum(batched_loss, axis=1)
        lengths = tf.cast(lengths, dtype=batched_loss.dtype)
        loss = tf.reduce_mean(batched_loss / lengths)

    perplexity = tf.exp(loss)

    variables = conv_encoder.trainable_variables + lstm_decoder.trainable_variables
    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return loss, perplexity

In [32]:
@tf.function
def evaluate(image, target, lengths, optimizer):

    initial_state = conv_encoder(image)
    # Teacher forcing - feeding the target as the next input
    batched_loss = []
    for t in range(1, target.shape[1]):

        dec_input = tf.expand_dims(target[:, t-1], 1)
        result, initial_state = lstm_decoder(input=dec_input, features=initial_state)

        batched_loss.append(loss_function(target[:, t], result))

    batched_loss = tf.reshape(tf.stack(batched_loss), shape=target[:,1:].shape)
    batched_loss = tf.reduce_sum(batched_loss, axis=1)
    lengths = tf.cast(lengths, dtype=batched_loss.dtype)
    loss = tf.reduce_mean(batched_loss / lengths)

    perplexity = tf.exp(loss)


    return loss, perplexity

In [33]:
train_images, test_images, train_sequences, test_sequences =  train_test_split(images, padded, test_size=0.1)

In [34]:
train_length = (train_sequences!=vocab[pad_token]).sum(axis=1)-2

In [35]:
test_length = (test_sequences!=vocab[pad_token]).sum(axis=1)-2

In [36]:
epochs = 50
batch_size= 64
train_images = np.array(np.array_split(train_images, len(train_images)//batch_size))
test_images = np.array(np.array_split(test_images, len(test_images)//batch_size))
train_sequences = np.array(np.array_split(train_sequences, len(train_sequences)//batch_size))
test_sequences = np.array(np.array_split(test_sequences, len(test_sequences)//batch_size))
train_length = np.array(np.array_split(train_length, len(train_length)//batch_size))
test_length = np.array(np.array_split(test_length, len(test_length)//batch_size))

In [37]:
train_images.shape, train_sequences.shape, train_length.shape

((54, 64, 224, 224, 3), (54, 64, 37), (54, 64))

In [38]:
!pip install tqdm



In [39]:
from tqdm import tqdm

In [None]:
train_loss = []
train_perplexity = []
test_loss = []
test_perplexity = []
for epoch in range(epochs):
    epoch_train_loss = 0
    epoch_train_perplexity = 0
    epoch_test_loss = 0
    epoch_test_perplexity = 0
    for batch_train_images, batch_train_sequences, batched_train_length in tqdm(zip(train_images,train_sequences,train_length)):
        batched_train_loss, batched_train_perplexity = train_step(batch_train_images,batch_train_sequences,batched_train_length, optimizer)
        epoch_train_loss+=batched_train_loss
        epoch_train_perplexity+=batched_train_perplexity
    epoch_train_loss = epoch_train_loss/batch_size
    epoch_train_perplexity =epoch_train_perplexity/batch_size
    train_loss.append(epoch_train_loss)
    train_perplexity.append(epoch_train_perplexity)
    print('Finished epoch {}; Train loss : {}; Train perplexity : {}'.format(epoch,epoch_train_loss,epoch_train_perplexity))
    for batch_test_images, batch_test_sequences, batched_test_length in zip(test_images,test_sequences,test_length):
        batched_test_loss, batched_test_perplexity = evaluate(batch_test_images,batch_test_sequences,batched_test_length, optimizer)
        epoch_test_loss+=batched_test_loss
        epoch_test_perplexity+=batched_test_perplexity
    epoch_test_loss = epoch_test_loss/batch_size
    epoch_test_perplexity =epoch_test_perplexity/batch_size
    test_perplexity.append(epoch_test_perplexity)
    test_loss.append(epoch_test_loss)
    print('Test loss : {}; Test perplexity : {}'.format(epoch_test_loss,epoch_test_perplexity))


54it [02:15,  2.51s/it]


Finished epoch 0; Train loss : 7.964125156402588; Train perplexity : 17854.46875


0it [00:00, ?it/s]

Test loss : 0.7490072250366211; Test perplexity : 319.35784912109375


54it [01:25,  1.58s/it]


Finished epoch 1; Train loss : 6.31256628036499; Train perplexity : 1864.590576171875


0it [00:00, ?it/s]

Test loss : 0.6238366365432739; Test perplexity : 80.35478210449219


54it [01:25,  1.57s/it]


Finished epoch 2; Train loss : 5.4418745040893555; Train perplexity : 606.2158813476562


0it [00:00, ?it/s]

Test loss : 0.5585781335830688; Test perplexity : 39.258155822753906


54it [01:24,  1.57s/it]


Finished epoch 3; Train loss : 4.924441814422607; Train perplexity : 320.242919921875


0it [00:00, ?it/s]

Test loss : 0.5138434171676636; Test perplexity : 24.06816291809082


54it [01:24,  1.57s/it]


Finished epoch 4; Train loss : 4.539194107055664; Train perplexity : 200.29571533203125


0it [00:00, ?it/s]

Test loss : 0.4802291691303253; Test perplexity : 16.642854690551758


54it [01:24,  1.56s/it]


Finished epoch 5; Train loss : 4.232934951782227; Train perplexity : 138.24436950683594


0it [00:00, ?it/s]

Test loss : 0.45290929079055786; Test perplexity : 12.333497047424316


54it [01:24,  1.57s/it]


Finished epoch 6; Train loss : 3.9789860248565674; Train perplexity : 101.7147445678711


0it [00:00, ?it/s]

Test loss : 0.4304272532463074; Test perplexity : 9.643580436706543


54it [01:24,  1.56s/it]


Finished epoch 7; Train loss : 3.767735004425049; Train perplexity : 78.76109313964844


0it [00:00, ?it/s]

Test loss : 0.41257762908935547; Test perplexity : 7.935382843017578


54it [01:24,  1.56s/it]


Finished epoch 8; Train loss : 3.5917928218841553; Train perplexity : 63.615379333496094


0it [00:00, ?it/s]

Test loss : 0.3979198932647705; Test perplexity : 6.762115001678467


54it [01:24,  1.56s/it]


Finished epoch 9; Train loss : 3.4454095363616943; Train perplexity : 53.262733459472656


0it [00:00, ?it/s]

Test loss : 0.38611555099487305; Test perplexity : 5.94638729095459


54it [01:24,  1.56s/it]


Finished epoch 10; Train loss : 3.320995807647705; Train perplexity : 45.7969856262207


0it [00:00, ?it/s]

Test loss : 0.3767746090888977; Test perplexity : 5.371668815612793


54it [01:24,  1.56s/it]


Finished epoch 11; Train loss : 3.2139475345611572; Train perplexity : 40.21280288696289


0it [00:00, ?it/s]

Test loss : 0.3691519498825073; Test perplexity : 4.943988800048828


54it [01:24,  1.56s/it]


Finished epoch 12; Train loss : 3.1200125217437744; Train perplexity : 35.8565559387207


0it [00:00, ?it/s]

Test loss : 0.36221247911453247; Test perplexity : 4.584655284881592


54it [01:24,  1.56s/it]


Finished epoch 13; Train loss : 3.0367374420166016; Train perplexity : 32.384788513183594


0it [00:00, ?it/s]

Test loss : 0.35776287317276; Test perplexity : 4.367166042327881


54it [01:24,  1.56s/it]


Finished epoch 14; Train loss : 2.961594581604004; Train perplexity : 29.563318252563477


0it [00:00, ?it/s]

Test loss : 0.35282278060913086; Test perplexity : 4.139503479003906


54it [01:24,  1.56s/it]


Finished epoch 15; Train loss : 2.891826868057251; Train perplexity : 27.144344329833984


0it [00:00, ?it/s]

Test loss : 0.34837034344673157; Test perplexity : 3.944704532623291


54it [01:24,  1.57s/it]


Finished epoch 16; Train loss : 2.8280301094055176; Train perplexity : 25.105634689331055


0it [00:00, ?it/s]

Test loss : 0.3451247811317444; Test perplexity : 3.808554172515869


54it [01:24,  1.57s/it]


Finished epoch 17; Train loss : 2.769484043121338; Train perplexity : 23.374792098999023


0it [00:00, ?it/s]

Test loss : 0.34226739406585693; Test perplexity : 3.6922101974487305


54it [01:24,  1.57s/it]


Finished epoch 18; Train loss : 2.7163095474243164; Train perplexity : 21.909343719482422


0it [00:00, ?it/s]

Test loss : 0.3403218984603882; Test perplexity : 3.6153910160064697


54it [01:24,  1.57s/it]


Finished epoch 19; Train loss : 2.6653335094451904; Train perplexity : 20.58625030517578


0it [00:00, ?it/s]

Test loss : 0.3384184241294861; Test perplexity : 3.541940212249756


54it [01:24,  1.57s/it]


Finished epoch 20; Train loss : 2.6197264194488525; Train perplexity : 19.474754333496094


0it [00:00, ?it/s]

Test loss : 0.33711928129196167; Test perplexity : 3.493241786956787


54it [01:24,  1.56s/it]


Finished epoch 21; Train loss : 2.5878522396087646; Train perplexity : 18.7098331451416


0it [00:00, ?it/s]

Test loss : 0.3353009819984436; Test perplexity : 3.4251060485839844


54it [01:25,  1.57s/it]


Finished epoch 22; Train loss : 2.5458481311798096; Train perplexity : 17.78765869140625


0it [00:00, ?it/s]

Test loss : 0.3348878026008606; Test perplexity : 3.409022569656372


54it [01:24,  1.57s/it]


Finished epoch 23; Train loss : 2.514137029647827; Train perplexity : 17.093046188354492


0it [00:00, ?it/s]

Test loss : 0.3348785936832428; Test perplexity : 3.4110562801361084


54it [01:24,  1.56s/it]


Finished epoch 24; Train loss : 2.4695451259613037; Train perplexity : 16.21106719970703


0it [00:00, ?it/s]

Test loss : 0.3332764506340027; Test perplexity : 3.3515939712524414


54it [01:24,  1.57s/it]


Finished epoch 25; Train loss : 2.429314613342285; Train perplexity : 15.43126392364502


0it [00:00, ?it/s]

Test loss : 0.33062103390693665; Test perplexity : 3.2569634914398193


54it [01:24,  1.57s/it]


Finished epoch 26; Train loss : 2.393317222595215; Train perplexity : 14.766615867614746


0it [00:00, ?it/s]

Test loss : 0.3299965262413025; Test perplexity : 3.2351467609405518


54it [01:24,  1.57s/it]


Finished epoch 27; Train loss : 2.3564834594726562; Train perplexity : 14.122121810913086


0it [00:00, ?it/s]

Test loss : 0.32945239543914795; Test perplexity : 3.2159688472747803


54it [01:24,  1.57s/it]


Finished epoch 28; Train loss : 2.318119525909424; Train perplexity : 13.48011302947998


0it [00:00, ?it/s]

Test loss : 0.32958969473838806; Test perplexity : 3.2199923992156982


54it [01:24,  1.57s/it]


Finished epoch 29; Train loss : 2.2831332683563232; Train perplexity : 12.92081356048584


0it [00:00, ?it/s]

Test loss : 0.3294069766998291; Test perplexity : 3.212521553039551


54it [01:24,  1.57s/it]


Finished epoch 30; Train loss : 2.2517220973968506; Train perplexity : 12.439105033874512


0it [00:00, ?it/s]

Test loss : 0.33069461584091187; Test perplexity : 3.256990909576416


54it [01:24,  1.57s/it]


Finished epoch 31; Train loss : 2.2230589389801025; Train perplexity : 12.019454002380371


0it [00:00, ?it/s]

Test loss : 0.3306044936180115; Test perplexity : 3.2580161094665527


54it [01:24,  1.57s/it]


Finished epoch 32; Train loss : 2.2030696868896484; Train perplexity : 11.745024681091309


0it [00:00, ?it/s]

Test loss : 0.32974910736083984; Test perplexity : 3.230086326599121


54it [01:24,  1.57s/it]


Finished epoch 33; Train loss : 2.197511911392212; Train perplexity : 11.678101539611816


0it [00:00, ?it/s]

Test loss : 0.342887818813324; Test perplexity : 3.7255446910858154


54it [01:24,  1.57s/it]


Finished epoch 34; Train loss : 2.1601693630218506; Train perplexity : 11.136058807373047


0it [00:00, ?it/s]

Test loss : 0.3327842354774475; Test perplexity : 3.33793306350708


54it [01:24,  1.57s/it]


Finished epoch 35; Train loss : 2.121854782104492; Train perplexity : 10.626585006713867


0it [00:00, ?it/s]

Test loss : 0.3329025208950043; Test perplexity : 3.342803716659546


54it [01:24,  1.57s/it]


Finished epoch 36; Train loss : 2.088897466659546; Train perplexity : 10.20627212524414


0it [00:00, ?it/s]

Test loss : 0.3327288031578064; Test perplexity : 3.335421323776245


54it [01:25,  1.58s/it]


Finished epoch 37; Train loss : 2.053839683532715; Train perplexity : 9.786166191101074


0it [00:00, ?it/s]

Test loss : 0.3349001407623291; Test perplexity : 3.413177728652954


54it [01:24,  1.57s/it]


Finished epoch 38; Train loss : 2.026627540588379; Train perplexity : 9.470224380493164


0it [00:00, ?it/s]

Test loss : 0.338936984539032; Test perplexity : 3.568413734436035


54it [01:24,  1.56s/it]


Finished epoch 39; Train loss : 2.014310598373413; Train perplexity : 9.320809364318848


0it [00:00, ?it/s]

Test loss : 0.33412259817123413; Test perplexity : 3.386307954788208


54it [01:24,  1.57s/it]


Finished epoch 40; Train loss : 2.0043258666992188; Train perplexity : 9.215474128723145


0it [00:00, ?it/s]

Test loss : 0.3384207487106323; Test perplexity : 3.5514698028564453


54it [01:24,  1.57s/it]


Finished epoch 41; Train loss : 1.9758613109588623; Train perplexity : 8.902186393737793


0it [00:00, ?it/s]

Test loss : 0.33758771419525146; Test perplexity : 3.514464855194092


54it [01:24,  1.57s/it]


Finished epoch 42; Train loss : 1.9720653295516968; Train perplexity : 8.872456550598145


0it [00:00, ?it/s]

Test loss : 0.3365648090839386; Test perplexity : 3.4768776893615723


54it [01:24,  1.57s/it]


Finished epoch 43; Train loss : 1.991978645324707; Train perplexity : 9.085957527160645


0it [00:00, ?it/s]

Test loss : 0.33215248584747314; Test perplexity : 3.3222453594207764


54it [01:24,  1.57s/it]


Finished epoch 44; Train loss : 1.9201436042785645; Train perplexity : 8.329696655273438


0it [00:00, ?it/s]

Test loss : 0.3341189920902252; Test perplexity : 3.3929595947265625


54it [01:24,  1.57s/it]


Finished epoch 45; Train loss : 1.8784947395324707; Train perplexity : 7.921506404876709


0it [00:00, ?it/s]

Test loss : 0.33578673005104065; Test perplexity : 3.455756664276123


54it [01:24,  1.57s/it]


Finished epoch 46; Train loss : 1.8443560600280762; Train perplexity : 7.603034973144531


0it [00:00, ?it/s]

Test loss : 0.3384881019592285; Test perplexity : 3.557844638824463


45it [01:10,  1.56s/it]

# Testing model

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
class Decoder:
    def __init__(self, encoder: tf.keras.Model,
                 decoder: tf.keras.Model,
                 start_token: int,
                 end_token: int,
                 max_len: int = 10, ):
        self.encoder = encoder
        self.decoder = decoder
        self.max_len = max_len
        self.start_token = start_token
        self.end_token = end_token

    def decode(self, input):
        pass


class GreedyDecoder(Decoder):
    def __init__(self, encoder: tf.keras.Model,
                 decoder: tf.keras.Model,
                 start_token: int,
                 end_token: int,
                 max_len: int = 10,
                 ):
        super().__init__(encoder,
                         decoder,
                         start_token,
                         end_token,
                         max_len
                         )

    def decode(self, input, max_len_output=50):
        initial_state = self.encoder(np.expand_dims(input, axis=0))

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = self.start_token

        res = []
        while True:
            # Sample a token
            output, initial_state = self.decoder(target_seq,initial_state)
            sampled_token_index = np.argmax(output)

            if len(res) > max_len_output or sampled_token_index == self.end_token:
                break

            res.append(sampled_token_index)

            target_seq = np.zeros((1, 1))
            target_seq[0, 0] = sampled_token_index

       
        return res


In [None]:
class BeamSearchDecoder(Decoder):
    def __init__(self, encoder: tf.keras.Model,
                 decoder: tf.keras.Model,
                 start_token: int,
                 end_token: int,
                 max_len: int = 10,
                 ):
        super().__init__(encoder,
                         decoder,
                         start_token,
                         end_token,
                         max_len
                         )

    def decode(self, input, beam_size=3):
        start = [self.start_token]
        initial_state = self.encoder(np.expand_dims(input,axis=0))


        start_word = [[start, 0.0, initial_state]]

        while len(start_word[0][0]) < self.max_len:
            temp = []
            for s in start_word:
                target_seq = np.array([[s[0][-1]]])
                initial_state = s[-1]
                output, initial_state= self.decoder(target_seq, initial_state)
                output = np.hstack(output)
                output = tf.nn.softmax(output).numpy()
                word_preds = np.argsort(output)[-beam_size:]

                for w in word_preds:
                    next_cap, prob = s[0][:], s[1]
                    next_cap.append(w)
                    prob += output[w]
                    temp.append([next_cap, prob, initial_state])

            start_word = temp
            # Sorting according to the probabilities
            start_word = sorted(start_word, reverse=False, key=lambda l: l[1])
            # Getting the top words
            start_word = start_word[-beam_size:]

        start_word = start_word[-1][0]

        final_caption = []

        for i in start_word:
            if i != self.end_token:
                final_caption.append(i)
            else:
                break
        return final_caption[1:]

In [None]:
def predict(data, decoder, inverse_vocab,
           beam_size=None):
    if beam_size:
        result = decoder.decode(data, beam_size=beam_size)
    else:
        result = decoder.decode(data)
    return ' '.join([inverse_vocab[i] for i in result])

In [None]:
greedy_decoder = GreedyDecoder(conv_encoder, lstm_decoder, vocab[start_token], vocab[end_token],
             max_len=39)

In [None]:
beam_decoder = BeamSearchDecoder(conv_encoder, lstm_decoder, vocab[start_token], vocab[end_token],
             max_len=39)

In [None]:
sample = np.random.choice(images)

In [None]:
plt.imshow(sample)

In [None]:
predict(sample, greedy_decoder, inverse_vocab)

In [None]:
predict(sample, beam_decoder, inverse_vocab, beam_size=3)