# importing all libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten, TimeDistributed, LSTM, Input, BatchNormalization, Conv2D, MaxPooling2D, Reshape, Conv1D, GlobalAveragePooling1D, MaxPooling1D, Lambda
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from keras.losses import sparse_categorical_crossentropy
from keras.losses import categorical_crossentropy
import tensorflow_hub as hub
from PIL import Image
import gzip
from nltk.translate.bleu_score import corpus_bleu
from keras.preprocessing.sequence import pad_sequences
import spacy
import h5py
import os
import cv2
import pickle
import re
import shutil
import glob
import gzip
%matplotlib inline

In [None]:
import time

# creating path variables

In [None]:
path_gloss_df = '/content/drive/MyDrive/Colab Notebooks/gloss_vector_dataframe.zip'
path_embedding_images = '/content/drive/MyDrive/Colab Notebooks/emb_comp_i3d_train_zero.h5'
path_gloss_files = '/content/drive/MyDrive/Colab Notebooks/phoenix14t.pami0.train.annotations_only.gzip'

In [None]:
path_dev_gloss = '/content/drive/MyDrive/Colab Notebooks/phoenix14t.pami0.dev.annotations_only.gzip'
path_test_gloss = '/content/drive/MyDrive/Colab Notebooks/phoenix14t.pami0.test.annotations_only.gzip'

In [None]:
path_embedding_dev = '/content/drive/MyDrive/Colab Notebooks/emb_comp_i3d_dev_zero.h5'
path_embedding_test = '/content/drive/MyDrive/Colab Notebooks/emb_comp_i3d_test_zero.h5'

In [None]:
dev_img_embedding = "\\emb_comp_i3d_dev_zero.h5"
test_img_embedding = "\\emb_comp_i3d_test_zero.h5"

# loading annotations in a dataframes

In [None]:
with gzip.open(path_gloss_files,'rb') as f:
  annotation_gloss = pickle.load(f)

In [None]:
with gzip.open(path_dev_gloss,'rb') as f:
  annotation_dev = pickle.load(f)

In [None]:
with gzip.open(path_test_gloss,'rb')as f :
  annotation_test = pickle.load(f)

In [None]:
annotation_gloss = pd.DataFrame(annotation_gloss)

In [None]:
annotation_gloss_dev = pd.DataFrame(annotation_dev)



In [None]:
annotation_gloss_test = pd.DataFrame(annotation_test)

In [None]:
annotation_gloss.head()

In [None]:
annotation_gloss_dev.head()

In [None]:
annotation_gloss_test.head()

In [None]:
# we can use values attribute to directly create a list of all sentences of a particular column
#annotation_gloss_test["text"].values

# preprocessing the vocabulary

In [None]:
for i in range(len(annotation_gloss)):
  annotation_gloss.iloc[i,2] = 'startseq '+annotation_gloss.iloc[i,2].lower() + ' endseq'
  annotation_gloss.iloc[i,3] = 'startseq '+annotation_gloss.iloc[i,3].lower() + ' endseq'
  annotation_gloss.iloc[i,3] = annotation_gloss.iloc[i,3].replace(' .','')
  #annotation_gloss.iloc[i,3] =annotation_gloss.iloc[i,3][-2].replace(' ','')
 

In [None]:
for i in range(len(annotation_gloss_dev)):
  annotation_gloss_dev.iloc[i,2] = 'startseq '+annotation_gloss_dev.iloc[i,2].lower() + ' endseq'
  annotation_gloss_dev.iloc[i,3] = 'startseq '+annotation_gloss_dev.iloc[i,3].lower() + ' endseq'
  annotation_gloss_dev.iloc[i,3] = annotation_gloss_dev.iloc[i,3].replace(' .','')

In [None]:
for i in range(len(annotation_gloss_test)):
  annotation_gloss_test.iloc[i,2] = 'startseq '+annotation_gloss_test.iloc[i,2].lower() + ' endseq'
  annotation_gloss_test.iloc[i,3] = 'startseq '+annotation_gloss_test.iloc[i,3].lower() + ' endseq'
  annotation_gloss_test.iloc[i,3] = annotation_gloss_test.iloc[i,3].replace(' .','')

# creating a vocabulary of texts

In [None]:
vocabulary = []
for txt in annotation_gloss.text.values:
    vocabulary.extend(txt.split())
for txt in annotation_gloss.gloss.values:
    vocabulary.extend(txt.split())
    
for txt in annotation_gloss_dev.text.values:
    vocabulary.extend(txt.split())
for txt in annotation_gloss_dev.gloss.values:
    vocabulary.extend(txt.split())
    
for txt in annotation_gloss_test.text.values:
    vocabulary.extend(txt.split())
for txt in annotation_gloss_test.gloss.values:
    vocabulary.extend(txt.split())
print('Vocabulary Size: %d' % len(set(vocabulary)))

# reading images embedding for train test and dev

In [None]:
#reading images feature vectors
with h5py.File(path_embedding_images,'r')as ab:
    ind = ab.keys()
    print(ind)
    data_train=ab['x_1'][()]

In [None]:
#reading images feature vectors
with h5py.File(path_embedding_dev,'r')as ab:
    ind = ab.keys()
    print(ind)
    data_dev=ab['x_1'][()]

In [None]:
#reading images feature vectors
with h5py.File(path_embedding_test,'r')as ab:
    ind = ab.keys()
    print(ind)
    data_test=ab['x_1'][()]

In [None]:
data_train.shape

In [None]:
data_dev.shape

In [None]:
data_test.shape

# creating tokenizer dataset


In [None]:
gloss_token = []
text_token = []
for i in range(len(annotation_gloss)):
  gloss_token.append(annotation_gloss.iloc[i,2])
  text_token.append(annotation_gloss.iloc[i,3])

In [None]:
gloss_token_dev = []
text_token_dev = []
for i in range(len(annotation_gloss_dev)):
  gloss_token_dev.append(annotation_gloss_dev.iloc[i,2])
  text_token_dev.append(annotation_gloss_dev.iloc[i,3])

In [None]:
gloss_token_test = []
text_token_test = []
for i in range(len(annotation_gloss_test)):
  gloss_token_test.append(annotation_gloss_test.iloc[i,2])
  text_token_test.append(annotation_gloss_test.iloc[i,3])

# creating tokenizer

In [None]:
def create_tokenizer(vocabulary):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=3606,
                                                 oov_token="<unk>",
                                                 filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ')
    tokenizer.fit_on_texts(vocabulary)
    tokenizer.word_index['<pad>'] = 0
    tokenizer.index_word[0] = '<pad>'
    return tokenizer

In [None]:
tokenizer = create_tokenizer(vocabulary)

In [None]:
vocab_size = len(tokenizer.word_index)+1
print("vocalb size",vocab_size)

In [None]:
tokenizer.index_word[3]

In [None]:
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

# tokenizing train,test,dev glosses

In [None]:
train_y = encode_sequences(tokenizer,35,gloss_token)
dev_y = encode_sequences(tokenizer,35,gloss_token_dev)
test_y = encode_sequences(tokenizer,35,gloss_token_test)

In [None]:
train_y[0]

# hyperparameters

In [None]:
BATCH_SIZE = 128
BUFFER_SIZE = 7096
embedding_dim = 256
units = 512
vocab_size = len(tokenizer.word_index) + 1
num_steps = 7096 // BATCH_SIZE
features_shape = 512
attention_features_shape = 35

In [None]:
data_train.shape

In [None]:
# for i in train_y[2]:
#   print(tokenizer.index_word[i])

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((data_train, train_y))
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

# dataset shape = ((128,512),(128,35))
# dataset shape = ((batchsize,feature size),(batch size ,sentence size))

In [None]:
for (batch, (img_tensor, target)) in enumerate(dataset):
    print("batch",batch)
    
    print("img_tensor",img_tensor)
    print("img tensrog shape",img_tensor.shape)
    print("target",target)
    print("tartghet shape",target.shape)
    break

# creating model for decoder

In [None]:
'''The encoder output(i.e. 'features'), hidden state(initialized to 0)(i.e. 'hidden') and
the decoder input (which is the start token)(i.e. 'x') is passed to the decoder.'''

class Rnn_Local_Decoder(tf.keras.Model):
    def __init__(self, embedding_dim, units, vocab_size):
        super(Rnn_Local_Decoder, self).__init__()
        self.units = units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm = tf.keras.layers.LSTM(self.units,
                                         activation='tanh',
                                         recurrent_activation='sigmoid',
                                      use_bias=True,
                                      kernel_initializer='glorot_uniform',
                                      return_sequences=True,
                                      return_state=True,
                                      recurrent_initializer='glorot_uniform')

        self.fc1 = tf.keras.layers.Dense(self.units)

        self.dropout = tf.keras.layers.Dropout(0.9, noise_shape=None, seed=None)
        self.batchnormalization = tf.keras.layers.BatchNormalization(axis=-1, momentum=0.99, epsilon=0.001, center=True, scale=True, beta_initializer='zeros', gamma_initializer='ones', moving_mean_initializer='zeros', moving_variance_initializer='ones', beta_regularizer=None, gamma_regularizer=None, beta_constraint=None, gamma_constraint=None)

        self.fc2 = tf.keras.layers.Dense(vocab_size)

        # Implementing Attention Mechanism
        self.Uattn = tf.keras.layers.Dense(units)
        self.Wattn = tf.keras.layers.Dense(units)
        self.Vattn = tf.keras.layers.Dense(1)

    def call(self, x, features, hidden):
        print("x",x.shape)
        #x shape is (128,1)
        b_shape = features.shape[0]
        
        # features shape ==> (128,512) ==> Output from ENCODER
        # hidden shape == (batch_size, hidden_size) ==>(128,512)
        # hidden_with_time_axis shape == (batch_size, 1, hidden_size) ==> (128,1,512)
        
        
        
        print("hidden shape",hidden.shape)
        
        #hidden_with_time_axis = tf.expand_dims(hidden, 1)
        
        
        
        #print("hidden shape",hidden_with_time_axis.shape)

        # score shape == (128, 512, 1)
        # Attention Function
        '''e(ij) = f(s(t-1),h(j))'''
        ''' e(ij) = Vattn(T)*tanh(Uattn * h(j) + Wattn * s(t))'''
        
        #print("self ua tten ",self.Uattn(features).shape)
        #print(" wattn shape",self.Wattn(hidden_with_time_axis))
        
        #temp_tanh = tf.reshape(tf.nn.tanh(self.Uattn(features) + self.Wattn(hidden)),[128,512,1])
        #print("tanh_temp sahpe ",temp_tanh.shape)
        #print("tanh sahpe ",tf.nn.tanh(self.Uattn(features) + self.Wattn(hidden)).shape)





        #score = self.Vattn(tf.reshape(tf.nn.tanh(self.Uattn(features) + self.Wattn(hidden)),(BATCH_SIZE,512,1)))
        score = self.Vattn(tf.reshape(tf.nn.tanh(self.Uattn(features) + self.Wattn(hidden)),(b_shape,512,1)))




        print("score shape",score.shape)
        
        

        # self.Uattn(features) : (128,512)
        # self.Wattn(hidden_with_time_axis) : (128,1,512)
        # tf.nn.tanh(self.Uattn(features) + self.Wattn(hidden_with_time_axis)) : (128,512,1)
        # self.Vattn(tf.nn.tanh(self.Uattn(features) + self.Wattn(hidden_with_time_axis))) : (128,512,1) ==> score

        # you get 1 at the last axis because you are applying score to self.Vattn
        # Then find Probability using Softmax
        '''attention_weights(alpha(ij)) = softmax(e(ij))'''

        attention_weights = tf.nn.softmax(score, axis=1)
        temp_attention = tf.cast(attention_weights,dtype=tf.float64)
        #print("attention shapes ",temp_attention.shape)

        # attention_weights shape == (128,128, 1)
        # Give weights to the different pixels in the image
        ''' C(t) = Summation(j=1 to T) (attention_weights * VGG-16 features) '''
        #print("attention type ",type(attention_weights))
        #print("attention weithg datatype",temp_attention.dtype)
        #print("features type",type(features))
        #print("feature dtype",features.dtype)
        print("attention shape ",temp_attention.shape)
        
        
        temp_features = tf.expand_dims(features,1)
        print("feature shape",temp_features.shape)

        context_vector = temp_attention * temp_features
        context_vector = tf.reduce_sum(context_vector, axis=1)
        
        print("context vector shape",context_vector.shape)

        # Context Vector(64,256) = AttentionWeights(64,49,1) * features(64,49,256)
        # context_vector shape after sum == (64, 256)
        # x shape after passing through embedding == (64, 1, 256)

        x = self.embedding(x)
        # x shape after concatenation == (64, 1,  512)
        temp_x = tf.cast(x,dtype=tf.float64)

        x = tf.concat([tf.expand_dims(context_vector, 1), temp_x], axis=-1)
        # passing the concatenated vector to the GRU

        output, state = self.lstm(x)
        # shape == (batch_size, max_length, hidden_size)

        x = self.fc1(output)
        # x shape == (batch_size * max_length, hidden_size)

        x = tf.reshape(x, (-1, x.shape[2]))

        # Adding Dropout and BatchNorm Layers
        x= self.dropout(x)
        x= self.batchnormalization(x)

        # output shape == (64 * 512)
        x = self.fc2(x)

        # shape : (64 * 8329(vocab))
        return x, state, attention_weights

    def reset_state(self, batch_size):
        return tf.zeros((batch_size, self.units))
    # def build_graph(self):
    #     x = Input(shape=(128,512))
    #     return Model(inputs=[x], outputs=self.call(x))



decoder = Rnn_Local_Decoder(embedding_dim, units, vocab_size)
# decoder.build((features,hidden))
# decoder.build_graph().summary()

In [None]:
# tf.keras.utils.plot_model(
#     decoder.build_graph(),                      # here is the trick (for now)
#     to_file='\content\drive\MyDrive\Colab Notebooks\model.png', dpi=96,              # saving  
#     show_shapes=True, show_layer_names=True,  # show shapes and layer name
#     expand_nested=False                       # will show nested block
# )

 # loss

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
   from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

# training

In [None]:
temp = tf.expand_dims([tokenizer.word_index['startseq']] * BATCH_SIZE, 1)

In [None]:
temp.shape

In [None]:
loss_plot = []

@tf.function
def train_step(img_tensor, target):
 loss = 0
 # initializing the hidden state for each batch
 # because the captions are not related from image to image

 # hidden shape = (128,1024)
 hidden = decoder.reset_state(batch_size=target.shape[0])
 dec_input = tf.expand_dims([tokenizer.word_index['startseq']] * target.shape[0], 1)
 

 with tf.GradientTape() as tape:
     features = img_tensor
     for i in range(1, target.shape[1]):
         # passing the features through the decoder
         predictions, hidden, _ = decoder(dec_input, features, hidden)
        #  decoder.build((features,hidden))
        #  decoder.build_graph().summary()
        #  tf.keras.utils.plot_model(
        #           decoder.build_graph(),                      # here is the trick (for now)
        #           to_file='\content\drive\MyDrive\Colab Notebooks\model.png', dpi=96,              # saving  
        #           show_shapes=True, show_layer_names=True,  # show shapes and layer name
        #           expand_nested=False                       # will show nested block
        #             )
         
         #print("predicted value",predictions)
         loss += loss_function(target[:, i], predictions)

         # using teacher forcing
         dec_input = tf.expand_dims(target[:, i], 1)

 total_loss = (loss / int(target.shape[1]))
 trainable_variables = decoder.trainable_variables
 gradients = tape.gradient(loss,trainable_variables)
 optimizer.apply_gradients(zip(gradients, trainable_variables))

 return loss, total_loss

In [None]:

EPOCHS = 10
for epoch in range(0, EPOCHS):
   start = time.time()
   total_loss = 0

   for (batch, (img_tensor, target)) in enumerate(dataset):
       batch_loss, t_loss = train_step(img_tensor, target)
       total_loss += t_loss

       if batch % 100 == 0:
           print ('Epoch {} Batch {} Loss {:.4f}'.format(
             epoch + 1, batch, batch_loss.numpy() / int(target.shape[1])))
   # storing the epoch end loss value to plot later
   loss_plot.append(total_loss / num_steps)

   print ('Epoch {} Loss {:.6f}'.format(epoch + 1,
                                        total_loss/num_steps))

   print ('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

In [None]:
plt.plot(loss_plot)
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss Plot')
plt.show()

# evaluate using greedy

In [None]:
def evaluate(vid):
   #attention_plot = np.zeros((max_length, attention_features_shape))
   #print("vid shape",vid.shape)

   hidden = decoder.reset_state(batch_size=1)
   temp_input = tf.expand_dims(vid, 0)
   max_length = 35
   #img_tensor_val = image_features_extract_model(temp_input)
   #img_tensor_val = tf.reshape(img_tensor_val, (img_tensor_val.shape[0], -1, img_tensor_val.shape[3])

   #features = encoder(img_tensor_val)
   #print("features shape",vid.shape)
   features = temp_input
   #features = vid
   #print("feature shape",features.shape)
   dec_input = tf.expand_dims([tokenizer.word_index['startseq']], 1)
   result = []

   for i in range(max_length):
       #print("dec_input" ,dec_input.shape)
       #print("features",features.shape)
       #print("hidden",hidden.shape)
       predictions, hidden, attention_weights = decoder(dec_input, features, hidden)
       #attention_plot[i] = tf.reshape(attention_weights, (-1, )).numpy()
       predicted_id = tf.argmax(predictions[0]).numpy()
       result.append(tokenizer.index_word[predicted_id])
       #print("current predicted word",tokenizer.index_word[predicted_id])

       if tokenizer.index_word[predicted_id] == 'endseq':
           return result

       dec_input = tf.expand_dims([predicted_id], 0)
   #attention_plot = attention_plot[:len(result), :]

   #return result, attention_plot
   return result

In [None]:
tokenizer.word_index['startseq']

In [None]:
#!pip install jiwer

In [None]:
import jiwer

In [None]:
from nltk.translate.bleu_score import SmoothingFunction
smoothie = SmoothingFunction().method4

In [None]:
actual,predicted = list(),list()
for i in range(len(data_dev)):
  image = data_dev[i]
  result = evaluate(image)
  real_caption = text_token_dev[i].split(' ')
  for j in real_caption:
    if((j=='startseq')or (j=="endseq")):
      real_caption.remove(j)
  first = ' '.join(j for j in real_caption)
  for j in result:
    if j=="<unk>":
      result.remove(j)

  # for j in real_caption:
  #   if (j=="<unk>") or(j=="endseq") :
  #     real_caption.remove(j)
  #first = real_caption.split(' ', 1)[1]
  result_join = ' '.join(result)
  result_final = result_join.rsplit(' ', 1)[0]
  print("predicted sentence",result_final)
  print("actual sentence",first)
  actual.append(first)
  predicted.append(result_final)
  # print('BLEU-1: %f' % corpus_bleu([first],[result_final], weights=(1.0, 0, 0, 0),smoothing_function=smoothie))
  # print('BLEU-2: %f' % corpus_bleu([first],[result_final], weights=(0.5, 0.5, 0, 0),smoothing_function=smoothie))
  # print('BLEU-3: %f' % corpus_bleu([first],[result_final], weights=(0.3, 0.3, 0.3, 0),smoothing_function=smoothie))
  # print('BLEU-4: %f' % corpus_bleu([first],[result_final], weights=(0.25, 0.25, 0.25, 0.25),smoothing_function=smoothie))
  print("word error rate is",jiwer.wer(first,result_final))
    

print('BLEU-1: %f' % corpus_bleu(actual,predicted, weights=(1.0, 0, 0, 0),smoothing_function=smoothie))
print('BLEU-2: %f' % corpus_bleu(actual,predicted, weights=(0.5, 0.5, 0, 0),smoothing_function=smoothie))
print('BLEU-3: %f' % corpus_bleu(actual,predicted, weights=(0.3, 0.3, 0.3, 0),smoothing_function=smoothie))
print('BLEU-4: %f' % corpus_bleu(actual,predicted, weights=(0.25, 0.25, 0.25, 0.25),smoothing_function=smoothie))

In [None]:

image = data_dev[125]

result = evaluate(image)

real_caption = ' '.join([i for i in text_token_dev[125].split(' ')])


first = real_caption.split(' ', 1)[1]

for i in result:
   if i=="<unk>":
       result.remove(i)

for i in real_caption:
   if i=="<unk>":
       real_caption.remove(i)

       
result_join = ' '.join(result)
result_final = result_join.rsplit(' ', 1)[0]

print("final result ",result_final)
print("actaul ",first)


reference = first
candidate = result_final
print("ref length",len(reference))
print("candidate length",len(candidate))


print('BLEU-1: %f' % corpus_bleu([reference], [candidate], weights=(1.0, 0, 0, 0),smoothing_function=smoothie))
print('BLEU-2: %f' % corpus_bleu([reference], [candidate], weights=(0.5, 0.5, 0, 0),smoothing_function=smoothie))
print('BLEU-3: %f' % corpus_bleu([reference], [candidate], weights=(0.3, 0.3, 0.3, 0),smoothing_function=smoothie))
print('BLEU-4: %f' % corpus_bleu([reference], [candidate], weights=(0.25, 0.25, 0.25, 0.25),smoothing_function=smoothie))

print ('Real Caption:', real_caption)
print ('Prediction Caption:', result_final)


In [None]:
# captions on the validation set
# rid = np.random.randint(0, len(img_name_val))
# image = '/content/gdrive/My Drive/FLICKR8K/Flicker8k_Dataset/2319175397_3e586cfaf8.jpg'
image = data_dev[125]
# real_caption = ' '.join([tokenizer.index_word[i] for i in cap_val[rid] if i not in [0]])
result = evaluate(image)
#print("result",result)
real_caption = ' '.join([i for i in text_token_dev[125].split(' ')])

#print("real caption",real_caption)
# remove <start> and <end> from the real_caption
first = real_caption.split(' ', 1)[1]
#print("first ref",first)
#real_caption = 'Two white dogs are playing in the snow'

#remove "<unk>" in result
for i in result:
   if i=="<unk>":
       result.remove(i)

for i in real_caption:
   if i=="<unk>":
       real_caption.remove(i)

#remove <end> from result        
result_join = ' '.join(result)
result_final = result_join.rsplit(' ', 1)[0]

print("final result ",result_final)
print("actaul ",first)

# real_appn = []
# real_appn.append(real_caption.split())
reference = first
candidate = result_final
print("ref length",len(reference))
print("candidate length",len(candidate))

#score = sentence_bleu(reference, candidate)
#print(f"BELU score: {score*100}")
print('BLEU-1: %f' % corpus_bleu([reference], [candidate], weights=(1.0, 0, 0, 0),smoothing_function=smoothie))
print('BLEU-2: %f' % corpus_bleu([reference], [candidate], weights=(0.5, 0.5, 0, 0),smoothing_function=smoothie))
print('BLEU-3: %f' % corpus_bleu([reference], [candidate], weights=(0.3, 0.3, 0.3, 0),smoothing_function=smoothie))
print('BLEU-4: %f' % corpus_bleu([reference], [candidate], weights=(0.25, 0.25, 0.25, 0.25),smoothing_function=smoothie))

print ('Real Caption:', real_caption)
print ('Prediction Caption:', result_final)
#plot_attention(image, result, attention_plot)

In [None]:
# def plot_attention(image, result, attention_plot):
#    temp_image = np.array(Image.open(image))
#    fig = plt.figure(figsize=(10, 10))
#    len_result = len(result)
#    for l in range(len_result):
#        temp_att = np.resize(attention_plot[l], (8, 8))
#        ax = fig.add_subplot(len_result//2, len_result//2, l+1)
#        ax.set_title(result[l])
#        img = ax.imshow(temp_image)
#        ax.imshow(temp_att, cmap='gray', alpha=0.6, extent=img.get_extent())

#    plt.tight_layout()
#    plt.show()

In [None]:
# captions on the validation set
# rid = np.random.randint(0, 512)
# image = data_dev[0]

for k in range(0,519,128):
  print(text_token_dev[k])
  # for j in text_token_dev[i].split(' '):
  #   print(j) 

  #real_caption = ' '.join([j for j in text_token_dev[k].split(' ')])

  real_caption = []
  for j in text_token_dev[k:k+128]:
    real_caption.append(j)
  #print("real caption shape",len(real_caption))
  



  #print("data_dev.shape",data_dev[k:k+128].shape)







  #print("real_caption: ",real_caption)
  #print("data dev shape",data_dev[k].shape)
  result = evaluate(data_dev[k:k+128,:])
  #print("result length",len(result))

  # remove <start> and <end> from the real_caption
  first = real_caption.split(' ', 1)[1]
  #real_caption = 'Two white dogs are playing in the snow'

  #remove "<unk>" in result

  for i in result:
    if i=="<unk>":
        result.remove(i)

  for i in real_caption:
    if i=="<unk>":
        real_caption.remove(i)

  #remove <end> from result        
  result_join = ' '.join(result)
  result_final = result_join.rsplit(' ', 1)[0]

  real_appn = []
  real_appn.append(real_caption.split())
  reference = real_appn
  candidate = result
  break
  


score = sentence_bleu(reference, candidate)
print('BLEU-1: %f' % corpus_bleu(reference, candidate, weights=(1.0, 0, 0, 0)))
print('BLEU-2: %f' % corpus_bleu(reference, candidate, weights=(0.5, 0.5, 0, 0)))
print('BLEU-3: %f' % corpus_bleu(reference, candidate, weights=(0.3, 0.3, 0.3, 0)))
print('BLEU-4: %f' % corpus_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25))) 
print(f"BELU score: {score*100}")

print ('Real Caption:', real_caption)
print ('Prediction Caption:', result_final)
#plot_attention(image, result, attention_plot)

# test set

In [None]:
actual,predicted = list(),list()
for i in range(len(data_test)):
  image = data_test[i]
  result = evaluate(image)
  real_caption = text_token_test[i].split(' ')
  for j in real_caption:
    if((j=='startseq')or (j=="endseq")):
      real_caption.remove(j)
  first = ' '.join(j for j in real_caption)
  for j in result:
    if j=="<unk>":
      result.remove(j)

  # for j in real_caption:
  #   if (j=="<unk>") or(j=="endseq") :
  #     real_caption.remove(j)
  #first = real_caption.split(' ', 1)[1]
  result_join = ' '.join(result)
  result_final = result_join.rsplit(' ', 1)[0]
  print("predicted sentence",result_final)
  print("actual sentence",first)
  actual.append(first)
  predicted.append(result_final)
  # print('BLEU-1: %f' % corpus_bleu([first],[result_final], weights=(1.0, 0, 0, 0),smoothing_function=smoothie))
  # print('BLEU-2: %f' % corpus_bleu([first],[result_final], weights=(0.5, 0.5, 0, 0),smoothing_function=smoothie))
  # print('BLEU-3: %f' % corpus_bleu([first],[result_final], weights=(0.3, 0.3, 0.3, 0),smoothing_function=smoothie))
  # print('BLEU-4: %f' % corpus_bleu([first],[result_final], weights=(0.25, 0.25, 0.25, 0.25),smoothing_function=smoothie))
  print("word error rate is",jiwer.wer(first,result_final))
    

print('BLEU-1: %f' % corpus_bleu(actual,predicted, weights=(1.0, 0, 0, 0),smoothing_function=smoothie))
print('BLEU-2: %f' % corpus_bleu(actual,predicted, weights=(0.5, 0.5, 0, 0),smoothing_function=smoothie))
print('BLEU-3: %f' % corpus_bleu(actual,predicted, weights=(0.3, 0.3, 0.3, 0),smoothing_function=smoothie))
print('BLEU-4: %f' % corpus_bleu(actual,predicted, weights=(0.25, 0.25, 0.25, 0.25),smoothing_function=smoothie))

In [None]:
rid = np.random.randint(0, len(img_name_val))
image = img_name_val[rid]
start = time.time()
real_caption = ' '.join([tokenizer.index_word[i] for i in cap_val[rid] if i not in [0]])
result, attention_plot = evaluate(image)

first = real_caption.split(' ', 1)[1]
real_caption = first.rsplit(' ', 1)[0]

#remove "<unk>" in result
for i in result:
   if i=="<unk>":
       result.remove(i)

#remove <end> from result        
result_join = ' '.join(result)
result_final = result_join.rsplit(' ', 1)[0]

real_appn = []
real_appn.append(real_caption.split())
reference = real_appn
candidate = result_final

print ('Real Caption:', real_caption)
print ('Prediction Caption:', result_final)

plot_attention(image, result, attention_plot)
print(f"time took to Predict: {round(time.time()-start)} sec")

Image.open(img_name_val[rid])