LEARNING DEEP LEARNING


Theory and practice of Neural Networks, Computer Vision, Natural Language Processing and Transformers Using TensorFlow

In [1]:
import json
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Attention
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.layers import GlobalAveragePooling2D
from tensorflow.keras.layers import Reshape
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras.applications import VGG19
from tensorflow.keras.applications.vgg19 import preprocess_input
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.utils import Sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
import gzip
import logging
import os
import zipfile

tf.get_logger().setLevel(logging.ERROR)

TRAINING_FILE_DIR = 'C:/Users/pipel/Documents/Javeriana Topicos/Sesion 3/Image Caption/data/coco/'
OUTPUT_FILE_DIR = 'C:/Users/pipel/Documents/Javeriana Topicos/Sesion 3/Image Caption/tf_data/feature_vectors/'

!wget http://images.cocodataset.org/zips/train2014.zip

!wget http://images.cocodataset.org/annotations/annotations_trainval2014.zip



local_zip = 'C:/Users/pipel/Documents/Javeriana Topicos/Sesion 3/Image Caption/data/train2014.zip'

zip_ref = zipfile.ZipFile(local_zip, 'r')

zip_ref.extractall(TRAINING_FILE_DIR)
zip_ref.close()



local_zip = 'C:/Users/pipel/Documents/Javeriana Topicos/Sesion 3/Image Caption/data/annotations_trainval2014.zip'

zip_ref = zipfile.ZipFile(local_zip, 'r')

zip_ref.extractall(TRAINING_FILE_DIR)
zip_ref.close()

In [2]:
with open(TRAINING_FILE_DIR + 'annotations/captions_train2014.json') as json_file:
    data = json.load(json_file)

image_dict ={}

for image in data['images']:
  image_dict[image['id']]=[image['file_name']]

for anno in data['annotations']:
  image_dict[anno['image_id']].append(anno['caption'])

In [3]:
model=VGG19(weights='imagenet')
model.summary()

Model: "vgg19"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 224, 224, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0     

In [4]:
model_new =Model(inputs=model.input,
                 outputs=model.get_layer('block5_conv4').output)

model_new.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 224, 224, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0     

!mkdir /tf_data
!mkdir /tf_data/feature_vectors

In [5]:
train_data_fnames = os.listdir( TRAINING_FILE_DIR + 'train2014')
print(train_data_fnames[:10])

['COCO_train2014_000000000009.jpg', 'COCO_train2014_000000000025.jpg', 'COCO_train2014_000000000030.jpg', 'COCO_train2014_000000000034.jpg', 'COCO_train2014_000000000036.jpg', 'COCO_train2014_000000000049.jpg', 'COCO_train2014_000000000061.jpg', 'COCO_train2014_000000000064.jpg', 'COCO_train2014_000000000071.jpg', 'COCO_train2014_000000000072.jpg']


In [7]:


READ_IMAGES = 90000

for i, key in enumerate(image_dict.keys()):
  if i % 1000 == 0:
    print('progress: ' + str(i) + 'images processed')

  if i == READ_IMAGES:
      break

  item = image_dict.get(key)
  filename = TRAINING_FILE_DIR + 'train2014/'+ item[0]

  image = load_img(filename)
  width = image.size[0]
  height = image.size[1]

  if height > width:
    image = load_img(filename, target_size=(int(height/width*256),256))
  else:
    image = load_img(filename, target_size=(256,int(width/height*256)))
  
  width = image.size[0]
  height = image.size[1]
  image_np = img_to_array(image)

  h_start = int((height-224)/2)
  w_start = int((width-224)/2)
  image_np = image_np[h_start:h_start+224,
                      w_start:w_start+224]

  image_np = np.expand_dims(image_np, axis=0)

  X = preprocess_input(image_np)
  Y = model_new.predict(X)
  save_filename = OUTPUT_FILE_DIR + item[0] + '.pickle.gzip'
  pickle_file = gzip.open(save_filename, 'wb')
  pickle.dump(Y[0],pickle_file)
  pickle_file.close()

save_filename = OUTPUT_FILE_DIR + 'caption_file.pickle.gz'
pickle_file = gzip.open(save_filename, 'wb')
pickle.dump(image_dict,pickle_file)
pickle_file.close()




progress: 0images processed
progress: 1000images processed
progress: 2000images processed
progress: 3000images processed
progress: 4000images processed
progress: 5000images processed
progress: 6000images processed
progress: 7000images processed
progress: 8000images processed
progress: 9000images processed
progress: 10000images processed
progress: 11000images processed
progress: 12000images processed
progress: 13000images processed
progress: 14000images processed
progress: 15000images processed
progress: 16000images processed
progress: 17000images processed
progress: 18000images processed
progress: 19000images processed
progress: 20000images processed
progress: 21000images processed
progress: 22000images processed
progress: 23000images processed
progress: 24000images processed
progress: 25000images processed
progress: 26000images processed
progress: 27000images processed
progress: 28000images processed
progress: 29000images processed
progress: 30000images processed
progress: 31000images

In [68]:
EPOCHS = 20
BATCH_SIZE = 128
MAX_WORDS = 10000
READ_IMAGES = 90000
LAYER_SIZE = 256
EMBEDDING_WIDTH = 128
OOV_WORD = 'UNK'
PAD_INDEX = 0
OOV_INDEX = 1
START_INDEX = MAX_WORDS - 2
STOP_INDEX = MAX_WORDS - 1
MAX_LENGTH = 60
TRAINING_FILE_DIR = 'C:/Users/pipel/Documents/Javeriana Topicos/Sesion 3/Image Caption/tf_data/feature_vectors/'
TEST_FILE_DIR = 'C:/Users/pipel/Documents/Javeriana Topicos/Sesion 3/Image Caption/data/test_images/'
TEST_IMAGES = ['boat.jpg',
               'cat.jpg',
               'table.jpg',
               'bird.jpg']



In [69]:
# funcion para leer los archivos de entrenamiento
def read_training_file(file_name, max_len):
    pickle_file = gzip.open(file_name, 'rb')
    image_dict = pickle.load(pickle_file)
    pickle_file.close()
    image_paths = []
    dest_word_sequences = []
    for i, key in enumerate(image_dict):
        if i == READ_IMAGES:
            break
        image_item = image_dict[key]
        image_paths.append(image_item[0])
        caption = image_item[1]
        word_sequence = text_to_word_sequence(caption)
        dest_word_sequence = word_sequence[0:max_len]
        dest_word_sequences.append(dest_word_sequence)
    return image_paths, dest_word_sequences



In [70]:
from numpy.lib.function_base import append

#funciones para tokenizar y destokenizar las series

def tokenize(sequences):
    tokenizer = Tokenizer(num_words=MAX_WORDS-2,
                          oov_token=OOV_WORD)
    tokenizer.fit_on_texts(sequences)
    token_sequences = tokenizer.texts_to_sequences(sequences)
    return tokenizer, token_sequences

def tokens_to_words(tokenizer, seq):
    word_seq = []
    for index in seq:
        if index == PAD_INDEX:
            word_seq.append('PAD')
        elif index == OOV_INDEX:
            word_seq.append(OOV_WORD)
        elif index == START_INDEX:
            word_seq.append('START')
        elif index == STOP_INDEX:
            word_seq.append('STOP')
        else:
            word_seq.append(tokenizer.sequences_to_texts(
                [[index]])[0])
    print(word_seq)


In [71]:
#leer los archivos
image_paths, dest_seq = read_training_file(TRAINING_FILE_DIR + 'caption_file.pickle.gz', MAX_LENGTH)
dest_tokenizer, dest_token_seq = tokenize(dest_seq)

In [72]:
dest_token_seq

[[2, 384, 70, 629, 76, 547, 8, 286],
 [2, 10, 367, 1283, 6, 2, 62, 79, 6, 968],
 [2, 181, 43, 1232, 1562, 26, 2, 10, 9, 13, 6],
 [5, 62, 9, 237, 3, 2724, 4, 5, 722],
 [2, 172, 8, 20, 22, 587, 6, 5, 62],
 [2, 40, 8, 17, 319, 3, 2, 10, 6, 2, 279, 84, 275, 50, 94, 2, 310],
 [5, 682, 89, 4, 5, 122, 9, 44, 2, 76, 23],
 [2, 27, 67, 8, 34, 18, 4, 5, 25],
 [2, 356, 66, 6, 2, 62, 7, 2, 271, 8, 444],
 [15, 18, 6, 2, 57, 119, 126, 88, 19, 16, 1927],
 [2, 31, 6, 17, 9, 13, 6, 2, 62],
 [2, 31, 9, 257, 2, 2470, 7, 2, 443, 8, 388],
 [2, 62, 7, 2, 23, 8, 34, 286],
 [2, 62, 70, 307, 444, 2, 1411, 109, 8, 239],
 [2, 1284, 367, 57, 146, 3, 2, 62, 44, 2, 120],
 [894, 301, 106, 554, 54, 12, 19, 509, 3084],
 [2, 29, 3, 73, 19, 2, 23, 367, 57, 160],
 [2, 10, 257, 36, 212, 4, 21, 3, 2, 57, 518],
 [1435, 22, 367, 57, 19, 2, 384, 145, 3085, 2277],
 [494, 46, 23, 334, 97, 2, 3988, 447, 7, 196],
 [2, 147, 832, 1140, 147, 578, 1928, 62],
 [15, 18, 72, 2, 153, 227, 1829, 131],
 [2, 62, 6, 2, 384, 7, 57, 4, 5, 167],

In [28]:
# Clase para crear batches de entrenamiento en el momento.
class ImageCaptionSequence(Sequence):
    def __init__(self, image_paths, dest_input_data,
                 dest_target_data, batch_size):
        self.image_paths = image_paths
        self.dest_input_data = dest_input_data
        self.dest_target_data = dest_target_data
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.dest_input_data) /
            float(self.batch_size)))

    def __getitem__(self, idx):
        batch_x0 = self.image_paths[
            idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_x1 = self.dest_input_data[
            idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_y = self.dest_target_data[
            idx * self.batch_size:(idx + 1) * self.batch_size]
        image_features = []
        for image_id in batch_x0:
            file_name = TRAINING_FILE_DIR + image_id + '.pickle.gzip'
            pickle_file = gzip.open(file_name, 'rb')
            feature_vector = pickle.load(pickle_file)
            pickle_file.close()
            image_features.append(feature_vector)
        return [np.array(image_features),
                np.array(batch_x1)], np.array(batch_y)


In [29]:
# Preparar la data de entrenamiento.
dest_target_token_seq = [x + [STOP_INDEX] for x in dest_token_seq]
dest_input_token_seq = [[START_INDEX] + x for x in
                        dest_target_token_seq]
dest_input_data = pad_sequences(dest_input_token_seq,
                                padding='post')
dest_target_data = pad_sequences(
    dest_target_token_seq, padding='post',
    maxlen=len(dest_input_data[0]))
image_sequence = ImageCaptionSequence(
    image_paths, dest_input_data, dest_target_data, BATCH_SIZE)

In [30]:
image_sequence.image_paths

['COCO_train2014_000000057870.jpg',
 'COCO_train2014_000000384029.jpg',
 'COCO_train2014_000000222016.jpg',
 'COCO_train2014_000000520950.jpg',
 'COCO_train2014_000000069675.jpg',
 'COCO_train2014_000000547471.jpg',
 'COCO_train2014_000000122688.jpg',
 'COCO_train2014_000000392136.jpg',
 'COCO_train2014_000000398494.jpg',
 'COCO_train2014_000000090570.jpg',
 'COCO_train2014_000000504616.jpg',
 'COCO_train2014_000000161919.jpg',
 'COCO_train2014_000000457732.jpg',
 'COCO_train2014_000000044404.jpg',
 'COCO_train2014_000000004428.jpg',
 'COCO_train2014_000000170558.jpg',
 'COCO_train2014_000000405613.jpg',
 'COCO_train2014_000000283524.jpg',
 'COCO_train2014_000000037015.jpg',
 'COCO_train2014_000000071631.jpg',
 'COCO_train2014_000000491269.jpg',
 'COCO_train2014_000000365363.jpg',
 'COCO_train2014_000000064460.jpg',
 'COCO_train2014_000000581674.jpg',
 'COCO_train2014_000000470072.jpg',
 'COCO_train2014_000000344806.jpg',
 'COCO_train2014_000000084427.jpg',
 'COCO_train2014_00000031723

# ENCODER

In [31]:
#feature Vector
feature_vector_input = Input(shape=(14, 14, 512))

#Encode Layers
enc_mean_layer = GlobalAveragePooling2D()
enc_layer_h = Dense(LAYER_SIZE)
enc_layer_c = Dense(LAYER_SIZE)

#Connect The encoding Layers
enc_mean_layer_output = enc_mean_layer(feature_vector_input)
enc_layer_h_outputs = enc_layer_h(enc_mean_layer_output)
enc_layer_c_outputs = enc_layer_c(enc_mean_layer_output)

#Output state for encoder
enc_layer_outputs = [enc_layer_h_outputs, enc_layer_c_outputs]

#Build the model
enc_model_top = Model(feature_vector_input, enc_layer_outputs)
enc_model_top.summary()

Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_10 (InputLayer)           [(None, 14, 14, 512) 0                                            
__________________________________________________________________________________________________
global_average_pooling2d_1 (Glo (None, 512)          0           input_10[0][0]                   
__________________________________________________________________________________________________
dense_4 (Dense)                 (None, 256)          131328      global_average_pooling2d_1[0][0] 
__________________________________________________________________________________________________
dense_5 (Dense)                 (None, 256)          131328      global_average_pooling2d_1[0][0] 
Total params: 262,656
Trainable params: 262,656
Non-trainable params: 0
____________________

#DECODER

In [32]:
#Build Decoder model
dec_feature_vector_input = Input(shape=(14, 14, 512))
dec_embedding_input = Input(shape=(None, ))
dec_layer1_state_input_h = Input(shape=(LAYER_SIZE,))
dec_layer1_state_input_c = Input(shape=(LAYER_SIZE,))

#Create Decoder Layers
dec_reshape_layer = Reshape((196, 512),
                            input_shape=(14, 14, 512,))
dec_attention_layer = Attention()
dec_query_layer = Dense(512)
dec_embedding_layer = Embedding(output_dim=EMBEDDING_WIDTH,
                                input_dim=MAX_WORDS,
                                mask_zero=False)
dec_layer1 = LSTM(LAYER_SIZE, return_state=True,
                  return_sequences=True)
dec_concat_layer = Concatenate()
dec_layer2 = Dense(MAX_WORDS, activation='softmax')

#connect the decoder layers
dec_embedding_layer_outputs = dec_embedding_layer(
    dec_embedding_input)
dec_reshape_layer_outputs = dec_reshape_layer(
    dec_feature_vector_input)
dec_layer1_outputs, dec_layer1_state_h, dec_layer1_state_c = \
    dec_layer1(dec_embedding_layer_outputs, initial_state=[
        dec_layer1_state_input_h, dec_layer1_state_input_c])
dec_query_layer_outputs = dec_query_layer(dec_layer1_outputs)
dec_attention_layer_outputs = dec_attention_layer(
    [dec_query_layer_outputs, dec_reshape_layer_outputs])
dec_layer2_inputs = dec_concat_layer(
    [dec_layer1_outputs, dec_attention_layer_outputs])
dec_layer2_outputs = dec_layer2(dec_layer2_inputs)

#Build the model

dec_model = Model([dec_feature_vector_input,
                   dec_embedding_input,
                   dec_layer1_state_input_h,
                   dec_layer1_state_input_c],
                  [dec_layer2_outputs, dec_layer1_state_h,
                   dec_layer1_state_c])
dec_model.summary()

Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_12 (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 128)    1280000     input_12[0][0]                   
__________________________________________________________________________________________________
input_13 (InputLayer)           [(None, 256)]        0                                            
__________________________________________________________________________________________________
input_14 (InputLayer)           [(None, 256)]        0                                            
____________________________________________________________________________________________

#Encoder - Decoder


In [33]:
#construir el modelo de entrenamiento

train_feature_vector_input = Input(shape=(14, 14, 512))
train_dec_embedding_input = Input(shape=(None, ))
intermediate_state = enc_model_top(train_feature_vector_input)
train_dec_output, _, _ = dec_model([train_feature_vector_input,
                                    train_dec_embedding_input] +
                                    intermediate_state)
training_model = Model([train_feature_vector_input,
                        train_dec_embedding_input],
                        [train_dec_output])
training_model.compile(loss='sparse_categorical_crossentropy',
                       optimizer='adam', metrics =['accuracy'])
training_model.summary()

Model: "model_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_15 (InputLayer)           [(None, 14, 14, 512) 0                                            
__________________________________________________________________________________________________
input_16 (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
model_5 (Model)                 [(None, 256), (None, 262656      input_15[0][0]                   
__________________________________________________________________________________________________
model_6 (Model)                 [(None, None, 10000) 9495824     input_15[0][0]                   
                                                                 input_16[0][0]             

# ENCODER FOR INFERENCE

In [34]:
conv_model = VGG19(weights='imagenet')
conv_model_outputs = conv_model.get_layer('block5_conv4').output
intermediate_state = enc_model_top(conv_model_outputs)
inference_enc_model = Model([conv_model.input],
                            intermediate_state
                            + [conv_model_outputs])
inference_enc_model.summary()


Model: "model_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_17 (InputLayer)        [(None, 224, 224, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0   

In [38]:

for i in range(EPOCHS): # Train and evaluate model
    print('step: ' , i)
    history = training_model.fit(image_sequence, epochs=1)
    for filename in TEST_IMAGES:
        # Determine dimensions.
        image = load_img(TEST_FILE_DIR + filename)
        width = image.size[0]
        height = image.size[1]

        # Resize so shortest side is 256 pixels.
        if height > width:
            image = load_img(
                TEST_FILE_DIR + filename,
                target_size=(int(height/width*256), 256))
        else:
            image = load_img(
                TEST_FILE_DIR + filename,
                target_size=(256, int(width/height*256)))
        width = image.size[0]
        height = image.size[1]
        image_np = img_to_array(image)

        # Crop to center 224x224 region.
        h_start = int((height-224)/2)
        w_start = int((width-224)/2)
        image_np = image_np[h_start:h_start+224,
                            w_start:w_start+224]

        # Run image through encoder.
        image_np = np.expand_dims(image_np, axis=0)
        x = preprocess_input(image_np)
        dec_layer1_state_h, dec_layer1_state_c, feature_vector = \
            inference_enc_model.predict(x, verbose=0)

        # Predict sentence word for word.
        prev_word_index = START_INDEX
        produced_string = ''
        pred_seq = []
        for j in range(MAX_LENGTH):
            x = np.reshape(np.array(prev_word_index), (1, 1))
            preds, dec_layer1_state_h, dec_layer1_state_c = \
                dec_model.predict(
                    [feature_vector, x, dec_layer1_state_h,
                     dec_layer1_state_c], verbose=0)
            prev_word_index = np.asarray(preds[0][0]).argmax()
            pred_seq.append(prev_word_index)
            if prev_word_index == STOP_INDEX:
                break
        tokens_to_words(dest_tokenizer, pred_seq)
        print('\n\n')

step:  0


TypeError: int() argument must be a string, a bytes-like object or a number, not 'tuple'

In [62]:
image_sequence.dest_input_data.shape

(82783, 51)

In [66]:
dest_input_data

array([[9998,    2,  384, ...,    0,    0,    0],
       [9998,    2,   10, ...,    0,    0,    0],
       [9998,    2,  181, ...,    0,    0,    0],
       ...,
       [9998,    2,   10, ...,    0,    0,    0],
       [9998,    2,   29, ...,    0,    0,    0],
       [9998,    5, 3085, ...,    0,    0,    0]])

In [43]:
image_sequence.batch_size

128

In [44]:
history = training_model.fit(image_sequence.dest_input_data,
                             image_sequence.dest_target_data,
                             batch_size=image_sequence.batch_size, 
                             epochs=1)

ValueError: Error when checking model input: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 2 array(s), but instead got the following list of 1 arrays: [array([[9998,    2,  384, ...,    0,    0,    0],
       [9998,    2,   10, ...,    0,    0,    0],
       [9998,    2,  181, ...,    0,    0,    0],
       ...,
       [9998,    2,   10, ...,    0, ...