In [1]:
import tensorflow as tf
tf.enable_eager_execution()

In [2]:
import numpy as np
import os
import time
import json
import pickle
from glob import glob
from PIL import Image
from tqdm import tqdm

import matplotlib.pyplot as plt
from sklearn.utils import shuffle

In [3]:
import keras as K

Using TensorFlow backend.


In [4]:
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
session = tf.Session(config=config)

In [5]:
annotation_folder = '../Dataset/MSCOCO/annotations/'
image_folder = '../Dataset/MSCOCO/train2014/'

In [6]:
annotation_file = annotation_folder + 'captions_train2014.json'

# Read the json file
with open(annotation_file, 'r') as f:
    annotations = json.load(f)

In [120]:
NUM_SAMPLES = 300

# Store captions and image names
all_captions = []
all_img_paths = []

for annot in annotations['annotations']:
    caption = annot['caption']
    image_id = annot['image_id']
    img_path = image_folder + 'COCO_train2014_' + '%012d.jpg' % (image_id)

    all_img_paths.append(img_path)
    all_captions.append(caption)

# Shuffle captions and image_names together
all_captions, all_img_paths = shuffle(all_captions, all_img_paths, random_state=1)
train_captions = all_captions[:NUM_SAMPLES]
train_img_paths = all_img_paths[:NUM_SAMPLES]

In [121]:
print("len train_captions :", len(train_img_paths))
print("len all_captions :", len(all_img_paths))

len train_captions : 300
len all_captions : 414113


## Encoder

In [9]:
MODEL_TYPE = "xception"


def get_encoder(model_type=MODEL_TYPE):

    if model_type == "xception":
        cnn_preprocessor = tf.keras.applications.xception
        cnn_model = tf.keras.applications.Xception(include_top=False, weights='imagenet')

    elif model_type == "inception_v3":
        cnn_preprocessor = tf.keras.applications.inception_v3
        cnn_model = tf.keras.applications.InceptionV3(include_top=False, weights='imagenet')

    input_layer = cnn_model.input
    output_layer = cnn_model.layers[-1].output # use last hidden layer as output
    
    encoder = tf.keras.Model(input_layer, output_layer)
    encoder_preprocessor = cnn_preprocessor
    
    return encoder, encoder_preprocessor


encoder, encoder_preprocessor = get_encoder(MODEL_TYPE)

In [10]:
IMAGE_SIZE = (299, 299)


def load_image(image_path):

    image = tf.io.read_file(image_path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, IMAGE_SIZE)
    image = encoder_preprocessor.preprocess_input(image)
    
    return image, image_path

## Prepare Image dataset

In [11]:
BATCH_SIZE = 8

# Get unique images
unique_train_img_paths = sorted(set(train_img_paths))

# Prepare dataset
image_dataset = tf.data.Dataset.from_tensor_slices(unique_train_img_paths)
image_dataset = image_dataset.map(load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE) # use max num of CPU
image_dataset = image_dataset.batch(BATCH_SIZE)

In [190]:
estimated_batch_count = NUM_SAMPLES / BATCH_SIZE + 1
print("estimated_batch_count", estimated_batch_count)

estimated_batch_count 38.5


In [12]:
# Preprocessed image (batch)
for batch_imgs, batch_img_paths in tqdm(image_dataset):
    
    # get context vector of batch images
    batch_features = encoder(batch_imgs)
    
    # flatten 2D cnn result into 1D for RNN decoder input
    # (batch_size, 10, 10, 2048)  => (batch_size, 100, 2048)
    batch_features = tf.reshape(batch_features, (batch_features.shape[0], -1, batch_features.shape[3]))
    
    # Cache preprocessed image
    for image_feature, image_path in zip(batch_features, batch_img_paths):
        image_path = image_path.numpy().decode("utf-8")
        np.save(image_path, image_feature.numpy())

0it [00:00, ?it/s]

estimated_batch_count 38.5


38it [00:08,  4.31it/s]


## Prepare Tokenizer

In [141]:
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel
from keras.preprocessing.text import Tokenizer

In [142]:
TOKENIZER = "BERT"
VOCAB_SIZE = 5000  # Choose the top-n words from the vocabulary

In [143]:
class BertTokenizerWrapper(BertTokenizer):

    def texts_to_sequences(self, texts):
        """
        convert batch texts into indexed version
        eg: ['an apple', 'two person']
        output: [[1037,17260], [2083, 2711]] 
        """
        
        tokenized_texts = [self.tokenize(x) for x in texts]
        token_ids = [self.convert_tokens_to_ids(x) for x in tokenized_texts]
        
        return token_ids

In [144]:
if TOKENIZER == "BERT" :

    # Load pre-trained BERT tokenizer (vocabulary)
    tokenizer = BertTokenizerWrapper.from_pretrained('bert-base-uncased')
    
else : 
    
    # use default keras tokenizer
    tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="[UNK]")
    tokenizer.fit_on_texts(train_captions)    
    tokenizer.word_index['[PAD]'] = 0
    tokenizer.index_word[0] = '[PAD]'

In [145]:
train_captions = tokenizer.texts_to_sequences(train_captions)

## Pad sequence

In [146]:
from keras.preprocessing.sequence import pad_sequences

# If you do not provide a max_length value, pad_sequences calculates it automatically
MAX_LENGTH = None  # use <int> or None

In [147]:
train_captions = pad_sequences(train_captions, maxlen=MAX_LENGTH, padding='post', truncating="post")

## Create dataset object

In [151]:
# Load the numpy files
def load_image_npy(img_name, cap):
    img_tensor = np.load(img_name.decode('utf-8') + '.npy')
    return img_tensor, cap

In [165]:
# Create dataset object
from tensorflow.data import Dataset
dataset = Dataset.from_tensor_slices((train_img_paths, train_captions))

# Use map to load the numpy files in parallel
# wrap function into numpy function
dataset = dataset.map(lambda item1, item2: tf.numpy_function(
          map_func, [item1, item2], [tf.float32, tf.int32]),
          num_parallel_calls=tf.data.experimental.AUTOTUNE)

In [166]:
# Shuffle and batch
dataset = dataset.shuffle(1000).batch(BATCH_SIZE)
dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [None]:
# ===========================================
# dataset contains tuple of (image, captions)
# image : (batch_size, 100, 2048)
# caption : (batch_size, max_length)

## Split train eval test

In [186]:
# Split dataset 

TRAIN_SPLIT = 0.7
EVAL_SPLIT = 0.15
TEST_SPLIT = 0.15  # approx

n_batch = int(NUM_SAMPLES / BATCH_SIZE) + 1
n_train = int(n_batch * 0.7)
n_eval = int(n_batch * 0.15)
n_test = n_batch - (n_train + n_eval)

train_dataset = dataset.take(n_train)
eval_dataset = dataset.skip(n_train).take(n_eval)
test_dataset = dataset.skip(n_train + n_eval)

In [189]:
print("train: {} batches, (total : {})".format(n_train, n_train * BATCH_SIZE))
print("eval : {} batches, (total : {})".format(n_eval, n_eval * BATCH_SIZE))
print("test : {} batches, (total : {} (aprx))".format(n_test, n_test * BATCH_SIZE))

train: 26 batches, (total : 208)
eval : 5 batches, (total : 40)
test : 7 batches, (total : 56 (aprx))


# Model

## Encoder

In [None]:



class CNN_Encoder(tf.keras.Model):
    
    # Image features was preprocessed and cached before
    # This encoder passes those features through a Fully connected layer
    # This encoder concist of single Fully-connected layer with RELU activation function
    
    def __init__(self, embedding_dim):
        super(CNN_Encoder, self).__init__()
        
        # shape after fc == (batch_size, 64, embedding_dim)
        self.fc = tf.keras.layers.Dense(embedding_dim)

    def call(self, x):
        
        x = self.fc(x)
        x = tf.nn.relu(x)
        
        return x

In [82]:
model = BertModel.from_pretrained('bert-base-uncased')
model.eval();