In [None]:
from IPython.display import clear_output
!pip install pycocotools
!pip install wget
!pip install gensim

clear_output()

# Downloading coco dataset

In [None]:
import wget
import zipfile
import os
from PIL import Image
import numpy as np
from matplotlib import pyplot as plt

if not os.path.isdir('/content/coco data'):
    os.makedirs('/content/coco data')

annotation_url = 'http://images.cocodataset.org/annotations/annotations_trainval2017.zip'
annotation_file_path = '/content/coco data/annotations_trainval2017.zip'
wget.download(annotation_url, annotation_file_path)
with zipfile.ZipFile(annotation_file_path, 'r') as zip_ref:
    zip_ref.extractall('/content/coco data/')
os.remove('/content/coco data/annotations_trainval2017.zip')

image_url = 'http://images.cocodataset.org/zips/train2017.zip'
image_zip_path = '/content/coco data/train2017.zip'
wget.download(image_url, image_zip_path)
if 'train2017' not in os.listdir('/content/coco data/'):
  os.mkdir('/content/coco data/train2017')
with zipfile.ZipFile(image_zip_path, 'r') as zip_ref:
    zip_ref.extractall('/content/coco data')
os.remove('/content/coco data/train2017.zip')

image_url = 'http://images.cocodataset.org/zips/val2017.zip'
image_zip_path = '/content/coco data/val2017.zip'
wget.download(image_url, image_zip_path)
if 'val2017' not in os.listdir('/content/coco data/'):
  os.mkdir('/content/coco data/val2017')
with zipfile.ZipFile(image_zip_path, 'r') as zip_ref:
    zip_ref.extractall('/content/coco data')
os.remove('/content/coco data/val2017.zip')
"""
"""

'\n'

In [None]:
"""
os.remove('/content/coco data/annotations_trainval2017.zip')
os.remove('/content/coco data/train2017.zip')
os.remove('/content/coco data/val2017.zip')
"""

# Loading annotations

In [None]:
import json

# both these dictionaries will contain the train/validation captions + img names
with open('/content/coco data/annotations/captions_train2017.json', 'r') as json_file:
    train_dict = json.load(json_file)
train_dict=train_dict['annotations']

with open('/content/coco data/annotations/captions_val2017.json', 'r') as json_file:
    val_dict = json.load(json_file)
val_dict=val_dict['annotations']

train_dict[0], val_dict[0], len(train_dict), len(val_dict)

({'image_id': 203564,
  'id': 37,
  'caption': 'A bicycle replica with a clock as the front wheel.'},
 {'image_id': 179765,
  'id': 38,
  'caption': 'A black Honda motorcycle parked in front of a garage.'},
 591753,
 25014)

## Tokenizing Captions

In [None]:
import nltk
tokenizer = nltk.tokenize.WordPunctTokenizer()

for i in train_dict:
    i['caption'] = tokenizer.tokenize(i['caption'].lower())

for i in val_dict:
    i['caption'] = tokenizer.tokenize(i['caption'].lower())

## Loading Fasttext and removing unknown words

In [None]:
from gensim.models import Word2Vec
import gensim.downloader as api
#w2v_model = api.load('word2vec-google-news-300')
w2v_model = api.load("fasttext-wiki-news-subwords-300")



In [None]:
st_unknown = set()   # saves unknown words
all_tokens = dict()  # keeps track of token counts

# replace unknown words with UNK and remove '.' at the end

for i in train_dict:
    # transforming image_id into actual image names
    i['image_id'] = str(i['image_id'])
    i['image_id'] = '0'*(12-len(i['image_id']))+i['image_id']+'.jpg'

    # removing '.'
    if i['caption'][-1]=='.':
        i['caption'].pop()
    # adding UNK, saving counts
    for j in range(len(i['caption'])):
        if i['caption'][j] not in w2v_model:
            st_unknown.add(i['caption'][j])
            i['caption'][j] = 'UNK'
        all_tokens[i['caption'][j]] = all_tokens.get(i['caption'][j], 0)+1

for i in val_dict:
    # transforming image_id into actual image names
    i['image_id'] = str(i['image_id'])
    i['image_id'] = '0'*(12-len(i['image_id']))+i['image_id']+'.jpg'

    # removing '.'
    if i['caption'][-1]=='.':
        i['caption'].pop()
    # adding UNK, saving counts
    for j in range(len(i['caption'])):
        if i['caption'][j] not in w2v_model:
            st_unknown.add(i['caption'][j])
            i['caption'][j] = 'UNK'
        all_tokens[i['caption'][j]] = all_tokens.get(i['caption'][j], 0)+1

print(len(all_tokens))

rare_word_st = set()
for k,v in all_tokens.items():
    if v<=4:
        rare_word_st.add(k)
for i in rare_word_st:
    all_tokens.pop(i)

len(all_tokens), len(st_unknown)

22827


(10266, 4582)

# Adding Special Tokens and transforming captions into numeric representation.



In [None]:
'UNK' in all_tokens

True

In [None]:
all_tokens = list(all_tokens.keys())
all_tokens.extend(['START', 'END', 'PAD'])
all_tokens = np.array(all_tokens)

# key_to_ind shows where the words are located
word_to_ind = dict((v, ind) for ind, v in enumerate(all_tokens))

In [None]:
word_to_ind['START'], word_to_ind['END'], word_to_ind['PAD']

(10266, 10267, 10268)

In [None]:
for i in train_dict:
    for j in range(len(i['caption'])):
        i['caption'][j] = word_to_ind.get(i['caption'][j], word_to_ind['UNK'])
for i in val_dict:
    for j in range(len(i['caption'])):
        i['caption'][j] = word_to_ind.get(i['caption'][j], word_to_ind['UNK'])

# Defining a Generator

In [None]:
# these variables are used for quick access to images without 'if's
t_v_name = ['train2017', 'val2017']
PAD_EMB = word_to_ind['PAD']
SOS_EMB = word_to_ind['START']
EOS_EMB = word_to_ind['END']


def make_batch(batch_indxs, data_dict):
    # val = flag for loading val/train
    val = int(len(data_dict)<1e5)
    batch_size = len(batch_indxs)

    # preparing captions
    captions = [data_dict[i]['caption'] for i in batch_indxs]
    mx_size = np.max([len(i) for i in captions])
    mx_size += 2
    for i in range(len(captions)):
        captions[i] = [SOS_EMB] + captions[i] + [EOS_EMB]
        captions[i] = captions[i]+[PAD_EMB]*(mx_size-len(captions[i]))

    # loading images
    # there are 500k+ images so leat's not use ImageDataGenerator this time
    imgs = []
    for i in batch_indxs:
        q = Image.open('/content/coco data/' + t_v_name[val] + "/" + data_dict[i]['image_id'])
        q = q.resize((224, 224)).convert('RGB')
        imgs.append(np.array(q))

    return np.array(imgs), np.array(captions, dtype='int32')


def data_gen(data_dict, batch_size=32, shuffle=True, cycle=False, **kwargs):
    """ iterates minibatches of data in random order """
    while True:
        if shuffle:
            indices = np.random.permutation(len(data_dict))
        else:
            indices = np.arange(len(data_dict))

        for start in range(0, len(indices), batch_size):
            imgs, captions = make_batch(indices[start : start + batch_size], data_dict)
            print('')
            yield [imgs, captions[:, :-1]], np.array([w2v_model[all_tokens[cpt]] for cpt in captions[:, 1:]])

        if not cycle: break

In [None]:
# testing if make_batch works
x = make_batch([1,2,3], val_dict)
print(x[0].shape, x[1].shape)
"""
for i in x[0]:
    plt.imshow(i)
    plt.show()
"""

(3, 224, 224, 3) (3, 11)


'\nfor i in x[0]:\n    plt.imshow(i)\n    plt.show()\n'

In [None]:
x[1]

array([[10266,     0,  4184,   150,    21,    35,     0,   165,    92,
        10267, 10268],
       [10266,    57,   174,  1494,     3,    83,   753,  2172,   121,
          287, 10267],
       [10266,     0,    41,  1920,    66,    35,     0,   997,   175,
        10267, 10268]], dtype=int32)

# Model Architecture

In [None]:
import tensorflow as tf
from tensorflow.keras.applications.efficientnet_v2 import EfficientNetV2B0
import tensorflow.keras.layers as L

eff_net_model = EfficientNetV2B0(
    include_top=False,
    weights='imagenet',
    input_shape=(224, 224, 3),
    )

In [None]:
np.array([w2v_model[i] for i in all_tokens]).shape

(10269, 300)

In [None]:
def att_block(inp, neurons=64, outp_neurons=64, causal=False):
    x1 = L.Dense(neurons, activation='relu')(inp)
    x2 = L.Dense(neurons, activation='relu')(inp)
    print(x1.shape)

    x0 = L.Attention()([x1, x2], use_causal_mask=causal)
    x0 = L.Add()([x0, inp])
    x0 = L.BatchNormalization()(x0)
    x0 = L.Dense(outp_neurons, 'relu')(x0)

    return x0

emb_l = L.Embedding(input_dim=len(all_tokens),
                    output_dim=300,
                    weights=np.array([[w2v_model[i] for i in all_tokens]]),
                    trainable=False
                    )


inp1 = tf.keras.Input(shape=(224, 224, 3))

# image encoder
x = eff_net_model(inp1)
x = L.Dense(640, activation='relu')(x)
x1 = L.GlobalAveragePooling2D(name='eff_net_avgpool')(x)
x2 = L.GlobalMaxPooling2D(name='eff_net_maxpool')(x)

x = L.Concatenate()([x1, x2])
x = L.Dense(400, 'relu')(x)

x = att_block(x, 400, 350)
x = att_block(x, 350, 300)

outp_enc = L.Dense(150, 'relu')(x)

# text decoder
inp2 = tf.keras.Input(shape=(None,))
x = L.Masking(word_to_ind['PAD'])(inp2)  # masking the PAD
x = emb_l(x)                             # embedding inputs

x = L.Dense(220, 'relu')(x)
x = att_block(x, 220, 150)

x = L.Attention()([x, outp_enc], use_causal_mask=True)
x = L.BatchNormalization()(x)
x = L.Dense(150, 'relu')(x)

x = att_block(x, 150, 220, causal=True)
x = att_block(x, 220, 300, causal=True)

outp_dec = L.Dense(300, 'relu')(x)


model = tf.keras.Model([inp1, inp2], outp_dec)
print('model constructed!!!')

model.compile('adam', 'mean_squared_error', metrics=['cosine_similarity', 'mean_absolute_error'])
# model.summary()

(None, 400)
(None, 350)
(None, None, 220)
(None, None, 150)
(None, None, 220)
model constructed!!!


In [None]:
# model visualization
"""
tf.keras.utils.plot_model(model,
                          show_shapes=True,
                          show_layer_names=True,
                          rankdir='TB', #  'LR' for left to right
                          show_layer_activations=True,
                          show_trainable=True)
"""

"\ntf.keras.utils.plot_model(model,\n                          show_shapes=True,\n                          show_layer_names=True,\n                          rankdir='TB', #  'LR' for left to right\n                          show_layer_activations=True,\n                          show_trainable=True)\n"

In [None]:
# Clear the default graph
tf.compat.v1.reset_default_graph()

# Optionally, release GPU memory
tf.keras.backend.clear_session()

In [None]:
tf.config.run_functions_eagerly(True)

#  train_size - 591753,
#  val_size - 25014
batch_size = 32
epochs = 1

train_gen = data_gen(train_dict, batch_size=batch_size)
val_gen =  data_gen(val_dict, batch_size, cycle=True)

model.fit(train_gen,
          epochs=epochs,
          steps_per_epoch = 591752 // batch_size + 1,

          validation_data = val_gen,
          validation_steps = 25013 // batch_size + 1,

          max_queue_size=10,
          workers=3,
          use_multiprocessing=True
          )






[1;30;43mStreaming output truncated to the last 5000 lines.[0m

In [None]:
"""
import time, psutil
uptime = time.time() - psutil.boot_time()
remain = 12*60*60 - uptime
"""

In [None]:
model.save('/content/drive/MyDrive/ADV ML/IMG_Caption_v1.h5')