# PROJECT : IMAGE CAPTIONING 

## Importing the libreries

In [1]:
import sys
sys.path.append("..")
import grading
import download_utils

import tensorflow as tf
import keras
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
L = keras.layers
K = keras.backend
import utils
import time
import zipfile
import json
from collections import defaultdict
import re
import random
from random import choice
import grading_utils
import os
from keras_utils import reset_tf_session
import tqdm_utils

Using TensorFlow backend.


## Extracting the image features

In [2]:
IMG_SIZE = 299

# we take the last hidden layer of IncetionV3 as an image embedding

def get_cnn_encoder():
    
    K.set_learning_phase(False)
    model = keras.applications.InceptionV3(include_top = False)
    preprocess_for_model = keras.applications.inception_v3.preprocess_input
    
    model = keras.models.Model(model.inputs , keras.layers.GlobalAveragePooling2D()(model.output))
    
    return model , preprocess_for_model

In [4]:
#load pre-trained model
tf.compat.v1.Session()
encoder, preprocess_for_model = get_cnn_encoder()

#extract train features
train_img_embeds, train_img_fns = utils.apply_model("E:/Datasets/COCO/train2014.zip", encoder, preprocess_for_model, input_shape=(IMG_SIZE, IMG_SIZE))

utils.save_pickle(train_img_embeds, "train_img_embeds.pickle")
utils.save_pickle(train_img_fns, "train_img_fns.pickle")

#extract validation features
val_img_embeds, val_img_fns = utils.apply_model("E:/Datasets/COCO/val2014.zip", encoder, preprocess_for_model, input_shape=(IMG_SIZE, IMG_SIZE))

utils.save_pickle(val_img_embeds, "val_img_embeds.pickle")
utils.save_pickle(val_img_fns, "val_img_fns.pickle")

#sample images for learners
def sample_zip(fn_in, fn_out, rate=0.01, seed=42):
    np.random.seed(seed)
    with zipfile.ZipFile(fn_in) as fin, zipfile.ZipFile(fn_out, "w") as fout:
        sampled = filter(lambda _: np.random.rand() < rate, fin.filelist)
        for zInfo in sampled:
            fout.writestr(zInfo, fin.read(zInfo))
            
sample_zip("E:/Datasets/COCO/train2014.zip", "train2014_sample.zip")
sample_zip("E:/Datasets/COCO/val2014.zip", "val2014_sample.zip")

HBox(children=(FloatProgress(value=0.0, max=82784.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=40505.0), HTML(value='')))




In [5]:
# load prepared embeddings
train_img_embeds = utils.read_pickle("train_img_embeds.pickle")
train_img_fns = utils.read_pickle("train_img_fns.pickle")
val_img_embeds = utils.read_pickle("val_img_embeds.pickle")
val_img_fns = utils.read_pickle("val_img_fns.pickle")
# check shapes
print(train_img_embeds.shape, len(train_img_fns))
print(val_img_embeds.shape, len(val_img_fns))

(82783, 2048) 82783
(40504, 2048) 40504


In [6]:
# check prepared samples of images
list(filter(lambda x: x.endswith("_sample.zip") , os.listdir("")))

FileNotFoundError: [WinError 3] The system cannot find the path specified: ''

## Extracting captions for image

In [None]:
#extract caption from zip file

def get_captions_for_fns(fns , zip_fn , zip_json_path):
    
    zf = zipfile.ZipFile(zip_fn)
    j = json.loads(zf.read(zip_json_path).decode('utf8'))
    
    id_to_fn = {img["id"]: img["file_name"] for img in j["images"]}
    fn_to_caps = dafaultdict(list)
    
    for cap in j['annotations']:
        fn_to_caps[id_to_fn[cap['image_id']]].append(cap['caption'])
    
    fn_to_caps = dict(fn_to_caps)
    return list(map(lambda x: fn_to_caps[x] , fns))

In [None]:
train_captions = get_captions_for_fns(train_img_fns, "captions_train-val2014.zip", 
                                      "annotations/captions_train2014.json")

val_captions = get_captions_for_free(val_img_fns , "captions_val_2014.zip" , "annotations/captions_val2014.json")

In [None]:
# check shape
print(len(train_img_fns), len(train_captions))
print(len(val_img_fns , len(val_captions)))

In [None]:
# Check the tranining example


def show_trainig_example(train_img_fns, train_captions, example_idx=0):
    """
    You can change example_idx and see different images
    """
    zf = zipfile.ZipFile("train2014_sample.zip")
    captions_by_file = dict(zip(train_img_fns, train_captions))
    all_files = set(train_img_fns)
    found_files = list(filter(lambda x: x.filename.rsplit("/")[-1] in all_files, zf.filelist))
    example = found_files[example_idx]
    img = utils.decode_image_from_buf(zf.read(example))
    plt.imshow(utils.image_center_crop(img))
    plt.title("\n".join(captions_by_file[example.filename.rsplit("/")[-1]]))
    plt.show()
    
show_trainig_example(train_img_fns, train_captions, example_idx=142)

## Prepare Caption for training

In [None]:
train_caption[:5]

In [None]:
#special tokens
PAD = "#PAD#"
UNK = "#UNK#"
START = "#START#"
END = "#END#"


# split sentence into tokens (split into lowercased words)
def split_sentence(sentence):
    return list(filter(lambda x: len(x) > 0, re.split('\W+', sentence.lower())))


def generate_vocabulary():
    
    from collections import Counter
    
    table = Counter()
    for tc in train_captions:
        for sentence in tc:
            for token in split_sentence(sentence):
                table[token] += 1
    vocab = [token for token, c in table.items() if c >= 5]
    vocab += [PAD, UNK, START, END]

    
    return {token: index for index, token in enumerate(sorted(vocab))}



def caption_tokens_to_indices(captions, vocab):
    """
    `captions` argument is an array of arrays:
    [
        [
            "image1 caption1",
            "image1 caption2",
            ...
        ],
        [
            "image2 caption1",
            "image2 caption2",
            ...
        ],
        ...
    ]
    Use `split_sentence` function to split sentence into tokens.
    Replace all tokens with vocabulary indices, use UNK for unknown words (out of vocabulary).
    Add START and END tokens to start and end of each sentence respectively.
    For the example above you should produce the following:
    [
        [
            [vocab[START], vocab["image1"], vocab["caption1"], vocab[END]],
            [vocab[START], vocab["image1"], vocab["caption2"], vocab[END]],
            ...
        ],
        ...
    ]
    """
    ### YOUR CODE HERE ###
    res = []
    for caption in captions:
        currCap = []
        for sentence in caption:
            currSen = [vocab[START]]
            for token in split_sentence(sentence):
                currSen.append(vocab.get(token, vocab[UNK]))
            currSen.append(vocab[END])
            currCap.append(currSen)
        res.append(currCap)
                
    return res    

In [None]:
# prepare vocabulary
vocab = generate_vocabulary(train_captions)
vocab_inverse = {idx: w for w, idx in vocab.items()}
print(len(vocab))

In [None]:
# replace tokens with indices
train_captions_indexed = caption_tokens_to_indices(train_captions, vocab)
val_captions_indexed = caption_tokens_to_indices(val_captions, vocab)

## Define Architecture