### Course: ID2223
### Group: Nicolas Essipova, Peter Lakatos, Marios Chatiras

# Task 1: Inception v1 Implementation

Approach involves creating the inception module prior to building out the inception architecture itself, adhering to the structure outlined by the paper. Everything is implemented in TensorFlow 2.0 and Keras.

In [0]:
%tensorflow_version 2.x
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Model

print(tf.__version__)

In [0]:
# Setting up the input and initializers for our model,
# per what was used in the paper.

input_layer = layers.Input(shape=(224, 224, 3))
kernel_init = tf.keras.initializers.RandomUniform(-1, 1)
bias_init = tf.keras.initializers.Constant(value=0.2)

# Defining the Inception Module
![alt text](https://i.imgur.com/coskhAk.png)

In [0]:
def inception_module(x, # previous layer
                     filters_1x1,
                     filters_3x3_reduce,
                     filters_3x3,
                     filters_5x5_reduce,
                     filters_5x5,
                     filters_pool_proj,
                     name=None):
    
    # First path
    conv_1x1 = layers.Conv2D(filters_1x1, (1, 1), padding='same', activation='relu', kernel_initializer=kernel_init, bias_initializer=bias_init)(x)
    
    # Second path
    conv_3x3 = layers.Conv2D(filters_3x3_reduce, (1, 1), padding='same', activation='relu', kernel_initializer=kernel_init, bias_initializer=bias_init)(x)
    conv_3x3 = layers.Conv2D(filters_3x3, (3, 3), padding='same', activation='relu', kernel_initializer=kernel_init, bias_initializer=bias_init)(conv_3x3)

    # Third path
    conv_5x5 = layers.Conv2D(filters_5x5_reduce, (1, 1), padding='same', activation='relu', kernel_initializer=kernel_init, bias_initializer=bias_init)(x)
    conv_5x5 = layers.Conv2D(filters_5x5, (5, 5), padding='same', activation='relu', kernel_initializer=kernel_init, bias_initializer=bias_init)(conv_5x5)

    # Fourth path
    pool_proj = layers.MaxPool2D((3, 3), strides=(1, 1), padding='same')(x)
    pool_proj = layers.Conv2D(filters_pool_proj, (1, 1), padding='same', activation='relu', kernel_initializer=kernel_init, bias_initializer=bias_init)(pool_proj)

    # Filter concatenation
    output = layers.concatenate([conv_1x1, conv_3x3, conv_5x5, pool_proj], axis=3, name=name)
    
    return output

# Constructing GoogLeNet

We will now construct the model according to the final topology that is presented in the paper. Our constructed model will look exactly like this:

![alt text](https://i.imgur.com/WBGzkF2.png)

In [0]:
x = layers.Conv2D(64, (7, 7), padding='same', strides=(2, 2), activation='relu', name='conv_1_7x7/2',
           kernel_initializer=kernel_init,
           bias_initializer=bias_init)(input_layer)

x = layers.MaxPool2D((3, 3), padding='same', strides=(2, 2), name='max_pool_1_3x3/2')(x)
x = layers.Conv2D(64, (1, 1), padding='same', strides=(1, 1), activation='relu', name='conv_2a_3x3/1')(x)
x = layers.Conv2D(192, (3, 3), padding='same', strides=(1, 1), activation='relu', name='conv_2b_3x3/1')(x)
x = layers.MaxPool2D((3, 3), padding='same', strides=(2, 2), name='max_pool_2_3x3/2')(x)

# Our first inception module
x = inception_module(x,
                     filters_1x1=64,
                     filters_3x3_reduce=96,
                     filters_3x3=128,
                     filters_5x5_reduce=16,
                     filters_5x5=32,
                     filters_pool_proj=32,
                     name='inception_3a')

# Our second inception module
x = inception_module(x,
                     filters_1x1=128,
                     filters_3x3_reduce=128,
                     filters_3x3=192,
                     filters_5x5_reduce=32,
                     filters_5x5=96,
                     filters_pool_proj=64,
                     name='inception_3b')

x = layers.MaxPool2D((3, 3), padding='same', strides=(2, 2), name='max_pool_3_3x3/2')(x)

# Our third inception module
x = inception_module(x,
                     filters_1x1=192,
                     filters_3x3_reduce=96,
                     filters_3x3=208,
                     filters_5x5_reduce=16,
                     filters_5x5=48,
                     filters_pool_proj=64,
                     name='inception_4a')


# Our first auxilliary output path
x1 = layers.AveragePooling2D((5, 5), strides=3)(x)
x1 = layers.Conv2D(128, (1, 1), padding='same', activation='relu')(x1)
x1 = layers.Flatten()(x1)
x1 = layers.Dense(1024, activation='relu')(x1)
x1 = layers.Dropout(0.7)(x1) # 70% is intentional, as that's stated in the paper.
x1 = layers.Dense(10, activation='softmax', name='auxilliary_output_1')(x1)

# Our fourth inception module
x = inception_module(x,
                     filters_1x1=160,
                     filters_3x3_reduce=112,
                     filters_3x3=224,
                     filters_5x5_reduce=24,
                     filters_5x5=64,
                     filters_pool_proj=64,
                     name='inception_4b')

# Our fifth inception module
x = inception_module(x,
                     filters_1x1=128,
                     filters_3x3_reduce=128,
                     filters_3x3=256,
                     filters_5x5_reduce=24,
                     filters_5x5=64,
                     filters_pool_proj=64,
                     name='inception_4c')

# Our sixth inception module
x = inception_module(x,
                     filters_1x1=112,
                     filters_3x3_reduce=144,
                     filters_3x3=288,
                     filters_5x5_reduce=32,
                     filters_5x5=64,
                     filters_pool_proj=64,
                     name='inception_4d')

# Our second auxilliary output path
x2 = layers.AveragePooling2D((5, 5), strides=3)(x)
x2 = layers.Conv2D(128, (1, 1), padding='same', activation='relu')(x2)
x2 = layers.Flatten()(x2)
x2 = layers.Dense(1024, activation='relu')(x2)
x2 = layers.Dropout(0.7)(x2)
x2 = layers.Dense(10, activation='softmax', name='auxilliary_output_2')(x2)

# Our seventh inception module
x = inception_module(x,
                     filters_1x1=256,
                     filters_3x3_reduce=160,
                     filters_3x3=320,
                     filters_5x5_reduce=32,
                     filters_5x5=128,
                     filters_pool_proj=128,
                     name='inception_4e')

x = layers.MaxPool2D((3, 3), padding='same', strides=(2, 2), name='max_pool_4_3x3/2')(x)

# Our eight inception module
x = inception_module(x,
                     filters_1x1=256,
                     filters_3x3_reduce=160,
                     filters_3x3=320,
                     filters_5x5_reduce=32,
                     filters_5x5=128,
                     filters_pool_proj=128,
                     name='inception_5a')

# Our ninth and final inception module
x = inception_module(x,
                     filters_1x1=384,
                     filters_3x3_reduce=192,
                     filters_3x3=384,
                     filters_5x5_reduce=48,
                     filters_5x5=128,
                     filters_pool_proj=128,
                     name='inception_5b')

# Our final output path
x = layers.GlobalAveragePooling2D(name='avg_pool_5_3x3/1')(x)
x = layers.Dropout(0.4)(x)
x = layers.Dense(10, activation='softmax', name='output')(x)

In [0]:
model = Model(input_layer, [x, x1, x2], name='inception_v1')
model.summary()

## Our model summary is in agreement with the topological structure of what is proposed in the paper.

# Task 2: Show and Tell: A Neural Image Caption Generator

Automatically describing the content of an image is a fundamental problem in AI that connects computer vision and natural language processing. In this task, we will be looking into how we can use CNNs and RNNs to build an Image Caption Generator. We used the Flickr8k dataset.

In [0]:
#Packages
import numpy as np
from PIL import Image
from tqdm import tqdm
import re
from random import shuffle
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.preprocessing import image, sequence
from tensorflow.keras.layers import Input, Dense, Embedding, Dropout, LSTM, add, RepeatVector, Activation, TimeDistributed, Bidirectional, Add
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import ModelCheckpoint

In [0]:
train_dir = 'data/Flickr8k/Flickr8k_text/Flickr_8k.trainImages.txt'
test_dir = 'data/Flickr8k/Flickr8k_text/Flickr_8k.testImages.txt'
captions_dir = 'data/Flickr8k/Flickr8k_text/Flickr8k.token.txt'
all_images_dir = 'data/Flickr8k/Flickr8k_Dataset/Flicker8k_Dataset/'


train_image_names = open(train_dir, 'r').read().strip().split("\n")
test_image_names = open(test_dir, 'r').read().strip().split("\n")
all_captions = open(captions_dir, 'r').read().strip().split("\n")


train_image_dir = []
for i in train_image_names:
    train_image_dir.append(all_images_dir + i)
    
test_image_dir = []
for i in test_image_names:
    test_image_dir.append(all_images_dir + i)

In [0]:
def train_captions(train_image_names, captions):
    t_dict = {}    
    for i in tqdm(captions):
        imagename, caption = i.split("\t")  
        imagename = imagename[:-2]
        if imagename not in train_image_names:           
            continue            
        if imagename not in t_dict:
            t_dict[imagename] = []        
        t_dict[imagename].append(caption)        
    return t_dict

In [0]:
def filter_caption(train_image_names, train_dict):
    arr = {}
    for name, caption in train_dict.items():
        arr[name] = []        
        for j in caption:
            sentence = ""            
            for i in j.split(" "):
                word = i.lower()
                word = re.sub(r'[^\w\s]','',word)
                if len(word)==1 and word!= "a":
                    continue
                if not word.isalpha():
                    continue
                sentence = sentence + word + " "            
            arr[name].append(sentence.strip()) 
    return arr                

In [0]:
train_dict = train_captions(train_image_names, all_captions)
train_dict = filter_caption(train_image_names, train_dict)

100%|█████████████████████████████████████████████████████████████████████████| 40460/40460 [00:02<00:00, 17138.92it/s]


In [0]:
def shuffle_cap(train_dict):
    train_img_cap = []
    train_img = []
    train_captions = []
    for key, value in train_dict.items():
        for new in value:
            train_img_cap.append(key + "-" + new)

    train_img_cap_copy = train_img_cap[:] 
    shuffle(train_img_cap_copy)  

    for data in train_img_cap_copy:
            i,j = data.split("-")
            train_img.append(i)
            train_captions.append("<start> " + j + " <stop>")

    return train_img, train_captions

In [0]:
train_img, train_captions = shuffle_cap(train_dict)

In [0]:
def max_caption(train_captions):
    size = 0
    for i in train_captions:
        if len(i.split(" ")) > size:
            size = len(i.split(" "))
    return size

In [0]:
max_caption_length = max_caption(train_captions)

37

In [0]:
def vocabulary(train_captions):
    words = []
    for captions in train_captions:
        for i in captions.split(" "):
            words.append(i)
    words = list(set(words))
    return words 

In [0]:
vocab = vocabulary(train_captions)
vocab_size = len(vocab)

In [0]:
#Using the VGG16
def VGG():
    model = VGG16(weights='imagenet')    
    model_input = model.input
    model_output = model.layers[-2].output    
    model = Model(inputs = model_input, outputs = model_output)    
    return model
# Image preprocessing for VGG model

def image_proc(path, VGG_model):    
    temp = image.load_img(path, target_size=(224, 224))
    img = image.img_to_array(temp)
    img = img.reshape((1, img.shape[0], img.shape[1], img.shape[2]))
    img /= 255
    img -= 0.5
    img *= 2    
    enc = VGG_model.predict(img)
    enc = np.reshape(enc, enc.shape[1])
    return enc

def encode_image(train_image_dir, VGG_model):
    arr = {}    
    for path in tqdm(train_image_dir):
        image_name = path[len(all_images_dir):]
        arr[image_name] = image_proc(path, VGG_model)
    return arr

In [0]:
def make_index(vocab):
    arr = {}
    for i, j in enumerate(vocab):
        arr[j] = i        
    return arr

In [0]:
index = make_index(vocab)

In [0]:
VGG_model = VGG()
encoded_train_image = encode_image(train_image_dir, VGG_model)

100%|██████████████████████████████████████████████████████████████████████████████| 6000/6000 [20:27<00:00,  4.89it/s]


In [0]:
def generator(train_img, train_captions, encoded_train_image, index, vocab_size, max_caption_length, batch_size = 200):
    
    temp_cap = []
    sequence_cap = []
    target_cap = []
    encoded_image = []
    counter = 0
    for i in range(len(train_captions)):        
        caption_words = train_captions[i].split(" ")
        counter+=1

        for j in range(1, len(caption_words)):            
            temp_cap = []
            next_word = np.zeros((vocab_size,))

            for k in range(j):                
                temp_cap.append( index[caption_words[k]] )
            
            next_word [ index[caption_words[j]] ] = 1            
            sequence_cap.append(temp_cap)            
            target_cap.append(next_word)            
            encoded_image.append(encoded_train_image[ train_img[i] ])
    
        if counter == batch_size:
            encoded_image = np.asarray(encoded_image)
            sequence_cap = sequence.pad_sequences(sequence_cap, maxlen = max_caption_length, padding ='post')
            target_cap = np.asarray(target_cap)
            
            yield [[encoded_image, sequence_cap], target_cap]
            counter = 0
            encoded_image = []
            sequence_cap = []
            target_cap = []
            
                

In [0]:
in1 = Input(shape = (4096,))
par = Dropout(0.5)(in1)
par = Dense(256, activation = 'relu')(par)

in2 = Input(shape = (max_caption_length,))
cap = Embedding(vocab_size, 256)(in2)
cap = Dropout(0.5)(cap)
cap = LSTM(256, return_sequences=True)(cap)

decode = Bidirectional(LSTM(256, return_sequences=False))(add([par, cap]))
out = Dense(vocab_size, activation='softmax')(decode)
model = Model(inputs = [in1, in2], outputs = out)

model.compile(loss='categorical_crossentropy', optimizer=RMSprop(), metrics=['accuracy'])
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 37)]         0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 4096)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 37, 256)      1940224     input_4[0][0]                    
__________________________________________________________________________________________________
dropout (Dropout)               (None, 4096)         0           input_3[0][0]                    
____________________________________________________________________________________________

In [0]:
model.fit_generator(generator(train_img, train_captions, encoded_train_image, index, vocab_size, max_caption_length, batch_size = 100), steps_per_epoch = 300, epochs = 1)

In [0]:
#Generating captions starting with "<start>" and ending when next word predicted is "<stop>" or at max_caption_length
def gen_captions(image):    
    begin_cap = ["<start>"]
    while True:
        temp_caption = [index[i] for i in begin_cap]
        temp_caption = sequence.pad_sequences([temp_caption], maxlen=max_caption_length, padding='post')
        encode = encoded_test_image[image[len(all_images_dir):]]
        preds = model.predict([np.array([encode]), np.array(temp_caption)])
        word_prediction = index_word[np.argmax(preds[0])]
        begin_cap.append(word_prediction)
        
        if word_prediction == "<stop>" or len(begin_cap) > max_caption_length:
            break

    final_caption =  ' '.join(begin_cap[1:-1])
    final_caption = final_caption.capitalize() + "."
    return final_caption

In [0]:
index_word ={}
for i,j in index.items():    
    index_word[j] = i

for i in range(10):
  new_image = test_image_names[randint(1, 500)]
  new_image = all_images_dir + new_image
  print ('Caption:', gen_captions(new_image))
  Image.open(new_image)

