### Homework 3 logbook
This time I used two encoders, one for image analysis and another for sequence analysis. Since this task can be seen as a classification task I used two dense layers on the top of the net to sum up the features extracted from the two encoders. This net is not able to answer location based questions since it only encodes the features without decoding so it may have troubles to locate a feature. Attention mechanism may increase the overall accuracy. In conclusion this net can give almost +0.3 accuracy points with respect to the baseline 0.3 (since 30% of the answer in the training are "yes").

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import os
import tensorflow as tf
import numpy as np

# Set the seed for random operations. 
# This let our experiments to be reproducible. 
SEED = 1234
tf.random.set_seed(SEED)
np.random.seed(SEED)

# Get current working directory
cwd = os.getcwd()

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
# Use Kaggle token to download dataset on drive
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content/gdrive/My Drive/Kaggle"

In [None]:
%cd /content/gdrive/My Drive/Kaggle

In [None]:
!pip install --upgrade --force-reinstall --no-deps kaggle

In [None]:
!kaggle competitions download -c anndl-2020-vqa

In [None]:
%ls

In [None]:
if not os.path.exists("/content/VQA_Dataset"):
  !unzip "anndl-2020-vqa.zip" -d "/content"

In [None]:
%cd /content

In [None]:
%ls VQA_Dataset/

# Dataset

In [None]:
from PIL import Image
import json

with open('VQA_Dataset/train_questions_annotations.json') as f:
  ds_dict = json.load(f)

ds_list = list(ds_dict.values())
qst, img, ans = ds_list[0].values()
qst
ans
img = Image.open("VQA_Dataset/Images/"+img+".png")
img_arr = np.array(img)[...,:3]
img_arr.shape

Image.fromarray(img_arr)

# Tokenization
## Converts words to integers

In [None]:
# Prepare dataset
# ---------------

qst = []
img = []
ans = []

for item in ds_list:
  q, i, a = item.values()

  qst.append(q)
  img.append(i)
  ans.append(a)

print('Number of sentences:', len(qst))

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(qst)
tokenized = tokenizer.texts_to_sequences(qst)

wtoi = tokenizer.word_index
print('Total question words:', len(wtoi))

max_qst_length = max(len(q) for q in tokenized)
print('Max question sentence length:', max_qst_length)

In [None]:
labels_dict = {
        '0': 0,
        '1': 1,
        '2': 2,
        '3': 3,
        '4': 4,
        '5': 5,
        'apple': 6,
        'baseball': 7,
        'bench': 8,
        'bike': 9,
        'bird': 10,
        'black': 11,
        'blanket': 12,
        'blue': 13,
        'bone': 14,
        'book': 15,
        'boy': 16,
        'brown': 17,
        'cat': 18,
        'chair': 19,
        'couch': 20,
        'dog': 21,
        'floor': 22,
        'food': 23,
        'football': 24,
        'girl': 25,
        'grass': 26,
        'gray': 27,
        'green': 28,
        'left': 29,
        'log': 30,
        'man': 31,
        'monkey bars': 32,
        'no': 33,
        'nothing': 34,
        'orange': 35,
        'pie': 36,
        'plant': 37,
        'playing': 38,
        'red': 39,
        'right': 40,
        'rug': 41,
        'sandbox': 42,
        'sitting': 43,
        'sleeping': 44,
        'soccer': 45,
        'squirrel': 46,
        'standing': 47,
        'stool': 48,
        'sunny': 49,
        'table': 50,
        'tree': 51,
        'watermelon': 52,
        'white': 53,
        'wine': 54,
        'woman': 55,
        'yellow': 56,
        'yes': 57
}

data = np.array([labels_dict[a] for a in ans])
shape = (data.size, len(labels_dict))
one_hot = np.zeros(shape)

rows = np.arange(data.size)
one_hot[rows, data] = 1
print(one_hot.shape)

In [None]:
print("FREQ\t\t\tLABEL")
labels = np.array(list(labels_dict.keys()))
freqs = np.array(one_hot.sum(axis=0) / one_hot.sum())

stats = np.array(list(zip(freqs, labels)))

stats = stats[stats[:,0].argsort()]

for stat in stats[::-1]:
  print("{}\t{}".format(stat[0], stat[1]))

# Padding sequences

In [None]:
encoder_inputs = pad_sequences(tokenized, maxlen=max_qst_length)

print("Encoder inputs shape:", encoder_inputs.shape)

In [None]:
encoder_inputs

In [None]:
from PIL import Image

class CustomDataset(tf.keras.utils.Sequence):

  def __init__(self, dataset_dir, which_subset, qst, img, ans, out_shape = [256, 256], preprocessing_function = None, val_split = 0):

    size = int(len(qst)*(1-val_split))

    self.dataset_dir = dataset_dir
    self.which_subset = which_subset
    if which_subset == 'test':
      self.qst = qst
      self.img = img
      self.ans = ans
    else:
      self.qst = qst[:size] if which_subset == 'training' else qst[size:]
      self.img = img[:size] if which_subset == 'training' else img[size:]
      self.ans = ans[:size] if which_subset == 'training' else ans[size:]
    self.out_shape = out_shape
    self.preprocessing_function = preprocessing_function
    
    print(which_subset + ' ' + str(len(self.qst)))

  def __len__(self):
    return len(self.qst)

  def __getitem__(self, index):
    # Read Image
    curr_filename = self.img[index]
    img = Image.open(os.path.join(self.dataset_dir, 'Images', curr_filename + '.png'))
    img = img.resize(self.out_shape)
    img_arr = np.array(img)[...,:3]

    if self.preprocessing_function is not None:
      img_arr = self.preprocessing_function(img_arr)

    if self.which_subset == 'test':
      return {"input_1": self.qst[index], "input_2": img_arr}
    else:
      return {"input_1": self.qst[index], "input_2": img_arr}, self.ans[index]

In [None]:
from tensorflow.keras.applications.resnet50  import preprocess_input 

img_h = 400
img_w = 700

dataset = CustomDataset('VQA_Dataset', 'training', encoder_inputs, img, one_hot, [img_w, img_h], preprocess_input)
dataset_valid = CustomDataset('VQA_Dataset', 'validation', encoder_inputs, img, one_hot, [img_w, img_h], preprocess_input)

In [None]:
BATCH_SIZE = 16

train_dataset = tf.data.Dataset.from_generator(lambda: dataset,
                                               output_types=({"input_1": tf.float32, "input_2": tf.float32}, tf.int32),
                                               output_shapes=({"input_1": [max_qst_length], "input_2": [img_h, img_w, 3]}, [labels.size]))

train_dataset = train_dataset.batch(BATCH_SIZE)

train_dataset = train_dataset.repeat()

valid_dataset = tf.data.Dataset.from_generator(lambda: dataset_valid,
                                               output_types=({"input_1": tf.float32, "input_2": tf.float32}, tf.int32),
                                               output_shapes=({"input_1": [max_qst_length], "input_2": [img_h, img_w, 3]}, [labels.size]))
valid_dataset = valid_dataset.batch(BATCH_SIZE)

valid_dataset = valid_dataset.repeat()

In [None]:
# Translation utils
itow = {v:k for k, v in wtoi.items()}

def translate(input_sentence):

  output_sentence = []
  for word_id in input_sentence:

    if word_id > 0:
        word = itow[word_id]
        output_sentence.append(word)
    
  return ' '.join(output_sentence)

In [None]:
# Let's test data generator
# -------------------------
import time
import matplotlib.pyplot as plt

%matplotlib inline

iterator = iter(train_dataset)

In [None]:
l, a = next(iterator)
q, i = l.values()

fig, ax = plt.subplots(1, 1)

translate(q[0].numpy())
print(labels[np.argmax(a[0])])
img_arr = i[0].numpy()
ax.imshow(np.uint8(img_arr))

# Model

In [None]:
base_model = tf.keras.applications.ResNet50(input_shape=[img_h, img_w, 3], include_top=False)

for layer in base_model.layers:
  base_model.trainable = False

base_model.summary()

In [None]:
# Build Encoder-Decoder Model
# ---------------------------

EMBEDDING_SIZE = 32

# ENCODER SEQ
# -----------

encoder_input = tf.keras.Input(shape=[max_qst_length], name="input_1")
encoder_embedding_layer = tf.keras.layers.Embedding(len(wtoi)+1, EMBEDDING_SIZE, input_length=max_qst_length, mask_zero=True)
encoder_embedding_out = encoder_embedding_layer(encoder_input)
encoder = tf.keras.layers.LSTM(units=256, return_state=True)

encoder_output, h, c = encoder(encoder_embedding_out)

# ENCODER IMG
# -----------

decoder_input = tf.keras.Input(shape=[img_h, img_w, 3], name="input_2")
x = base_model(decoder_input)

x = tf.keras.layers.Conv2D(filters=128,
                            kernel_size=(3, 3),
                            strides=(1, 1),
                            padding='same',
                            activation='relu')(x)
x = tf.keras.layers.GlobalAvgPool2D()(x)

x = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dense(128, activation='relu')(tf.concat([x, h], -1))
decoder_dense = tf.keras.layers.Dense(len(labels), activation='softmax')
decoder = decoder_dense(x)

# MODEL
model = tf.keras.Model([encoder_input, decoder_input], decoder)

In [None]:
model.summary(line_length=200)
# model.weights

# Prepare model for training

In [None]:
# Optimization params
# -------------------

# Loss
loss = tf.keras.losses.CategoricalCrossentropy()

# learning rate
lr = 1e-3
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
# -------------------

# Validation metrics
# ------------------

metrics = ['accuracy']
# ------------------

# Compile Model
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

In [None]:
import os
from datetime import datetime

cwd = os.getcwd()

exps_dir = os.path.join('/content/gdrive/My Drive/Homework3', 'experiments')
if not os.path.exists(exps_dir):
    os.makedirs(exps_dir)

now = datetime.now().strftime('%b%d_%H-%M-%S')

exp_name = 'exp'

exp_dir = os.path.join(exps_dir, exp_name + '_' + str(now))
if not os.path.exists(exp_dir):
    os.makedirs(exp_dir)
    
callbacks = []

# Model checkpoint
# ----------------
ckpt_dir = os.path.join(exp_dir, 'ckpts')
if not os.path.exists(ckpt_dir):
    os.makedirs(ckpt_dir)

ckpt_callback = tf.keras.callbacks.ModelCheckpoint(filepath=os.path.join(ckpt_dir, 'cp_{epoch:02d}.ckpt'), 
                                                   save_weights_only=False)  # False to save the model directly
callbacks.append(ckpt_callback)

# # Early Stopping
# # --------------
# early_stop = False
# if early_stop:
#     es_callback = tf.keras.callback.EarlyStopping(monitor='val_loss', patience=10)
#     callbacks.append(es_callback)

# # ---------------------------------

model.fit(x=train_dataset,
          validation_data=valid_dataset,
          steps_per_epoch=len(dataset)//BATCH_SIZE,
          validation_steps=len(dataset_valid)//BATCH_SIZE,
          epochs=1,
          callbacks=callbacks)

In [None]:
from PIL import Image
import json

with open('VQA_Dataset/test_questions.json') as f:
  dst_dict = json.load(f)

dst_list = list(dst_dict.values())

In [None]:
# Prepare testset
# ---------------

key = list(dst_dict.keys())
qst = []
img = []

for item in dst_list:
    
  q, i = item.values()

  qst.append(q)
  img.append(i)

print('Number of sentences:', len(qst))

In [None]:
tokenized = tokenizer.texts_to_sequences(qst)

In [None]:
encoder_inputs = pad_sequences(tokenized, maxlen=max_qst_length)

print("Encoder inputs shape:", encoder_inputs.shape)

In [None]:
encoder_inputs

In [None]:
from tensorflow.keras.applications.resnet50 import preprocess_input 

dataset_test = CustomDataset('VQA_Dataset', 'test', encoder_inputs, img, None, [img_w, img_h], preprocess_input)

In [None]:
test_dataset = tf.data.Dataset.from_generator(lambda: dataset_test,
                                               output_types=({"input_1": tf.float32, "input_2": tf.float32}),
                                               output_shapes=({"input_1": [max_qst_length], "input_2": [img_h, img_w, 3]}))
test_dataset = test_dataset.batch(BATCH_SIZE)

In [None]:
ans = model.predict(test_dataset)

In [None]:
# Let's visualize results
# ----------------------
import time
import matplotlib.pyplot as plt

%matplotlib inline

idx = 0

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(16, 32))

k = key[idx]
q = qst[idx]
i = img[idx]
a = ans[idx]

print(q, end='')

# Original Image
i2 = Image.open(os.path.join('VQA_Dataset', 'Images', i + '.png'))
i2 = i2.resize([img_w, img_h])
i2_arr = np.array(i2)[...,:3]
idx += 1;

labels = np.array(list(labels_dict.keys()))
print(labels[np.argmax(a)])
ax.imshow(np.uint8(i2))

In [None]:
import os
from datetime import datetime

def create_csv(results, results_dir='./'):

    csv_fname = 'results_'
    csv_fname += datetime.now().strftime('%b%d_%H-%M-%S') + '.csv'

    with open(os.path.join(results_dir, csv_fname), 'w') as f:

        f.write('Id,Category\n')

        for key, value in results.items():
            f.write(key + ',' + str(value) + '\n')

In [None]:
submission = {}
for idx in range(len(ans)):
  submission[key[idx]] = np.argmax(ans[idx])

In [None]:
create_csv(submission, '/content/gdrive/My Drive/Homework3') 