In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import os
import tensorflow as tf
from tensorflow import keras
import numpy as np
import json
import cv2
import PIL
from datetime import datetime
from PIL import Image
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from math import ceil, floor
from tqdm.notebook import tqdm

In [2]:
#### Constants ####
TRAIN = True
SEED = 1234
img_h = 200
img_w = 350
n_channels = 3
bs = 256;
dataset_split = 0.8
num_questions = 0

tf.random.set_seed(SEED)
np.random.seed(SEED)

# Get current working directory
cwd = os.getcwd()

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
!unzip drive/MyDrive/Assignment_3/anndl-2020-vqa.zip -d ./

In [5]:
imgs_path = os.path.join('/content/VQA_Dataset', 'Images')
train_json_path = os.path.join('/content/VQA_Dataset', 'train_questions_annotations.json')
test_json_path = os.path.join('/content/VQA_Dataset', 'test_questions.json')


# direct dictionary, word => code
dictionary = {
        '0': 0,
        '1': 1,
        '2': 2,
        '3': 3,
        '4': 4,
        '5': 5,
        'apple': 6,
        'baseball': 7,
        'bench': 8,
        'bike': 9,
        'bird': 10,
        'black': 11,
        'blanket': 12,
        'blue': 13,
        'bone': 14,
        'book': 15,
        'boy': 16,
        'brown': 17,
        'cat': 18,
        'chair': 19,
        'couch': 20,
        'dog': 21,
        'floor': 22,
        'food': 23,
        'football': 24,
        'girl': 25,
        'grass': 26,
        'gray': 27,
        'green': 28,
        'left': 29,
        'log': 30,
        'man': 31,
        'monkey bars': 32,
        'no': 33,
        'nothing': 34,
        'orange': 35,
        'pie': 36,
        'plant': 37,
        'playing': 38,
        'red': 39,
        'right': 40,
        'rug': 41,
        'sandbox': 42,
        'sitting': 43,
        'sleeping': 44,
        'soccer': 45,
        'squirrel': 46,
        'standing': 47,
        'stool': 48,
        'sunny': 49,
        'table': 50,
        'tree': 51,
        'watermelon': 52,
        'white': 53,
        'wine': 54,
        'woman': 55,
        'yellow': 56,
        'yes': 57
}

# inverse dictionary, code => word
inverse_dictionary = {value:key for key, value in dictionary.items()}

N_CLASSES = len(dictionary)

In [6]:
class DataGenerator(tf.keras.utils.Sequence):
  ''' constructor '''
  def __init__(self, answers, imageIDs, input_questions, batch_size, training, max_length,
               shuffle=True, img_h=128, img_w=128, channels=3, img_generator=None):
    self.answers = answers
    self.imageIDs = imageIDs
    self.input_questions = input_questions
    self.batch_size = batch_size
    self.shuffle = shuffle
    self.indexes = np.arange(len(self.answers)) # list of indexes on complete dataset
    self.max_length = max_length
    self.training = training
    self.img_h = img_h
    self.img_w = img_w
    self.channels = channels # RGB image
    self.img_generator = img_generator
    self.on_epoch_end()

  def __len__(self):
    return int(np.floor(len(self.imageIDs) / self.batch_size))

  def __getitem__(self, index):
    bs_index_start = index * self.batch_size; # if bs = 32, second batch starts from 31 (0 indexed)
    bs_index_end = bs_index_start + self.batch_size - 1; # if bs = 32, second batch finished at 63 (0 indexed)
    indexes = self.indexes[bs_index_start:(bs_index_end+1)]
    
    # generates array [[RGBimage0m answer0], [RGBimage1, answer1], ...]
    input_x =  self._generate_x(indexes)
    
    if self.training: # if training, return input and also ground truth
      output_y = self._generate_y(indexes)
      return (input_x, output_y)
    
    else: # if testing, return input only
      return input_x

  def on_epoch_end(self):
    if self.shuffle:
      np.random.shuffle(self.indexes)

  def _generate_x(self, indexes):
    # init result containers
    RGBimages = np.empty((self.batch_size, self.img_h, self.img_w, self.channels))
    input_ids = np.empty((self.batch_size, self.max_length))
    attention_mask = np.empty((self.batch_size, self.max_length))

    for i, ID in enumerate(indexes):
      RGBimages[i, ] = self._load_image(self.imageIDs[ID], self.img_w, self.img_h)
      input_ids[i,] = self.input_questions[ID]['input_ids']
      #questions[i,1,] = self.input_questions[ID]['token_type_ids']
      attention_mask[i,] = self.input_questions[ID]['attention_mask']

    return {'image' : RGBimages, 'input_ids' : input_ids, 'attention_mask' : attention_mask}

  def _generate_y(self, indexes):
    y = np.empty((self.batch_size, N_CLASSES), dtype=int)
    
    # transforming answer to categorical
    indexed_answers = [self.answers[i] for i in indexes]
    
    categorical = tf.keras.utils.to_categorical(indexed_answers, num_classes=N_CLASSES)

    # enum => [[0, 64], [1, 42], [2, 76], ...]
    for i, elem in enumerate(categorical):
      y[i] = elem;

    return y

  def _load_image(self, img_name, img_w, img_h):
    rgba_image = PIL.Image.open(imgs_path + '/' + img_name + ".png")
    rgb_image = rgba_image.convert('RGB')
    image = cv2.resize(np.array(rgb_image), (img_w, img_h))
    if self.img_generator is not None:
      img_t = self.img_generator.get_random_transform(image.shape, seed=SEED)
      image = self.img_generator.apply_transform(image, img_t)   
    image = image/ 255.
    return image

In [7]:
# extracts (questions, imageIDs, answers) from training json
def parseTrainJson(data, first, last):
  imageIDs = []
  questions = []
  answers = []

  for key in list(data)[first:last]:
    question = data[key]['question'].lower().split(" ") # splitting questio into words
    question[-1] = question[-1].replace("?", "") # removing question mark
    
    imageID = data[key]['image_id']
    answer = data[key]['answer']

    questions.append(question)
    imageIDs.append(imageID)
    answers.append(dictionary[answer]) # appending equivalent number of word
     
  return questions, imageIDs, answers

# extracts (questionIDs, questions, imageIDs) from test json
def parseTestJson(data):
  questionIDs = []
  imageIDs = []
  questions = []

  for key in data:
    questionIDs.append(key)
    imageID = data[key]['image_id']
    question = data[key]['question'].split(" ") # splitting questio into words
    question[-1] = question[-1].replace("?", "") # removing question mark

    imageIDs.append(imageID)
    questions.append(question)

  return questionIDs, questions, imageIDs

In [8]:
# train and validation splitting intervals

# train/valid
with open(train_json_path, 'r') as f:
  data = json.load(f)
  num_questions = len(data)
  num_train_questions = floor(num_questions * dataset_split)
  num_valid_questions = num_questions - num_train_questions
  (train_questions, train_imageIDs, train_answers) = parseTrainJson(data, 0, num_train_questions);  # train
  (valid_questions, valid_imageIDs, valid_answers) = parseTrainJson(data, num_train_questions, num_questions);  # valid

# test
with open(test_json_path, 'r') as f:
  test_data = json.load(f)
  (test_questionIDs, test_questions, test_imageIDs) = parseTestJson(test_data) # test

In [None]:
!pip install -q transformers tensorflow_datasets

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
vocabulary = tokenizer.get_vocab()

In [None]:
max_length = 23 #calculated max
bert_input_train = []
for line in tqdm(train_questions):
  bert_input_train.append(tokenizer.encode_plus(
                                              line,
                                              add_special_tokens=True,
                                              truncation=True,
                                              max_length=max_length,
                                              padding='max_length',
                                              return_attention_mask=True,
                                              is_split_into_words=True))
bert_input_valid = []
for line in tqdm(valid_questions):
  bert_input_valid.append(tokenizer.encode_plus(
                                              line,
                                              add_special_tokens=True,
                                              truncation=True,
                                              max_length=max_length,
                                              padding='max_length',
                                              return_attention_mask=True,
                                              is_split_into_words=True))
bert_input_test = []
for line in tqdm(test_questions):
  bert_input_test.append(tokenizer.encode_plus(
                                              line,
                                              add_special_tokens=True,
                                              truncation=True,
                                              max_length=max_length,
                                              padding = 'max_length',
                                              return_attention_mask=True,
                                              is_split_into_words=True))
  

In [None]:
print(train_questions[12])
print(tokenizer.encode(train_questions[12]))
print(tokenizer.decode(bert_input_train[12]['input_ids']))
print(bert_input_train[12]['input_ids'])

In [13]:
train_generator = DataGenerator(answers=train_answers, 
                                imageIDs=train_imageIDs, 
                                input_questions=bert_input_train,
                                batch_size=bs,
                                shuffle=True,
                                training=True,
                                img_h=img_h,
                                img_w=img_w,
                                channels=n_channels,
                                max_length=max_length)

valid_generator = DataGenerator(answers=valid_answers, 
                                imageIDs=valid_imageIDs, 
                                input_questions=bert_input_valid,
                                batch_size=bs,
                                shuffle=False,
                                training=True,
                                img_h=img_h,
                                img_w=img_w,
                                channels=n_channels,
                                max_length=max_length)

test_generator = DataGenerator(answers=test_questionIDs, 
                                imageIDs=test_imageIDs, 
                                input_questions=bert_input_test,
                                batch_size=1,
                                shuffle=False,
                                training=False,
                                img_h=img_h,
                                img_w=img_w,
                                channels=n_channels,
                                max_length=max_length)

In [14]:
from transformers import TFAutoModel
from transformers import BertModel, BertConfig
import tensorflow_hub as hub

config = BertConfig(use_cache=True, output_attentions=False, is_decoder=True)

def VQA(out_dim = 768):
  drop_rate_conv = 0.2
  drop_rate_ffnn = 0.5
  
  ## Inputs ##
  input_image = tf.keras.Input(shape=(img_h, img_w, n_channels), name='image')
  input_ids = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name='input_ids')
  attention_mask = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name='attention_mask')

  ## CNN image processing##
  # Conv block 1
  conv1 = tf.keras.layers.Conv2D(filters=32, kernel_size=(3, 3), padding='same', kernel_initializer='he_uniform')(input_image)
  batch1 = tf.keras.layers.BatchNormalization()(conv1)
  act1 = tf.keras.layers.Activation('relu')(batch1)
  pool1 = tf.keras.layers.MaxPool2D(pool_size=(2, 2))(act1)
  drop1 = tf.keras.layers.Dropout(drop_rate_conv, seed=SEED)(pool1)

  # Conv block 2
  conv2 = tf.keras.layers.Conv2D(filters=64, kernel_size=(3, 3), padding='same', kernel_initializer='he_uniform')(drop1)
  batch2 = tf.keras.layers.BatchNormalization()(conv2)
  act2 = tf.keras.layers.Activation('relu')(batch2)
  pool2 = tf.keras.layers.MaxPool2D(pool_size=(2, 2))(act2)
  drop2 = tf.keras.layers.Dropout(drop_rate_conv, seed=SEED)(pool2)

  # Conv block 3
  conv3 = tf.keras.layers.Conv2D(filters=128, kernel_size=(3, 3), padding='same', kernel_initializer='he_uniform')(drop2)
  batch3 = tf.keras.layers.BatchNormalization()(conv3)
  act3 = tf.keras.layers.Activation('relu')(batch3)
  pool3 = tf.keras.layers.MaxPool2D(pool_size=(2, 2))(act3)
  drop3 = tf.keras.layers.Dropout(drop_rate_conv, seed=SEED)(pool3)

  # Conv block 4
  gap1 = tf.keras.layers.GlobalAveragePooling2D()(drop3)
  dense1 = tf.keras.layers.Dense(units=out_dim, kernel_initializer='he_uniform')(gap1)
  batch4 = keras.layers.BatchNormalization()(dense1)
  act4 = tf.keras.layers.Activation('relu')(batch4)
  drop4 = tf.keras.layers.Dropout(drop_rate_conv, seed=SEED)(act4)

  ## BERT transformer ##
  bert = TFAutoModel.from_pretrained('bert-base-uncased', config=config)
  embeddings = bert(
        input_ids=input_ids, attention_mask=attention_mask
  )[0]

  #embeddings = bert(
  #      input_ids=input_ids, attention_mask=attention_mask
  #)[1] This is te unpooled output. requires max-pooling since output is (None, 23, 768)
  gap2 = tf.keras.layers.GlobalAveragePooling1D()(embeddings)
  denseX = tf.keras.layers.Dense(units=out_dim, kernel_initializer='he_uniform')(gap2)
  #denseX = tf.keras.layers.Dense(units=out_dim, kernel_initializer='he_uniform')(embeddings)
  batchX = tf.keras.layers.BatchNormalization()(denseX)
  actX = tf.keras.layers.Activation('relu')(batchX)
  dropX = tf.keras.layers.Dropout(drop_rate_ffnn, seed=SEED)(actX)
  
  ## Merge ##
  merge = tf.keras.layers.Multiply()([drop4, dropX])
  dense = tf.keras.layers.Dense(units=out_dim, kernel_initializer='he_uniform')(merge)
  batch = tf.keras.layers.BatchNormalization()(dense)
  act = tf.keras.layers.Activation('relu')(batch)
  drop = tf.keras.layers.Dropout(drop_rate_ffnn, seed=SEED)(act)
  out = tf.keras.layers.Dense(N_CLASSES, activation='softmax')(drop)
  VQA_model = tf.keras.models.Model(inputs=[input_image, input_ids, attention_mask], outputs=out)

  return VQA_model

In [None]:
VQA_net = VQA(out_dim=768)
print(VQA_net.layers[18])
VQA_net.layers[18].trainable = False # freezing transformer
VQA_net.summary()
tf.keras.utils.plot_model(VQA_net, expand_nested=True, show_shapes=True, show_layer_names=True)

In [16]:
# loss
loss = tf.keras.losses.CategoricalCrossentropy()
# optimizer
lr = 1e-3
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
# metrics
metrics = ['accuracy']

VQA_net.compile(optimizer=optimizer, loss=loss, metrics=metrics)

In [None]:
from datetime import datetime

if TRAIN:  
  exps_dir = os.path.join(cwd, 'drive/My Drive/Assignment_3/Log/')
  if not os.path.exists(exps_dir):
      os.makedirs(exps_dir)

  now = datetime.now().strftime('%b%d_%H-%M-%S')

  model_name = 'Baseline'

  exp_dir = os.path.join(exps_dir, model_name + '_' + str(now))
  if not os.path.exists(exp_dir):
      os.makedirs(exp_dir)
      
  callbacks = []

  # Model checkpoint
  # ----------------
  ckpt_dir = os.path.join(exp_dir, 'ckpts')
  if not os.path.exists(ckpt_dir):
      os.makedirs(ckpt_dir)

  ckpt_callback = tf.keras.callbacks.ModelCheckpoint(filepath=os.path.join(ckpt_dir, 'cp_{epoch:02d}.ckpt'), 
                                                    save_weights_only=True, save_best_only=True)  # False to save the model directly
  callbacks.append(ckpt_callback)

  # Visualize Learning on Tensorboard
  # ---------------------------------
  tb_dir = os.path.join(exp_dir, 'tb_logs')
  if not os.path.exists(tb_dir):
      os.makedirs(tb_dir)
      
  # By default shows losses and metrics for both training and validation
  tb_callback = tf.keras.callbacks.TensorBoard(log_dir=tb_dir,
                                              profile_batch=0,
                                              histogram_freq=0)  # if 1 shows weights histograms
  callbacks.append(tb_callback)

  reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, min_lr=1e-7, verbose=1, cooldown=0)
  callbacks.append(reduce_lr)

  # Early Stopping
  # --------------
  early_stop = True
  if early_stop:
      es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=7, restore_best_weights=True)
      callbacks.append(es_callback)
  

  VQA_net.fit(x=train_generator,
            epochs=50,  #### set repeat in training dataset
            steps_per_epoch=len(train_generator),
            validation_data=valid_generator,
            validation_steps=len(valid_generator),
            callbacks=callbacks)

In [None]:
#VQA_net.save('/content/drive/My Drive/Assignment_3/Saved Models/BertTransformerCustom')

In [None]:
def create_csv(results, results_dir='./'):
    csv_fname = 'results_'
    csv_fname += datetime.now().strftime('%b%d_%H-%M-%S') + '.csv'

    with open(os.path.join(results_dir, csv_fname), 'w') as f:
        f.write('Id,Category\n')
        for key, value in results.items():
            f.write(key + ',' + str(value) + '\n')

In [None]:
pred = VQA_net.predict(test_generator)
results = {}
for i in range(len(pred)):
    results[test_generator.answers[i]] = np.argmax(pred[i])

create_csv(results, results_dir='./drive/MyDrive/Assignment_3/Results')