In [None]:
# takes about 5-7 minutes
!wget https://dl.fbaipublicfiles.com/clevr/CLEVR_v1.0.zip
# unzip the file takes about 3 min
!unzip -q CLEVR_v1.0.zip

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
import os, json
from PIL import Image
import numpy as np
import pandas as pd
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
!pip -q install transformers


[K     |████████████████████████████████| 4.0 MB 5.2 MB/s 
[K     |████████████████████████████████| 596 kB 61.9 MB/s 
[K     |████████████████████████████████| 6.6 MB 63.9 MB/s 
[K     |████████████████████████████████| 895 kB 69.6 MB/s 
[K     |████████████████████████████████| 77 kB 6.8 MB/s 
[?25h

In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification

In [None]:
def CLEVER(categories,split = "train",transformation = None):
    
    json_dir = os.path.join('CLEVR_v1.0/','questions',f"CLEVR_{split}_questions.json")
    image_dir = os.path.join('CLEVR_v1.0/','images',split)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    categories = categories
    data = []
    cat2id = {i:index for index,i in enumerate(categories)}
    id2cat = {index:i for index,i in enumerate(categories)}
    transformation = transformation
    json_file = json.load(open(json_dir))
    
    for index,i in enumerate(json_file["questions"]):
        text = i["question"]
        answer = self.cat2id[i["answer"]]
        label_vector = torch.zeros((len(self.categories)))
        label_vector[answer] = 1
        data.append((i["image_index"],i["image_filename"],text,answer))
        
    for index in range(len(data)):
        image_path = os.path.join(image_dir,data[index][1])
        image = Image.open(image_path).convert('RGB')
        if transformation != None:
            image = transformation(image)
        text = data[index][2]
        encoded_text = tokenizer.encode_plus(
                text, add_special_tokens = True, truncation = True, 
                max_length = 256, padding = 'max_length',
                return_attention_mask = True,
                return_tensors = 'pt')
        text = encoded_text['input_ids'][0]
        attention_mask = encoded_text['attention_mask'][0]
        answer = data[index][3]
        yield image,text,attention_mask,answer



In [None]:
categories = ['0', '1', '10', '2', '3', '4', '5', '6', '7', '8', '9', 'blue', 'brown', 'cube', 'cyan', 'cylinder', 'gray', 'green', 'large', 'metal', 'no', 'purple', 'red', 'rubber', 'small', 'sphere', 'yellow', 'yes']
print(categories)

['0', '1', '10', '2', '3', '4', '5', '6', '7', '8', '9', 'blue', 'brown', 'cube', 'cyan', 'cylinder', 'gray', 'green', 'large', 'metal', 'no', 'purple', 'red', 'rubber', 'small', 'sphere', 'yellow', 'yes']


In [None]:
data_gen = CLEVER(categories,"train",None)

In [None]:
dataset = tf.data.Dataset.from_generator(data_gen,output_types = (tf.float32))

TypeError: ignored

In [None]:

def transform(image):
    x = image.resize((224,224))
    x = np.array(x)
    x = tf.image.convert_image_dtype(x, tf.float32, saturate=False, name=None)
    return x
def make_gen_callable(categories,split,transformation):
    def CLEVER():
        json_dir = os.path.join('CLEVR_v1.0/','questions',f"CLEVR_{split}_questions.json")
        image_dir = os.path.join('CLEVR_v1.0/','images',split)
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        data = []
        cat2id = {i:index for index,i in enumerate(categories)}
        id2cat = {index:i for index,i in enumerate(categories)}

        json_file = json.load(open(json_dir))
        
        for index,i in enumerate(json_file["questions"]):
            text = i["question"]
            answer = cat2id[i["answer"]]
            data.append((i["image_index"],i["image_filename"],text,answer))
            
        for index in range(len(data)):
            image_path = os.path.join(image_dir,data[index][1])
            image = Image.open(image_path).convert('RGB')
            if transformation != None:
                image = transformation(image)
            print(image.shape)
            text = data[index][2]
            encoded_text = tokenizer.encode_plus(
                    text, add_special_tokens = True, truncation = True, 
                    max_length = 256, padding = 'max_length',
                    return_attention_mask = True,
                    return_tensors = 'pt')
            text = encoded_text['input_ids'][0]
            attention_mask = encoded_text['attention_mask'][0]
            answer = data[index][3]
            yield image,text,attention_mask,answer
    return CLEVER


In [None]:
data_gen = make_gen_callable(categories,"train",transform)
dataset = tf.data.Dataset.from_generator(data_gen,output_types = (tf.float32))

In [None]:
iterator = iter(dataset)

print(iterator.get_next())

(224, 224, 3)


UnknownError: ignored

## FINAL TRY

In [None]:
training_questions = json.load(open("CLEVR_v1.0/questions/CLEVR_train_questions.json"))


In [None]:
categories = ['0', '1', '10', '2', '3', '4', '5', '6', '7', '8', '9', 'blue', 'brown', 'cube', 'cyan', 'cylinder', 'gray', 'green', 'large', 'metal', 'no', 'purple', 'red', 'rubber', 'small', 'sphere', 'yellow', 'yes']
print(categories)

['0', '1', '10', '2', '3', '4', '5', '6', '7', '8', '9', 'blue', 'brown', 'cube', 'cyan', 'cylinder', 'gray', 'green', 'large', 'metal', 'no', 'purple', 'red', 'rubber', 'small', 'sphere', 'yellow', 'yes']


In [None]:
file_names = []
labels = []
questions = []
masks = []

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
cat2id = {i:index for index,i in enumerate(categories)}

for index,i in enumerate(training_questions["questions"][:10000]):
    #name = tf.convert_to_tensor("CLEVR_v1.0/" + "images/"+ "train/" + i["image_filename"],dtype = tf.string)
    name = "CLEVR_v1.0/" + "images/"+ "train/" + i["image_filename"]
    file_names.append(name)


    answer = cat2id[i["answer"]]
    label_vector = np.zeros((len(categories)))
    label_vector[answer] = 1
    #label_vector = tf.convert_to_tensor(label_vector,dtype = tf.int32)
    labels.append(label_vector)
    
    encoded_text = tokenizer.encode_plus(
                i["question"], add_special_tokens = True, truncation = True, 
                max_length = 256, padding = 'max_length',
                return_attention_mask = True,
                return_tensors = 'pt')
    #input_id = tf.convert_to_tensor(encoded_text["input_ids"][0],dtype = tf.int32)
    questions.append(encoded_text["input_ids"][0])
    
    #attention = tf.convert_to_tensor(encoded_text['attention_mask'][0],dtype = tf.int32)
    masks.append(encoded_text['attention_mask'][0])



df = pd.DataFrame()
df["files"] = file_names
df["labels"] = labels
df["questions"] = questions
df["masks"] = masks

In [None]:
for i in df.columns:
    if i not in ["files"]:
        df[i] = df[i].apply(lambda x:str(x))

In [None]:
tf_dataset = tf.data.Dataset.from_tensor_slices(df)

In [None]:
def get_inputs(file_path):
    print(len(file_path))
    image = tf.io.read_file(file_path[0])
    #image = tf.image.resize(image,(224,224,3))
    image = tf.image.decode_image(image,channels = 3,dtype = tf.float32)
    #labels = tf.strings.to_number(file_path[1],out_type = tf.int32)
    labels = file_path[1]
    #questions = tf.strings.to_number(file_path[2],out_type = tf.int32)
    questions = file_path[2]
    #masks = tf.strings.to_number(file_path[3],out_type = tf.int32)
    masks = file_path[3]
    return (image,questions,masks),labels

tf_dataset = tf_dataset.map(get_inputs).batch(64)

4


In [None]:
for inputs,labels in tf_dataset:
    #print(img,label,question,mask)
    print(inputs,labels)
    break

(<tf.Tensor: shape=(64, 320, 480, 3), dtype=float32, numpy=
array([[[[0.41960785, 0.41960785, 0.41960785],
         [0.4117647 , 0.4117647 , 0.4117647 ],
         [0.41568628, 0.4117647 , 0.4117647 ],
         ...,
         [0.40784314, 0.40392157, 0.40392157],
         [0.4       , 0.4       , 0.4       ],
         [0.40392157, 0.40392157, 0.40392157]],

        [[0.4117647 , 0.4117647 , 0.4117647 ],
         [0.41568628, 0.41568628, 0.41568628],
         [0.4117647 , 0.4117647 , 0.4117647 ],
         ...,
         [0.4       , 0.4       , 0.4       ],
         [0.4       , 0.4       , 0.4       ],
         [0.40392157, 0.40392157, 0.40392157]],

        [[0.41568628, 0.4117647 , 0.4117647 ],
         [0.41568628, 0.41568628, 0.41568628],
         [0.41568628, 0.41568628, 0.4117647 ],
         ...,
         [0.40392157, 0.40392157, 0.4       ],
         [0.40784314, 0.40784314, 0.40784314],
         [0.4       , 0.4       , 0.4       ]],

        ...,

        [[0.4627451 , 0.45882353

In [None]:
from tensorflow.keras.applications import EfficientNetB0
from transformers import BertTokenizer, TFBertForSequenceClassification,TFBertModel

In [None]:
input1 = keras.layers.Input(shape = (320,480,3,))
input2 = tf.keras.layers.Input((256,), dtype=tf.int32)
attention = tf.keras.layers.Input((256,), dtype=tf.int32)

effnet = EfficientNetB0(weights='imagenet')
x = effnet(input1)

bert_model = TFBertModel.from_pretrained("bert-base-cased")
q_embedding = bert_model(input2, attention_mask=attention)[0]
q = tf.keras.layers.GlobalAveragePooling1D()(q_embedding)
y = tf.keras.layers.Dropout(0.2)(q)
y = tf.keras.layers.Dense(256, activation='relu')(y)

z = tf.keras.layers.Concatenate(axis=1)([x, y])

output = layers.Dense(28,activation = "softmax")(z)

model = keras.models.Model(inputs = [input1,input2,attention],outputs = output)




Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
model.summary()

Model: "model_22"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_137 (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 input_138 (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 tf_bert_model_7 (TFBertModel)  TFBaseModelOutputWi  108310272   ['input_137[0][0]',              
                                thPoolingAndCrossAt               'input_138[0][0]']              
                                tentions(last_hidde                                               
                                n_state=(None, 256,                                        

In [None]:

model.compile(
    loss=keras.losses.CategoricalCrossentropy(),
    optimizer=keras.optimizers.Adam(),
    metrics=["accuracy"],
)

In [None]:
history = model.fit(tf_dataset, epochs=2)


Epoch 1/2








ValueError: ignored

In [None]:
# def create_model():
#     q_id = tf.keras.layers.Input((256,), dtype=tf.int32)
#     q_mask = tf.keras.layers.Input((256,), dtype=tf.int32)
    
#     bert_model = TFBertModel.from_pretrained("bert-base-cased")
    
#     # if config.output_hidden_states = True, obtain hidden states via bert_model(...)[-1]
#     q_embedding = bert_model(q_id, attention_mask=q_mask)[0]
    
#     q = tf.keras.layers.GlobalAveragePooling1D()(q_embedding)
    
#     x = tf.keras.layers.Dropout(0.2)(q)
    
#     x = tf.keras.layers.Dense(256, activation='relu')(x)

#     model = tf.keras.models.Model(inputs=[q_id, q_mask], outputs=x)
    
#     return model

In [None]:
model.summary()

Model: "model_18"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_122 (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 input_123 (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 tf_bert_model_3 (TFBertModel)  TFBaseModelOutputWi  108310272   ['input_122[0][0]',              
                                thPoolingAndCrossAt               'input_123[0][0]']              
                                tentions(last_hidde                                               
                                n_state=(None, 256,                                        