  # Download TQA

In [None]:
import requests
from tqdm import tqdm
import math
import zipfile

In [2]:
url = "https://s3.amazonaws.com/ai2-vision-textbook-dataset/dataset_releases/tqa/tqa_train_val_test.zip"

r = requests.get(url, stream=True)
total_size = int(r.headers.get('content-length', 0))
block_size = 1024
wrote = 0 
with open("tqa_train_val_test.zip", 'wb') as f:
  for data in tqdm(r.iter_content(block_size), total=math.ceil(total_size//block_size) , unit='KB', desc = "tqa_train_val_test.zip", leave = True):
    wrote = wrote  + len(data)
    f.write(data)
if total_size != 0 and wrote != total_size:
  print("ERROR, something went wrong")
f.close
zip_ref = zipfile.ZipFile("tqa_train_val_test.zip", 'r')
zip_ref.extractall(".")
zip_ref.close()

tqa_train_val_test.zip: 1715758KB [00:39, 43496.99KB/s]                               


In [1]:
!ls "./tqa_train_val_test"

CVPR17_TQA.pdf	README.md  test  train	val


# Corpus loading

In [2]:
import json
import sys
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from collections import OrderedDict
from keras.applications.vgg19 import VGG19
from PIL import Image
from keras.models import Sequential
from keras.layers import InputLayer, BatchNormalization, Conv2D, MaxPooling2D

Using TensorFlow backend.


In [None]:
question_type = "diagramQuestions"
#question_type = "nonDiagramQuestions"

weights = "cross"
#weights = "cross-vecsi"

In [None]:
train_folder = "./tqa_train_val_test/train/"
train_json = "./tqa_train_val_test/train/tqa_v1_train.json"
test_folder = "./tqa_train_val_test/test/"
test_json = "./tqa_train_val_test/test/tqa_v2_test.json"
val_folder = "./tqa_train_val_test/val/"
val_json = "./tqa_train_val_test/val/tqa_v1_val.json"

dataset_jsons = [train_json,test_json,val_json]
dataset_folders = [train_folder,test_folder,val_folder]

## Data loading

In [None]:
# Read the articles dataset that will be used to train and validate the model.
def data_extraction(dataset_folders, dataset_jsons):
    count = 0
    #Prepare data
    paragraphs = []
    figures_paragraphs = []
    questions = []
    figures_questions = []
    list_answers = [[],[],[],[]]
    correct_answers = []
    question_types = []
    split_cont = 0
    for dataset_json in dataset_jsons:
        folder = dataset_folders[split_cont]
        split_cont = split_cont+1
        with open(dataset_json, "r") as file:
            dataset = json.load(file)

        for doc in dataset:
            count=count+1
            sys.stdout.write("\r%d lessons processed" % count)
            sys.stdout.flush()

            question_list = [x for x in doc["questions"][question_type]]
            for question_id in question_list:
                n_answers = len (doc["questions"][question_type][question_id]["answerChoices"])
                
                if(n_answers!=4 or question_id == "NDQ_005923" or question_id == "NDQ_006171" or question_id == "NDQ_004046" or question_id =="NDQ_016510" or doc["questions"][question_type][question_id]["questionSubType"] != "Multiple Choice"):
                    continue

                #Questions
                questions_getter(folder, doc, question_id, question_type, questions,figures_questions)

                #Context
                context_getter(folder, doc, question_id, question_type, paragraphs, figures_paragraphs)

                #Answers
                answers_getter(doc, question_id, question_type, n_answers, list_answers)

                #Correct Answer (labeling)
                label_getter(doc, question_id, question_type, correct_answers)

    data_raw = [paragraphs,questions,list_answers[0],list_answers[1],list_answers[2],list_answers[3]]
    figures = [figures_paragraphs,figures_questions]
    print("\n")

    return data_raw, figures, correct_answers
    
def questions_getter(folder, doc, question_id, question_type, questions,figures_questions):
    question = doc["questions"][question_type][question_id]["beingAsked"]["processedText"]
    if(question_type == "diagramQuestions"):
      figure_path = folder+doc["questions"][question_type][question_id]["imagePath"]
      figure_file = open(figure_path, 'rb')
      figure = Image.open(figure_file)
      figure_resized = figure.resize((224,224), Image.ANTIALIAS)
      figure_array = np.array(figure_resized)
      figure.close()
      figure_file.close()
    if(question_type=="nonDiagramQuestions"):
      figure_array = np.zeros((224,224,3))
    figures_questions.append(figure_array)
    questions.append(question)
def context_getter(folder, doc, question_id, question_type, paragraphs, figures_paragraphs):
    question = doc["questions"][question_type][question_id]["beingAsked"]["processedText"]
    topics = [x for x in doc["topics"]]
    documents = []
    figs = []
    documents.append(question)
    figs.append("")
    for topic in topics:
      paragraph = doc["topics"][topic]["content"]["text"]
      figure_path = ""
      if len(doc["topics"][topic]["content"]["figures"])>0:
        figure_path = folder+doc["topics"][topic]["content"]["figures"][0]["imagePath"]
      figs.append(figure_path)
      documents.append(paragraph)
    tfidf = TfidfVectorizer().fit_transform(documents)
    pairwise_similarity = tfidf * tfidf.T
    score_max_index = np.argmax(pairwise_similarity[0,1:])+1
    score_max_paragraph = documents[score_max_index]
    score_max_figure = figs [score_max_index]
    score_max = pairwise_similarity[0,score_max_index]
    chosen_paragraph = score_max_paragraph
    chosen_figure = score_max_figure
    if chosen_figure == "":
        figure_array = np.zeros((224,224,3))
    else:
        figure_file = open(chosen_figure, 'rb')
        figure = Image.open(figure_file)
        figure_resized = figure.resize((224,224), Image.ANTIALIAS)
        figure_array = np.array(figure_resized)
        figure.close()
        figure_file.close()
    figures_paragraphs.append(figure_array)
    paragraphs.append(chosen_paragraph)
def answers_getter(doc, question_id,question_type,n_answers,list_answers):
    letter_list=["a","b","c","d"]
    for i in range(4):
        if(i < n_answers):
            letter = letter_list[i]
            answer = doc["questions"][question_type][question_id]["answerChoices"][letter]["processedText"]
        else:
            answer=""
        list_answers[i].append(answer)
def label_getter(doc, question_id, question_type, correct_answers):   
    correct_answer = doc["questions"][question_type][question_id] ["correctAnswer"]["processedText"]
    letter_list=["a","b","c","d"]
    correct_array = np.zeros(4)
    for i in range(4):
        if(letter_list[i]==correct_answer):
            correct_array[i]=1
    correct_answers.append(correct_array)

def data_refiner(data_raw,figures,correct_answers,tokenizer):
    model = Sequential()
    model.add(InputLayer(input_shape=(224,224,3)))
    model.add(Conv2D(64, (3,3), padding = "same", activation="relu"))
    model.add(BatchNormalization())
    model.add(Conv2D(64, (3,3), padding = "same", activation="relu"))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(2))
    model.add(Conv2D(128, (3,3), padding = "same", activation="relu"))
    model.add(BatchNormalization())
    model.add(Conv2D(128, (3,3), padding = "same", activation="relu"))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(2))
    model.add(Conv2D(256, (3,3), padding = "same", activation="relu"))
    model.add(BatchNormalization())
    model.add(Conv2D(256, (3,3), padding = "same", activation="relu"))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(2))
    model.add(Conv2D(512, (3,3), padding = "same", activation="relu"))
    model.add(BatchNormalization())
    model.add(Conv2D(512, (3,3), padding = "same", activation="relu")) 
    model.add(BatchNormalization())
    model.add(MaxPooling2D((28,28),2))
    model.load_weights('./weights/'+weights+'-weights.h5')
    count = 0
    data =[]
    for i in range(len(data_raw)):
        if(i==0):
            max_len = 630
        else:
            max_len=73
        sequences = tokenizer.texts_to_sequences(data_raw[i])
        data_refined = pad_sequences(sequences, maxlen=max_len, padding="post", truncating="post")
        data.append(data_refined)
        print("Shape of (" + str(count) + ") data tensor:" + str(data_refined.shape))
        if(i == 0):
            figure_feat = features_extraction(np.array(figures[0]),model)
            data.append(figure_feat)
            count = count+1
            print("Shape of (" + str(count) + ") data tensor:" + str(figure_feat.shape))
        if(i == 1):
            figure_feat = features_extraction(np.array(figures[1]),model)
            data.append(figure_feat)
            count = count+1
            print("Shape of (" + str(count) + ") data tensor:" + str(figure_feat.shape))
        count = count+1
    labels = np.array(correct_answers)
    print("Shape of labels tensor:", labels.shape)

    print("\n")

    return data, labels
def features_extraction(figure,model):
    features = model.predict(figure,batch_size=32,verbose=1)
    return features

In [None]:
data, figures, correct_answers = data_extraction(dataset_folders, dataset_jsons)

#Data_refining
texts = []

for i in range(len(data[0])):
    for j in range(len(data)):
        texts.append(data[j][i])

tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

# Get the vocabulary index
word_index = tokenizer.word_index
print("Found %s unique tokens." % len(word_index))

x, y = data_refiner(data, figures, correct_answers,tokenizer)

1076 lessons processed

Found 16285 unique tokens.
Instructions for updating:
Colocations handled automatically by placer.


## NN architecture definition and training

In [None]:
from keras.models import Model, Sequential
from keras.layers import Embedding, Dense, Dropout, Bidirectional, LSTM, Lambda, InputLayer, Concatenate, BatchNormalization, Reshape
from keras import backend as K
from keras import optimizers
from keras.metrics import categorical_accuracy
import tensorflow as tf
from sklearn.metrics import classification_report

In [None]:
dim = 100
dout = 0.5
rdout = 0.5
n_words = len(word_index)

In [None]:
# add a layer that returns the concatenation
# of the positive part of the input and
# the opposite of the negative part

def similarityMU(x):
    M = x[0]
    U = x[1]
    S = tf.matmul(M,U, transpose_b=True)
    res = tf.reduce_max(S,axis=2,keepdims=True)
    return res

def answerer(x):
    a = x[0]
    M = x[1]
    M_t = tf.transpose(M, [0, 2, 1])
    a_exp = tf.expand_dims(a, 1)
    m = tf.multiply(a_exp,M_t)
    res = tf.reduce_sum(m,axis=2,keepdims=True)
    return res

def similaritymC(x):
    m = x[0]
    C = x[1]
    C1, C2, C3, C4 = tf.split(C, 4, 2)
    C_list=[C1,C2,C3,C4]
    res_tmp=[]
    for C in C_list:
        C_t = tf.transpose(C, [0, 2, 1])
        C_sum = tf.reduce_sum(C_t, axis=2, keepdims=True)
        res_tmp.append(tf.matmul(m, C_sum,transpose_a = True))
    #res = tf.reduce_sum(tf.concat(res_tmp, 1),axis=2)
    res = tf.concat(res_tmp, 1)
    return res

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, precision_score, recall_score

kfold = KFold(n_splits=10, shuffle=True)
precisions = []
recalls = []
f1s = []

for train, test in kfold.split(x[0], y):
    
    x_train = []
    x_test = []
    for elem in x:
        x_train.append(elem[train])
        x_test.append(elem[test])
    y_train = y[train]
    y_test = y[test]
    
    modelM = Sequential()
    modelM.add(InputLayer(input_shape=(630,),name="input_M"))
    modelM.add(Embedding(input_dim=n_words+1, output_dim=dim, input_length=630, embeddings_initializer="uniform", trainable=True,name="embedding_M"))
    modelM.add(LSTM(units=dim, return_sequences=True, name="lstm_M", dropout=dout, recurrent_dropout=rdout))

    #VGG19M = VGG19(weights='imagenet', include_top=False)
    #for layer in VGG19M.layers:
    #    layer.name = layer.name + "_MF"

    modelMF = Sequential()
    modelMF.add(InputLayer(input_shape=(1,1,512,),name="input_MF"))
    modelMF.add(Reshape((1,512,),name="reshape_MF"))
    modelMF.add(Dense(256, activation="tanh",name="perceptron_MF_1"))
    modelMF.add(Dense(dim, activation="tanh",name="perceptron_MF_2"))

    modelInMMF = Concatenate(name = "concatenateMMF", axis=1)([modelM.output,modelMF.output])
    modelMMF = Model([modelM.input,modelMF.input], modelInMMF)

    modelU = Sequential()
    modelU.add(InputLayer(input_shape=(73,),name="input_U"))
    modelU.add(Embedding(input_dim=n_words+1, output_dim=dim, input_length=73, embeddings_initializer="uniform", trainable=True,name="embedding_U"))
    modelU.add(LSTM(units=dim, return_sequences=True, name="lstm_U", dropout=dout, recurrent_dropout=rdout))

    modelUF = Sequential()
    modelUF.add(InputLayer(input_shape=(1,1,512,),name="input_UF"))
    modelUF.add(Reshape((1,512,),name="reshape_UF"))
    modelUF.add(Dense(256, activation="tanh",name="perceptron_UF_1"))
    modelUF.add(Dense(dim, activation="tanh",name="perceptron_UF_2"))

    modelInUUF = Concatenate(name = "concatenateUUF", axis=1)([modelU.output,modelUF.output])
    modelUUF = Model([modelU.input,modelUF.input], modelInUUF)
    
    modelC1 = Sequential()
    modelC1.add(InputLayer(input_shape=(73,),name="input_C1"))
    modelC1.add(Embedding(input_dim=n_words+1, output_dim=dim, input_length=73, embeddings_initializer="uniform", trainable=True,name="embedding_C1"))
    modelC1.add(LSTM(units=dim, return_sequences=True, name="lstm_C1", dropout=dout, recurrent_dropout=rdout))
    modelC2 = Sequential()
    modelC2.add(InputLayer(input_shape=(73,),name="input_C2"))
    modelC2.add(Embedding(input_dim=n_words+1, output_dim=dim, input_length=73, embeddings_initializer="uniform", trainable=True,name="embedding_C2"))
    modelC2.add(LSTM(units=dim, return_sequences=True, name="lstm_C2", dropout=dout, recurrent_dropout=rdout))
    modelC3 = Sequential()
    modelC3.add(InputLayer(input_shape=(73,),name="input_C3"))
    modelC3.add(Embedding(input_dim=n_words+1, output_dim=dim, input_length=73, embeddings_initializer="uniform", trainable=True,name="embedding_C3"))
    modelC3.add(LSTM(units=dim, return_sequences=True, name="lstm_C3", dropout=dout, recurrent_dropout=rdout))
    modelC4 = Sequential()
    modelC4.add(InputLayer(input_shape=(73,),name="input_C4"))
    modelC4.add(Embedding(input_dim=n_words+1, output_dim=dim, input_length=73, embeddings_initializer="uniform", trainable=True,name="embedding_C4"))
    modelC4.add(LSTM(units=dim, return_sequences=True, name="lstm_C4", dropout=dout, recurrent_dropout=rdout))

    modelInC = Concatenate(name = "concatenate")([modelC1.output,modelC2.output,modelC3.output,modelC4.output])
    modelC = Model([modelC1.input,modelC2.input,modelC3.input,modelC4.input], modelInC)

    modelIna = Lambda(similarityMU, output_shape=(630+1,),name="similarityMU")([modelMMF.output, modelUUF.output])
    modelIna = Dense(630+1,activation="softmax",name="softmax_a")(modelIna)
    modela = Model([modelM.input,modelMF.input,modelU.input,modelUF.input], modelIna)

    modelInm = Lambda(answerer, output_shape=(dim,),name="answerer") ([modela.output, modelMMF.output])
    modelm = Model([modelM.input,modelMF.input,modelU.input,modelUF.input], modelInm)

    modelIn = Lambda(similaritymC, output_shape=(4,),name="similaritymC")([modelm.output,modelC.output])
    modelIn = Dense(4, activation="softmax",name="softmax_y") (modelIn)
    model = Model([modelM.input,modelMF.input,modelU.input,modelUF.input,modelC1.input,modelC2.input,modelC3.input,modelC4.input], modelIn)
    
    adam = optimizers.Adam(lr=1e-2, decay=0.0)
    model.compile(loss="categorical_crossentropy", optimizer=adam, metrics=[categorical_accuracy])
    model.fit(x_train, y_train, batch_size=128, epochs=10, validation_data=(x_test, y_test), verbose=1)
    
    pred = model.predict(x_test, batch_size=128)
    max_value = np.argmax(pred,axis=1)
    predNew = np.zeros(np.shape(pred))
    for i in range(len(predNew)):
        predNew[i,max_value[i]]=1
    print(classification_report(y_test, predNew, digits=4, target_names=["a","b","c","d"]))
    precisions.append(precision_score(y_test, predNew, average="weighted"))
    recalls.append(recall_score(y_test, predNew, average="weighted"))
    f1s.append(f1_score(y_test, predNew, average="weighted"))
    
print("Precision: %.2f (+/- %.2f)" % (np.mean(precisions), np.std(precisions)))
print("Recall: %.2f (+/- %.2f)" % (np.mean(recalls), np.std(recalls)))
print("F1 Score: %.2f (+/- %.2f)" % (np.mean(f1s), np.std(f1s)))
    