In [1]:
## Import necessary libraries

import os
import json
import random
import re
import pandas as pd
from copy import deepcopy

RANDOM_STATE = 12345

#### Code for normalising words adapted from: https://github.com/dandelin/ViLT/blob/master/vilt/utils/glossary.py

In [2]:
## For normalising the answers
contractions = {
    "aint": "ain't",
    "arent": "aren't",
    "cant": "can't",
    "couldve": "could've",
    "couldnt": "couldn't",
    "couldn'tve": "couldn't've",
    "couldnt've": "couldn't've",
    "didnt": "didn't",
    "doesnt": "doesn't",
    "dont": "don't",
    "hadnt": "hadn't",
    "hadnt've": "hadn't've",
    "hadn'tve": "hadn't've",
    "hasnt": "hasn't",
    "havent": "haven't",
    "hed": "he'd",
    "hed've": "he'd've",
    "he'dve": "he'd've",
    "hes": "he's",
    "howd": "how'd",
    "howll": "how'll",
    "hows": "how's",
    "Id've": "I'd've",
    "I'dve": "I'd've",
    "Im": "I'm",
    "Ive": "I've",
    "isnt": "isn't",
    "itd": "it'd",
    "itd've": "it'd've",
    "it'dve": "it'd've",
    "itll": "it'll",
    "let's": "let's",
    "maam": "ma'am",
    "mightnt": "mightn't",
    "mightnt've": "mightn't've",
    "mightn'tve": "mightn't've",
    "mightve": "might've",
    "mustnt": "mustn't",
    "mustve": "must've",
    "neednt": "needn't",
    "notve": "not've",
    "oclock": "o'clock",
    "oughtnt": "oughtn't",
    "ow's'at": "'ow's'at",
    "'ows'at": "'ow's'at",
    "'ow'sat": "'ow's'at",
    "shant": "shan't",
    "shed've": "she'd've",
    "she'dve": "she'd've",
    "she's": "she's",
    "shouldve": "should've",
    "shouldnt": "shouldn't",
    "shouldnt've": "shouldn't've",
    "shouldn'tve": "shouldn't've",
    "somebody'd": "somebodyd",
    "somebodyd've": "somebody'd've",
    "somebody'dve": "somebody'd've",
    "somebodyll": "somebody'll",
    "somebodys": "somebody's",
    "someoned": "someone'd",
    "someoned've": "someone'd've",
    "someone'dve": "someone'd've",
    "someonell": "someone'll",
    "someones": "someone's",
    "somethingd": "something'd",
    "somethingd've": "something'd've",
    "something'dve": "something'd've",
    "somethingll": "something'll",
    "thats": "that's",
    "thered": "there'd",
    "thered've": "there'd've",
    "there'dve": "there'd've",
    "therere": "there're",
    "theres": "there's",
    "theyd": "they'd",
    "theyd've": "they'd've",
    "they'dve": "they'd've",
    "theyll": "they'll",
    "theyre": "they're",
    "theyve": "they've",
    "twas": "'twas",
    "wasnt": "wasn't",
    "wed've": "we'd've",
    "we'dve": "we'd've",
    "weve": "we've",
    "werent": "weren't",
    "whatll": "what'll",
    "whatre": "what're",
    "whats": "what's",
    "whatve": "what've",
    "whens": "when's",
    "whered": "where'd",
    "wheres": "where's",
    "whereve": "where've",
    "whod": "who'd",
    "whod've": "who'd've",
    "who'dve": "who'd've",
    "wholl": "who'll",
    "whos": "who's",
    "whove": "who've",
    "whyll": "why'll",
    "whyre": "why're",
    "whys": "why's",
    "wont": "won't",
    "wouldve": "would've",
    "wouldnt": "wouldn't",
    "wouldnt've": "wouldn't've",
    "wouldn'tve": "wouldn't've",
    "yall": "y'all",
    "yall'll": "y'all'll",
    "y'allll": "y'all'll",
    "yall'd've": "y'all'd've",
    "y'alld've": "y'all'd've",
    "y'all'dve": "y'all'd've",
    "youd": "you'd",
    "youd've": "you'd've",
    "you'dve": "you'd've",
    "youll": "you'll",
    "youre": "you're",
    "youve": "you've",
}

manual_map = {
    "none": "0",
    "zero": "0",
    "one": "1",
    "two": "2",
    "three": "3",
    "four": "4",
    "five": "5",
    "six": "6",
    "seven": "7",
    "eight": "8",
    "nine": "9",
    "ten": "10",
}
articles = ["a", "an", "the"]
period_strip = re.compile("(?!<=\d)(\.)(?!\d)")
comma_strip = re.compile("(\d)(\,)(\d)")
punct = [
    ";",
    r"/",
    "[",
    "]",
    '"',
    "{",
    "}",
    "(",
    ")",
    "=",
    "+",
    "\\",
    "_",
    "-",
    ">",
    "<",
    "@",
    "`",
    ",",
    "?",
    "!",
]


def normalize_word(token):
    
    original_token = token
    _token = token
    for p in punct:
        if (p + " " in token or " " + p in token) or (
            re.search(comma_strip, token) != None
        ):
            _token = _token.replace(p, "")
        else:
            _token = _token.replace(p, " ")
    token = period_strip.sub("", _token, re.UNICODE)

    _token = []
    temp = token.lower().split()
    for word in temp:
        word = manual_map.setdefault(word, word)
        if word not in articles:
            _token.append(word)
    for i, word in enumerate(_token):
        if word in contractions:
            _token[i] = contractions[word]
    token = " ".join(_token)
    token = token.replace(",", "")
    
    if token == "":
        token = original_token
        
    return token

In [3]:
def filter_rows_outside_3129(df, vqa_answers):
    dropped_examples = []
    covered_examples = 0
    
    filtered_df = df.copy(deep=True)
    for ind, mcq_ans in enumerate(df['multiple_choice_answer']):
        
        if mcq_ans in vqa_answers:
            covered_examples += 1
        else:
            dropped_examples.append(ind)
    
    filtered_df = df.drop(dropped_examples)
    print('Covered examples:', covered_examples)
    
    return filtered_df

# Load the **VQA v2.0 Dataset**
VQA v2.0 dataset obtained from: https://www.kaggle.com/datasets/rajatkumar794/visual-based-question-answering

In [4]:
DATASET_FILEPATH = "../input/visual-based-question-answering"

TRAIN_ANNOTATIONS_FILEPATH = "v2_Annotations_Train_mscoco/v2_mscoco_train2014_annotations.json"
TRAIN_QUESTIONS_FILEPATH = "v2_Questions_Train_mscoco/v2_OpenEnded_mscoco_train2014_questions.json"
TRAIN_IMAGES_FILEPATH = "train2014/train2014"

VAL_ANNOTATIONS_FILEPATH = "v2_Annotations_Val_mscoco/v2_mscoco_val2014_annotations.json"
VAL_QUESTIONS_FILEPATH = "v2_Questions_Val_mscoco/v2_OpenEnded_mscoco_val2014_questions.json"
VAL_IMAGES_FILEPATH = "val2014/val2014"

In [5]:
## Change filepaths accordingly, depending on whether you processed the train or validation dataset.
data_annotations1 = json.load(open(os.path.join(DATASET_FILEPATH, TRAIN_ANNOTATIONS_FILEPATH)))
data_annotations2 = json.load(open(os.path.join(DATASET_FILEPATH, VAL_ANNOTATIONS_FILEPATH)))

train_annotations = pd.DataFrame(data_annotations1["annotations"])
val_annotations = pd.DataFrame(data_annotations2["annotations"])

print(train_annotations.shape[0])
print(val_annotations.shape[0])

443757
214354


In [6]:
## Combining the training and validation sets

df_annotations = pd.concat([train_annotations, val_annotations], axis=0)
print("Total no. of examples (training + validation):",df_annotations.shape[0])
print("No. of unique answers:", len(df_annotations['multiple_choice_answer'].unique()))

Total no. of examples (training + validation): 658111
No. of unique answers: 29332


In [7]:
## Normalise the answers, e.g. removing punctuations
train_annotations['multiple_choice_answer'] = train_annotations['multiple_choice_answer'].map(normalize_word)
val_annotations['multiple_choice_answer'] = val_annotations['multiple_choice_answer'].map(normalize_word)

df_annotations = pd.concat([train_annotations, val_annotations], axis=0)

In [8]:
## To obtain only answers that have appeared at least 9 times across the training + validation sets
## Results in 3,129 unique answers

all_answers_dict = {}
label_id = 0
answer_mapping = {}

for ans in df_annotations['multiple_choice_answer']:

    if ans not in all_answers_dict:
        all_answers_dict[ans] = 1
    else:
        all_answers_dict[ans] += 1

sorted_ans_dict = dict(sorted(all_answers_dict.items(), key=lambda item: item[1],reverse=True))
for sort_ans, sort_count in sorted_ans_dict.items():
    if sort_count == 8:
        break
    answer_mapping[sort_ans] = label_id
    label_id += 1

print(len(answer_mapping.keys()))

3129


In [9]:
## Obtain the answer to label mapping to be used

random.seed(RANDOM_STATE)
updated_answer_list = deepcopy(list(answer_mapping.keys()))
random.shuffle(updated_answer_list)

label_id = 0
new_answer_mapping = {}

for ans in updated_answer_list:
    new_answer_mapping[ans] = label_id
    label_id += 1

print(len(new_answer_mapping))
print(new_answer_mapping)

3129


In [10]:
with open('VQAv2_answer_mapping.json', 'w') as fp:
    json.dump(new_answer_mapping, fp)

In [11]:
## Keep only examples with answers that are one of the 3,129 answers.

train_annotations_filtered = filter_rows_outside_3129(train_annotations, new_answer_mapping.keys())
val_annotations_filtered = filter_rows_outside_3129(val_annotations, new_answer_mapping.keys())

Covered examples: 413433
Covered examples: 199613


In [15]:
## Verify that there are now 3,129 unique answers in the filtered dataset.
train_ans_set = set(train_annotations_filtered['multiple_choice_answer'])
val_ans_set = set(val_annotations_filtered['multiple_choice_answer'])

print(len(train_ans_set.union(val_ans_set)))

3129


In [16]:
train_annotations_processed = train_annotations_filtered.reset_index(drop=True)
train_annotations_processed = train_annotations_processed.rename(columns={'multiple_choice_answer': 'answer'})

val_annotations_processed = val_annotations_filtered.reset_index(drop=True)
val_annotations_processed = val_annotations_processed.rename(columns={'multiple_choice_answer': 'answer'})

train_annotations_processed

Unnamed: 0,question_type,answer,answers,image_id,answer_type,question_id
0,what is this,net,"[{'answer': 'net', 'answer_confidence': 'maybe...",458752,other,458752000
1,what,pitcher,"[{'answer': 'pitcher', 'answer_confidence': 'y...",458752,other,458752001
2,what color is the,orange,"[{'answer': 'orange', 'answer_confidence': 'ye...",458752,other,458752002
3,is this,yes,"[{'answer': 'yes', 'answer_confidence': 'yes',...",458752,yes/no,458752003
4,what color is the,white,"[{'answer': 'white', 'answer_confidence': 'yes...",262146,other,262146000
...,...,...,...,...,...,...
413428,how many,2,"[{'answer': '2', 'answer_confidence': 'yes', '...",524286,number,524286000
413429,what color is the,black,"[{'answer': 'black', 'answer_confidence': 'yes...",524286,other,524286001
413430,is there a,no,"[{'answer': 'no', 'answer_confidence': 'yes', ...",524286,yes/no,524286002
413431,what color is the,black,"[{'answer': 'black', 'answer_confidence': 'yes...",524286,other,524286003


In [17]:
## Change filepaths accordingly, depending on whether you processed the train or validation dataset.
train_questions = json.load(open(os.path.join(DATASET_FILEPATH, TRAIN_QUESTIONS_FILEPATH)))
val_questions = json.load(open(os.path.join(DATASET_FILEPATH, VAL_QUESTIONS_FILEPATH)))

df_train_questions = pd.DataFrame(train_questions["questions"])
df_val_questions = pd.DataFrame(val_questions["questions"])

df_train_questions

Unnamed: 0,image_id,question,question_id
0,458752,What is this photo taken looking through?,458752000
1,458752,What position is this man playing?,458752001
2,458752,What color is the players shirt?,458752002
3,458752,Is this man a professional baseball player?,458752003
4,262146,What color is the snow?,262146000
...,...,...,...
443752,524286,What color is the keyboard?,524286001
443753,524286,Is there a computer mouse on the desk?,524286002
443754,524286,What color is the computer?,524286003
443755,524286,Why are there two keyboards?,524286004


In [18]:
## Merge dataframes containing the questions and answers

df_train_combined = train_annotations_processed.merge(df_train_questions, on=['question_id', 'image_id'], how="left")
df_train_combined['image_id'] = df_train_combined.loc[:, 'image_id'].apply(lambda img_id: f"{TRAIN_IMAGES_FILEPATH}/COCO_train2014_{img_id:012}.jpg")
df_train_combined

Unnamed: 0,question_type,answer,answers,image_id,answer_type,question_id,question
0,what is this,net,"[{'answer': 'net', 'answer_confidence': 'maybe...",train2014/train2014/COCO_train2014_00000045875...,other,458752000,What is this photo taken looking through?
1,what,pitcher,"[{'answer': 'pitcher', 'answer_confidence': 'y...",train2014/train2014/COCO_train2014_00000045875...,other,458752001,What position is this man playing?
2,what color is the,orange,"[{'answer': 'orange', 'answer_confidence': 'ye...",train2014/train2014/COCO_train2014_00000045875...,other,458752002,What color is the players shirt?
3,is this,yes,"[{'answer': 'yes', 'answer_confidence': 'yes',...",train2014/train2014/COCO_train2014_00000045875...,yes/no,458752003,Is this man a professional baseball player?
4,what color is the,white,"[{'answer': 'white', 'answer_confidence': 'yes...",train2014/train2014/COCO_train2014_00000026214...,other,262146000,What color is the snow?
...,...,...,...,...,...,...,...
413428,how many,2,"[{'answer': '2', 'answer_confidence': 'yes', '...",train2014/train2014/COCO_train2014_00000052428...,number,524286000,How many keyboards are there?
413429,what color is the,black,"[{'answer': 'black', 'answer_confidence': 'yes...",train2014/train2014/COCO_train2014_00000052428...,other,524286001,What color is the keyboard?
413430,is there a,no,"[{'answer': 'no', 'answer_confidence': 'yes', ...",train2014/train2014/COCO_train2014_00000052428...,yes/no,524286002,Is there a computer mouse on the desk?
413431,what color is the,black,"[{'answer': 'black', 'answer_confidence': 'yes...",train2014/train2014/COCO_train2014_00000052428...,other,524286003,What color is the computer?


In [19]:
## Merge dataframes containing the questions and answers

df_val_combined = val_annotations_processed.merge(df_val_questions, on=['question_id', 'image_id'], how="left")
df_val_combined['image_id'] = df_val_combined.loc[:, 'image_id'].apply(lambda img_id: f"{VAL_IMAGES_FILEPATH}/COCO_val2014_{img_id:012}.jpg")
df_val_combined

Unnamed: 0,question_type,answer,answers,image_id,answer_type,question_id,question
0,none of the above,down,"[{'answer': 'down', 'answer_confidence': 'yes'...",val2014/val2014/COCO_val2014_000000262148.jpg,other,262148000,Where is he looking?
1,what are the,watching,"[{'answer': 'spectating', 'answer_confidence':...",val2014/val2014/COCO_val2014_000000262148.jpg,other,262148001,What are the people in the background doing?
2,what is,picnic table,"[{'answer': 'table', 'answer_confidence': 'yes...",val2014/val2014/COCO_val2014_000000262148.jpg,other,262148002,What is he on top of?
3,is this a,no,"[{'answer': 'no', 'answer_confidence': 'yes', ...",val2014/val2014/COCO_val2014_000000393225.jpg,yes/no,393225001,Is this a creamy soup?
4,is this,yes,"[{'answer': 'yes', 'answer_confidence': 'yes',...",val2014/val2014/COCO_val2014_000000393225.jpg,yes/no,393225002,Is this rice noodle soup?
...,...,...,...,...,...,...,...
199608,are these,yes,"[{'answer': 'no', 'answer_confidence': 'maybe'...",val2014/val2014/COCO_val2014_000000524282.jpg,yes/no,524282006,Are these women wearing skirts?
199609,what is the,green,"[{'answer': 'green', 'answer_confidence': 'yes...",val2014/val2014/COCO_val2014_000000393212.jpg,other,393212000,What is the main color in the photo?
199610,what is on the,arrow,"[{'answer': 'arrow', 'answer_confidence': 'yes...",val2014/val2014/COCO_val2014_000000393212.jpg,other,393212002,What is on the sign?
199611,does the,left,"[{'answer': 'left', 'answer_confidence': 'yes'...",val2014/val2014/COCO_val2014_000000393212.jpg,other,393212003,Does the arrow point left or right?


In [21]:
## Prepare dataframe to be saved to a csv file
preprocessed_train_df = df_train_combined[['question', 'answer', 'answer_type', 'image_id']]
preprocessed_train_df['label'] = preprocessed_train_df['answer'].map(new_answer_mapping)
preprocessed_train_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,question,answer,answer_type,image_id,label
0,What is this photo taken looking through?,net,other,train2014/train2014/COCO_train2014_00000045875...,1036
1,What position is this man playing?,pitcher,other,train2014/train2014/COCO_train2014_00000045875...,2092
2,What color is the players shirt?,orange,other,train2014/train2014/COCO_train2014_00000045875...,2693
3,Is this man a professional baseball player?,yes,yes/no,train2014/train2014/COCO_train2014_00000045875...,2259
4,What color is the snow?,white,other,train2014/train2014/COCO_train2014_00000026214...,2580
...,...,...,...,...,...
413428,How many keyboards are there?,2,number,train2014/train2014/COCO_train2014_00000052428...,2013
413429,What color is the keyboard?,black,other,train2014/train2014/COCO_train2014_00000052428...,2486
413430,Is there a computer mouse on the desk?,no,yes/no,train2014/train2014/COCO_train2014_00000052428...,2194
413431,What color is the computer?,black,other,train2014/train2014/COCO_train2014_00000052428...,2486


In [22]:
## Prepare dataframe to be saved to a csv file
preprocessed_val_df = df_val_combined[['question', 'answer', 'answer_type', 'image_id']]
preprocessed_val_df['label'] = preprocessed_val_df['answer'].map(new_answer_mapping)
preprocessed_val_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,question,answer,answer_type,image_id,label
0,Where is he looking?,down,other,val2014/val2014/COCO_val2014_000000262148.jpg,207
1,What are the people in the background doing?,watching,other,val2014/val2014/COCO_val2014_000000262148.jpg,881
2,What is he on top of?,picnic table,other,val2014/val2014/COCO_val2014_000000262148.jpg,1600
3,Is this a creamy soup?,no,yes/no,val2014/val2014/COCO_val2014_000000393225.jpg,2194
4,Is this rice noodle soup?,yes,yes/no,val2014/val2014/COCO_val2014_000000393225.jpg,2259
...,...,...,...,...,...
199608,Are these women wearing skirts?,yes,yes/no,val2014/val2014/COCO_val2014_000000524282.jpg,2259
199609,What is the main color in the photo?,green,other,val2014/val2014/COCO_val2014_000000393212.jpg,368
199610,What is on the sign?,arrow,other,val2014/val2014/COCO_val2014_000000393212.jpg,174
199611,Does the arrow point left or right?,left,other,val2014/val2014/COCO_val2014_000000393212.jpg,2415


In [23]:
preprocessed_train_df.to_csv('VQAv2_train_preprocessed.csv', index=False)
preprocessed_val_df.to_csv('VQAv2_val_preprocessed.csv', index=False)

In [24]:
## To check the csv file now containing the preprocessed training set.
check_train = pd.read_csv('VQAv2_train_preprocessed.csv')
check_train

Unnamed: 0,question,answer,answer_type,image_id,label
0,What is this photo taken looking through?,net,other,train2014/train2014/COCO_train2014_00000045875...,1036
1,What position is this man playing?,pitcher,other,train2014/train2014/COCO_train2014_00000045875...,2092
2,What color is the players shirt?,orange,other,train2014/train2014/COCO_train2014_00000045875...,2693
3,Is this man a professional baseball player?,yes,yes/no,train2014/train2014/COCO_train2014_00000045875...,2259
4,What color is the snow?,white,other,train2014/train2014/COCO_train2014_00000026214...,2580
...,...,...,...,...,...
413428,How many keyboards are there?,2,number,train2014/train2014/COCO_train2014_00000052428...,2013
413429,What color is the keyboard?,black,other,train2014/train2014/COCO_train2014_00000052428...,2486
413430,Is there a computer mouse on the desk?,no,yes/no,train2014/train2014/COCO_train2014_00000052428...,2194
413431,What color is the computer?,black,other,train2014/train2014/COCO_train2014_00000052428...,2486


In [25]:
## To check the csv file now containing the preprocessed validation set.
check_val = pd.read_csv('VQAv2_val_preprocessed.csv')
check_val

Unnamed: 0,question,answer,answer_type,image_id,label
0,Where is he looking?,down,other,val2014/val2014/COCO_val2014_000000262148.jpg,207
1,What are the people in the background doing?,watching,other,val2014/val2014/COCO_val2014_000000262148.jpg,881
2,What is he on top of?,picnic table,other,val2014/val2014/COCO_val2014_000000262148.jpg,1600
3,Is this a creamy soup?,no,yes/no,val2014/val2014/COCO_val2014_000000393225.jpg,2194
4,Is this rice noodle soup?,yes,yes/no,val2014/val2014/COCO_val2014_000000393225.jpg,2259
...,...,...,...,...,...
199608,Are these women wearing skirts?,yes,yes/no,val2014/val2014/COCO_val2014_000000524282.jpg,2259
199609,What is the main color in the photo?,green,other,val2014/val2014/COCO_val2014_000000393212.jpg,368
199610,What is on the sign?,arrow,other,val2014/val2014/COCO_val2014_000000393212.jpg,174
199611,Does the arrow point left or right?,left,other,val2014/val2014/COCO_val2014_000000393212.jpg,2415


In [26]:
## To obtain the label to answer mappings

id_to_answer = dict(zip(new_answer_mapping.values(), new_answer_mapping.keys()))
print(len(id_to_answer))

with open('VQAv2_id_to_answer.json', 'w') as fp:
    json.dump(id_to_answer, fp)

3129
