In [None]:
import json
import zipfile
import time
import random
import numpy as np
from collections import Counter, defaultdict
from IPython.display import clear_output

# Mounting

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Pathing

In [None]:
# Total dataset size: 443757
# SUBSET_SIZE = 10*1000

# Max number for equal split
# SUBSET_SIZE = 3*57*1000

# SUBSET_SIZE = 30*1000
SUBSET_SIZE = 443700

EQUAL = SUBSET_SIZE <= 3*57*1000

# part = 'Val'
part = 'Train'


root = '/content/drive/MyDrive/BOWIE/Data/'
input_path = root + f'VQA_{part}/'
output_path = root + 'Subset/'

Qpath = input_path + f'v2_Questions_{part}_mscoco.zip'
Apath = input_path + f'v2_Annotations_{part}_mscoco.zip'



In [None]:
with zipfile.ZipFile(Qpath, 'r') as file:
    qdata = json.load(file.open(file.namelist()[0]))

with zipfile.ZipFile(Apath, 'r') as file:
    adata = json.load(file.open(file.namelist()[0])) 

# Full Set Statistics

In [None]:
def set_stat( data, t ):

  print( f'\n======== {t} SET STATISTICS ========\n' )

  question_types = set()
  multiple_choice_answers = set()
  answer_types = set()

  answer2count = defaultdict(int)
  answertypes2count = defaultdict(int)

  # Dict of dicts
  top_answers_per_type = defaultdict(lambda: defaultdict(int))

  data4type = {
      'yes/no': list(),
      'number': list(), 
      'other': list()
  }

  print( f'Number of datapoints: {len(data["annotations"])}' )
  for i, ann in enumerate( data['annotations'] ):
      question_types.add(ann['question_type'])
      
      mca = ann['multiple_choice_answer']
      multiple_choice_answers.add( mca )
      answer2count[ mca ] += 1

      at = ann['answer_type']
      answer_types.add( at )
      answertypes2count[ at ] += 1

      top_answers_per_type[ at ][ mca ] += 1

      data4type[ at ].append( i )

  print("Unique Question Types: ", len(question_types))
  print('One of them:', random.sample(question_types, 1))

  print("\nUnique Answer Types: ", answer_types)
  for k, v in answertypes2count.items():
    print('* ', k, '\t', v)

  print("\nTop 3 Most Common Answers per Answer Type: ", answer_types)
  for at in answer_types:
    print(f'*  {at}:\t{Counter(top_answers_per_type[ at ]).most_common(3)}')
  print('\n')
  return data4type

In [None]:
data4type = set_stat( adata, 'FULL' )



Number of datapoints: 443757
Unique Question Types:  65
One of them: ['what time']

Unique Answer Types:  {'other', 'yes/no', 'number'}
*  other 	 219269
*  yes/no 	 166882
*  number 	 57606

Top 3 Most Common Answers per Answer Type:  {'other', 'yes/no', 'number'}
*  other:	[('white', 8915), ('blue', 5455), ('red', 5201)]
*  yes/no:	[('yes', 84615), ('no', 82263), ('africa', 1)]
*  number:	[('1', 12520), ('2', 12194), ('3', 6527)]




# Data Subset

In [None]:
print (sum([len(ix) for ix in data4type.values()]))
random.seed(64)
for v in data4type.values():
    random.shuffle(v)

# print(data4type.keys())
# print(data4type['yes/no'][:10])
# print(data4type['number'][:10])
# print(data4type['other'][:10])

443757


In [None]:
def data_subset( n = 20000, try_equal=False ):

    start_time = time.time()
    idx = list(range(0,len(qdata['questions'])))
    random.seed(42)
    random.shuffle(idx)

    np.random.seed(42)
    splits = ['train', 'valid', 'test']

    n_cat = n//3

    qdata_small = {'questions': list()}
    adata_small = {'annotations': list()}
    a_type_counts = {'yes/no': 0, 'number': 0, 'other': 0}

    while len(qdata_small['questions']) < n:
        try:
            i = idx.pop()
        except:
            break
        
        at = adata['annotations'][i]['answer_type'] 
        
        if try_equal == True:
            if a_type_counts[at] >= n_cat:
                continue

        if at == 'yes/no' and adata['annotations'][i]['multiple_choice_answer'] not in ['yes', 'no']:
            continue
            
        adata_small['annotations'].append(adata['annotations'][i])
        qdata_small['questions'].append(qdata['questions'][i])
        
        split = np.random.choice(splits, p=(.8, .15, .05))
        adata_small['annotations'][-1]['split'] = split
        qdata_small['questions'][-1]['split'] = split
        
        a_type_counts[at] += 1
            
    # Tests
    # assert len(qdata_small['questions']) == len(adata_small['annotations']) == 3*n, "Inconsitent Lengths."
    a_type_counts = {'yes/no': 0, 'number': 0, 'other': 0}
    for ann in adata_small['annotations']:
        a_type_counts[ann['answer_type']] += 1
    # assert a_type_counts['yes/no'] == a_type_counts['number'] == a_type_counts['other'] == n, "Inconsistent Answer Type Lengths."

    print("Data Creation Looks good! Time Taken %.2f" %(time.time()-start_time))
    return adata_small, qdata_small, a_type_counts

In [None]:
adata_small, qdata_small, a_type_counts = data_subset( n=SUBSET_SIZE, try_equal=EQUAL )

Data Creation Looks good! Time Taken 10.71


In [None]:
answertypes2count = set_stat( adata_small, 'SUB' )



Number of datapoints: 395484
Unique Question Types:  65
One of them: ['what is']

Unique Answer Types:  {'other', 'yes/no', 'number'}
*  other 	 171000
*  yes/no 	 166878
*  number 	 57606

Top 3 Most Common Answers per Answer Type:  {'other', 'yes/no', 'number'}
*  other:	[('white', 6923), ('blue', 4226), ('red', 4065)]
*  yes/no:	[('yes', 84615), ('no', 82263)]
*  number:	[('1', 12520), ('2', 12194), ('3', 6527)]




In [None]:
# stop

# Splitting

In [None]:
import gzip

In [None]:
qdata_small_splits = {\
                      'train': {'questions': list()}, 
                      'valid': {'questions': list()}, 
                      'test': {'questions': list()}
                     }

adata_small_splits = {\
                      'train': {'annotations': list()}, 
                      'valid': {'annotations': list()}, 
                      'test': {'annotations': list()}
                     }

for i in range(len(qdata_small['questions'])):
    
    split = qdata_small['questions'][i]['split']
    # assert split == adata_small['annotations'][i]['split'], "Inconsistent Splits."
    # assert adata_small['annotations'][i]['question_id'] == qdata_small['questions'][i]['question_id'], "Inconsistent IDs."
    
    qdata_small_splits[split]['questions'].append(qdata_small['questions'][i])
    adata_small_splits[split]['annotations'].append(adata_small['annotations'][i])
    
print('\n======================')
print("Training Set Size: %i" %(len(qdata_small_splits['train']['questions'])))
print("Validation Set Size: %i" %(len(qdata_small_splits['valid']['questions'])))
print("Test Set Size: %i" %(len(qdata_small_splits['test']['questions'])))
print('======================\n')

# Saving

In [None]:
c = f'{SUBSET_SIZE // 1000}k'
c

In [None]:
for split in ['train', 'valid', 'test']:
    
    with gzip.GzipFile(output_path + 'adata_' + split + '.gzip', 'w') as file:
        file.write(json.dumps(adata_small_splits[split]).encode('utf-8'))
        
    with gzip.GzipFile(output_path + 'qdata_' + split + '.gzip', 'w') as file:
        file.write(json.dumps(qdata_small_splits[split]).encode('utf-8'))

In [None]:
image_ids = set()
for q in qdata_small['questions']:
    image_ids.add(q['image_id'])
print(len(image_ids))
image_ids_json = {'image_ids': list(image_ids)}
with open(root + 'Additional/image_ids_vqa.json', 'w') as file:
    json.dump(image_ids_json, file)