In [1]:
import json
import numpy as np
from collections import Counter

In [2]:
kb_json = './dataset/kb.json'

train_json = './dataset/train.json'
val_json = './dataset/val.json'
test_json = './dataset/test.json'

def string_clean(s: str) -> str:
    s = ' '.join(s.split())
    return s

In [3]:
kb = json.load(open(kb_json))

entities = dict()
entities_set = set()
concepts = dict()
concepts_set = set()

for i in kb['concepts']:
    concepts[i] = kb['concepts'][i]['name']
    concepts_set.add(string_clean(kb['concepts'][i]['name']))

for i in kb['entities']:
    entities[i] = kb['entities'][i]['name']
    entities_set.add(string_clean(kb['entities'][i]['name']))

In [4]:
# with open('concepts.txt', 'w') as f:
#     f.write(str(concepts))

# with open('entities.txt', 'w') as f:
#     f.write(str(entities))

# with open('concepts_set.txt', 'w') as f:
#     f.write(str(concepts_set))

# with open('entities_set.txt', 'w') as f:
#     f.write(str(entities_set))

In [5]:
trn = json.load(open(train_json))
vld = json.load(open(val_json))
tst = json.load(open(test_json))

In [6]:
trn_choices = set()
trn_find = set()
vld_choices = set()
vld_find = set()
# tst_choices = set()
# tst_find = set()

### Program entities that are not included in KB

In [7]:
for statement in trn:
    for function in statement['program']:
        if function['function'] == 'Find':
            trn_find.add(string_clean(function['inputs'][0]))
for statement in vld:
    for function in statement['program']:
        if function['function'] == 'Find':
            vld_find.add(string_clean(function['inputs'][0]))

In [8]:
# with open('trn_find.txt', 'w') as f:
#     f.write(str(trn_find))
# with open('vld_find.txt', 'w') as f:
#     f.write(str(vld_find))

In [9]:
trn_int_ent = trn_find.intersection(entities_set)
vld_int_ent = vld_find.intersection(entities_set)

In [10]:
len(trn_int_ent), len(entities_set), len(trn_find), (len(trn_int_ent) / len(trn_find) * 100), (len(trn_find) - len(trn_int_ent))

(13188, 13693, 13439, 98.13230151052905, 251)

In [11]:
len(vld_int_ent), len(entities_set), len(vld_find), (len(vld_int_ent) / len(vld_find) *100), (len(vld_find) - len(vld_int_ent))

(7602, 13693, 7733, 98.30596146385619, 131)

In [12]:
outliers_trn_find = set([(i if i not in trn_int_ent else None) for i in trn_find])
outliers_trn_find.discard(None)
outliers_vld_find = set([(i if i not in vld_int_ent else None) for i in vld_find])
outliers_vld_find.discard(None)
# outliers_entities = set([(i if i not in trn_intersection else None) for i in entities_set])
# outliers_entities.discard(None)

In [13]:
len(outliers_trn_find), len(outliers_vld_find)

(251, 131)

In [14]:
# with open('trn_find_exclude.txt', 'w') as f:
#     f.write(str(outliers_trn_find))
# with open('vld_find_exclude.txt', 'w') as f:
#     f.write(str(outliers_vld_find))

In [15]:
trn_int_all = outliers_trn_find.intersection(concepts_set)
vld_int_all = outliers_vld_find.intersection(concepts_set)
len(trn_int_all), len(vld_int_all)

(251, 131)

In [16]:
outliers_trn_find_all = set([(i if i not in trn_int_all else None) for i in outliers_trn_find])
outliers_trn_find_all.discard(None)
outliers_vld_find_all = set([(i if i not in vld_int_all else None) for i in outliers_vld_find])
outliers_vld_find_all.discard(None)
outliers_trn_find_all, outliers_vld_find_all

(set(), set())

In [17]:
i = 0
for statement in trn:
    print(statement['question'], '\n', statement['program'], '\n', statement['choices'], '\n', statement['answer'], '\n')
    i += 1
    if i == 100: break


Which town has a TOID of 4000000074573917 and has an OS grid reference of SP8778? 
 [{'function': 'FindAll', 'dependencies': [], 'inputs': []}, {'function': 'FilterStr', 'dependencies': [0], 'inputs': ['TOID', '4000000074573917']}, {'function': 'FilterConcept', 'dependencies': [1], 'inputs': ['town']}, {'function': 'FindAll', 'dependencies': [], 'inputs': []}, {'function': 'FilterStr', 'dependencies': [3], 'inputs': ['OS grid reference', 'SP8778']}, {'function': 'FilterConcept', 'dependencies': [4], 'inputs': ['town']}, {'function': 'And', 'dependencies': [2, 5], 'inputs': []}, {'function': 'What', 'dependencies': [6], 'inputs': []}] 
 ['Wigan', 'Doncaster', 'Royal Tunbridge Wells', 'Kettering', 'Edmonton', 'Macclesfield', 'Blackburn', 'Colchester', 'South Shields', 'Wimbledon'] 
 Kettering 

Who is the reviewer of the Georgia national football team, which is ranked 78th? 
 [{'function': 'Find', 'dependencies': [], 'inputs': ['Georgia national football team']}, {'function': 'QueryAttrQ

### QA Choices that are not included in KB

In [18]:
function_type = ['FindAll', 'Find', 
'FilterConcept', 'FilterStr', 'FilterNum', 'FilterYear', 'FilterDate', 'QFilterStr', 'QFilterNum', 'QFilterYear', 'QFilterDate', 
'Relate', 'And', 'Or', 'What', 'Count', # Note: In the data 'QueryName' is substituted to 'What'
'QueryAttr', 'QueryAttrUnderCondition', 'QueryRelation', 
'SelectBetween','SelectAmong', 
'VerifyStr', 'VerifyNum', 'VerifyYear', 'VerifyDate',
'QueryAttrQualifier', 'QueryRelationQualifier']

In [19]:
for statement in trn:
    for function in statement['program']:
        if function['function'] == 'What':
            trn_choices.update([string_clean(i) for i in statement['choices']])
            break
for statement in vld:
    for function in statement['program']:
        if function['function'] == 'What':
            vld_choices.update([string_clean(i) for i in statement['choices']])
            break

In [20]:
trn_choice_intersection = trn_choices.intersection(entities_set)
vld_choice_intersection = vld_choices.intersection(entities_set)

In [21]:
len(trn_choice_intersection), len(entities_set), len(trn_choices), (len(trn_choice_intersection) / len(trn_choices) * 100), len(trn_choices) - len(trn_choice_intersection)

(13291, 13693, 13393, 99.23840812364668, 102)

In [22]:
len(vld_choice_intersection), len(entities_set), len(vld_choices), (len(vld_choice_intersection) / len(vld_choices) * 100), len(vld_choices) - len(vld_choice_intersection)

(6601, 13693, 6629, 99.57761351636748, 28)

In [23]:
# with open('trn_choices.txt', 'w') as f:
#     f.write(str(trn_choices))
# with open('vld_choices.txt', 'w') as f:
#     f.write(str(trn_choices))

In [24]:
outliers_trn_choice = set([(i if i not in trn_choice_intersection else None) for i in trn_choices])
outliers_trn_choice.discard(None)
outliers_vld_choice = set([(i if i not in vld_choice_intersection else None) for i in vld_choices])
outliers_vld_choice.discard(None)

In [25]:
len(outliers_trn_choice), len(outliers_vld_choice)

(102, 28)

In [26]:
trn_int = outliers_trn_choice.intersection(concepts_set)
vld_int = outliers_vld_choice.intersection(concepts_set)
len(trn_int), len(vld_int)

(102, 28)

In [27]:
outliers_trn_choice_all = set([(i if i not in trn_int else None) for i in outliers_trn_choice])
outliers_trn_choice_all.discard(None)
outliers_vld_choice_all = set([(i if i not in vld_int else None) for i in outliers_vld_choice])
outliers_vld_choice_all.discard(None)
outliers_trn_choice_all, outliers_vld_choice_all

(set(), set())

### Extract query?