#### Exploring Raw JSON Data

In [1]:
import sys, os
import torch
pwd = os.getcwd()

class Arguments():
    data = os.path.join(pwd, 'DATA', 'eval_v2.1_public.json')
    exp_folder = os.path.join(pwd, 'Experimente/LightningTest')
    word_rep = os.path.join(pwd, 'DATA', 'glove.840B.300d.txt')
    cuda = torch.cuda.is_available()
    use_covariance = False
    force_restart = False

args = Arguments()

if not os.path.exists(args.exp_folder):
    os.makedirs(args.exp_folder)

In [2]:
import json
with open(args.data) as f_o:
    file = json.load(f_o)
    
#file.keys()
#file['answers'] #is a dict with qid --> ['Here is one long Answer with multiple Sentence in one String']
#file['passages'] #is a dict with qid --> [{'is_selected': ..., 'passage_text':..., 'url':...},]
#file['query'] #is a dict with qid --> 'Answer Sentence'
#file['query_id'] #is a dict with qid --> Query ID
#file['query_type'] #is a dict with qid --> Type of Query {'DESCRIPTION', 'NUMERIC', 'LOCATION',....}
#file['wellFormedAnswers'] #is a dict with qid --> Well formulated Answers in form ['Here is a short well formulated Answer in one String']

len(file['query_id'])

101092

In [3]:
file.keys()

dict_keys(['passages', 'query', 'query_id', 'query_type'])

#### Exploring Loaded Data

In [2]:
sys.path.append(os.path.join(pwd,'MsmarcoQuestionAnswering','Baseline'))
sys.path.append(os.path.join(pwd,'MsmarcoQuestionAnswering','Baseline','scripts'))

import MsmarcoQuestionAnswering.Baseline.mrcqa as mrcqa
import MsmarcoQuestionAnswering.Baseline.scripts.dataset as dataset
import MsmarcoQuestionAnswering.Baseline.scripts.checkpointing as checkpointing
import MsmarcoQuestionAnswering.Baseline.scripts.train as manager


In [3]:
import json
with open(args.data) as f_o:
        data, _ = dataset.load_data(json.load(f_o), span_only=True, answered_only=True, loading_limit=1000)

Start Organizing Data...
Organizing progress: 0.0 x 10⁴


In [34]:
#data is a List of validated and pre-processed Tuples with (qid, passage, query, (start_pos, end_pos))
class DataIndizes():
    qid = 0
    passage = 1
    query = 2
    span = 3

token_to_id = {'': 0}
char_to_id = {'': 0}
tokenized_data = dataset.tokenize_data(data, token_to_id, char_to_id)

0.0 x 10⁴/0.0347 x 10⁴


In [50]:
#tokenized_data is a tokenized form of the data before: A List of Tuples with (qid,passage,query,(start_pos_indizes, end_pos_indizes),token_to_char index mapping)
#Where passage and query has the form: ([passage_tokens],[[char_tokens_per_word],...,])
class TokenizedDataIndizes():
    qid = 0
    passage = 1
    query = 2
    span = 3
    mapping = 4

QID = 0

#Create Inverse Dictionaries
id_to_token = {id_: tok for tok, id_ in token_to_id.items()}
id_to_char = {id_: char for char, id_ in char_to_id.items()}


#Get the Answer to the question in not tokenized Data
span = data[QID][DataIndizes.span]
print(data[QID][DataIndizes.passage]['passage_text'][span[0]:span[1]])

#Get the Answer to the question in tokenized Data
span = tokenized_data[QID][TokenizedDataIndizes.span]
passage_tokens = tokenized_data[QID][TokenizedDataIndizes.passage][0]
print([id_to_token[tok] for tok in passage_tokens[span[0]:span[1]]])

Restorative justice that fosters dialogue between victim and offender has shown the highest rates of victim satisfaction and offender accountability.
['Restorative', 'justice', 'that', 'fosters', 'dialogue', 'between', 'victim', 'and', 'offender', 'has', 'shown', 'the', 'highest', 'rates', 'of', 'victim', 'satisfaction', 'and', 'offender', 'accountability', '.']


In [29]:
import pandas
from itertools import compress

def qid_to_index(data, qid):
    df = pandas.DataFrame(data)
    boolean_pos = (df[0] == str(qid))
    return list(compress(range(len(boolean_pos)), boolean_pos))

def get_span(data,index):
    return data[index][3]

def get_passage_text(data,index):
    return data[index][1]['passage_text']

def span_to_answer(span,passage_text):
    return passage_text[span[0]:span[1]]


df = pandas.DataFrame(data)

idx = qid_to_index(data,1000)[0]
span_to_answer(get_span(data, idx),get_passage_text(data,idx))

'Depona Ab is a library in Vilhelmina, Sweden.'