In [1]:
import os
import re
import json
from tqdm import tqdm

In [2]:
class Preprocess(object):
    def __init__(self):
        
        ent_labels = json.load(open('./json/entity.json', 'rb'))
        rel_labels = json.load(open('./json/relation.json', 'rb'))
        
        vocab = ['"', '(', 'rdfs:label', 'by', 'ask', '>', 'select', 'que', 'limit', 'jai', 'mai',
                 '?sbj', ')', 'lang', 'year', '}', '?value', 'peint', 'desc', 'where', 'ce', 
                 'distinct',  'filter', 'lcase', 'order', 'la', '<', 'asc', 'en', 'contains', 
                 'as', ',', 'strstarts',  '{', "'", 'j', 'count', '=', '.', '?vr0', '?vr1', 
                 '?vr2', '?vr3', '?vr4', '?vr5', '?vr6',  '?vr0_label', '?vr1_label', '?vr2_label', 
                 '?vr3_label', '?vr4_label', '?vr5_label', '?vr6_label', 'wd:', 'wdt:', 'ps:', 
                 'p:', 'pq:', '?maskvar1', '[DEF]','[Entity]', '[Relation]', 'null']

        vocab_dict={}
        for i,text in enumerate(vocab):
            vocab_dict[text]='<extra_id_'+str(i+30)+'>'

        for kk in ent_labels:
            if ent_labels[kk] is None: ent_labels[kk] = vocab_dict['null']

        self.ent_labels = ent_labels
        self.rel_labels = rel_labels
        self.vocab_dict = vocab_dict

    
    def process(self, wikisparql):
        sparql = wikisparql.replace('(',' ( ').replace(')',' ) ').replace('{',' { ')\
        .replace('}',' } ').replace(':',': ').replace(',',' , ').replace("'"," ' ")\
        .replace('.',' . ').replace('=',' = ').lower()
        sparql = ' '.join(sparql.split())    
            
        split = sparql.split()
        for idx, item in enumerate(split):
            if item in self.ent_labels:
                split[idx] = self.ent_labels[item]
            elif item in self.rel_labels:
                split[idx] = self.rel_labels[item]

            if item in self.vocab_dict:
                split[idx] = self.vocab_dict[item]
        
        return ' '.join(split).strip()
        
    def _preprocess(self, data):
        # res = {'input': self.process(data['input']), 'target': self.process(data['target'])}

        return {'input': self.process(data['input']), 'target': self.process(data['target'])}

In [3]:
test_data = json.load(open('./json/flip_test.json', 'rb'))
train_data = json.load(open('./json/flip_train.json', 'rb'))

In [5]:
pre = Preprocess()

path = '../../transform/transformers_cache/downloads/LC-QuAD2.0-pre/dataset'

In [8]:
train = [pre._preprocess(item) for item in tqdm(train_data)]

with open(f'{path}/train.json','w+') as file:
    file.write(json.dumps(train, indent=2))

100%|██████████| 24180/24180 [00:00<00:00, 31656.27it/s]


In [9]:
test = [pre._preprocess(item) for item in tqdm(test_data)]

with open(f'{path}/test.json','w+') as file:
    file.write(json.dumps(test, indent=2))

100%|██████████| 6046/6046 [00:00<00:00, 36105.13it/s]


In [33]:
res

{'input': '<extra_id_6> <extra_id_21> <extra_id_39> <extra_id_19> <extra_id_33> <extra_id_53> manticore <extra_id_54> Theoi Project ID <extra_id_39> <extra_id_15>',
 'target': '<extra_id_6> <extra_id_21> <extra_id_39> <extra_id_19> <extra_id_33> <extra_id_53> manticore <extra_id_54> Theoi Project ID <extra_id_39> <extra_id_15>'}

In [13]:
count = 0
for item in train_data:
    if item['input'] != item['target']:
        count += 1
count

17508

In [14]:
count/len(train_data)

0.7240694789081886