# First tests with the API

In [1]:
example_input = {
    "dependency_heads": [3,3,0,7,6,7,3,9,7,11,7,15,15,15,11,18,18,15,3],
    "dependency_relations": ["nsubj","advmod","root","mark","det","nsubj","ccomp","amod","obj","mark","advcl","det","amod","amod","obj","case","compound","nmod","punct"],
    "lemmas": ["it","also","recommend","that","the","authority","take","appropriate","measure","to","meet","the","specific","educational","need","of","Roma","child","."],
    "pos_tags": ["PRON","ADV","VERB","SCONJ","DET","NOUN","VERB","ADJ","NOUN","PART","VERB","DET","ADJ","ADJ","NOUN","ADP","PROPN","NOUN","PUNCT"],
    "predicates": ["_","_","PROPOSE","_","_","_","CARRY-OUT-ACTION","_","_","_","SATISFY_FULFILL","_","_","_","_","_","_","_","_"], 
    "words": [
        "It","also","recommends","that","the","authorities","take","appropriate","measures","to",
        "meet","the","specific","educational","needs","of","Roma","children","."],
    "roles": {
        "2": ["_","agent","_","topic","_","_","_","_","_","_","_","_","_","_","_","_","_","_","_"],
        "6": ["_","_","_","_","_","agent","_","_","patient","goal","_","_","_","_","_","_","_","_","_"],
        "10": ["_","_","_","_","_","_","_","_","_","_","_","_","_","_","theme","_","_","_","_"]
    }
}

In [2]:
sample_text = ' '.join(example_input['words'])
sample_text

'It also recommends that the authorities take appropriate measures to meet the specific educational needs of Roma children .'

In [3]:
verbatlas_span_url = 'http://127.0.0.1:3001/api/model'
amuse_url = 'http://127.0.0.1:3002/api/model'
verbatlas_dependency_url = 'http://127.0.0.1:3003/api/model'

In [4]:
import requests
test_str = "We urge the parties to refrain from the use of force and provocative acts , which only serve to undermine the peace process , and we appeal to them to take immediate steps to create the necessary environment for a restoration of peace , stability and the continuation of talks leading to a comprehensive , just and lasting peace based on Security Council resolutions 242 ( 1967 ) and 338 ( 1973 ) ."
http_input = [{'text':test_str, 'lang':'EN'}]
x = requests.post(verbatlas_dependency_url, json = http_input)
print(x)

<Response [200]>


In [5]:
import json
x_json = json.loads(x.text)
x_json

[{'tokens': [{'index': 0, 'rawText': 'We'},
   {'index': 1, 'rawText': 'urge'},
   {'index': 2, 'rawText': 'the'},
   {'index': 3, 'rawText': 'parties'},
   {'index': 4, 'rawText': 'to'},
   {'index': 5, 'rawText': 'refrain'},
   {'index': 6, 'rawText': 'from'},
   {'index': 7, 'rawText': 'the'},
   {'index': 8, 'rawText': 'use'},
   {'index': 9, 'rawText': 'of'},
   {'index': 10, 'rawText': 'force'},
   {'index': 11, 'rawText': 'and'},
   {'index': 12, 'rawText': 'provocative'},
   {'index': 13, 'rawText': 'acts'},
   {'index': 14, 'rawText': ','},
   {'index': 15, 'rawText': 'which'},
   {'index': 16, 'rawText': 'only'},
   {'index': 17, 'rawText': 'serve'},
   {'index': 18, 'rawText': 'to'},
   {'index': 19, 'rawText': 'undermine'},
   {'index': 20, 'rawText': 'the'},
   {'index': 21, 'rawText': 'peace'},
   {'index': 22, 'rawText': 'process'},
   {'index': 23, 'rawText': ','},
   {'index': 24, 'rawText': 'and'},
   {'index': 25, 'rawText': 'we'},
   {'index': 26, 'rawText': 'appeal

# Creating the pipeline

In [6]:
import json
import requests
from copy import deepcopy
import re
from tqdm import tqdm

def load_data(lang_data_path):
    f = open(lang_data_path)
    res = json.load( f )
    f.close()
    return res

def list_to_phrase(l_w):
    phrase = ' '.join(l_w) if type(l_w) == list else l_w
    phrase = re.sub(r'( \')','\'',phrase)
    phrase = re.sub(r'( ’)','’',phrase)
    phrase = re.sub(r'(\b\. )|(\b\.$)',' . ',phrase)
    return phrase

def reconstruct_subwords(text_str_input, verbatlas_out):
    # get the list of words from VerbAtlas
    va_l = [ e['rawText'] for e in verbatlas_out['tokens'] ]
    # get only the VerbAtlas annotations
    for i in range(len(verbatlas_out['annotations'])):
        l_k = list(verbatlas_out['annotations'][i].keys())
        for k in l_k:
            if k != 'tokenIndex' and k != 'verbatlas':
                verbatlas_out['annotations'][i].pop(k,None)

    text_l = text_str_input.split(' ')
    r_v_i = deepcopy(verbatlas_out)

    # VerbAtlas sanification
    text_i = 0; json_i = 0
    while json_i < len(r_v_i['tokens']):

        edit_v = r_v_i['tokens'][json_i]['rawText'].lower() != text_l[text_i].lower()

        if not edit_v:
            text_i += 1; json_i += 1
            continue
        elif r_v_i['tokens'][json_i]['rawText'].lower().strip() == text_l[text_i].lower():
            r_v_i['tokens'][json_i]['rawText'] = r_v_i['tokens'][json_i]['rawText'].lower().strip()
            text_i += 1; json_i += 1
            continue
        
        # for the predicates and roles

        for ii in range(len(r_v_i['annotations'])):
            r_v_i['annotations'][ii]['tokenIndex'] = r_v_i['annotations'][ii]['tokenIndex'] - 1 \
            if r_v_i['annotations'][ii]['tokenIndex'] > json_i else r_v_i['annotations'][ii]['tokenIndex']
            for iii in range(len(r_v_i['annotations'][ii]['verbatlas']['roles'])):
                for iiii in [0,1]:
                    r_v_i['annotations'][ii]['verbatlas']['roles'][iii]['span'][iiii] = \
                    r_v_i['annotations'][ii]['verbatlas']['roles'][iii]['span'][iiii] -1 \
                    if r_v_i['annotations'][ii]['verbatlas']['roles'][iii]['span'][iiii] > json_i \
                    else r_v_i['annotations'][ii]['verbatlas']['roles'][iii]['span'][iiii]

        r_v_i['tokens'][json_i]['rawText'] += r_v_i['tokens'][json_i+1]['rawText']
        r_v_i['tokens'][json_i]['rawText'] = r_v_i['tokens'][json_i]['rawText'].strip()

        for ii in range( json_i+2 , len(r_v_i['tokens']) ):
            r_v_i['tokens'][ii]['index'] -= 1

        del r_v_i['tokens'][json_i+1]

    return text_str_input, r_v_i

def generate_data(data, verbatlas_link, lang="EN", chunk_dim=1):
    null_tag = '_'
    result = {}

    sentences_skipped = 0
    to_send = []

    pbar = tqdm(enumerate(data.items()))

    for data_i, (data_sample_key,data_sample_value) in pbar:

        text_sampled = list_to_phrase(data_sample_value['words'])

        to_send.append({ 'text': text_sampled, 'lang':lang, 'key_id':data_sample_key })
        if len(to_send)%chunk_dim != 0:
            continue
        
        pbar.set_description(f'percentage: {((data_i+1)/len(data)):.4f} (index={data_i})')

        res_v = requests.post(verbatlas_link, json = to_send)
        status_code_good = res_v.status_code == 200

        if not status_code_good:
            print(f'verbatlas={res_v.status_code} | skipping chunk!')
            sentences_skipped += len(to_send)
            to_send = []

        res_v = json.loads(res_v.text)
        for t_i, r_v_i in zip(to_send, res_v):

            ti_text, r_v_i = reconstruct_subwords(t_i['text'], r_v_i)
            t_i['text'] = ti_text

            k_id = t_i['key_id']

            result[k_id] = {'roles':{}}
            # words
            result[k_id]['words'] = [ e['rawText'] for e in r_v_i['tokens'] ]

            # predicates + roles
            result[k_id]['predicates'] = [null_tag]*len(r_v_i['tokens'])
            for annotation in r_v_i['annotations']:
                tokenIndex = annotation['tokenIndex']
                if annotation['verbatlas']['frameName'] != null_tag:
                    result[k_id]['predicates'][tokenIndex] = annotation['verbatlas']['frameName']

                    result[k_id]['roles'][str(tokenIndex)] = [null_tag]*len(r_v_i['tokens'])

                    for _, role_for_i in enumerate(annotation['verbatlas']['roles']):
                        span_pos = role_for_i['span'][0]
                                
                        result[k_id]['roles'][str(tokenIndex)][span_pos] = role_for_i['role'].lower()
                
        to_send = []
    
    print(f'skipped {sentences_skipped} sentences out of {len(data.values())}')
    return result

# Recreating English Dataset

In [68]:
dataset_en = load_data('../../../../data/EN/train.json')

In [69]:
err_phrase = {'r_i':list(dataset_en.values())[4349]}
print(err_phrase['r_i']['words'])

['At', 'the', 'same', 'meeting', ',', 'the', 'Committee', 'decided', ',', 'by', 'acclamation', ',', 'to', 'recommend', 'to', 'the', 'General', 'Assembly', 'that', 'it', 'appoint', 'David', 'Dutton', 'as', 'a', 'member', 'of', 'the', 'Committee', 'on', 'Contributions', 'for', 'a', 'term', 'of', 'office', 'beginning', 'on', '', '2003', 'and', 'ending', 'on', '31', 'December', '2004', '(', 'see', 'para', '.', '10', ')', '.']


In [70]:
dataset_en_generated = generate_data(dataset_en, verbatlas_dependency_url, lang="EN")

percentage: 0.0711 (index=390): : 388it [00:18, 22.23it/s]

verbatlas=500 | skipping chunk!


percentage: 0.0820 (index=450): : 448it [00:21, 22.45it/s]

verbatlas=500 | skipping chunk!


percentage: 0.2521 (index=1386): : 1385it [01:04, 23.58it/s]

verbatlas=500 | skipping chunk!


percentage: 0.2729 (index=1500): : 1499it [01:09, 22.68it/s]

verbatlas=500 | skipping chunk!
verbatlas=500 | skipping chunk!


percentage: 0.3052 (index=1678): : 1676it [01:17, 22.86it/s]

verbatlas=500 | skipping chunk!


percentage: 0.3341 (index=1837): : 1835it [01:24, 24.14it/s]

verbatlas=500 | skipping chunk!


percentage: 0.3414 (index=1877): : 1877it [01:26, 22.93it/s]

verbatlas=500 | skipping chunk!


percentage: 0.5737 (index=3155): : 3155it [02:27, 13.45it/s]

verbatlas=500 | skipping chunk!


percentage: 0.8046 (index=4425): : 4425it [03:34, 21.99it/s]

verbatlas=500 | skipping chunk!


percentage: 0.8135 (index=4474): : 4473it [03:36, 21.91it/s]

verbatlas=500 | skipping chunk!


percentage: 0.8751 (index=4813): : 4813it [03:54, 22.02it/s]

verbatlas=500 | skipping chunk!


percentage: 0.8897 (index=4893): : 4891it [03:58, 22.06it/s]

verbatlas=500 | skipping chunk!


percentage: 0.8975 (index=4936): : 4936it [04:00, 21.29it/s]

verbatlas=500 | skipping chunk!


percentage: 0.9098 (index=5004): : 5002it [04:03, 22.05it/s]

verbatlas=500 | skipping chunk!


percentage: 0.9462 (index=5204): : 5204it [04:13, 20.56it/s]

verbatlas=500 | skipping chunk!
verbatlas=500 | skipping chunk!


percentage: 0.9509 (index=5230): : 5230it [04:14, 21.73it/s]

verbatlas=500 | skipping chunk!


percentage: 0.9829 (index=5406): : 5404it [04:23, 20.67it/s]

verbatlas=500 | skipping chunk!


percentage: 1.0000 (index=5500): : 5501it [04:27, 20.54it/s]

skipped 19 sentences out of 5501





In [71]:
import json

with open('../../../../data/EN/train_autogenerated.json', 'w') as outfile:
    json.dump(dataset_en_generated, outfile, indent=4)

# Tests

In [84]:
ex_out = generate_data({'r_i':example_input}, verbatlas_dependency_url, lang="EN")['r_i']

percentage: 1.0000 (index=0): : 1it [00:00,  6.46it/s]

skipped 0 sentences out of 1





In [86]:
print({'words': example_input['words'], 'predicates': example_input['predicates'], 'roles': example_input['roles']})
print()
print({'words': ex_out['words'], 'predicates': ex_out['predicates'], 'roles': ex_out['roles']})

{'words': ['It', 'also', 'recommends', 'that', 'the', 'authorities', 'take', 'appropriate', 'measures', 'to', 'meet', 'the', 'specific', 'educational', 'needs', 'of', 'Roma', 'children', '.'], 'predicates': ['_', '_', 'PROPOSE', '_', '_', '_', 'CARRY-OUT-ACTION', '_', '_', '_', 'SATISFY_FULFILL', '_', '_', '_', '_', '_', '_', '_', '_'], 'roles': {'2': ['_', 'agent', '_', 'topic', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_'], '6': ['_', '_', '_', '_', '_', 'agent', '_', '_', 'patient', 'goal', '_', '_', '_', '_', '_', '_', '_', '_', '_'], '10': ['_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', 'theme', '_', '_', '_', '_']}}

{'words': ['It', 'also', 'recommends', 'that', 'the', 'authorities', 'take', 'appropriate', 'measures', 'to', 'meet', 'the', 'specific', 'educational', 'needs', 'of', 'Roma', 'children', '.'], 'predicates': ['_', '_', 'PROPOSE', '_', '_', '_', 'TAKE', '_', '_', '_', 'SATISFY_FULFILL', '_', '_', '_', '_', '_', '_', '

In [None]:
dataset_en = load_data('../../../../data/EN/train.json')

In [None]:
err_phrase = {'r_i':list(dataset_en.values())[4349]}
print(err_phrase['r_i']['words'])

['At', 'the', 'same', 'meeting', ',', 'the', 'Committee', 'decided', ',', 'by', 'acclamation', ',', 'to', 'recommend', 'to', 'the', 'General', 'Assembly', 'that', 'it', 'appoint', 'David', 'Dutton', 'as', 'a', 'member', 'of', 'the', 'Committee', 'on', 'Contributions', 'for', 'a', 'term', 'of', 'office', 'beginning', 'on', '', '2003', 'and', 'ending', 'on', '31', 'December', '2004', '(', 'see', 'para', '.', '10', ')', '.']


In [None]:
dataset_en_generated = generate_data(dataset_en, verbatlas_dependency_url, lang="EN")

percentage: 0.0711 (index=390): : 388it [00:18, 22.23it/s]

verbatlas=500 | skipping chunk!


percentage: 0.0820 (index=450): : 448it [00:21, 22.45it/s]

verbatlas=500 | skipping chunk!


percentage: 0.2521 (index=1386): : 1385it [01:04, 23.58it/s]

verbatlas=500 | skipping chunk!


percentage: 0.2729 (index=1500): : 1499it [01:09, 22.68it/s]

verbatlas=500 | skipping chunk!
verbatlas=500 | skipping chunk!


percentage: 0.3052 (index=1678): : 1676it [01:17, 22.86it/s]

verbatlas=500 | skipping chunk!


percentage: 0.3341 (index=1837): : 1835it [01:24, 24.14it/s]

verbatlas=500 | skipping chunk!


percentage: 0.3414 (index=1877): : 1877it [01:26, 22.93it/s]

verbatlas=500 | skipping chunk!


percentage: 0.5737 (index=3155): : 3155it [02:27, 13.45it/s]

verbatlas=500 | skipping chunk!


percentage: 0.8046 (index=4425): : 4425it [03:34, 21.99it/s]

verbatlas=500 | skipping chunk!


percentage: 0.8135 (index=4474): : 4473it [03:36, 21.91it/s]

verbatlas=500 | skipping chunk!


percentage: 0.8751 (index=4813): : 4813it [03:54, 22.02it/s]

verbatlas=500 | skipping chunk!


percentage: 0.8897 (index=4893): : 4891it [03:58, 22.06it/s]

verbatlas=500 | skipping chunk!


percentage: 0.8975 (index=4936): : 4936it [04:00, 21.29it/s]

verbatlas=500 | skipping chunk!


percentage: 0.9098 (index=5004): : 5002it [04:03, 22.05it/s]

verbatlas=500 | skipping chunk!


percentage: 0.9462 (index=5204): : 5204it [04:13, 20.56it/s]

verbatlas=500 | skipping chunk!
verbatlas=500 | skipping chunk!


percentage: 0.9509 (index=5230): : 5230it [04:14, 21.73it/s]

verbatlas=500 | skipping chunk!


percentage: 0.9829 (index=5406): : 5404it [04:23, 20.67it/s]

verbatlas=500 | skipping chunk!


percentage: 1.0000 (index=5500): : 5501it [04:27, 20.54it/s]

skipped 19 sentences out of 5501





In [None]:
import json

with open('../../../../data/EN/train_autogenerated.json', 'w') as outfile:
    json.dump(dataset_en_generated, outfile, indent=4)