In [23]:
import string
import json
import tqdm
from transformers import BertTokenizer


tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

def get_sent_partitions(sents):
    tokens_lengths = []
    for sent in sents:
        token_encoding = tokenizer.encode(sent, add_special_tokens=True)
        if not token_encoding:
            token_encoding = [tokenizer.convert_tokens_to_ids('[UNK]')]
        tokens_lengths.append(len(token_encoding))
    partitions = []
    start = 0
    sum_length = 0
    for idx, length in enumerate(tokens_lengths):
        if sum_length + length > 300:
            partitions.append((start, idx))
            start = idx
            sum_length = length
        else:
            sum_length += length
    if start < len(tokens_lengths):
        partitions.append((start, len(tokens_lengths)))
    return partitions


# key = 'dev'
key = 'train'
file_name = '../data/datasets/ontonotes/%s.json' % key

data = json.load(open(file_name))

docs = []

for cur_d in tqdm.tqdm(data):
    sents, ents = cur_d['tokens'], cur_d['entities']

    partitions = get_sent_partitions(sents)

    for part_s, part_e in partitions:
        part_sents = sents[part_s: part_e]
        part_ents = ents[part_s: part_e]

        # print('-------->', part_ents)

        part_diff = sum(len(sent) for sent in sents[:part_s])

        entities = []

        for cur_ents in part_ents:
            for sent_id, cur_ent in enumerate(cur_ents):
                start, end, label = cur_ent
                entities.append({
                    'start': start - part_diff,
                    'end': end - part_diff + 1,
                    'type': label,
                    'sent_id': sent_id})
        docs.append({'tokens': part_sents, 'entities': entities, 'relations': []})
print(len(docs))
json.dump(docs, open('../data/datasets/ontonotes/doc_%s.json' % key, 'w'))

100%|██████████| 322/322 [00:00<00:00, 985.57it/s] 


746


In [None]:
from transformers import BertTokenizer


tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)


In [13]:
import json

key = 'test'

data = json.load(open('../data/datasets/ontonotes/doc_%s.json' % key))
keys = set()
max_size = 0
big = []
all = 0

for doc in data:
    tokens = []
    for tks in doc['tokens']:
        tokens.extend(tks)
    for ent in doc['entities']:
        all += 1
        size = ent['end'] - ent['start']
        if size > 8:
            big.append(size)
            # print('---------->', ent['end'] - ent['start'])
            # print(tokens[ent['start']: ent['end']])
        max_size = max(max_size, size)
        keys.add(ent['type'])

print(len(keys), keys)

keys_dict = {}

for key in keys:
    keys_dict[key] = {'short': key, 'verbose': key}

# json.dump({'entities': keys_dict, 'relations': {}}, open('../data/datasets/ontonotes/ontonotes_types.json', 'w'))

print(big, all, len(big) / all, max_size)

18 {'ORG', 'PERCENT', 'TIME', 'QUANTITY', 'EVENT', 'MONEY', 'PRODUCT', 'CARDINAL', 'NORP', 'ORDINAL', 'LAW', 'GPE', 'LOC', 'DATE', 'PERSON', 'WORK_OF_ART', 'FAC', 'LANGUAGE'}
[9, 9, 11, 9, 10, 11, 12, 13, 10, 9, 12, 13, 21, 18, 9, 15] 11257 0.001421337834236475 21


In [2]:
import json

train_data = json.load(open('../data/datasets/ontonotes/doc_train.json'))
dev_data = json.load(open('../data/datasets/ontonotes/doc_dev.json'))

json.dump(train_data+dev_data, open('../data/datasets/ontonotes/doc_train_dev.json', 'w'))