In [8]:
from tqdm import tqdm
from difflib import SequenceMatcher


def read_docs(file_name):
    with open(file_name) as f:
        lines = f.readlines()
    
    docs = []
    for line in tqdm(lines):
        items = line.strip().split('|')
        if len(items) >= 3 and items[1] == 't':
            doc_id, _, title = items
            docs.append((doc_id, title))
    return docs

def sim_score(title, tokens):
    target = ' '.join(tokens)
    str_len = min(len(title), len(target))
    return SequenceMatcher(None, title[:str_len], target[:str_len]).ratio()


def align(docs, sents):
    sent_counter = 0
    doc_id = None
    align_file = open('./align_results.txt', 'w')
    for doc in tqdm(docs):
        cur_doc_id = doc_id
        doc_id, title = doc
        while sim_score(title, sents[sent_counter]['tokens']) < 0.8:
            sents[sent_counter]['doc_id'] = cur_doc_id
            sent_counter += 1

            if sent_counter >= len(sents):
                print('============', doc_id)
                print(title)
                3 / 0

        align_file.write('-------->%s, %s, %s\n' % (doc_id, sent_counter, sim_score(title, sents[sent_counter]['tokens'])))
        align_file.write(' '.join(sents[sent_counter]['tokens']) + '\n')
        align_file.write(title + '\n\n')
        if sim_score(title, sents[sent_counter]['tokens']) < 0.9:
            print('-------->%s, %s, %s' % (doc_id, sent_counter, sim_score(title, sents[sent_counter]['tokens'])))
            print(' '.join(sents[sent_counter]['tokens']))
            print(title + '\n')
        sents[sent_counter]['doc_id'] = doc_id
        sents[sent_counter+1]['doc_id'] = doc_id
        sent_counter += 2
    align_file.close()

    for sent_id in range(sent_counter, len(sents)):
        sents[sent_id]['doc_id'] = doc_id
    
    for sent in sents:
        assert 'doc_id' in sent
    
    doc_sents_dict = {}
    for sent in sents:
        if sent['doc_id'] not in doc_sents_dict:
            doc_sents_dict[sent['doc_id']] = []
        doc_sents_dict[sent['doc_id']].append(sent)
    
    new_docs = []

    for doc in docs:
        doc_id, title = doc
        doc_sents = doc_sents_dict[doc_id]
        relations = []
        tokens = []
        entities = []
        word_diff = [0]
        for sent_id, doc_sent in enumerate(doc_sents):
            tokens.append(doc_sent['tokens'])
            
            word_idx = 0
            tags = doc_sent['ner_tags']
            while word_idx < len(doc_sent['tokens']):
                if tags[word_idx] == 1:
                    start = word_idx
                    end = word_idx + 1
                    while end < len(doc_sent['tokens']) and tags[end] == 2:
                        end += 1
                    entities.append({'start': start + word_diff[-1], 'end': end + word_diff[-1], 'type': 'Disease', 'sent_id': sent_id})
                    word_idx = end
                else:
                    word_idx += 1

            word_diff.append(word_diff[-1] + len(doc_sent['tokens']))

        new_doc = {'doc_id': doc_id, 'tokens': tokens, 'entities': entities, 'relations': relations}
        new_docs.append(new_doc)

    return new_docs


In [9]:
from tqdm import tqdm
import json
from datasets import load_dataset

dataset = load_dataset("ncbi_disease")

# keys = [('train', 'train', 'train'), ('develop', 'validation', 'dev'), ('test', 'test', 'test')]

keys = [('test', 'test', 'test')]

for file_key, dataset_key, save_key in keys:
    print('processing ------>', file_key, dataset_key, save_key)
    file_name = '../data/datasets/ncbi_disease/NCBI%sset_corpus.txt' % file_key
    docs = read_docs(file_name)
    docs.sort()
    json.dump(docs, open('../data/datasets/ncbi_disease/%s_title.json' % file_key, 'w'))
    sents = dataset[dataset_key]

    sents = [sent for sent in sents if len(sent['tokens']) > 0]

    json.dump(sents, open('../data/datasets/ncbi_disease/%s.json' % save_key, 'w'))
    new_docs = align(docs, sents)
    json.dump(new_docs, open('../data/datasets/ncbi_disease/new_doc_%s.json' % save_key, 'w'))

# print(num_sents)
# print(len(dataset[dataset_key]))

# print(line.strip().split('\t'))

Found cached dataset ncbi_disease (/home/hygao/.cache/huggingface/datasets/ncbi_disease/ncbi_disease/1.0.0/92314c7992b0b8a5ea2ad101be33f365b684a2cc011e0ffa29c691e6d32b2d03)


  0%|          | 0/3 [00:00<?, ?it/s]

processing ------> test test test


100%|██████████| 1259/1259 [00:00<00:00, 2182980.05it/s]
 40%|████      | 40/100 [00:00<00:00, 389.78it/s]

-------->9585606, 433, 0.8842105263157894
The hemochromatosis 845 G - - > A and 187 C - - > G mutations : prevalence in non - Caucasian populations .
The hemochromatosis 845 G-->A and 187 C-->G mutations: prevalence in non-Caucasian populations.



100%|██████████| 100/100 [00:00<00:00, 398.74it/s]


In [8]:
import json
from datasets import load_dataset


# file_key = 'develop'
file_key = 'train'
file_name = '../data/datasets/ncbi_disease/NCBI%sset_corpus.txt' % file_key

dataset = load_dataset("ncbi_disease")

# docs = []
# with open(file_name) as fh:
#     lines = fh.readlines()

# num_sents = 0
# for line in tqdm(lines):
#     doc_id, title, abstract = line.strip().split('\t')
#     # sents = list(nlp(abstract).sents)
#     # num_sents += len(sents) + 1
#     docs.append((doc_id, title, abstract))

docs = read_docs(file_name)

docs.sort()

print(len(dataset['validation']), len(docs))

# train_data = []
# docs = []
# keys = [('train', 'train'), ('dev', 'validation'), ('test', 'test')]

# key, org_key = keys[1]

# for sent in dataset[org_key]:
    
#     print(sent)
#     docs.append({'tokens': sents, 'entities': entities, 'relations': relations})

# print(len(docs))
# json.dump(docs, open('../data/datasets/chemdner/new_doc_%s.json' % key, 'w'))

Found cached dataset ncbi_disease (/home/hygao/.cache/huggingface/datasets/ncbi_disease/ncbi_disease/1.0.0/92314c7992b0b8a5ea2ad101be33f365b684a2cc011e0ffa29c691e6d32b2d03)


  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 6924/6924 [00:00<00:00, 2054425.64it/s]

924 593





In [4]:
train_file_name = '../data/datasets/ncbi_disease/new_doc_train.json'
dev_file_name = '../data/datasets/ncbi_disease/new_doc_dev.json'
save_file_name = '../data/datasets/ncbi_disease/new_doc_train_dev.json'

train_data = json.load(open(train_file_name))
dev_data = json.load(open(dev_file_name))

json.dump(train_data + dev_data, open(save_file_name, 'w'))