In [6]:
import numpy as np
import os
import sys
import ujson
from tqdm import tqdm

In [7]:
def load_file(file_path, limit=-1):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
    if limit == -1:
        lines = [ujson.loads(line.strip()) for line in tqdm(lines)]
    else:
        lines = [ujson.loads(line.strip()) for line in tqdm(lines[0:min(limit, len(lines))])]
    return lines

In [10]:
def analysis_one_file(file_path, limit=-1):
    print(file_path)
    lines = load_file(file_path, limit)
    cui_count = {}
    rel_count = {}
    cui_pair_count = {}
    cui_pair_rel = {}
    cui_set = set()
    lines_set = set()

    for line in lines:
        cui0 = line['h']['id']
        cui1 = line['t']['id']
        rel = line['relation']
        sen = line['text']
        cui_set.update([cui0, cui1])
        lines_set.update([sen])

        # Stat
        if not cui0 in cui_count:
            cui_count[cui0] = 0
        if not cui1 in cui_count:
            cui_count[cui1] = 0
        cui_count[cui0] += 1
        cui_count[cui1] += 1

        if not rel in rel_count:
            rel_count[rel] = 0
        rel_count[rel] += 1

        cui_pair = cui0 + "\t" + cui1
        if not cui_pair in cui_pair_count:
            cui_pair_count[cui_pair] = 0
            cui_pair_rel[cui_pair] = set()
        cui_pair_count[cui_pair] += 1
        cui_pair_rel[cui_pair].update([rel])

    print(f'Lines count:{len(lines)}')
    non_na_lines = len(lines) - rel_count.get('NA', 0)
    print(f'Non-NA Lines count:{non_na_lines}')
    # entity
    print(f'CUI count:{len(cui_count)}')
    
    # relation
    print(f'Relation type count:{len(rel_count)}')

    # entity pair (relation count, instance count)
    print(f'CUI pair count:{len(cui_pair_count)}')

    multiple_label_instance = [cui_pair for cui_pair in cui_pair_rel if len(cui_pair_rel[cui_pair]) > 1]
    print(f'Multi label CUI pair count:{len(multiple_label_instance)}')
    #print([(multiple_label_instance[i], cui_pair_rel[multiple_label_instance[i]]) for i in range(5)])
    return cui_count, rel_count, cui_pair_count, cui_pair_rel

#analysis_one_file('../dataset_v2/dev.txt')
#analysis_one_file('../dataset_v2/test.txt')
#analysis_one_file('../dataset_v2/train.txt')


In [13]:
cui_count, rel_count, cui_pair_count, cui_pair_rel = analysis_one_file('../dataset_v2/train.txt', -1)

../dataset_v2/train.txt
100%|██████████| 18236374/18236374 [03:43<00:00, 81576.60it/s]
Lines count:18236374
Non-NA Lines count:15824099
CUI count:140443
Relation type count:127
CUI pair count:1854727
Multi label CUI pair count:13802


In [14]:
rel_count

{'RO\tmay_treat': 560795,
 'CHD\tisa': 7719133,
 'RO\thas_direct_procedure_site': 161822,
 'RO\thas_procedure_site': 241538,
 'RO\thas_pathological_process': 167299,
 'RO\thas_active_ingredient': 282371,
 'RB\thas_part': 248874,
 'RO\thas_associated_morphology': 560943,
 'RO\tmapped_to': 361405,
 'RO\thas_structural_class': 174390,
 'RO\thas_component': 223738,
 'RO\thas_measured_component': 123913,
 'RO\tpossibly_equivalent_to': 496318,
 'RO\thas_finding_site': 1240642,
 'RO\thas_causative_agent': 445328,
 'RO\thas_contraindicated_drug': 137453,
 'RO\thas_subject_of_information': 5900,
 'SY\tsame_as': 246720,
 'RO\thas_method': 370007,
 'RO\tmay_prevent': 156500,
 'RO\thas_modification': 70597,
 'RO\thas_definitional_manifestation': 139897,
 'RO\treplaces': 71734,
 'RO\thas_therapeutic_class': 154653,
 'RO\tinterprets': 335679,
 'RO\thas_mechanism_of_action': 6371,
 'RO\thas_contraindicated_class': 48424,
 'RO\thas_subject_relationship_context': 28164,
 'RO\thas_direct_device': 33017,

In [25]:
with open('bert_one_f1.json', 'r') as f:
    f1 = ujson.load(f)
new_d = {}
for key in f1:
    new_d[key] = [f1[key], rel_count.get(key, -1)]
aps = sorted(new_d.items(), key=lambda d:-d[1][0])

In [27]:
for a in aps:
    print(a)

('RO\thas_property_type', [1.0, 4080])
('RO\thas_subject_of_information', [0.9411764705882353, 5900])
('RO\thas_finding_site', [0.9149072296865004, 1240642])
('RO\thas_laterality', [0.9090909090909091, 6879])
('RO\thas_pathological_process', [0.899581589958159, 167299])
('RO\thas_clinical_course', [0.8750000000000001, 20600])
('RO\thas_process_output', [0.8750000000000001, 3525])
('RO\thas_property', [0.875, 5581])
('RO\thas_severity', [0.8749999999999999, 12417])
('RO\thas_access_instrument', [0.8571428571428571, 2425])
('RO\thas_extent', [0.8571428571428571, 1868])
('RO\thas_procedure_site', [0.8534906588003932, 241538])
('RB\thas_tradename', [0.8481012658227848, 44957])
('RO\thas_causative_agent', [0.8471986417657045, 445328])
('RO\thas_associated_morphology', [0.8274747474747475, 560943])
('RO\thas_interpretation', [0.8273809523809524, 147837])
('RO\thas_finding_method', [0.8214285714285715, 68058])
('RO\tmay_treat', [0.8177920685959271, 560795])
('CHD\tisa', [0.8151609553478713, 7