In [2]:
import numpy as np
import os
import sys
import ujson
from tqdm import tqdm

In [3]:
def load_file(file_path, limit=-1):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
    if limit == -1:
        lines = [ujson.loads(line.strip()) for line in tqdm(lines)]
    else:
        lines = [ujson.loads(line.strip()) for line in tqdm(lines[0:min(limit, len(lines))])]
    return lines

In [4]:
def analysis_one_file(file_path, limit=-1):
    print(file_path)
    lines = load_file(file_path, limit)
    cui_count = {}
    rel_count = {}
    cui_pair_count = {}
    cui_pair_rel = {}
    cui_set = set()
    lines_set = set()

    for line in lines:
        cui0 = line['h']['id']
        cui1 = line['t']['id']
        rel = line['relation']
        sen = line['text']
        cui_set.update([cui0, cui1])
        lines_set.update([sen])

        # Stat
        if not cui0 in cui_count:
            cui_count[cui0] = 0
        if not cui1 in cui_count:
            cui_count[cui1] = 0
        cui_count[cui0] += 1
        cui_count[cui1] += 1

        if not rel in rel_count:
            rel_count[rel] = 0
        rel_count[rel] += 1

        cui_pair = cui0 + "\t" + cui1
        if not cui_pair in cui_pair_count:
            cui_pair_count[cui_pair] = 0
            cui_pair_rel[cui_pair] = set()
        cui_pair_count[cui_pair] += 1
        cui_pair_rel[cui_pair].update([rel])

    print(f'Lines count:{len(lines)}')
    non_na_lines = len(lines) - rel_count.get('NA', 0)
    print(f'Non-NA Lines count:{non_na_lines}')
    # entity
    print(f'CUI count:{len(cui_count)}')
    
    # relation
    print(f'Relation type count:{len(rel_count)}')

    # entity pair (relation count, instance count)
    print(f'CUI pair count:{len(cui_pair_count)}')

    multiple_label_instance = [cui_pair for cui_pair in cui_pair_rel if len(cui_pair_rel[cui_pair]) > 1]
    print(f'Multi label CUI pair count:{len(multiple_label_instance)}')
    #print([(multiple_label_instance[i], cui_pair_rel[multiple_label_instance[i]]) for i in range(5)])
    return cui_count, rel_count, cui_pair_count, cui_pair_rel

#analysis_one_file('../dataset_v2/dev.txt')
#analysis_one_file('../dataset_v2/test.txt')
#analysis_one_file('../dataset_v2/train.txt')


In [8]:
cui_count, rel_count, cui_pair_count, cui_pair_rel = analysis_one_file('../../data/sentence_coder_1105.json', -1)

../../data/sentence_coder_1105.json


100%|██████████| 15463273/15463273 [03:00<00:00, 85863.43it/s] 


Lines count:15463273
Non-NA Lines count:15463273
CUI count:134577
Relation type count:20
CUI pair count:251336
Multi label CUI pair count:0


In [19]:
print(list(cui_count.items())[0:5])
print(rel_count)
print(list(cui_pair_count.items())[0:5])
print(list(cui_pair_rel.items())[0:5])

[('CN00453821', 46723), ('CN00028545', 2603646), ('CN01014918', 33609), ('CN00002361', 1529274), ('CN00114798', 878)]
{'instance of': 5693704, 'subclass of': 4970935, 'biological process': 293891, 'cell component': 546118, 'medical condition treated': 282742, 'drug or therapy used for treatment': 248969, 'part of': 1039293, 'has part': 1212874, 'molecular function': 59023, 'parent taxon': 255052, 'symptoms and signs': 157580, 'possible treatment': 30638, 'encoded by': 148274, 'expressed in': 115156, 'subject has role': 252989, 'encodes': 34938, 'found in taxon': 80035, 'active ingredient in': 23242, 'has active ingredient': 5748, 'significant drug interaction': 12072}
[('CN00453821\tCN00028545', 46212), ('CN01014918\tCN00002361', 12952), ('CN00114798\tCN01014918', 878), ('CN00233642\tCN00125281', 96), ('CN00205267\tCN00002361', 4992)]
[('CN00453821\tCN00028545', {'instance of'}), ('CN01014918\tCN00002361', {'instance of'}), ('CN00114798\tCN01014918', {'subclass of'}), ('CN00233642\tCN0