In [1]:
from configuration import BaseConfig
from datahandler import DataReader, DataWriter
import pandas as pd
import os
import matplotlib.pyplot as plt

config = BaseConfig(version=None).get_args(db_name="umls")

In [2]:
tui2stn = DataReader.load_json(config.tui2stn)
tui2str = DataReader.load_json(config.tui2str)

level1  = DataReader.load_json(config.level1)
level2  = DataReader.load_json(config.level2)
level3  = DataReader.load_json(config.level3)

umls_rel = DataReader.load_df(config.umls_rel)

umls_entity = DataReader.load_df(config.umls_entity)

print(f"size of UMLS‌ relation detection set:{umls_rel.shape[0]:_}")
print(f"size of UMLS‌ entity detection set:{umls_entity.shape[0]:_}")

display(umls_rel.head(3))
display(umls_entity.head(3))

size of UMLS‌ relation detection set:19_783_580
size of UMLS‌ entity detection set:2_093_042


Unnamed: 0,CUI1,CUI2,RELA,SAB-CUI1,STR-CUI1,SAB-CUI2,STR-CUI2
0,C0000039,C0043950,mapped_to,MSH,DIPALMITOYLPHOSPHATIDYLCHOLINE 0102,MSH,"3,5,8-Trioxa-4-phosphatetracosan-1-aminium, 4-..."
1,C0000039,C0216971,mapped_to,MSH,DIPALMITOYLPHOSPHATIDYLCHOLINE 0102,NCI,"(R)-(4-Oxido-10-oxo-7-palmitoyl-3,5,9-trioxa-4..."
2,C0000039,C0381030,mapped_to,MSH,DIPALMITOYLPHOSPHATIDYLCHOLINE 0102,MSH,DEPN-8


Unnamed: 0,CUI,STR,TUIs,STNs,SAB,level-1,level-2,level-3,level-4
0,C0000039,DIPALMITOYLPHOSPHATIDYLCHOLINE 0102,"['T109', 'T121']","['A1.4.1.2.1', 'A1.4.1.1.1']",MSH,A1,A1.4,A1.4.1,A1.4.1.2
1,C0000052,ALPHA GLUCAN BRANCHING ENZYME 01 04,"['T116', 'T126']","['A1.4.1.2.1.7', 'A1.4.1.1.3.3']",MSH,A1,A1.4,A1.4.1,A1.4.1.2
2,C0000084,CARBOXYGLUTAMIC ACID 01,"['T116', 'T123']","['A1.4.1.2.1.7', 'A1.4.1.1.3']",MSH,A1,A1.4,A1.4.1,A1.4.1.2


In [3]:
import numpy as np
import pandas as pd

sabs = list(umls_entity['SAB'].unique())
tui_size = len(tui2str)
sab_size = len(sabs)

tui2index = {tui:index for index, tui in enumerate(tui2str.keys())}
sab2index = {sab:index for index, sab in enumerate(sabs)}

type_sab_matrix = np.zeros((tui_size, sab_size))
for sab, tuis in zip(umls_entity['SAB'], umls_entity['TUIs']):
    for tui in eval(tuis):
        type_sab_matrix[tui2index[tui], sab2index[sab]] += 1
        
df_data = {}
df_data['type\source'] = list(tui2index.keys())
for sab, col in sab2index.items():
    df_data[sab] = list(type_sab_matrix[:, col])

pd.DataFrame(df_data).to_csv('type_sab_matrix.csv', index=False)

In [4]:
print("Level 1 class FQs:\n", umls_entity['level-1'].value_counts())

Level 1 class FQs:
 A1    956992
A2    559326
B2    339035
B1    237484
B        182
A         23
Name: level-1, dtype: int64


In [12]:
stat_df = pd.DataFrame(umls_entity[['level-3', 'level-4']].groupby("level-3").value_counts())

stat_df.to_csv("umls.stats.l3-l4.csv")

display(stat_df.head(3))

Unnamed: 0_level_0,Unnamed: 1_level_0,0
level-3,level-4,Unnamed: 2_level_1
A1.1.2,A1.4.1.1,1
A1.1.3,A1.1.3.2,46903
A1.1.3,A1.1.3.1,36867


In [13]:
stat_df = pd.DataFrame(umls_entity[['level-2', 'level-3']].groupby("level-2").value_counts())

stat_df.to_csv("umls.stats.l2-l3.csv")

display(stat_df.head(3))

Unnamed: 0_level_0,Unnamed: 1_level_0,0
level-2,level-3,Unnamed: 2_level_1
A1.1,A1.1.3,93647
A1.1,A1.1.2,26499
A1.1,A1.1.4,3387


In [14]:
stat_df = pd.DataFrame(umls_entity[['level-1', 'level-2']].groupby("level-1").value_counts())

stat_df.to_csv("umls.stats.l1-l2.csv")

display(stat_df.head(3))

Unnamed: 0_level_0,Unnamed: 1_level_0,0
level-1,level-2,Unnamed: 2_level_1
A1,A1.4,385350
A1,A1.3,266777
A1,A1.2,180798
