In [5]:
import pandas as pd
import os

data_dir = "../../RxNorm_full_01032023/rrf/"

### RXNCONSO.RFF

In [6]:
# read dir
conso_dir = os.path.join(data_dir, 'RXNCONSO.RRF')
conso_df = pd.read_csv(conso_dir, delimiter='|', header=None)

print(f'Read {len(conso_df):,} lines from {conso_dir}.')

# set columns
col_list = list(conso_df.columns)
col_list[0] = 'RXCUI'
col_list[7] = 'RXAUI'
col_list[14] = 'TERM'
conso_df.columns = col_list

# filter
conso_df = conso_df[['RXCUI', 'RXAUI', 'TERM']]

# drop NaN
conso_df.dropna(inplace=True)
print(f'Read {len(conso_df):,} lines without NaN from {conso_dir}.')



Read 1,125,456 lines from ../../RxNorm_full_01032023/rrf/RXNCONSO.RRF.
Read 1,125,456 lines without NaN from ../../RxNorm_full_01032023/rrf/RXNCONSO.RRF.


### RXNREL.RFF

In [7]:
# read dir
rel_dir = os.path.join(data_dir, 'RXNREL.RRF')
rel_df = pd.read_csv(rel_dir, delimiter='|', header=None)

print(f'Read {len(rel_df):,} lines from {rel_dir}.')

# set columns
col_list = list(rel_df.columns)
col_list[0] = 'UI1_CUI'
col_list[1] = 'UI1_AUI'
col_list[2] = 'UI1_TYPE'
col_list[4] = 'UI2_CUI'
col_list[5] = 'UI2_AUI'
col_list[6] = 'UI2_TYPE'
col_list[7] = 'REL'
rel_df.columns = col_list

# filter
rel_df = rel_df[['UI1_AUI', 'UI1_CUI', 'UI1_TYPE', 'UI2_AUI', 'UI2_CUI', 'UI2_TYPE', 'REL']]

# assert all relations are between similar types
assert (rel_df['UI1_TYPE'] == rel_df['UI2_TYPE']).all()

# only keep CUI relations
rel_df = rel_df[rel_df['UI1_TYPE'] == 'CUI']
print(f'Read {len(rel_df):,} CUI relations.')

# rename again
rel_df = rel_df[['UI1_CUI', 'UI2_CUI', 'REL']]
rel_df.columns = ['CUI1', 'CUI2', 'REL']

# drop NaN
rel_df.dropna(inplace=True)
print(f'Read {len(rel_df):,} CUI relations without NaN.')

# cast column
rel_df = rel_df.astype({
    'CUI1': 'int32',
    'CUI2': 'int32',
})


  rel_df = pd.read_csv(rel_dir, delimiter='|', header=None)


Read 7,373,670 lines from ../../RxNorm_full_01032023/rrf/RXNREL.RRF.
Read 1,471,262 CUI relations.
Read 1,471,262 CUI relations without NaN.


### Some data exploration

In [8]:
# number of CUI
print(f'Found {len(conso_df["RXCUI"].unique()):,} unique RXCUIs.')
# number of AUI
print(f'Found {len(conso_df["RXAUI"].unique()):,} unique RXAUIs.')
# number of unique terms
print(f'Found {len(conso_df["TERM"].unique()):,} unique TERMs.')

# CUIs participating in relations
unique_values_in_relations = pd.concat([rel_df['CUI1'], rel_df['CUI2']]).unique()

print(f'Found {len(unique_values_in_relations):,} unique RXCUIs participating in relations.')


Found 380,965 unique RXCUIs.
Found 1,125,456 unique RXAUIs.
Found 938,113 unique TERMs.
Found 195,120 unique RXCUIs participating in relations.


In [9]:
conso_df

Unnamed: 0,RXCUI,RXAUI,TERM
0,3,8717795,"1,4-alpha-Glucan branching enzyme"
1,3,8717796,"1,4-alpha-Glucan branching enzyme (substance)"
2,3,8717808,"Amylo-(1,4,6)-transglycosylase"
3,3,8718164,Branching enzyme
4,19,10794494,17-hydrocorticosteroid
...,...,...,...
1125451,2624752,12785937,1 ML Zaire ebolavirus (strain Kikwit-95) envel...
1125452,2624752,12785941,"Ervebo 72,000,000 UNT per 1 ML Injection"
1125453,2624752,12785942,"ERVEBO 72,000,000 UNT in 1 ML Injection"
1125454,2624753,12785939,Zaire ebolavirus (strain Kikwit-95) envelope g...


In [10]:
# check that each term is only related to one concept?
grouped_df = conso_df.copy()
# get a list of all IDS per term
grouped_df.drop(columns=['RXAUI'], inplace=True)
grouped_df['RXCUI'] = grouped_df['RXCUI'].apply(lambda x: [x])
grouped_df = grouped_df.groupby('TERM').sum()
grouped_df['UNIQUE_RXCUI'] = grouped_df['RXCUI'].apply(lambda x: len(set(x)))

# check where the number of concepts is not 1
grouped_df_duplicates = grouped_df[grouped_df['UNIQUE_RXCUI'] > 1]

In [11]:
grouped_df_duplicates

Unnamed: 0_level_0,RXCUI,UNIQUE_RXCUI
TERM,Unnamed: 1_level_1,Unnamed: 2_level_1
1st Medx-Patch with Lidocaine 4% Topical Patch,"[2001686, 2107511]",2
ABILIFY MAINTENA KIT,"[1602604, 1602607, 1659814, 1659818]",4
ACETAMINOPHEN 10 mg in 1 mL INTRAVENOUS INJECTION,"[483017, 2480095]",2
"ACETAMINOPHEN 325 mg / DEXTROMETHORPHAN HYDROBROMIDE 10 mg / DOXYLAMINE SUCCINATE 6.25 mg ORAL CAPSULE, LIQUID FILLED [Cold and Flu Night Relief]","[1426334, 1426334, 1426334, 2611969]",2
ACETAMINOPHEN 325/CTM2/DM10/P-EP 5MG TAB,"[1086991, 1307244]",2
...,...,...
various,"[1429318, 1429319, 1429330, 1429340, 1429341, ...",6
various combinations,"[1447098, 1456617]",2
vitamin D3,"[2418, 1244014]",2
watermelon,"[260014, 901258]",2


In [12]:
# unique CUI relations
print(f'Found {len(rel_df["REL"].unique()):,} unique CUI relations.')
print(f'Unique CUI relations: {rel_df["REL"].unique()}')

Found 26 unique CUI relations.
Unique CUI relations: ['has_tradename' 'has_ingredient' 'tradename_of' 'has_part' 'form_of'
 'has_precise_ingredient' 'has_form' 'precise_ingredient_of'
 'reformulation_of' 'has_dose_form' 'inverse_isa' 'part_of'
 'has_ingredients' 'ingredient_of' 'dose_form_of' 'isa' 'constitutes'
 'contains' 'consists_of' 'ingredients_of' 'quantified_form_of'
 'has_quantified_form' 'reformulated_to' 'contained_in'
 'has_doseformgroup' 'doseformgroup_of']


In [13]:
# NOTE: procedure can be optimized
def lookup(term, conso_df):
    # find the concept related to the therm
    unique_concept_matches = conso_df[conso_df['TERM'] == term]['RXCUI'].unique()
    concept_to_atoms = {}
    for concept in unique_concept_matches:
        # find all atoms under this concept
        atoms = conso_df[conso_df['RXCUI'] == concept]
        concept_to_atoms.update({
            concept: atoms
        })
    return concept_to_atoms

In [14]:
concepts = lookup('Acetaminophen', conso_df)
print(concepts.keys())
for concept in concepts.keys():
    print(concept,':')
    print(concepts[concept].head(5))

dict_keys([161])
161 :
     RXCUI     RXAUI                              TERM
482    161  10280795                     Acetaminophen
483    161  10326508                     Acetaminophen
484    161  10333986                      Acetaminofén
485    161  10795555  Acetaminophen-containing product
486    161  10808671    Paracetamol-containing product


In [15]:
concepts = lookup('various', conso_df)
print(concepts.keys())
for concept in concepts.keys():
    print(concept,':')
    print(concepts[concept].head(5))

dict_keys([1429318, 1429319, 1429330, 1429340, 1429341, 1470021])
1429318 :
          RXCUI    RXAUI                                        TERM
811828  1429318  5482851                                     various
811829  1429318  5483186  various other intestinal adsorbents in ATC
1429319 :
          RXCUI    RXAUI                       TERM
811830  1429319  5482854                    various
811831  1429319  5483213  various allergen extracts
1429330 :
          RXCUI    RXAUI                                               TERM
811844  1429330  5482850                                            various
811845  1429330  5483185  various other agents for local oral treatment ...
1429340 :
          RXCUI    RXAUI                                     TERM
811860  1429340  5482852                                  various
811861  1429340  5483231  various other nasal preparations in ATC
1429341 :
          RXCUI    RXAUI                                    TERM
811862  1429341  5482853      

In [16]:
concepts = lookup('watermelon', conso_df)
print(concepts.keys())
for concept in concepts.keys():
    print(concept,':')
    print(concepts[concept].head(5))

dict_keys([260014, 901258])
260014 :
         RXCUI     RXAUI                    TERM
223259  260014  10328342              Watermelon
223260  260014  12254063  watermelon preparation
223261  260014   6809185              watermelon
901258 :
         RXCUI    RXAUI                           TERM
558810  901258  2991817                     WATERMELON
558811  901258  3060380  watermelon allergenic extract
558812  901258  4661177                     watermelon
558813  901258  5943521                     Watermelon
558814  901258  9275453                     Watermelon


In [17]:
def get_relation(cui1, cui2, rel_df):
    forward_relation = rel_df[(rel_df['CUI1'] == cui1) & (rel_df['CUI2'] == cui2)]
    backward_relation = rel_df[(rel_df['CUI2'] == cui1) & (rel_df['CUI1'] == cui2)]
    assert len(forward_relation) == len(backward_relation)
    return forward_relation, backward_relation

def get_all_relations(cui1, rel_df):
    forward_relations = rel_df[(rel_df['CUI1'] == cui1)]
    backward_relations = rel_df[(rel_df['CUI2'] == cui1)]
    assert len(forward_relations) == len(backward_relations)
    return forward_relations, backward_relations

def get_all_concepts(term, conso_df):
    unique_concept_matches = conso_df[conso_df['TERM'] == term]['RXCUI'].unique()
    return list(unique_concept_matches)

def get_all_terms(cui1, conso_df):
    atoms = conso_df[conso_df['RXCUI'] == cui1]['TERM'].unique()
    return list(atoms)

In [18]:
def get_all_related_concepts(cui, rel_df):
    forward_relations, _ = get_all_relations(cui, rel_df)
    related_concepts = forward_relations['CUI2'].unique()
    return list(related_concepts)

def get_all_related_atoms(term, conso_df, rel_df):
    parent_concepts = get_all_concepts(term, conso_df)
    rel_concepts = []
    for concept in parent_concepts:
        rel_concepts.extend(get_all_related_concepts(concept, rel_df))
    rel_concepts.extend(parent_concepts)
    rel_atoms = []
    for concept in rel_concepts:
        rel_atoms.extend(get_all_terms(concept, conso_df))
    return list(set(rel_atoms))

# def get_all_related_atoms()

In [19]:
get_all_related_atoms('paracetamol', conso_df, rel_df)

['acetaminophen / guaifenesin / pseudoephedrine Pill',
 'acetaminophen 425 MG',
 'acetaminophen / caffeine / phenylephrine Oral Product',
 'acetaminophen / PHENobarbital Pill',
 'acetaminophen / chlorpheniramine / diphenhydramine / pseudoephedrine',
 'acetaminophen / chlorpheniramine / dextromethorphan / pseudoephedrine Oral Suspension',
 'Ridenol',
 'acetaminophen / promethazine Rectal Product',
 'APAP/Dichloralphenazone/Isometheptene',
 'Dolmar',
 'St. Joseph Aspirin-Free',
 'Excedrin Tension Headache',
 'Acetaminophen- and caffeine-containing product',
 'acetaminophen / codeine Pill',
 'Painaid BRF',
 'acetaminophen/dextromethorphan/PSE',
 'acetaminophen Oral Product',
 'Arcet',
 'By-Ache',
 'Diphenhydramine- and paracetamol-containing product in oral dose form',
 'acetaminophen 16.7 MG/ML',
 'acetaminophen / dextromethorphan / guaifenesin Oral Capsule',
 'acetaminophen 20 MG/ML',
 'acetaminophen / pseudoephedrine Oral Liquid Product',
 'acetaminophen / aspirin / caffeine / calcium 

### Try to solve our problems with Join
Try to build a large LUT.

One row: term | parent cui | related cui | relation | related_term

How to build:
1. merge term | aui | cui with cui1 | rel | cui2
2. merge result with term | aui | cui

Add identity relation to the relation table to not have to build an exception here. If it proves to be too heavy, remove inverse relations.

In [163]:
# add reflexive relation
unique_cuis = conso_df["RXCUI"].unique()
rel_reflexive_df = pd.DataFrame(data= {"CUI1":unique_cuis,"CUI2":unique_cuis})
rel_reflexive_df['REL'] = 'is_same_concept'

# only keep unique (RXCUI, TERM) combinations
# conso_df_unique = conso_df.drop(columns=['RXAUI']).groupby(['RXCUI', 'TERM']).first().reset_index()

# uncomment this to remove duplicate TERMS per CONCEPT (halfs the resulting dataset size)
# conso_df_unique = conso_df.groupby(['RXCUI', 'TERM']).first().reset_index()
conso_df_unique = conso_df

In [164]:
conso_df_unique

Unnamed: 0,RXCUI,RXAUI,TERM
0,3,8717795,"1,4-alpha-Glucan branching enzyme"
1,3,8717796,"1,4-alpha-Glucan branching enzyme (substance)"
2,3,8717808,"Amylo-(1,4,6)-transglycosylase"
3,3,8718164,Branching enzyme
4,19,10794494,17-hydrocorticosteroid
...,...,...,...
1125451,2624752,12785937,1 ML Zaire ebolavirus (strain Kikwit-95) envel...
1125452,2624752,12785941,"Ervebo 72,000,000 UNT per 1 ML Injection"
1125453,2624752,12785942,"ERVEBO 72,000,000 UNT in 1 ML Injection"
1125454,2624753,12785939,Zaire ebolavirus (strain Kikwit-95) envelope g...


In [165]:
rel_df_left = rel_df.copy()
# add reflexive relation
rel_df_left = pd.concat([rel_df_left, rel_reflexive_df])
# prepare for merge
rel_df_left = rel_df_left.rename(columns={
    'CUI1':'RXCUI'
})

conso_df_right = conso_df_unique.copy()
# prepare for merge
conso_df_right = conso_df_unique.rename(columns={
    'RXCUI':'RXCUI2',
    'RXAUI':'RXAUI2',
    'TERM':'TERM2'
})

In [166]:
conso_df_right

Unnamed: 0,RXCUI2,RXAUI2,TERM2
0,3,8717795,"1,4-alpha-Glucan branching enzyme"
1,3,8717796,"1,4-alpha-Glucan branching enzyme (substance)"
2,3,8717808,"Amylo-(1,4,6)-transglycosylase"
3,3,8718164,Branching enzyme
4,19,10794494,17-hydrocorticosteroid
...,...,...,...
1125451,2624752,12785937,1 ML Zaire ebolavirus (strain Kikwit-95) envel...
1125452,2624752,12785941,"Ervebo 72,000,000 UNT per 1 ML Injection"
1125453,2624752,12785942,"ERVEBO 72,000,000 UNT in 1 ML Injection"
1125454,2624753,12785939,Zaire ebolavirus (strain Kikwit-95) envelope g...


In [167]:
merged_df = conso_df_unique.reset_index().merge(rel_df_left, on='RXCUI', how='outer').set_index('index')
merged_df = merged_df.rename(columns={
  'RXCUI': 'RXCUI1',  
  'RXAUI': 'RXAUI1',  
  'TERM': 'TERM1',  
  'CUI2': 'RXCUI2'
})
merged_df = merged_df.reset_index().merge(conso_df_right, on='RXCUI2', how='outer').set_index('index')

In [168]:
merged_df

Unnamed: 0_level_0,RXCUI1,RXAUI1,TERM1,RXCUI2,REL,RXAUI2,TERM2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,3,8717795,"1,4-alpha-Glucan branching enzyme",3,is_same_concept,8717795,"1,4-alpha-Glucan branching enzyme"
0,3,8717795,"1,4-alpha-Glucan branching enzyme",3,is_same_concept,8717796,"1,4-alpha-Glucan branching enzyme (substance)"
0,3,8717795,"1,4-alpha-Glucan branching enzyme",3,is_same_concept,8717808,"Amylo-(1,4,6)-transglycosylase"
0,3,8717795,"1,4-alpha-Glucan branching enzyme",3,is_same_concept,8718164,Branching enzyme
1,3,8717796,"1,4-alpha-Glucan branching enzyme (substance)",3,is_same_concept,8717795,"1,4-alpha-Glucan branching enzyme"
...,...,...,...,...,...,...,...
1125451,2624752,12785937,1 ML Zaire ebolavirus (strain Kikwit-95) envel...,2624749,constitutes,12785934,Zaire ebolavirus (strain Kikwit-95) envelope g...
1125452,2624752,12785941,"Ervebo 72,000,000 UNT per 1 ML Injection",2624749,constitutes,12785934,Zaire ebolavirus (strain Kikwit-95) envelope g...
1125453,2624752,12785942,"ERVEBO 72,000,000 UNT in 1 ML Injection",2624749,constitutes,12785934,Zaire ebolavirus (strain Kikwit-95) envelope g...
1125454,2624753,12785939,Zaire ebolavirus (strain Kikwit-95) envelope g...,2624749,constitutes,12785934,Zaire ebolavirus (strain Kikwit-95) envelope g...


In [169]:

def get_all_relations(term, merged_df):
    return merged_df[merged_df['TERM1'] == term]

def get_links(term1, term2, merged_df):
    return merged_df[(merged_df['TERM1'] == term1) & (merged_df['TERM2'] == term2)]

In [170]:
get_all_relations('ibuprofen', merged_df)

Unnamed: 0_level_0,RXCUI1,RXAUI1,TERM1,RXCUI2,REL,RXAUI2,TERM2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
19501,5640,12254458,ibuprofen,372450,has_ingredient,12263690,acetaminophen / ibuprofen Oral Capsule
19506,5640,179276,ibuprofen,372450,has_ingredient,12263690,acetaminophen / ibuprofen Oral Capsule
19507,5640,179278,ibuprofen,372450,has_ingredient,12263690,acetaminophen / ibuprofen Oral Capsule
19513,5640,2599466,ibuprofen,372450,has_ingredient,12263690,acetaminophen / ibuprofen Oral Capsule
19515,5640,3605298,ibuprofen,372450,has_ingredient,12263690,acetaminophen / ibuprofen Oral Capsule
...,...,...,...,...,...,...,...
19520,5640,7717203,ibuprofen,5640,is_same_concept,9183045,2-(4-isobutylphenyl)propanoic acid
19520,5640,7717203,ibuprofen,5640,is_same_concept,9183046,4-isobutylhydratropic acid
19520,5640,7717203,ibuprofen,5640,is_same_concept,9183047,α-(4-isobutylphenyl)propionic acid
19520,5640,7717203,ibuprofen,5640,is_same_concept,9183048,α-(p-isobutylphenyl)propionic acid


In [171]:
get_all_relations('ibuprofen', merged_df)['TERM2'].unique()

array(['acetaminophen / ibuprofen Oral Capsule',
       'acetaminophen / ibuprofen Oral Suspension',
       'acetaminophen / ibuprofen Oral Tablet',
       'acetaminophen / ibuprofen', 'acetaminophen-ibuprofen',
       'ACETAMINOPHEN/IBUPROFEN',
       'Acetaminophen- and ibuprofen-containing product',
       'Ibuprofen- and paracetamol-containing product',
       'Product containing ibuprofen and paracetamol (medicinal product)',
       'acetaminophen / ibuprofen Oral Liquid Product',
       'acetaminophen / ibuprofen Oral Product',
       'Acetaminophen- and ibuprofen-containing product in oral dose form',
       'Ibuprofen- and paracetamol-containing product in oral dose form',
       'Product containing ibuprofen and paracetamol in oral dose form (medicinal product form)',
       'acetaminophen / ibuprofen Pill', 'Motrin PM',
       'caffeine / ergotamine / ibuprofen Oral Tablet',
       'caffeine / ergotamine / ibuprofen',
       'caffeine / ergotamine / ibuprofen Oral Product',
 

In [172]:
get_links('ibuprofen', 'ACETAMINOPHEN/IBUPROFEN', merged_df)

Unnamed: 0_level_0,RXCUI1,RXAUI1,TERM1,RXCUI2,REL,RXAUI2,TERM2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
19501,5640,12254458,ibuprofen,818102,has_part,12520763,ACETAMINOPHEN/IBUPROFEN
19506,5640,179276,ibuprofen,818102,has_part,12520763,ACETAMINOPHEN/IBUPROFEN
19507,5640,179278,ibuprofen,818102,has_part,12520763,ACETAMINOPHEN/IBUPROFEN
19513,5640,2599466,ibuprofen,818102,has_part,12520763,ACETAMINOPHEN/IBUPROFEN
19515,5640,3605298,ibuprofen,818102,has_part,12520763,ACETAMINOPHEN/IBUPROFEN
19516,5640,5480549,ibuprofen,818102,has_part,12520763,ACETAMINOPHEN/IBUPROFEN
19517,5640,5480550,ibuprofen,818102,has_part,12520763,ACETAMINOPHEN/IBUPROFEN
19518,5640,5480551,ibuprofen,818102,has_part,12520763,ACETAMINOPHEN/IBUPROFEN
19519,5640,5480552,ibuprofen,818102,has_part,12520763,ACETAMINOPHEN/IBUPROFEN
19520,5640,7717203,ibuprofen,818102,has_part,12520763,ACETAMINOPHEN/IBUPROFEN


In [173]:
get_links('ibuprofen', 'carisoprodol / ibuprofen Pill', merged_df)

Unnamed: 0_level_0,RXCUI1,RXAUI1,TERM1,RXCUI2,REL,RXAUI2,TERM2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
19501,5640,12254458,ibuprofen,1151406,has_ingredient,12294059,carisoprodol / ibuprofen Pill
19506,5640,179276,ibuprofen,1151406,has_ingredient,12294059,carisoprodol / ibuprofen Pill
19507,5640,179278,ibuprofen,1151406,has_ingredient,12294059,carisoprodol / ibuprofen Pill
19513,5640,2599466,ibuprofen,1151406,has_ingredient,12294059,carisoprodol / ibuprofen Pill
19515,5640,3605298,ibuprofen,1151406,has_ingredient,12294059,carisoprodol / ibuprofen Pill
19516,5640,5480549,ibuprofen,1151406,has_ingredient,12294059,carisoprodol / ibuprofen Pill
19517,5640,5480550,ibuprofen,1151406,has_ingredient,12294059,carisoprodol / ibuprofen Pill
19518,5640,5480551,ibuprofen,1151406,has_ingredient,12294059,carisoprodol / ibuprofen Pill
19519,5640,5480552,ibuprofen,1151406,has_ingredient,12294059,carisoprodol / ibuprofen Pill
19520,5640,7717203,ibuprofen,1151406,has_ingredient,12294059,carisoprodol / ibuprofen Pill


In [174]:
get_all_relations('watermelon', merged_df)

Unnamed: 0_level_0,RXCUI1,RXAUI1,TERM1,RXCUI2,REL,RXAUI2,TERM2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
223261,260014,6809185,watermelon,260014,is_same_concept,10328342,Watermelon
223261,260014,6809185,watermelon,260014,is_same_concept,12254063,watermelon preparation
223261,260014,6809185,watermelon,260014,is_same_concept,6809185,watermelon
558812,901258,4661177,watermelon,901263,has_ingredient,3060385,watermelon allergenic extract Injectable Solution
558812,901258,4661177,watermelon,901259,has_ingredient,3060381,watermelon allergenic extract 50 MG/ML
558812,901258,4661177,watermelon,901262,has_ingredient,3060384,watermelon allergenic extract 100 MG/ML
558812,901258,4661177,watermelon,1164599,has_ingredient,3828226,watermelon allergenic extract Injectable Product
558812,901258,4661177,watermelon,901258,is_same_concept,2991817,WATERMELON
558812,901258,4661177,watermelon,901258,is_same_concept,3060380,watermelon allergenic extract
558812,901258,4661177,watermelon,901258,is_same_concept,4661177,watermelon


In [175]:
get_all_relations('PHENYLEPHRINE', merged_df)

Unnamed: 0_level_0,RXCUI1,RXAUI1,TERM1,RXCUI2,REL,RXAUI2,TERM2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
29311,8163,2072948,PHENYLEPHRINE,214186,has_part,10308371,Product containing paracetamol and phenylephri...
29311,8163,2072948,PHENYLEPHRINE,214186,has_part,10795540,Acetaminophen- and phenylephrine-containing pr...
29311,8163,2072948,PHENYLEPHRINE,214186,has_part,10808658,Paracetamol- and phenylephrine-containing product
29311,8163,2072948,PHENYLEPHRINE,214186,has_part,1172623,acetaminophen-phenylephrine
29311,8163,2072948,PHENYLEPHRINE,214186,has_part,12353274,acetaminophen / phenylephrine
...,...,...,...,...,...,...,...
29318,8163,3279979,PHENYLEPHRINE,8163,is_same_concept,8459519,Phenylephrine
29318,8163,3279979,PHENYLEPHRINE,8163,is_same_concept,8498681,l-(3-Hydroxyphenyl)-N-methylethanolamine
29318,8163,3279979,PHENYLEPHRINE,8163,is_same_concept,8692422,Fenilefrina
29318,8163,3279979,PHENYLEPHRINE,8163,is_same_concept,8692423,Phenylephrinum


In [176]:
get_all_relations('PHENYLEPHRINE', merged_df)['REL'].unique()

array(['has_part', 'tradename_of', 'has_ingredient', 'form_of',
       'is_same_concept'], dtype=object)

### Trying to speed up the lookups with dictionaries

In [62]:
# from collections import defaultdict
from tqdm import tqdm
# merged_dict = defaultdict(lambda: defaultdict(str))
merged_dict = {} #dict[str, dict[str,set]]
for _, row in tqdm(merged_df.iterrows()):
    term1 = row['TERM1']
    term2 = row['TERM2']
    rel = row['REL']

    # if term1 does not exist
    if term1 not in merged_dict:
        # add term2 and the relation
        nested_dict = {term2: {rel}} # dct[str, set]
        merged_dict.update({term1: nested_dict})
    # if term1 does exist
    else:
        nested_dict = merged_dict[term1]
        # if term2 does not exist, add it
        if term2 not in nested_dict:
            nested_dict.update({term2: {rel}})
        # if term2 does exist, add rel if it does not exist
        else:
            rel_set = nested_dict[term2]
            if rel not in rel_set:
                rel_set.update({rel})
            else:
                pass

39818257it [10:47, 61501.56it/s]


In [63]:
merged_dict['17-hydroxycorticosteroid']

{'17-hydrocorticosteroid': {'is_same_concept'},
 '17-hydroxycorticoid': {'is_same_concept'},
 '17-hydroxycorticosteroid': {'is_same_concept'},
 '17-hydroxycorticosteroid (substance)': {'is_same_concept'}}

In [69]:
def get_all_relations_dict(term, merged_dict):
    if term in merged_dict:
        return merged_dict[term]
    return None

def get_links_dict(term1, term2, merged_dict):
    if term1 in merged_dict:
        nested_dict = merged_dict[term1]
        if term2 in nested_dict:
            return nested_dict[term2]
    return None

In [70]:
get_all_relations_dict('PHENYLEPHRINE', merged_dict)

{'ACETAMINOPHEN/PHENYLEPHRINE': {'has_part'},
 'Acetaminophen- and phenylephrine-containing product': {'has_part'},
 'Paracetamol- and phenylephrine-containing product': {'has_part'},
 'Product containing paracetamol and phenylephrine (medicinal product)': {'has_part'},
 'acetaminophen / phenylephrine': {'has_part'},
 'acetaminophen-phenylephrine': {'has_part'},
 'Colrex': {'tradename_of'},
 'Hycomine Compound': {'tradename_of'},
 'acetaminophen / phenylephrine Chewable Tablet': {'has_ingredient'},
 'acetaminophen / phenylephrine Oral Solution': {'has_ingredient'},
 'acetaminophen / phenylephrine Oral Tablet': {'has_ingredient'},
 'acetaminophen / chlorpheniramine / phenylephrine Oral Tablet': {'has_ingredient'},
 'acetaminophen / chlorpheniramine / phenylephrine / salicylamide Oral Tablet': {'has_ingredient'},
 'acetaminophen / chlorpheniramine / codeine / phenylephrine Oral Capsule': {'has_ingredient'},
 'acetaminophen / caffeine / guaiFENesin / phenylephrine Oral Tablet': {'has_ingr

In [71]:
get_links_dict('ibuprofen', 'carisoprodol / ibuprofen Pill', merged_dict)

{'has_ingredient'}

### Trying to speed up lookups with multi-index


In [177]:
multi_df = merged_df.set_index(['TERM1', 'TERM2']).sort_index()

In [178]:
# investigate duplicate indexing
multi_df[multi_df.index.duplicated(keep=False)] # interesting, multiple relations between same terms via different concepts

Unnamed: 0_level_0,Unnamed: 1_level_0,RXCUI1,RXAUI1,RXCUI2,REL,RXAUI2
TERM1,TERM2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
((((3-Bromomesityl)carbamoyl)methyl)imino)diacetic acid,MEBROFENIN,1311500,12692981,1311500,is_same_concept,10894384
((((3-Bromomesityl)carbamoyl)methyl)imino)diacetic acid,MEBROFENIN,1311500,12692981,1311500,is_same_concept,3062173
((((3-Bromomesityl)carbamoyl)methyl)imino)diacetic acid,Mebrofenin,1311500,12692981,1311500,is_same_concept,10327795
((((3-Bromomesityl)carbamoyl)methyl)imino)diacetic acid,Mebrofenin,1311500,12692981,1311500,is_same_concept,10329348
((((3-Bromomesityl)carbamoyl)methyl)imino)diacetic acid,Mebrofenin,1311500,12692981,1311500,is_same_concept,12507250
...,...,...,...,...,...,...
ω-phenylbutyric acid,Phenylbutyrate,1546447,9183291,81647,has_form,9183286
ω-phenylbutyric acid,Phenylbutyric acid,1546447,9183291,1546447,is_same_concept,10809107
ω-phenylbutyric acid,Phenylbutyric acid,1546447,9183291,1546447,is_same_concept,9183287
ω-phenylbutyric acid,phenylbutyrate,1546447,9183291,81647,has_form,668676


In [179]:
multi_df.loc[['ω-phenylbutyric acid']]

Unnamed: 0_level_0,Unnamed: 1_level_0,RXCUI1,RXAUI1,RXCUI2,REL,RXAUI2
TERM1,TERM2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ω-phenylbutyric acid,4-Phenyl-n-butyric acid,1546447,9183291,1546447,is_same_concept,9183283
ω-phenylbutyric acid,4-phenylbutyrate,1546447,9183291,81647,has_form,2143469
ω-phenylbutyric acid,4-phenylbutyrate,1546447,9183291,81647,has_form,8694069
ω-phenylbutyric acid,4-phenylbutyric acid,1546447,9183291,1546447,is_same_concept,559668
ω-phenylbutyric acid,4-phenylbutyric acid,1546447,9183291,1546447,is_same_concept,6797593
ω-phenylbutyric acid,4-phenylbutyric acid,1546447,9183291,1546447,is_same_concept,8694070
ω-phenylbutyric acid,Benzenebutyric acid,1546447,9183291,1546447,is_same_concept,9183284
ω-phenylbutyric acid,PBA,1546447,9183291,1546447,is_same_concept,9183285
ω-phenylbutyric acid,PHENYLBUTYRIC ACID,1546447,9183291,1546447,is_same_concept,6395764
ω-phenylbutyric acid,Phenylbutyrate,1546447,9183291,81647,has_form,12765754


In [180]:
multi_df.loc[[('ω-phenylbutyric acid', 'phenylbutyrate')]]

Unnamed: 0_level_0,Unnamed: 1_level_0,RXCUI1,RXAUI1,RXCUI2,REL,RXAUI2
TERM1,TERM2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ω-phenylbutyric acid,phenylbutyrate,1546447,9183291,81647,has_form,668676
ω-phenylbutyric acid,phenylbutyrate,1546447,9183291,81647,has_form,6807297


In [181]:
multi_df.loc[[('ω-phenylbutyric acid', 'Phenylbutyrate')]]

Unnamed: 0_level_0,Unnamed: 1_level_0,RXCUI1,RXAUI1,RXCUI2,REL,RXAUI2
TERM1,TERM2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ω-phenylbutyric acid,Phenylbutyrate,1546447,9183291,81647,has_form,12765754
ω-phenylbutyric acid,Phenylbutyrate,1546447,9183291,81647,has_form,9183286


In [182]:
multi_df.loc['watermelon', 'watermelon']

Unnamed: 0_level_0,Unnamed: 1_level_0,RXCUI1,RXAUI1,RXCUI2,REL,RXAUI2
TERM1,TERM2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
watermelon,watermelon,260014,6809185,260014,is_same_concept,6809185
watermelon,watermelon,901258,4661177,901258,is_same_concept,4661177


In [183]:
multi_df.loc[[('paracetamol', 'paracetamol')]]

Unnamed: 0_level_0,Unnamed: 1_level_0,RXCUI1,RXAUI1,RXCUI2,REL,RXAUI2
TERM1,TERM2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
paracetamol,paracetamol,161,5481616,161,is_same_concept,5481616


### Using a normalization function
Make sure the understand what happens when two terms are mapped to the same normalized term

In [184]:
normalizer = lambda x: x.strip().lower()

In [185]:
normalizer('Hello ')

'hello'

In [186]:
normalized_multi_df = multi_df.copy().reset_index()
normalized_multi_df['TERM1'] = normalized_multi_df['TERM1'].apply(normalizer)
normalized_multi_df['TERM2'] = normalized_multi_df['TERM2'].apply(normalizer)
normalized_multi_df = normalized_multi_df.set_index(['TERM1', 'TERM2'])
normalized_multi_df = normalized_multi_df.reindex(sorted(normalized_multi_df.columns), axis=1)

In [187]:
assert len(normalized_multi_df) == len(multi_df)

In [188]:
normalized_multi_df.loc[['watermelon']]

Unnamed: 0_level_0,Unnamed: 1_level_0,REL,RXAUI1,RXAUI2,RXCUI1,RXCUI2
TERM1,TERM2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
watermelon,watermelon,is_same_concept,2991817,2991817,901258,901258
watermelon,watermelon,is_same_concept,2991817,5943521,901258,901258
watermelon,watermelon,is_same_concept,2991817,9275453,901258,901258
watermelon,watermelon,is_same_concept,2991817,4661177,901258,901258
watermelon,watermelon allergenic extract,is_same_concept,2991817,3060380,901258,901258
watermelon,watermelon allergenic extract 100 mg/ml,has_ingredient,2991817,3060384,901258,901262
watermelon,watermelon allergenic extract 50 mg/ml,has_ingredient,2991817,3060381,901258,901259
watermelon,watermelon allergenic extract injectable product,has_ingredient,2991817,3828226,901258,1164599
watermelon,watermelon allergenic extract injectable solution,has_ingredient,2991817,3060385,901258,901263
watermelon,watermelon,is_same_concept,5943521,2991817,901258,901258


In [189]:
normalized_multi_df.loc[['banana']]

Unnamed: 0_level_0,Unnamed: 1_level_0,REL,RXAUI1,RXAUI2,RXCUI1,RXCUI2
TERM1,TERM2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
banana,banana,is_same_concept,2991319,2991319,891830,891830
banana,banana,is_same_concept,2991319,9275352,891830,891830
banana,banana allergenic extract,is_same_concept,2991319,3042863,891830,891830
banana,banana allergenic extract 100 mg/ml,has_ingredient,2991319,3042864,891830,891831
banana,banana allergenic extract 50 mg/ml,has_ingredient,2991319,3056835,891830,899353
banana,banana allergenic extract injectable product,has_ingredient,2991319,3824624,891830,1159385
banana,banana allergenic extract injectable solution,has_ingredient,2991319,3042865,891830,891832
banana,banana extract,is_same_concept,2991319,8696358,891830,891830
banana,banana,is_same_concept,9275352,2991319,891830,891830
banana,banana,is_same_concept,10328695,10328695,285149,285149
