In [1]:
import pandas as pd
import os, shutil
import numpy as np
import matplotlib
import sys; sys.path.append('..')
import pickle
import torch
import dill
import utils.constant as constant

from tqdm.notebook import tqdm, tqdm_notebook
from collections import defaultdict

path_ddi_dataset = constant.PATH_DDI_DATA
path_iii_dataset = constant.PATH_MIMIC_III

---

# MAPPING

1. NDC to RXCUI (done in `preprocess_drug.ipynb`)
2. RXCUI to ATC4
3. ATC4 to CID

already done as `MAP_IDX4NDC_RXCUI_ATC4_CIDS.csv`!

In [22]:
# df_idx4ndc_rxcui_atc4_cids = pd.read_csv(os.path.join(path_ddi_dataset, "MAP_IDX4NDC_RXCUI_ATC4_CIDS.csv"))
# avoid `unnamed :0`
# <https://stackoverflow.com/questions/53988226/pd-read-csv-add-column-named-unnamed-0>
df_idx4ndc_rxcui_atc4_cids = pd.read_csv(os.path.join(path_ddi_dataset, "MAP_IDX4NDC_RXCUI_ATC4_CIDS.csv"), index_col=0)
df_idx4ndc_rxcui_atc4_cids

Unnamed: 0,NDC,RXCUI,idx,ATC4,list_cid,list_cid_idx
416,2.606320e+07,,589,,,
1351,1.320001e+08,,1524,,,
1798,3.380404e+08,,1971,,,
2569,8.290002e+09,,2742,,,
2576,8.881200e+09,,2749,,,
...,...,...,...,...,...,...
4082,6.854601e+10,637218.0,4272,N04BD,['CID000122316'],[610]
4086,6.877401e+10,197516.0,4276,J01FA,"['CID000054688', 'CID003002190', 'CID000002269...","[551, 639, 69, 185]"
4088,6.885000e+10,995605.0,4278,J04AK,"['CID000003279', 'CID000001046']","[188, 24]"
4089,6.885000e+10,995609.0,4279,J04AK,"['CID000003279', 'CID000001046']","[188, 24]"


In [23]:
mask = df_idx4ndc_rxcui_atc4_cids['ATC4'].notnull()
df_idx4ndc_rxcui_atc4_cids.loc[mask]

Unnamed: 0,NDC,RXCUI,idx,ATC4,list_cid,list_cid_idx
3,2.202402e+06,313009.0,176,A12CA,,
28,2.751001e+06,865098.0,201,A10AB,,
29,2.751101e+06,752388.0,202,A10AB,,
35,2.821501e+06,311036.0,208,A10AB,,
38,2.850101e+06,351859.0,211,A10AB,,
...,...,...,...,...,...,...
4082,6.854601e+10,637218.0,4272,N04BD,['CID000122316'],[610]
4086,6.877401e+10,197516.0,4276,J01FA,"['CID000054688', 'CID003002190', 'CID000002269...","[551, 639, 69, 185]"
4088,6.885000e+10,995605.0,4278,J04AK,"['CID000003279', 'CID000001046']","[188, 24]"
4089,6.885000e+10,995609.0,4279,J04AK,"['CID000003279', 'CID000001046']","[188, 24]"


In [24]:
df_idx4ndc_rxcui_atc4_cids = df_idx4ndc_rxcui_atc4_cids.drop(columns=['list_cid', 'list_cid_idx'])
df_idx4ndc_rxcui_atc4_cids

Unnamed: 0,NDC,RXCUI,idx,ATC4
416,2.606320e+07,,589,
1351,1.320001e+08,,1524,
1798,3.380404e+08,,1971,
2569,8.290002e+09,,2742,
2576,8.881200e+09,,2749,
...,...,...,...,...
4082,6.854601e+10,637218.0,4272,N04BD
4086,6.877401e+10,197516.0,4276,J01FA
4088,6.885000e+10,995605.0,4278,J04AK
4089,6.885000e+10,995609.0,4279,J04AK


# `med_voc` aligning

In [9]:
with open(os.path.join(path_ddi_dataset, "voc_final.pkl"), 'rb') as f:
    voc = dill.load(f)

med_voc = voc['med_voc']
med_voc.word2idx

{'N02B': 0,
 'A01A': 1,
 'A02B': 2,
 'A06A': 3,
 'B05C': 4,
 'A12A': 5,
 'A12C': 6,
 'C01C': 7,
 'A07A': 8,
 'M01A': 9,
 'N01A': 10,
 'C07A': 11,
 'C03C': 12,
 'A12B': 13,
 'N07A': 14,
 'C02D': 15,
 'N02A': 16,
 'N06A': 17,
 'A02A': 18,
 'J01M': 19,
 'C02A': 20,
 'B01A': 21,
 'A11C': 22,
 'C03A': 23,
 'A03B': 24,
 'C10A': 25,
 'C01B': 26,
 'N05C': 27,
 'C09A': 28,
 'D01A': 29,
 'H03A': 30,
 'J01D': 31,
 'B02B': 32,
 'R06A': 33,
 'J01X': 34,
 'N03A': 35,
 'N05A': 36,
 'C08C': 37,
 'D11A': 38,
 'C01D': 39,
 'A04A': 40,
 'M03A': 41,
 'A07E': 42,
 'R03A': 43,
 'B03B': 44,
 'D07A': 45,
 'N07B': 46,
 'N05B': 47,
 'R05C': 48,
 'D06A': 49,
 'A03F': 50,
 'R01A': 51,
 'G04B': 52,
 'C01E': 53,
 'L01A': 54,
 'A07D': 55,
 'D04A': 56,
 'A05A': 57,
 'P01C': 58,
 'D06B': 59,
 'L01B': 60,
 'C01A': 61,
 'C05A': 62,
 'C03D': 63,
 'P01A': 64,
 'J02A': 65,
 'J05A': 66,
 'L01X': 67,
 'H02A': 68,
 'V03A': 69,
 'J01F': 70,
 'G03A': 71,
 'J01E': 72,
 'J04A': 73,
 'D10A': 74,
 'P01B': 75,
 'R05D': 76,
 'N04B': 

In [11]:
list(med_voc.word2idx.keys())

['N02B',
 'A01A',
 'A02B',
 'A06A',
 'B05C',
 'A12A',
 'A12C',
 'C01C',
 'A07A',
 'M01A',
 'N01A',
 'C07A',
 'C03C',
 'A12B',
 'N07A',
 'C02D',
 'N02A',
 'N06A',
 'A02A',
 'J01M',
 'C02A',
 'B01A',
 'A11C',
 'C03A',
 'A03B',
 'C10A',
 'C01B',
 'N05C',
 'C09A',
 'D01A',
 'H03A',
 'J01D',
 'B02B',
 'R06A',
 'J01X',
 'N03A',
 'N05A',
 'C08C',
 'D11A',
 'C01D',
 'A04A',
 'M03A',
 'A07E',
 'R03A',
 'B03B',
 'D07A',
 'N07B',
 'N05B',
 'R05C',
 'D06A',
 'A03F',
 'R01A',
 'G04B',
 'C01E',
 'L01A',
 'A07D',
 'D04A',
 'A05A',
 'P01C',
 'D06B',
 'L01B',
 'C01A',
 'C05A',
 'C03D',
 'P01A',
 'J02A',
 'J05A',
 'L01X',
 'H02A',
 'V03A',
 'J01F',
 'G03A',
 'J01E',
 'J04A',
 'D10A',
 'P01B',
 'R05D',
 'N04B',
 'G04C',
 'J01C',
 'S01E',
 'H05B',
 'M04A',
 'C09C',
 'J01G',
 'C08D',
 'N06D',
 'H01C',
 'L04A',
 'A10B',
 'C05B',
 'B02A',
 'D08A',
 'A16A',
 'A11D',
 'C02C',
 'J01A',
 'A11G',
 'H03B',
 'L01D',
 'N06B',
 'C03B',
 'N01B',
 'G03C',
 'N04A',
 'N02C',
 'M03B',
 'A07B',
 'A11H',
 'M05B',
 'S01F',
 

# NEW `DDI` calculator testing

In [13]:
with open(os.path.join(path_ddi_dataset, "ddi_A_final.pkl"), 'rb') as f:
    ddi_adj = dill.load(f)
    
ddi_adj

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [14]:
sum(sum(ddi_adj)) / ddi_adj.size

0.05221140959151565

In [17]:
med_unique_word = list(med_voc.word2idx.keys())
med_unique_word

['N02B',
 'A01A',
 'A02B',
 'A06A',
 'B05C',
 'A12A',
 'A12C',
 'C01C',
 'A07A',
 'M01A',
 'N01A',
 'C07A',
 'C03C',
 'A12B',
 'N07A',
 'C02D',
 'N02A',
 'N06A',
 'A02A',
 'J01M',
 'C02A',
 'B01A',
 'A11C',
 'C03A',
 'A03B',
 'C10A',
 'C01B',
 'N05C',
 'C09A',
 'D01A',
 'H03A',
 'J01D',
 'B02B',
 'R06A',
 'J01X',
 'N03A',
 'N05A',
 'C08C',
 'D11A',
 'C01D',
 'A04A',
 'M03A',
 'A07E',
 'R03A',
 'B03B',
 'D07A',
 'N07B',
 'N05B',
 'R05C',
 'D06A',
 'A03F',
 'R01A',
 'G04B',
 'C01E',
 'L01A',
 'A07D',
 'D04A',
 'A05A',
 'P01C',
 'D06B',
 'L01B',
 'C01A',
 'C05A',
 'C03D',
 'P01A',
 'J02A',
 'J05A',
 'L01X',
 'H02A',
 'V03A',
 'J01F',
 'G03A',
 'J01E',
 'J04A',
 'D10A',
 'P01B',
 'R05D',
 'N04B',
 'G04C',
 'J01C',
 'S01E',
 'H05B',
 'M04A',
 'C09C',
 'J01G',
 'C08D',
 'N06D',
 'H01C',
 'L04A',
 'A10B',
 'C05B',
 'B02A',
 'D08A',
 'A16A',
 'A11D',
 'C02C',
 'J01A',
 'A11G',
 'H03B',
 'L01D',
 'N06B',
 'C03B',
 'N01B',
 'G03C',
 'N04A',
 'N02C',
 'M03B',
 'A07B',
 'A11H',
 'M05B',
 'S01F',
 

In [30]:
def calc_ddi_rate(durg_idxes_curr_admi: torch.tensor):
    # durg_idxes_curr_admi: the indeices of drugs of current patient, waiting to calculate the DDI score
    mask = df_idx4ndc_rxcui_atc4_cids.idx.isin(durg_idxes_curr_admi.tolist())  # MUST tolist !!!
    df_drugs_curr_admi = df_idx4ndc_rxcui_atc4_cids.loc[mask]

    # df_drugs_can_calc_ddi = df_drugs_curr_admi[df_drugs_curr_admi.list_cid_idx.notnull()]
    df_drugs_can_calc_ddi = df_drugs_curr_admi[df_drugs_curr_admi.ATC4.notnull()]

    # TODO: here should use the same `med_voc` as previous work
    # med_unique_word = [med_voc.idx2word[i] for i in range(med_voc_size)]
    df_drugs_can_calc_ddi['ATC3'] = df_drugs_can_calc_ddi['ATC4'].map(lambda x: x[:4])    
    df_drugs_can_calc_ddi = df_drugs_can_calc_ddi[df_drugs_can_calc_ddi.ATC3.isin(med_unique_word)]  # ATC3 = ATC4[:4]
    atc3s = df_drugs_can_calc_ddi.ATC3.unique()
    
#     print(atc3s)
    cnt_all = 0
    cnt_ddi = 0
    for i, atc3_i in enumerate(atc3s):
        idx_drug_i = med_voc.word2idx[atc3_i]

        for j, atc3_j in enumerate(atc3s):
            if j <= i: continue
            cnt_all += 1

            idx_drug_j = med_voc.word2idx[atc3_j]
            if ddi_adj[idx_drug_i, idx_drug_j] == 1 or \
               ddi_adj[idx_drug_j, idx_drug_i] == 1:
                cnt_ddi += 1

    if cnt_all == 0:
        return 0
    
    print(cnt_all)
    
    return cnt_ddi / cnt_all

durg_idxes_curr_admi = torch.tensor([2277, 2703, 3441, 3336, 3338])
ddi_score = calc_ddi_rate(durg_idxes_curr_admi)

ddi_score

['J01E' 'N06A' 'C07A' 'N03A' 'H03A']
10


0.3