In [1]:
import numpy as np
import dill
import pandas as pd

import datetime as dt
from tqdm import tqdm
from collections import defaultdict, Counter

import warnings
# 禁用所有警告
warnings.filterwarnings("ignore")

## Diagnoses

In [None]:
D_ICD_DIAGNOSES = pd.read_csv('input/D_ICD_DIAGNOSES.csv.gz')
D_ICD_DIAGNOSES.columns = ['ICD_CODE', 'ICD_VERSION', 'ICD_TEXT']

III_diagnoses = pd.read_csv('output/III_D.csv')
IV_diagnoses = pd.read_csv('output/IV_D.csv')

In [None]:
III_diagnoses['ICD_CODES'] = III_diagnoses.drop('HADM_ID', axis=1).apply(lambda row: [col for col in row.index if row[col] == 1], axis=1)
III_diagnoses['ICD_TEXTs'] = III_diagnoses.drop('HADM_ID', axis=1).apply(lambda row: [D_ICD_DIAGNOSES[D_ICD_DIAGNOSES['ICD_CODE'] == col]['ICD_TEXT'].values[0] for col in row.index if row[col] == 1], axis=1)

In [None]:
IV_diagnoses['ICD_CODES'] = IV_diagnoses.drop('HADM_ID', axis=1).apply(lambda row: [col for col in row.index if row[col] == 1], axis=1)
IV_diagnoses['ICD_TEXTs'] = IV_diagnoses.drop('HADM_ID', axis=1).apply(lambda row: [D_ICD_DIAGNOSES[D_ICD_DIAGNOSES['ICD_CODE'] == col]['ICD_TEXT'].values[0] for col in row.index if row[col] == 1], axis=1)

In [None]:
dill.dump(III_diagnoses, open('ouput/III_D_1.pkl','wb'))
dill.dump(IV_diagnoses, open('ouput/IV_D_1.pkl','wb'))

## Procedure

In [None]:
p_dic = pd.read_csv('input/d_icd_procedures.csv.gz')
p_dic.columns = ['ICD_CODE', 'ICD_VERSION', 'ICD_TEXT']

III_p = pd.read_csv('output/III_P.csv')
IV_p = pd.read_csv('output/IV_P.csv')

In [None]:
# 方法：通过遍历每个 HADM_ID，将等于1的列名收集为列表
III['ICD_CODES'] = III.drop(['HADM_ID','ICUSTAY_ID'], axis=1).apply(lambda row: [col for col in row.index if row[col] == 1], axis=1)
# 方法：通过遍历每个 HADM_ID，将等于1的列名收集为列表
III['ICD_TEXTs'] = III.drop(['HADM_ID','ICUSTAY_ID'], axis=1).apply(lambda row: [p_dic[p_dic['ICD_CODE'] == int(col)]['ICD_TEXT'].values[0] for col in row.index if row[col] == 1], axis=1)

In [None]:
IV['ICD_CODES'] = IV.drop(['HADM_ID','ICUSTAY_ID'], axis=1).apply(lambda row: [col for col in row.index if row[col] == 1], axis=1)
IV['ICD_TEXTs'] = IV.drop(['HADM_ID','ICUSTAY_ID'], axis=1).apply(lambda row: [p_dic[p_dic['ICD_CODE'] == int(col)]['ICD_TEXT'].values[0] for col in row.index if row[col] == 1], axis=1)

In [None]:
dill.dump(III, open('D:/2025BMC_Mortaility/Code_data/MIMICs/Data/III_P_1.pkl','wb'))
dill.dump(IV, open('D:/2025BMC_Mortaility/Code_data/MIMICs/Data/IV_P_1.pkl','wb'))

## DRUG

In [None]:
III_PRESCRIPTIONS = pd.read_csv('output/III_M_0.csv', dtype={'NDC':'str'})
IV_PRESCRIPTIONS = pd.read_csv('output/IV_M_0.csv', dtype={'NDC':'str'})

In [None]:
III_PRESCRIPTIONS = III_PRESCRIPTIONS[['ICUSTAY_ID','DRUG','NDC']]
IV_PRESCRIPTIONS = IV_PRESCRIPTIONS[['ICUSTAY_ID','DRUG','NDC']]

III_PRESCRIPTIONS = III_PRESCRIPTIONS.drop_duplicates(keep='first')
IV_PRESCRIPTIONS = IV_PRESCRIPTIONS.drop_duplicates(keep='first')

print(III_PRESCRIPTIONS.shape,IV_PRESCRIPTIONS.shape)

In [None]:
union = list(set(III_PRESCRIPTIONS.NDC.unique())&set(IV_PRESCRIPTIONS.NDC.unique()))
union = list(pd.unique(union))
print(len(union))

In [None]:
III_m = III_PRESCRIPTIONS[III_PRESCRIPTIONS.NDC.isin(union)].drop_duplicates(subset = ['ICUSTAY_ID','NDC'],keep='first')[['ICUSTAY_ID','NDC']].pivot_table(index='ICUSTAY_ID', columns='NDC', aggfunc='size', fill_value=np.nan)
III_m = III_m.reset_index()

IV_m = IV_PRESCRIPTIONS[IV_PRESCRIPTIONS.NDC.isin(union)].drop_duplicates(subset = ['ICUSTAY_ID','NDC'],keep='first')[['ICUSTAY_ID','NDC']].pivot_table(index='ICUSTAY_ID', columns='NDC', aggfunc='size', fill_value=np.nan)
IV_m = IV_m.reset_index()

III_m_missing = III_m[union].isna().mean() * 100
IV_m_missing = IV_m[union].isna().mean() * 100

keep_p_ids = list(III_m_missing[III_m_missing<95].index) + list(IV_m_missing[IV_m_missing<95].index)
keep_p_ids = list(pd.unique(keep_p_ids))
print(len(keep_p_ids),keep_p_ids)

In [None]:
III_PRESCRIPTIONS = III_PRESCRIPTIONS[III_PRESCRIPTIONS.NDC.isin(keep_p_ids)]
IV_PRESCRIPTIONS = IV_PRESCRIPTIONS[IV_PRESCRIPTIONS.NDC.isin(keep_p_ids)]

print(III_PRESCRIPTIONS.shape,IV_PRESCRIPTIONS.shape)
print(len(III_PRESCRIPTIONS.NDC.unique()),len(IV_PRESCRIPTIONS.NDC.unique()))

In [None]:
with open('input/rxnorm2RXCUI.txt', 'r') as f:
    rxnorm2RXCUI = eval(f.read())
    
III_PRESCRIPTIONS['RXCUI'] = III_PRESCRIPTIONS['NDC'].map(rxnorm2RXCUI)
IV_PRESCRIPTIONS['RXCUI'] = IV_PRESCRIPTIONS['NDC'].map(rxnorm2RXCUI)

In [None]:
III_PRESCRIPTIONS.dropna(inplace=True)
IV_PRESCRIPTIONS.dropna(inplace=True)
print(III_PRESCRIPTIONS.shape,IV_PRESCRIPTIONS.shape)

In [None]:
III_PRESCRIPTIONS = III_PRESCRIPTIONS[~(III_PRESCRIPTIONS.RXCUI == '')]
IV_PRESCRIPTIONS = IV_PRESCRIPTIONS[~(IV_PRESCRIPTIONS.RXCUI == '')]

In [None]:
rxnorm2atc4 = pd.read_csv('input/RXCUI2atc4.csv')
rxnorm2atc4 = rxnorm2atc4.drop(columns=['YEAR', 'MONTH', 'NDC'])
rxnorm2atc4.drop_duplicates(subset=['RXCUI'], inplace=True)

In [None]:
III_PRESCRIPTIONS['RXCUI'] = III_PRESCRIPTIONS['RXCUI'].astype('int64')
III_PRESCRIPTIONS = III_PRESCRIPTIONS.reset_index(drop=True)
print(III_PRESCRIPTIONS.shape,IV_PRESCRIPTIONS.shape)

III_PRESCRIPTIONS = III_PRESCRIPTIONS.merge(rxnorm2atc4, on=['RXCUI'])
III_PRESCRIPTIONS = III_PRESCRIPTIONS.rename(columns={'ATC4': 'ATC3'})
print(III_PRESCRIPTIONS.shape,IV_PRESCRIPTIONS.shape)

In [None]:
IV_PRESCRIPTIONS['RXCUI'] = IV_PRESCRIPTIONS['RXCUI'].astype('int64')
IV_PRESCRIPTIONS = IV_PRESCRIPTIONS.reset_index(drop=True)
print(III_PRESCRIPTIONS.shape,IV_PRESCRIPTIONS.shape)

IV_PRESCRIPTIONS = IV_PRESCRIPTIONS.merge(rxnorm2atc4, on=['RXCUI'])
IV_PRESCRIPTIONS = IV_PRESCRIPTIONS.rename(columns={'ATC4': 'ATC3'})
print(III_PRESCRIPTIONS.shape,IV_PRESCRIPTIONS.shape)

In [None]:
print(len(III_PRESCRIPTIONS.NDC.unique()),len(IV_PRESCRIPTIONS.NDC.unique()))
print(len(III_PRESCRIPTIONS.ATC3.unique()),len(IV_PRESCRIPTIONS.ATC3.unique()))

In [None]:
def ATC3toDrug(med_pd):
    atc3toDrugDict = {}
    for atc3, drugname in med_pd[['ATC3', 'DRUG']].values:
        if atc3 in atc3toDrugDict:
            atc3toDrugDict[atc3].add(drugname)  
        else:
            atc3toDrugDict[atc3] = {drugname}  

    return atc3toDrugDict

In [None]:
III_atc3toDrug = ATC3toDrug(III_PRESCRIPTIONS)
IV_atc3toDrug = ATC3toDrug(IV_PRESCRIPTIONS)

print(len(III_atc3toDrug),len(IV_atc3toDrug))

In [None]:
druginfo = pd.read_csv('input/drugbank_drugs_info.csv')

In [None]:
def atc3toSMILES(ATC3toDrugDict, druginfo):
    drug2smiles = {}
    atc3tosmiles = {}
    for drugname, smiles in druginfo[['name', 'moldb_smiles']].values:
        if type(smiles) == type('a'):
            drug2smiles[drugname] = smiles
    for atc3, drug in ATC3toDrugDict.items():
        temp = []
        for d in drug:
            try:
                temp.append(drug2smiles[d])
            except:
                pass
        if len(temp) > 0:
            atc3tosmiles[atc3] = temp[:3]
    
    return atc3tosmiles

In [None]:
III_SMIL = atc3toSMILES(III_atc3toDrug,druginfo)
IV_SMIL = atc3toSMILES(IV_atc3toDrug,druginfo)
print(len(III_SMIL),len(IV_SMIL))

In [None]:
III_med_pd = III_PRESCRIPTIONS[III_PRESCRIPTIONS.ATC3.isin(III_SMIL.keys())]
III_med_pd['SMILES'] = III_med_pd['ATC3'].map(lambda x: '\t'.join(III_SMIL[x]))

IV_med_pd = IV_PRESCRIPTIONS[IV_PRESCRIPTIONS.ATC3.isin(IV_SMIL.keys())]
IV_med_pd['SMILES'] = IV_med_pd['ATC3'].map(lambda x: '\t'.join(IV_SMIL[x]))

print(III_med_pd.shape,IV_med_pd.shape)
print(len(III_med_pd.ATC3.unique()),len(IV_med_pd.ATC3.unique()))
print(len(III_med_pd.NDC.unique()),len(IV_med_pd.NDC.unique()))

In [None]:
union = list(set(III_med_pd.NDC.unique())&set(IV_med_pd.NDC.unique()))
union = list(pd.unique(union))
print(len(union))

In [None]:
III_med_pd = III_med_pd[III_med_pd.NDC.isin(union)]
IV_med_pd = IV_med_pd[IV_med_pd.NDC.isin(union)]

print(III_med_pd.shape,IV_med_pd.shape)
print(len(III_med_pd.ICUSTAY_ID.unique()),len(IV_med_pd.ICUSTAY_ID.unique()))
print(len(III_med_pd.NDC.unique()),len(IV_med_pd.NDC.unique()))
print(len(III_med_pd.ATC3.unique()),len(IV_med_pd.ATC3.unique()))
print(len(III_med_pd.SMILES.unique()),len(IV_med_pd.SMILES.unique()))

In [None]:
III_pivot = III_med_pd[['ICUSTAY_ID','ATC3']]
IV_pivot = IV_med_pd[['ICUSTAY_ID','ATC3']]

III_pivot = III_pivot.drop_duplicates(keep='first')
IV_pivot = IV_pivot.drop_duplicates(keep='first')

In [None]:
III_pivot = III_pivot.pivot_table(index='ICUSTAY_ID', columns='ATC3', aggfunc='size', fill_value=np.nan)
III_pivot = III_pivot.reset_index()
III_pivot.head()

In [None]:
IV_pivot = IV_pivot.pivot_table(index='ICUSTAY_ID', columns='ATC3', aggfunc='size', fill_value=np.nan)
IV_pivot = IV_pivot.reset_index()
IV_pivot.head()

In [None]:
III_pivot.loc[:, III_pivot.columns != 'ICUSTAY_ID'] = III_pivot.loc[:, III_pivot.columns != 'ICUSTAY_ID'].notna().astype(int)
IV_pivot.loc[:, IV_pivot.columns != 'ICUSTAY_ID'] = IV_pivot.loc[:, IV_pivot.columns != 'ICUSTAY_ID'].notna().astype(int)

In [None]:
III_0 = III_med_pd.groupby(by=['ICUSTAY_ID'])['ATC3'].unique().reset_index()
III_00 = III_0.merge(III_med_pd.groupby(by=['ICUSTAY_ID'])['SMILES'].unique().reset_index(), on=['ICUSTAY_ID'], how='inner')

IV_0 = IV_med_pd.groupby(by=['ICUSTAY_ID'])['ATC3'].unique().reset_index()
IV_00 = IV_0.merge(IV_med_pd.groupby(by=['ICUSTAY_ID'])['SMILES'].unique().reset_index(), on=['ICUSTAY_ID'], how='inner')

print(III_00.shape,IV_00.shape)

In [None]:
III_FM = pd.merge(III_pivot,III_00,on='ICUSTAY_ID',how='left')
IV_FM = pd.merge(IV_pivot,IV_00,on='ICUSTAY_ID',how='left')

In [None]:
dill.dump(III_FM, open('output/III_M_1.pkl','wb'))
dill.dump(IV_FM, open('output/IV_M_1.pkl','wb'))