In [1]:
import zipfile
import os
import numpy as np
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import boto3
import pandas as pd
from io import BytesIO
from io import StringIO
import json
import requests 

In [2]:
# Initialize S3 client
s3 = boto3.client('s3')

# Function to load data 
bucket_name = 'ml-stack-759578648427-us-west-2'

def from_s3(bucket_name, file_name):
    response = s3.get_object(Bucket=bucket_name, Key=file_name)
    ext = os.path.splitext(file_name)[1]
    if ext == '.gz':
        data = pd.read_csv(BytesIO(response['Body'].read()), compression='gzip')
    elif ext == '.csv':
        data = pd.read_csv(BytesIO(response['Body'].read()))
    else:
        data = pd.read_excel(BytesIO(response['Body'].read()), sheet_name='DX_to_CCSR_Mapping', header=1)
    return data

# Import Modeling Set

In [3]:
file_name = 'MIMIC/Cleaned/df_omr_pt_ad_diag_proc.csv'
df = from_s3(bucket_name, file_name) 
df['charttime'] = pd.to_datetime(df['charttime'])
subjects = df['subject_id'].unique()
df.head()

Unnamed: 0,subject_id,charttime,Height (Inches),Weight (Lbs),bp_systolic,bp_diastolic,gender,anchor_age,insurance,language,...,SYM011,SYM012,SYM013,SYM014,SYM015,SYM016,SYM017,SYM018,procedure_ct,days_since_last_proc
0,10000826,2146-12-08 22:22:00,63.786,166.608,116.368,73.125,F,32,Private,English,...,,,,,,,,,1.0,3.0
1,10000898,2187-09-26 08:00:00,62.251,162.011,126.0,76.0,F,79,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2,10000980,2185-10-24 00:00:00,63.242,173.0,150.0,90.0,F,72,,,...,,,,,,,,,,
3,10001122,2142-04-08 10:48:00,62.926,170.767,131.007,72.597,F,70,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
4,10001401,2133-12-15 11:04:00,65.45,167.941,146.537,60.63,F,91,Medicare,English,...,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,23.0,155.0


# + Prescriptions

Mapping from: https://github.com/fabkury/n2c/tree/main
NDC formatting: https://github.com/MIT-LCP/mimic-code/issues/132
Article rxcui: https://medium.com/@pub_79863/how-to-map-fda-national-drug-codes-ndc-to-drug-classes-bd942fc52e52

In [4]:
def format_ndc(x):
    x = str(int(x))
    if len(x) < 10:
        x = '0' + x 
        x = f"{x[:4]}-{x[4:8]}-{x[8:]}"
    elif (len(x) == 10) & (x[0] == '1'):
        x = f"{x[:5]}-{x[5:8]}-{x[8:]}"
    else:
        x = f"{x[:5]}-{x[5:9]}-{x[9:]}"
    return x

def get_rxcui(ndc):
    r = requests.get(f"https://rxnav.nlm.nih.gov/REST/rxcui.json?idtype=NDC&id={ndc}", headers={"Accept": "application/json"})
    data = r.json()
    rxcui = data.get('idGroup', {}).get('rxnormId', [None])[0]
    return rxcui

def get_atc(rxcui):
    r = requests.get(f"https://rxnav.nlm.nih.gov/REST/rxclass/class/byRxcui.json?rxcui={rxcui}", headers={"Accept": "application/json"})
    data = r.json()
    classes = data.get('rxclassDrugInfoList', {}).get('rxclassDrugInfo', [])
    # take first atc class 
    atc_class = [cls['rxclassMinConceptItem']['classId'] for cls in classes if cls['rxclassMinConceptItem']['classType'] == 'ATC1-4']
    if not atc_class:
        atc_class = None 
        atc_class_name = None
    else:
        atc_class = atc_class[0]
        atc_class_name = [cls['rxclassMinConceptItem']['className'] for cls in classes if cls['rxclassMinConceptItem']['classType'] == 'ATC1-4'][0]
    return f"{atc_class}***{atc_class_name}"

In [5]:
file_name = 'MIMIC/prescriptions.csv.gz'
pr = from_s3(bucket_name, file_name) 
# filter subjects 
pr = pr[pr['subject_id'].isin(subjects)]
# subset columns 
pr = pr[['subject_id', 'starttime', 'stoptime', 'drug_type', 'drug','formulary_drug_cd','gsn', 'ndc', 'dose_val_rx', 'dose_unit_rx']]
# fillna 
pr.ndc = pr.ndc.fillna(0)
# map codes 
pr.ndc = pr.ndc.apply(format_ndc)
pr.head()

  data = pd.read_csv(BytesIO(response['Body'].read()), compression='gzip')


Unnamed: 0,subject_id,starttime,stoptime,drug_type,drug,formulary_drug_cd,gsn,ndc,dose_val_rx,dose_unit_rx
370,10000826,2146-12-06 00:00:00,2146-12-06 08:00:00,MAIN,Morphine Sulfate,MORP4I,4072,0409-1258-30,2-4,mg
371,10000826,2146-12-06 06:00:00,2146-12-06 08:00:00,MAIN,Lorazepam,LORA2I,3753,0409-1985-30,0.5,mg
372,10000826,2146-12-05 20:00:00,2146-12-06 08:00:00,MAIN,Heparin,HEPA5I,6549,63323-0262-01,5000,UNIT
373,10000826,2146-12-07 17:00:00,2146-12-12 21:00:00,MAIN,Docusate Sodium,DOCU100,3009,0904-2244-61,100,mg
374,10000826,2146-12-07 03:00:00,2146-12-08 02:00:00,MAIN,OxycoDONE (Immediate Release),OXYC5,4225,0406-0552-62,2.5,mg


### Create NDC to ATC Mapping

In [30]:
# map to rxcui codes 
ndc_map = pr[['ndc']].drop_duplicates()
ndc_map['rxcui'] = ndc_map.ndc.apply(get_rxcui)
ndc_map.head()

Unnamed: 0,ndc,rxcui
370,0409-1258-30,
371,0409-1985-30,763028.0
372,63323-0262-01,1361615.0
373,0904-2244-61,
374,0406-0552-62,1049621.0


In [89]:
ndc_map_exists = ndc_map.dropna()
ndc_map_exists.head()

Unnamed: 0,ndc,rxcui
371,0409-1985-30,763028
372,63323-0262-01,1361615
374,0406-0552-62,1049621
376,63323-0013-02,313324
385,0338-0049-03,1807634


In [101]:
# get atc classes 
ndc_map_exists['atc_info'] = ndc_map_exists.rxcui.apply(get_atc)
ndc_map_exists[['atc_class', 'atc_class_name']] = ndc_map_exists.atc_info.str.replace('***', '---').str.split('---', expand=True)
ndc_map_exists.drop(columns=['atc_info'], axis=1, inplace=True)
ndc_map_exists = ndc_map_exists[ndc_map_exists.atc_class != 'None']
ndc_map_exists.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ndc_map_exists['atc_info'] = ndc_map_exists.rxcui.apply(get_atc)


In [123]:
# save mapping in s3 
s3_key = 'MIMIC/Cleaned/ndc_act_mapping.csv'

# Save the DataFrame to a CSV in memory (using StringIO)
csv_buffer = StringIO()
ndc_map_exists.to_csv(csv_buffer, index=False)

# Upload the CSV data to S3
s3.put_object(Bucket=bucket_name, Key=s3_key, Body=csv_buffer.getvalue())

print(f"DataFrame successfully exported to s3://{bucket_name}/{s3_key}")

DataFrame successfully exported to s3://ml-stack-759578648427-us-west-2/MIMIC/Cleaned/ndc_act_mapping.csv


### Map & Join to Final DF 

In [6]:
file_name = 'MIMIC/Cleaned/ndc_act_mapping.csv'
ndc_map_exists = from_s3(bucket_name, file_name) 
ndc_map_exists.head()

Unnamed: 0,ndc,rxcui,atc_class,atc_class_name
0,0409-1985-30,763028,N05BA,Benzodiazepine derivatives
1,63323-0262-01,1361615,B01AB,Heparin group
2,0406-0552-62,1049621,N02AA,Natural opium alkaloids
3,63323-0013-02,313324,A11DA,"Vitamin B1, plain"
4,0338-0049-03,1807634,B05XA,Electrolyte solutions


In [15]:
num_matched = ndc_map_exists.ndc.nunique() / pr.ndc.nunique()
print(f"Matched {num_matched:.2%} codes")

Matched 57.22% codes


In [7]:
pr_mapped = pr.merge(ndc_map_exists, how='left', on='ndc')
pr_mapped.atc_class.fillna('PR_99', inplace=True)
pr_mapped = pr_mapped[['subject_id', 'stoptime', 'atc_class']]

# pivot 
pr_pivot = pr_mapped.groupby(['subject_id', 'stoptime', 'atc_class']).size().reset_index()
pr_pivot = pr_pivot.pivot(index=['subject_id', 'stoptime'], columns='atc_class', values=0).reset_index()
pr_pivot.fillna(0, inplace=True)
pr_pivot.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  pr_mapped.atc_class.fillna('PR_99', inplace=True)


atc_class,subject_id,stoptime,A01AA,A01AB,A01AC,A01AD,A02AA,A02AB,A02AC,A02AF,...,V03AC,V03AE,V03AF,V03AX,V04CC,V04CF,V04CG,V04CK,V06DC,V07AB
0,10000826,2146-12-06 08:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10000826,2146-12-06 12:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10000826,2146-12-06 19:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10000826,2146-12-07 10:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10000826,2146-12-07 14:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# join to final data 
df_sub = df[['subject_id', 'charttime']]
df_pr = df_sub.merge(pr_pivot, on='subject_id', how='left')
# filter dates, stop time before or equal to charttime 
df_pr['stoptime'] = pd.to_datetime(df_pr['stoptime'])
df_pr = df_pr[(df_pr.stoptime <= df_pr.charttime) | (df_pr.stoptime.isna() == True)]
df_pr.drop(columns=['stoptime'], inplace=True)

# count all presciption counts per category up to charttime i.e., mdeication history 
df_pr = df_pr.groupby(['subject_id', 'charttime']).sum().reset_index()
df_pr.fillna(0, inplace=True)

In [9]:
df_pr.shape

(35529, 441)

In [10]:
# merge to final data set 
df_omr_pt_ad_diag_proc_pr = df.merge(df_pr, on=['subject_id', 'charttime'])
df_omr_pt_ad_diag_proc_pr.head()

Unnamed: 0,subject_id,charttime,Height (Inches),Weight (Lbs),bp_systolic,bp_diastolic,gender,anchor_age,insurance,language,...,V03AC,V03AE,V03AF,V03AX,V04CC,V04CF,V04CG,V04CK,V06DC,V07AB
0,10000826,2146-12-08 22:22:00,63.786,166.608,116.368,73.125,F,32,Private,English,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10000898,2187-09-26 08:00:00,62.251,162.011,126.0,76.0,F,79,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10001122,2142-04-08 10:48:00,62.926,170.767,131.007,72.597,F,70,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10001401,2133-12-15 11:04:00,65.45,167.941,146.537,60.63,F,91,Medicare,English,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10001401,2134-10-09 13:45:00,65.5,168.283,146.155,60.552,F,92,Medicare,English,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
s3_key = 'MIMIC/Cleaned/df_omr_pt_ad_diag_proc_pr.csv'

# Save the DataFrame to a CSV in memory (using StringIO)
csv_buffer = StringIO()
df_omr_pt_ad_diag_proc_pr.to_csv(csv_buffer, index=False)

# Upload the CSV data to S3
s3.put_object(Bucket=bucket_name, Key=s3_key, Body=csv_buffer.getvalue())

print(f"DataFrame successfully exported to s3://{bucket_name}/{s3_key}")

DataFrame successfully exported to s3://ml-stack-759578648427-us-west-2/MIMIC/Cleaned/df_omr_pt_ad_diag_proc_pr.csv


# Old Mapping 

In [10]:
# import ndc to atc4 mapping 
file_name = 'MIMIC/package_NDC_ATC4_classes.csv'
ndc_map = from_s3(bucket_name, file_name) 
# format codes 
ndc_map.NDC = ndc_map.NDC.str.replace('-', '').astype(float)
# join ATC codes to pr 
pr = pr.merge(ndc_map, left_on='ndc', right_on='NDC', how='left')

# % matched
matched = pr[~pr.ATC_class.isna()]
num_matched = matched.ndc.nunique() / pr.ndc.nunique()
print(f"Matched {num_matched:.2%} codes")

# fillna 
pr.ATC_class = pr.ATC_class.fillna('99')
# get drug count by stop time 
pr = pr.groupby(['subject_id', 'stoptime', 'ATC_class']).size().reset_index()
#pivot data 
pr = pr.pivot(index=['subject_id', 'stoptime'], columns='ATC_class', values=0).reset_index()
# filna
pr.fillna(0, inplace=True)
pr.head()

Matched 21.54% codes


ATC_class,subject_id,stoptime,99,A01AA,A01AB,A01AC,A01AD,A02AA,A02AB,A02AC,...,V03AB,V03AC,V03AE,V03AF,V04CA,V04CC,V04CG,V04CX,V06DC,V07AB
0,10000826,2146-12-06 08:00:00,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10000826,2146-12-06 12:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10000826,2146-12-06 19:00:00,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10000826,2146-12-07 10:00:00,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10000826,2146-12-07 14:00:00,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
