In [4]:
# !pip install --upgrade pip
# !pip install loguru
# !pip install xgboost
# !pip install mapply
# !pip install pandas>=1.5.0

In [5]:
import pandas as pd
import mapply
from datetime import datetime as dt
from loguru import logger
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from IPython.display import display
import warnings
warnings.filterwarnings("ignore")

In [6]:
mapply.init(
    n_workers=-1,
    chunk_size=1000,
    max_chunks_per_worker=100,
    progressbar=True
)

In [7]:
def datestring_to_datetime(s):
    return dt.strptime(s, '%m/%d/%Y')

In [8]:
dataset = "s3://adl-core-sagemaker-studio/external/srivatsava/athena-query-results/Unsaved/2023/06/26/fab31066-4fe3-4ce0-b6f3-c5dcaa2d3a14.csv"
df = pd.read_csv(dataset)
df = df[pd.notnull(df['enrolid'])]
df.columns = ['patient_id', 'dx1', 'dx2', 'dx3', 'dx4', 'adm_dt', 'discharge_dt']

In [9]:
df["adm_dt"] = df["adm_dt"].mapply(lambda x: datestring_to_datetime(x))
df["discharge_dt"] = df["discharge_dt"].mapply(lambda x: datestring_to_datetime(x))

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=900.0), HTML(value='')), layout=Layout(di…




HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=900.0), HTML(value='')), layout=Layout(di…




In [10]:
df['patient_id'] = df['patient_id'].mapply(lambda x: str(int(x)))

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=900.0), HTML(value='')), layout=Layout(di…




In [11]:
# def check_target_diseases(patient_df, start_date, end_date, target_disease):
#     relevant_visits = patient_df[(patient_df['adm_dt'] >= start_date) & (patient_df['discharge_dt'] < end_date)]
#     return int(any(code == target_disease for code in relevant_visits['diagnosis_code']))

In [12]:
ICD_COLS = ['dx1', 'dx2', 'dx3', 'dx4']

In [13]:
%%time
df2 = pd.melt(df, 
              id_vars=['patient_id', 'adm_dt', 'discharge_dt'],
              value_vars=ICD_COLS,
              value_name='diagnosis_code').drop('variable', axis=1).dropna()

CPU times: user 26.7 s, sys: 7.95 s, total: 34.7 s
Wall time: 34.5 s


In [14]:
import random
random.seed(42)

all_patients = df2.patient_id.unique().tolist()
sample_patients = random.sample(all_patients,100000)

In [15]:
df2 = df2[df2['patient_id'].isin(sample_patients)].reset_index(drop='index')

In [16]:
labels = ['N18','I50', 'J44', 'M16', 'M17']

In [17]:
df2['diag3'] = df2['diagnosis_code'].mapply(lambda x: x[:3])

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=900.0), HTML(value='')), layout=Layout(di…




In [18]:
for lab in labels:
    print(df2[df2['diag3']==lab].shape)

(16769, 5)
(27351, 5)
(10733, 5)
(5059, 5)
(7200, 5)


In [19]:
df2.head()

Unnamed: 0,patient_id,adm_dt,discharge_dt,diagnosis_code,diag3
0,837601,2019-04-10,2019-04-13,J9601,J96
1,68321501,2019-05-29,2019-06-06,N183,N18
2,68321501,2019-08-19,2019-08-29,A4189,A41
3,68321501,2019-09-13,2019-09-20,I340,I34
4,243536201,2019-09-23,2019-09-23,K922,K92


In [20]:
df2_labels = df2[df2['diag3'].isin(labels)]
df2_labels.shape

(67112, 5)

In [21]:
df2_fod = df2_labels.groupby(['patient_id','diag3']).agg({'adm_dt':'min'}).reset_index()
df2_fod.head()

Unnamed: 0,patient_id,diag3,adm_dt
0,1005559304,I50,2019-01-29
1,1005559304,N18,2019-01-29
2,1005926202,I50,2018-09-28
3,1005926202,N18,2018-09-28
4,1011351001,I50,2019-11-08


In [22]:
df2_fod.columns = ['patient_id','diag3','fod']

In [23]:
df2_fod2 = df2_fod.pivot(index='patient_id',columns = ['diag3'], values=['fod']).reset_index()

In [24]:
df2_fod2.head()

Unnamed: 0_level_0,patient_id,fod,fod,fod,fod,fod
diag3,Unnamed: 1_level_1,I50,J44,M16,M17,N18
0,1005559304,2019-01-29,NaT,NaT,NaT,2019-01-29
1,1005926202,2018-09-28,NaT,NaT,NaT,2018-09-28
2,1011351001,2019-11-08,NaT,NaT,NaT,NaT
3,1011380202,NaT,NaT,2019-10-28,NaT,NaT
4,1011388002,NaT,NaT,NaT,NaT,2018-09-14


In [25]:
df2_fod2.columns = df2_fod2.columns.droplevel()

In [26]:
df2_fod2.head()

diag3,Unnamed: 1,I50,J44,M16,M17,N18
0,1005559304,2019-01-29,NaT,NaT,NaT,2019-01-29
1,1005926202,2018-09-28,NaT,NaT,NaT,2018-09-28
2,1011351001,2019-11-08,NaT,NaT,NaT,NaT
3,1011380202,NaT,NaT,2019-10-28,NaT,NaT
4,1011388002,NaT,NaT,NaT,NaT,2018-09-14


In [27]:
label_imputations = ['target_'+lab for lab in labels]

In [28]:
df2_fod2.columns = ['patient_id']+sorted(label_imputations)

In [74]:
df2_fod2.head(10)

Unnamed: 0,patient_id,target_I50,target_J44,target_M16,target_M17,target_N18
0,1005559304,2019-01-29,2030-12-31,2030-12-31,2030-12-31,2019-01-29
1,1005926202,2018-09-28,2030-12-31,2030-12-31,2030-12-31,2018-09-28
2,1011351001,2019-11-08,2030-12-31,2030-12-31,2030-12-31,2030-12-31
3,1011380202,2030-12-31,2030-12-31,2019-10-28,2030-12-31,2030-12-31
4,1011388002,2030-12-31,2030-12-31,2030-12-31,2030-12-31,2018-09-14
5,1011395401,2018-09-17,2030-12-31,2030-12-31,2030-12-31,2018-12-25
6,1011452701,2030-12-31,2030-12-31,2030-12-31,2030-12-31,2019-10-02
7,1011464401,2018-01-02,2030-12-31,2030-12-31,2030-12-31,2018-01-02
8,1011509401,2019-06-05,2019-06-05,2030-12-31,2030-12-31,2019-06-05
9,1011669301,2030-12-31,2030-12-31,2030-12-31,2030-12-31,2019-02-15


In [30]:
from datetime import datetime
datetime.today()

datetime.datetime(2023, 7, 24, 14, 5, 52, 802952)

In [31]:
df2_fod2.fillna(pd.to_datetime('2030-12-31'), inplace = True)

In [32]:
df3 = pd.merge(df2, df2_fod2, on = ['patient_id'], how = 'left')
df2.shape

(2383653, 5)

In [33]:
df3.shape

(2383653, 10)

In [34]:
df3.head(5)

Unnamed: 0,patient_id,adm_dt,discharge_dt,diagnosis_code,diag3,target_I50,target_J44,target_M16,target_M17,target_N18
0,837601,2019-04-10,2019-04-13,J9601,J96,2030-12-31,2030-12-31,2030-12-31,2030-12-31,2018-07-19
1,68321501,2019-05-29,2019-06-06,N183,N18,2019-08-19,2019-08-19,2030-12-31,2030-12-31,2019-05-29
2,68321501,2019-08-19,2019-08-29,A4189,A41,2019-08-19,2019-08-19,2030-12-31,2030-12-31,2019-05-29
3,68321501,2019-09-13,2019-09-20,I340,I34,2019-08-19,2019-08-19,2030-12-31,2030-12-31,2019-05-29
4,243536201,2019-09-23,2019-09-23,K922,K92,2030-12-31,2030-12-31,2030-12-31,2030-12-31,2021-06-16


In [35]:
df3.fillna(pd.to_datetime('2030-12-31'), inplace = True)

In [36]:
df3['month_id'] = (df3['adm_dt'].dt.year-2018)*12 + df3['adm_dt'].dt.month

In [37]:
df3.head()

Unnamed: 0,patient_id,adm_dt,discharge_dt,diagnosis_code,diag3,target_I50,target_J44,target_M16,target_M17,target_N18,month_id
0,837601,2019-04-10,2019-04-13,J9601,J96,2030-12-31,2030-12-31,2030-12-31,2030-12-31,2018-07-19,16
1,68321501,2019-05-29,2019-06-06,N183,N18,2019-08-19,2019-08-19,2030-12-31,2030-12-31,2019-05-29,17
2,68321501,2019-08-19,2019-08-29,A4189,A41,2019-08-19,2019-08-19,2030-12-31,2030-12-31,2019-05-29,20
3,68321501,2019-09-13,2019-09-20,I340,I34,2019-08-19,2019-08-19,2030-12-31,2030-12-31,2019-05-29,21
4,243536201,2019-09-23,2019-09-23,K922,K92,2030-12-31,2030-12-31,2030-12-31,2030-12-31,2021-06-16,21


In [38]:
import numpy as np
for col in sorted(label_imputations):
    dx = col.split('_')[-1]
    new_col = 'month_'+dx
    df3[new_col] = (df3[col].dt.year-2018)*12+df3[col].dt.month

In [39]:
all_patients = pd.DataFrame(df3['patient_id'].unique().tolist())
all_months = pd.DataFrame(np.arange(13,25).tolist())

all_months.columns = ['month_id']
all_patients.columns = ['patient_id']

all_months['key'] = 1
all_patients['key'] = 1
  
# to obtain the cross join we will merge 
# on the key and drop it.
result = pd.merge(all_months, all_patients, on ='key').drop("key", 1)

result.columns = ['current_month','patient_id']

In [40]:
df4 = pd.merge(result, df3, on = 'patient_id')

In [41]:
import numpy as np
for col in sorted(label_imputations):
    dx = col.split('_')[-1]
    new_col = 'final_target_'+dx
    month_dx = 'month_'+dx
    df4[new_col] = np.where((df4[month_dx]>df4['current_month']) & (df4[month_dx]<(df4['current_month']+4)),1,0)

In [42]:
df4.head()

Unnamed: 0,current_month,patient_id,adm_dt,discharge_dt,diagnosis_code,diag3,target_I50,target_J44,target_M16,target_M17,...,month_I50,month_J44,month_M16,month_M17,month_N18,final_target_I50,final_target_J44,final_target_M16,final_target_M17,final_target_N18
0,13,837601,2019-04-10,2019-04-13,J9601,J96,2030-12-31,2030-12-31,2030-12-31,2030-12-31,...,156,156,156,156,7,0,0,0,0,0
1,13,837601,2019-04-10,2019-04-13,J9621,J96,2030-12-31,2030-12-31,2030-12-31,2030-12-31,...,156,156,156,156,7,0,0,0,0,0
2,13,837601,2019-04-10,2019-04-13,J9621,J96,2030-12-31,2030-12-31,2030-12-31,2030-12-31,...,156,156,156,156,7,0,0,0,0,0
3,13,837601,2019-04-10,2019-04-13,R918,R91,2030-12-31,2030-12-31,2030-12-31,2030-12-31,...,156,156,156,156,7,0,0,0,0,0
4,13,837601,2019-04-10,2019-04-13,J9621,J96,2030-12-31,2030-12-31,2030-12-31,2030-12-31,...,156,156,156,156,7,0,0,0,0,0


In [43]:
df4['adm_month'] = (df4['adm_dt'].dt.year-2018)*12+df4['adm_dt'].dt.month

In [44]:
df4_targets = df4[['current_month','patient_id'] + [f'final_target_{label}' for label in labels]]

In [45]:
df4.head()

Unnamed: 0,current_month,patient_id,adm_dt,discharge_dt,diagnosis_code,diag3,target_I50,target_J44,target_M16,target_M17,...,month_J44,month_M16,month_M17,month_N18,final_target_I50,final_target_J44,final_target_M16,final_target_M17,final_target_N18,adm_month
0,13,837601,2019-04-10,2019-04-13,J9601,J96,2030-12-31,2030-12-31,2030-12-31,2030-12-31,...,156,156,156,7,0,0,0,0,0,16
1,13,837601,2019-04-10,2019-04-13,J9621,J96,2030-12-31,2030-12-31,2030-12-31,2030-12-31,...,156,156,156,7,0,0,0,0,0,16
2,13,837601,2019-04-10,2019-04-13,J9621,J96,2030-12-31,2030-12-31,2030-12-31,2030-12-31,...,156,156,156,7,0,0,0,0,0,16
3,13,837601,2019-04-10,2019-04-13,R918,R91,2030-12-31,2030-12-31,2030-12-31,2030-12-31,...,156,156,156,7,0,0,0,0,0,16
4,13,837601,2019-04-10,2019-04-13,J9621,J96,2030-12-31,2030-12-31,2030-12-31,2030-12-31,...,156,156,156,7,0,0,0,0,0,16


In [46]:
df4_ind_vars = df4[df4['current_month']>df4['adm_month']][['current_month','patient_id','diag3']].reset_index(drop='index')

In [47]:
df4_ind_vars.head()

Unnamed: 0,current_month,patient_id,diag3
0,13,837601,K43
1,13,837601,I44
2,13,837601,K56
3,13,837601,K43
4,13,837601,K56


In [48]:
df4_targets = df4_targets.drop_duplicates()

In [49]:
df4_targets.shape

(1200000, 7)

In [50]:
df4_targets['patient_id'].value_counts()

837601        12
3945780302    12
4158050103    12
4154551103    12
4121391502    12
              ..
3987425204    12
3985046901    12
3983760501    12
3954901102    12
3090280602    12
Name: patient_id, Length: 100000, dtype: int64

In [51]:
df4_targets.head()

Unnamed: 0,current_month,patient_id,final_target_N18,final_target_I50,final_target_J44,final_target_M16,final_target_M17
0,13,837601,0,0,0,0,0
82,14,837601,0,0,0,0,0
164,15,837601,0,0,0,0,0
246,16,837601,0,0,0,0,0
328,17,837601,0,0,0,0,0


In [52]:
df4_ind_vars.head()

Unnamed: 0,current_month,patient_id,diag3
0,13,837601,K43
1,13,837601,I44
2,13,837601,K56
3,13,837601,K43
4,13,837601,K56


In [53]:
df4_ind_vars['dummy'] = 1

In [54]:
t = df4_ind_vars['diag3'].value_counts(1).cumsum().reset_index()
t.columns = ['dx','cum_percent']
t[t['cum_percent']<0.8].shape, t.shape

((217, 2), (1657, 2))

In [55]:
dx_consider = t[t['cum_percent']<0.8]['dx'].tolist() + labels

In [56]:
df4_ind_vars['diag3'] = np.where(df4_ind_vars['diag3'].isin(dx_consider), df4_ind_vars['diag3'], 'other')

In [57]:
df4_ind_vars2 = df4_ind_vars.copy()

In [58]:
# for col in dx_consider:
#     print(col)
#     df4_ind_vars[col] = np.where(df4_ind_vars['diag3']==col,1,0)

# df4_ind_vars.shape

# df4_ind_vars.head()

# df4_ind_vars.drop(columns = ['diag3','dummy'], axis = 1, inplace= True)

# df4_ind_vars.shape

# df4_ind_vars.groupby(['current_month','patient_id']).sum()

In [59]:
df4_ind_vars.head()

Unnamed: 0,current_month,patient_id,diag3,dummy
0,13,837601,other,1
1,13,837601,I44,1
2,13,837601,K56,1
3,13,837601,other,1
4,13,837601,K56,1


In [60]:
pivot_df = df4_ind_vars.pivot_table(index=['patient_id', 'current_month'],
                              columns = 'diag3',
                              values = 'dummy',
                              aggfunc = 'max')

In [61]:
pivot_df = pivot_df.reset_index().fillna(0)

In [62]:
X = pd.merge(pivot_df, df4_targets, on=['patient_id', 'current_month'], how='inner')

In [63]:
X.head()

Unnamed: 0,patient_id,current_month,A04,A41,B95,B96,C18,C34,C50,C78,...,Z95,Z96,Z98,Z99,other,final_target_N18,final_target_I50,final_target_J44,final_target_M16,final_target_M17
0,1005559304,14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
1,1005559304,15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
2,1005559304,16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
3,1005559304,17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
4,1005559304,18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0


In [64]:
X = X.drop(columns=['patient_id', 'current_month'])

In [65]:
df_train, df_test = train_test_split(X,
                                     # stratify=X['final_target_I50'],
                                     test_size=0.2,
                                     random_state=7)

In [66]:
import xgboost as xgb

xgb_estimator = xgb.XGBClassifier(objective='binary:logistic',
                          use_label_encoder=False,
                          max_depth=1)

In [67]:
from sklearn.multioutput import MultiOutputClassifier

In [68]:
X_train, y_train = df_train.drop(columns=[f'final_target_{label}' for label in labels]), df_train[[f'final_target_{label}' for label in labels]]

In [69]:
X_test, y_test = df_test.drop(columns=[f'final_target_{label}' for label in labels]), df_train[[f'final_target_{label}' for label in labels]]

In [70]:
model = MultiOutputClassifier(xgb_estimator)
model.fit(X_train, y_train)

MultiOutputClassifier(estimator=XGBClassifier(base_score=None, booster=None,
                                              callbacks=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=None,
                                              early_stopping_rounds=None,
                                              enable_categorical=False,
                                              eval_metric=None, gamma=None,
                                              gpu_id=None, grow_policy=None,
                                              importance_type=None,
                                              interaction_constraints=None,
                                              learning_rate=None, max_bin=None,
                                              max_cat_to_onehot=None,
                                              max_delta_step

In [71]:
y_pred = model.predict_proba(X_train)
for ix, label in enumerate(labels):
    label_ = f'final_target_{label}'
    print (f"{label}: {roc_auc_score(y_train[label_], y_pred[ix][:, 1])}")

N18: 0.9156434946041732
I50: 0.9033970088623829
J44: 0.8836446061301105
M16: 0.8948416280592258
M17: 0.8895435389936615


In [72]:
from IPython.display import display

In [73]:
for ix, label in enumerate(labels):
    print (label)
    df_imp = pd.DataFrame({"col": X_train.columns, "imp": model.estimators_[ix].feature_importances_})
    display (df_imp.sort_values(by='imp', ascending=False)[0:10])
    print (('-'*130))

N18


Unnamed: 0,col,imp
61,I48,0.081131
63,I50,0.070599
116,N17,0.068864
22,E11,0.064729
46,I12,0.052745
64,I51,0.052466
28,E87,0.047805
197,Z37,0.047738
117,N18,0.040967
120,N39,0.03951


----------------------------------------------------------------------------------------------------------------------------------
I50


Unnamed: 0,col,imp
116,N17,0.088022
44,I10,0.083752
22,E11,0.072175
64,I51,0.047766
153,R06,0.047598
120,N39,0.04317
50,I25,0.042891
113,M79,0.04211
197,Z37,0.036537
15,D64,0.031952


----------------------------------------------------------------------------------------------------------------------------------
J44


Unnamed: 0,col,imp
71,J18,0.069912
44,I10,0.069025
154,R07,0.052354
153,R06,0.04605
19,D72,0.045825
197,Z37,0.04419
32,F17,0.035713
38,G40,0.035657
63,I50,0.035486
170,R53,0.03516


----------------------------------------------------------------------------------------------------------------------------------
M16


Unnamed: 0,col,imp
111,M54,0.080233
42,G89,0.071768
104,M17,0.070106
25,E78,0.06954
109,M48,0.058259
197,Z37,0.043025
44,I10,0.038551
198,Z38,0.029306
70,I95,0.026706
105,M19,0.025211


----------------------------------------------------------------------------------------------------------------------------------
M17


Unnamed: 0,col,imp
24,E66,0.061076
45,I11,0.055287
113,M79,0.052257
107,M43,0.05186
210,Z87,0.044461
166,R42,0.040451
118,N20,0.040046
41,G47,0.039802
178,R74,0.037465
106,M25,0.037362


----------------------------------------------------------------------------------------------------------------------------------
