In [1]:
from gensim.models import KeyedVectors

bio_word2vec = KeyedVectors.load_word2vec_format('/scratch/spp9399/mimic/bioW2V/BioWordVec_PubMed_MIMICIII_d200.vec.bin', binary=True)

In [2]:
import pandas as pd
df = pd.read_csv("/scratch/spp9399/mimic/data/los/LOS_WEEKS_adm_train_med7.csv")

In [3]:
df

Unnamed: 0,id,subject_id,gender,dob,admittime,ethnicity,text,los_label,all_entities
0,155297,41976,M,2136-07-28 00:00:00,2201-11-16 23:00:00,HISPANIC/LATINO - PUERTO RICAN,CHIEF COMPLAINT: Decreased responsiveness Maj...,0,"[('coumadin', 'DRUG'), ('Cefepime', 'DRUG'), (..."
1,168150,26316,F,2114-01-14 00:00:00,2185-12-16 15:46:00,UNKNOWN/NOT SPECIFIED,"CHIEF COMPLAINT: Fatigue, wide complex tachyca...",1,"[('propafenone', 'DRUG'), ('Digoxin', 'DRUG'),..."
2,154015,57593,M,2085-04-04 00:00:00,2144-01-13 18:55:00,WHITE,CHIEF COMPLAINT: preop CABG\n\nPRESENT ILLNESS...,2,"[('coumadin', 'DRUG'), ('Albuterol', 'DRUG'), ..."
3,145268,1217,M,2072-02-15 00:00:00,2125-04-05 16:55:00,BLACK/AFRICAN AMERICAN,CHIEF COMPLAINT: \n\nPRESENT ILLNESS: This is ...,1,"[('Hydrochlorothiazide', 'DRUG'), ('25 mg', 'S..."
4,170521,17144,M,2123-10-03 00:00:00,2195-07-23 10:27:00,UNKNOWN/NOT SPECIFIED,CHIEF COMPLAINT: worsening shortness of breath...,2,"[('atenolol', 'DRUG'), ('25', 'STRENGTH'), ('3..."
...,...,...,...,...,...,...,...,...,...
30416,194191,5060,M,2144-09-28 00:00:00,2181-08-20 20:26:00,WHITE,CHIEF COMPLAINT: etoh w/d\n\nPRESENT ILLNESS: ...,0,"[('listerine', 'DRUG'), ('valium', 'DRUG'), ('..."
30417,158608,25326,M,2058-05-23 00:00:00,2117-05-14 20:20:00,BLACK/AFRICAN AMERICAN,CHIEF COMPLAINT: Altered mental status\n\nPRES...,0,"[('bicarb', 'DRUG'), ('10 units', 'DOSAGE'), (..."
30418,192165,28575,M,2077-02-25 00:00:00,2159-08-28 23:11:00,WHITE,CHIEF COMPLAINT: Chest Pain\n\nPRESENT ILLNESS...,1,"[('3', 'DOSAGE'), ('baby', 'FORM'), ('aspirin'..."
30419,152118,30275,M,2073-09-15 00:00:00,2106-11-17 00:13:00,HISPANIC OR LATINO,"CHIEF COMPLAINT: Status epilepticus, sepsis\n\...",3,"[('Keppra', 'DRUG'), ('1', 'DOSAGE'), ('tablet..."


In [4]:
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import re

docs = []
for text in tqdm( df["text"], desc="Sentence tokenization" ):
    #1. Replace "\n" with " " (spaces)
    text = text.replace("\n", " ")
    text = text.replace("\t", " ")
    #2. Replace "[** - **]" data in this format with "" (mostly time and date)
    text = re.sub(r'\[\*\*.*?\*\*\]', "", text)
    #3. Removed wide spaces
    text = re.sub(' +', " ", text)
    docs.append(word_tokenize(text.lower()))

Sentence tokenization: 100%|██████████| 30421/30421 [01:18<00:00, 387.27it/s]


In [5]:
from gensim.models.doc2vec import TaggedDocument

tagged_docs = [TaggedDocument(words=doc, tags=[f'doc_{i}']) for i, doc in enumerate(docs)]

In [6]:
from gensim.models import Doc2Vec

In [7]:
doc2vec_model = Doc2Vec(vector_size=200, window=5, min_count=2, workers=4, epochs=40, dm=1)

In [8]:
doc2vec_model.build_vocab(tagged_docs)

In [9]:
doc2vec_model.wv.vectors = bio_word2vec.vectors

In [10]:
doc2vec_model.train(tagged_docs, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

In [11]:
y_tain = df['los_label'].values

In [12]:
train_vectors = [doc2vec_model.dv[f'doc_{i}'] for i in range(len(tagged_docs))]

In [13]:
len(train_vectors)

30421

In [14]:
def get_doc2vec_vectors(df, doc2vec):
    docs = []
    for text in tqdm( df["text"], desc="word tokenization" ):
        #1. Replace "\n" with " " (spaces)
        text = text.replace("\n", " ")
        text = text.replace("\t", " ")
        #2. Replace "[** - **]" data in this format with "" (mostly time and date)
        text = re.sub(r'\[\*\*.*?\*\*\]', "", text)
        #3. Removed wide spaces
        text = re.sub(' +', " ", text)
        docs.append(word_tokenize(text.lower()))
    tagged_docs = [TaggedDocument(words=doc, tags=[f'doc_{i}']) for i, doc in enumerate(docs)]
    print(tagged_docs[0])
    
    res = [doc2vec_model.infer_vector(doc[0], epochs=20, alpha=0.025) for doc in tagged_docs]
    return res

In [15]:
df_test = pd.read_csv("/scratch/spp9399/mimic/data/los/LOS_WEEKS_adm_test_med7.csv")

In [16]:
test_vectors = get_doc2vec_vectors(df_test, doc2vec_model)
y_test = df_test['los_label'].values

word tokenization: 100%|██████████| 8797/8797 [00:22<00:00, 385.08it/s]


TaggedDocument<['chief', 'complaint', ':', 'fall', 'from', 'syncopal', 'episode', ',', 'suffered', 'injuries', 'to', 'face', 'and', 'neck', 'including', 'facial', 'fractures', 'and', 'a', 'c2', 'dens', 'fracture', '.', 'present', 'illness', ':', '86-year-old', 'gentleman', 'who', 'is', 'transferred', 'from', 'an', 'outside', 'hospital', 'after', 'a', 'syncopal', 'episode', 'when', 'he', 'fell', 'onto', 'his', 'face', '.', 'ct', 'scan', 'at', 'the', 'outside', 'hospital', 'demonstrated', 'lefort', 'fractures', 'of', 'the', 'face', 'and', 'a', 'c2', 'fracture', '.', 'he', 'had', 'his', 'lacerations', 'repaired', 'at', 'the', 'outside', 'hospital', 'is', 'transferred', 'here', 'for', 'further', 'evaluation', '.', 'he', 'complains', 'of', 'pain', 'to', 'the', 'face', 'and', 'head', '.', 'he', 'denies', 'any', 'nausea', 'or', 'vomiting', '.', 'he', 'states', 'that', 'before', 'he', 'passed', 'out', 'he', 'had', 'lightheadedness', 'and', 'dizziness', 'medical', 'history', ':', 'htn', ',', 'h

In [18]:
from sklearn.metrics import accuracy_score, classification_report

In [19]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)

logreg.fit(train_vectors, y_tain)

y_pred = logreg.predict(test_vectors)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))



Accuracy: 0.38910992383767196
              precision    recall  f1-score   support

           0       0.40      0.05      0.08      1121
           1       0.42      0.63      0.50      3328
           2       0.35      0.34      0.34      2692
           3       0.36      0.21      0.27      1656

    accuracy                           0.39      8797
   macro avg       0.38      0.31      0.30      8797
weighted avg       0.38      0.39      0.36      8797



In [20]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42) # You can adjust the hyperparameters

rf_classifier.fit(train_vectors, y_tain)

y_pred = rf_classifier.predict(test_vectors)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))

Accuracy: 0.3803569398658634
              precision    recall  f1-score   support

           0       0.80      0.00      0.01      1121
           1       0.40      0.77      0.53      3328
           2       0.33      0.27      0.30      2692
           3       0.38      0.03      0.06      1656

    accuracy                           0.38      8797
   macro avg       0.48      0.27      0.22      8797
weighted avg       0.42      0.38      0.30      8797



In [21]:
from sklearn.ensemble import GradientBoostingClassifier
gbm = GradientBoostingClassifier(
    n_estimators=100, 
    learning_rate=0.1,  
    max_depth=3,        
    random_state=42
)

gbm.fit(train_vectors, y_tain)

y_pred = gbm.predict(test_vectors)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.3935432533818347
Classification Report:
               precision    recall  f1-score   support

           0       0.45      0.02      0.03      1121
           1       0.41      0.77      0.53      3328
           2       0.35      0.28      0.31      2692
           3       0.39      0.07      0.12      1656

    accuracy                           0.39      8797
   macro avg       0.40      0.29      0.25      8797
weighted avg       0.39      0.39      0.32      8797



In [22]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

In [23]:
dtrain = xgb.DMatrix(train_vectors, label=y_tain)
dtest = xgb.DMatrix(test_vectors, label=y_test)

In [24]:
params = {
    'objective': 'multi:softprob',
    'eval_metric': 'mlogloss',
    'eta': 0.5,
    'max_depth': 20,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'num_class': 4
}

bst = xgb.train(params, dtrain, num_boost_round=100)

In [25]:
test_preds = bst.predict(dtest)

import numpy as np
test_preds_binary = [np.argmax(prob) for prob in test_preds]

In [26]:
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_test, test_preds_binary)
print(f"Test Accuracy: {accuracy:.4f}")


print(classification_report(y_test, test_preds_binary))

Test Accuracy: 0.3787
              precision    recall  f1-score   support

           0       0.30      0.05      0.09      1121
           1       0.42      0.59      0.49      3328
           2       0.34      0.37      0.35      2692
           3       0.33      0.20      0.25      1656

    accuracy                           0.38      8797
   macro avg       0.35      0.30      0.30      8797
weighted avg       0.36      0.38      0.35      8797



## SMOTE

In [27]:
from imblearn.over_sampling import SMOTE

In [28]:
smote = SMOTE(random_state=42)

In [29]:
train_vectors_smote, train_labels_smote = smote.fit_resample(train_vectors, y_tain)

In [30]:
dtrain_smote = xgb.DMatrix(train_vectors_smote, label=train_labels_smote)

In [31]:
# XGBoost parameters
params = {
    'objective': 'multi:softprob',
    'eval_metric': 'mlogloss',
    'eta': 0.1,
    'max_depth': 100,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'num_class': 4
}

bst = xgb.train(params, dtrain_smote, num_boost_round=100)

In [32]:
dtest = xgb.DMatrix(test_vectors) 
test_preds = bst.predict(dtest)
test_preds_binary = [np.argmax(prob) for prob in test_preds]

from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(y_test, test_preds_binary)
print(f"Test Accuracy: {accuracy:.4f}")
print(classification_report(y_test, test_preds_binary))

Test Accuracy: 0.3401
              precision    recall  f1-score   support

           0       0.20      0.21      0.21      1121
           1       0.41      0.37      0.39      3328
           2       0.35      0.35      0.35      2692
           3       0.30      0.35      0.32      1656

    accuracy                           0.34      8797
   macro avg       0.32      0.32      0.32      8797
weighted avg       0.35      0.34      0.34      8797



In [33]:
from sklearn.ensemble import GradientBoostingClassifier

In [34]:
gb_clf = GradientBoostingClassifier(
    n_estimators=10,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

In [35]:
gb_clf.fit(train_vectors_smote, train_labels_smote)

In [42]:
y_test_pred = gb_clf.predict(test_vectors)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred))

Test Accuracy: 0.2609980675230192
Test Classification Report:
              precision    recall  f1-score   support

           0       0.17      0.45      0.25      1121
           1       0.41      0.16      0.23      3328
           2       0.33      0.17      0.22      2692
           3       0.25      0.48      0.33      1656

    accuracy                           0.26      8797
   macro avg       0.29      0.32      0.26      8797
weighted avg       0.33      0.26      0.25      8797



In [43]:
lr = LogisticRegression(random_state=42)

In [44]:
lr = LogisticRegression(
    C=0.1,
    penalty='l2',
    solver='lbfgs',
    max_iter=1000,
    multi_class='auto',
    random_state=42
)

In [45]:
lr.fit(train_vectors_smote, train_labels_smote)



In [46]:
y_test_pred = lr.predict(test_vectors)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred))

Test Accuracy: 0.25611003751278844
Test Classification Report:
              precision    recall  f1-score   support

           0       0.17      0.72      0.28      1121
           1       0.44      0.07      0.11      3328
           2       0.38      0.19      0.25      2692
           3       0.32      0.44      0.37      1656

    accuracy                           0.26      8797
   macro avg       0.33      0.35      0.25      8797
weighted avg       0.36      0.26      0.22      8797



In [47]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42) 

rf_classifier.fit(train_vectors_smote, train_labels_smote)

y_pred = rf_classifier.predict(test_vectors)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))

Accuracy: 0.31931340229623734
              precision    recall  f1-score   support

           0       0.18      0.23      0.20      1121
           1       0.41      0.38      0.39      3328
           2       0.33      0.31      0.32      2692
           3       0.27      0.29      0.28      1656

    accuracy                           0.32      8797
   macro avg       0.30      0.30      0.30      8797
weighted avg       0.33      0.32      0.32      8797



## Simple oversampling

In [49]:
import numpy as np
import pandas as pd
from sklearn.utils import resample

def balance_classes(df, label_column):
    max_count = df[label_column].value_counts().max()
    
    oversampled_dfs = []
    
    for label in df[label_column].unique():
        class_subset = df[df[label_column] == label]
        oversampled_subset = resample(
            class_subset,
            replace=True,
            n_samples=max_count,
            random_state=42  
        )
        oversampled_dfs.append(oversampled_subset)
    
    balanced_df = pd.concat(oversampled_dfs, ignore_index=True)
    
    balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    return balanced_df

train_df = pd.DataFrame(train_vectors)
train_df['los_label'] = y_tain

balanced_train_df = balance_classes(train_df, "los_label")

X_train_balanced = balanced_train_df.drop(columns=['los_label']).values
y_train_balanced = balanced_train_df['los_label'].values

print("Class distribution after oversampling:")
print(pd.Series(y_train_balanced).value_counts())

Class distribution after oversampling:
0    11197
1    11197
2    11197
3    11197
Name: count, dtype: int64


In [50]:
dtrain_smote = xgb.DMatrix(X_train_balanced, label=y_train_balanced)

In [51]:
params = {
    'objective': 'multi:softprob',  
    'eval_metric': 'mlogloss', 
    'eta': 0.1,
    'max_depth': 100, 
    'subsample': 0.8,  
    'colsample_bytree': 0.8, 
    'num_class': 4  
}

bst = xgb.train(params, dtrain_smote, num_boost_round=100)

In [52]:
dtest = xgb.DMatrix(test_vectors)
test_preds = bst.predict(dtest)
test_preds_binary = [np.argmax(prob) for prob in test_preds]

from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(y_test, test_preds_binary)
print(f"Test Accuracy: {accuracy:.4f}")
print(classification_report(y_test, test_preds_binary))

Test Accuracy: 0.3659
              precision    recall  f1-score   support

           0       0.35      0.13      0.19      1121
           1       0.42      0.47      0.44      3328
           2       0.33      0.38      0.35      2692
           3       0.31      0.29      0.30      1656

    accuracy                           0.37      8797
   macro avg       0.35      0.32      0.32      8797
weighted avg       0.36      0.37      0.36      8797



In [53]:
from sklearn.ensemble import GradientBoostingClassifier

In [54]:
gb_clf = GradientBoostingClassifier(
    n_estimators=10,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

In [56]:
gb_clf.fit(X_train_balanced, y_train_balanced)

In [57]:
y_test_pred = gb_clf.predict(test_vectors)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred))

Test Accuracy: 0.3041946117994771
Test Classification Report:
              precision    recall  f1-score   support

           0       0.24      0.26      0.25      1121
           1       0.42      0.28      0.34      3328
           2       0.31      0.25      0.28      2692
           3       0.24      0.47      0.32      1656

    accuracy                           0.30      8797
   macro avg       0.30      0.32      0.30      8797
weighted avg       0.33      0.30      0.30      8797



In [58]:
lr = LogisticRegression(random_state=42)

In [59]:
lr = LogisticRegression(
    C=0.1,
    penalty='l2',
    solver='lbfgs',
    max_iter=1000,
    multi_class='auto',
    random_state=42
)

In [60]:
lr.fit(X_train_balanced, y_train_balanced)



In [61]:
y_test_pred = lr.predict(test_vectors)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred))

Test Accuracy: 0.31976810276230533
Test Classification Report:
              precision    recall  f1-score   support

           0       0.26      0.44      0.33      1121
           1       0.45      0.20      0.28      3328
           2       0.37      0.21      0.27      2692
           3       0.28      0.65      0.39      1656

    accuracy                           0.32      8797
   macro avg       0.34      0.38      0.32      8797
weighted avg       0.37      0.32      0.30      8797



In [63]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42) # You can adjust the hyperparameters

rf_classifier.fit(X_train_balanced, y_train_balanced)

y_pred = rf_classifier.predict(test_vectors)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))

Accuracy: 0.35807661702853244
              precision    recall  f1-score   support

           0       0.33      0.10      0.16      1121
           1       0.40      0.50      0.44      3328
           2       0.33      0.39      0.36      2692
           3       0.29      0.20      0.24      1656

    accuracy                           0.36      8797
   macro avg       0.34      0.30      0.30      8797
weighted avg       0.35      0.36      0.34      8797

