In [1]:
from gensim.models import KeyedVectors

bio_word2vec = KeyedVectors.load_word2vec_format('/scratch/spp9399/mimic/bioW2V/BioWordVec_PubMed_MIMICIII_d200.vec.bin', binary=True)

In [2]:
import pandas as pd
df = pd.read_csv("/scratch/spp9399/mimic/data/cohort/mp/admission_only_true/MP_IN_adm_train.csv")

In [3]:
df

Unnamed: 0,id,subject_id,gender,dob,admittime,ethnicity,text,hospital_expire_flag
0,107384,26027,M,2166-07-13 00:00:00,2205-11-13 21:31:00,WHITE,"CHIEF COMPLAINT: AMS, concern for toxic alcoho...",0
1,101061,1578,F,2060-11-17 00:00:00,2139-05-18 22:35:00,WHITE,CHIEF COMPLAINT: abdominal pain\n\nPRESENT ILL...,0
2,127180,92652,M,2104-07-14 00:00:00,2192-06-09 14:58:00,UNKNOWN/NOT SPECIFIED,CHIEF COMPLAINT: Bilateral Sub Dural Hematoma\...,0
3,168339,20953,M,2052-08-25 00:00:00,2139-10-22 04:11:00,BLACK/AFRICAN AMERICAN,CHIEF COMPLAINT: Intracranial bleed\n\nPRESENT...,0
4,154044,19409,F,2092-09-28 00:00:00,2164-04-30 14:54:00,WHITE,CHIEF COMPLAINT: ischemic left foot\n\nPRESENT...,0
...,...,...,...,...,...,...,...,...
33949,122869,5271,M,2110-12-25 00:00:00,2159-05-31 11:01:00,UNKNOWN/NOT SPECIFIED,CHIEF COMPLAINT: Fever and neutropenia.\n\nPRE...,0
33950,145612,18439,M,2129-09-16 00:00:00,2199-06-03 20:19:00,WHITE,CHIEF COMPLAINT: Chest pain.\n\nPRESENT ILLNES...,0
33951,152495,2128,F,2112-09-30 00:00:00,2166-11-22 22:00:00,UNKNOWN/NOT SPECIFIED,CHIEF COMPLAINT: \n\nPRESENT ILLNESS: The pati...,0
33952,182652,31910,M,2029-10-16 00:00:00,2109-01-14 17:46:00,WHITE,"CHIEF COMPLAINT: Malaise, fever, chills, and L...",0


In [4]:
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import re

docs = []
for text in tqdm( df["text"], desc="Sentence tokenization" ):
    #1. Replace "\n" with " " (spaces)
    text = text.replace("\n", " ")
    text = text.replace("\t", " ")
    #2. Replace "[** - **]" data in this format with "" (mostly time and date)
    text = re.sub(r'\[\*\*.*?\*\*\]', "", text)
    #3. Removed wide spaces
    text = re.sub(' +', " ", text)
    docs.append(word_tokenize(text.lower()))

Sentence tokenization: 100%|██████████| 33954/33954 [01:28<00:00, 383.13it/s]


In [5]:
from gensim.models.doc2vec import TaggedDocument

tagged_docs = [TaggedDocument(words=doc, tags=[f'doc_{i}']) for i, doc in enumerate(docs)]

In [6]:
from gensim.models import Doc2Vec

In [7]:
doc2vec_model = Doc2Vec(vector_size=200, window=5, min_count=2, workers=4, epochs=40, dm=1)

In [8]:
doc2vec_model.build_vocab(tagged_docs)

In [9]:
doc2vec_model.wv.vectors = bio_word2vec.vectors

In [10]:
doc2vec_model.train(tagged_docs, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

In [11]:
y_tain = df['hospital_expire_flag'].values

In [12]:
train_vectors = [doc2vec_model.dv[f'doc_{i}'] for i in range(len(tagged_docs))]

In [13]:
len(train_vectors)

33954

In [14]:
def get_doc2vec_vectors(df, doc2vec):
    docs = []
    for text in tqdm( df["text"], desc="word tokenization" ):
        #1. Replace "\n" with " " (spaces)
        text = text.replace("\n", " ")
        text = text.replace("\t", " ")
        #2. Replace "[** - **]" data in this format with "" (mostly time and date)
        text = re.sub(r'\[\*\*.*?\*\*\]', "", text)
        #3. Removed wide spaces
        text = re.sub(' +', " ", text)
        docs.append(word_tokenize(text.lower()))
    tagged_docs = [TaggedDocument(words=doc, tags=[f'doc_{i}']) for i, doc in enumerate(docs)]
    print(tagged_docs[0])
    
    res = [doc2vec_model.infer_vector(doc[0], epochs=20, alpha=0.025) for doc in tagged_docs]
    return res

In [15]:
df_test = pd.read_csv("/scratch/spp9399/mimic/data/cohort/mp/admission_only_true/MP_IN_adm_test.csv")

In [16]:
test_vectors = get_doc2vec_vectors(df_test, doc2vec_model)
y_test = df_test['hospital_expire_flag'].values

word tokenization: 100%|██████████| 9822/9822 [00:25<00:00, 377.84it/s]


TaggedDocument<['chief', 'complaint', ':', 'present', 'illness', ':', 'the', 'patient', 'is', 'a', '57', 'year', 'old', 'female', 'with', 'no', 'past', 'medical', 'history', 'who', 'had', 'sudden', 'onset', 'of', 'midback', 'pain', 'and', 'severe', 'headache', '.', 'she', 'said', 'it', 'felt', 'like', 'a', 'bomb', 'while', 'giving', 'a', 'speech', 'in', '.', 'she', 'finished', 'her', 'speech', 'and', 'vomited', 'once', '.', 'this', 'was', 'on', '.', 'the', 'headache', 'persisted', '.', 'she', 'returned', 'to', 'the', 'united', 'states', 'the', 'following', 'day', 'with', 'increased', 'fatigue', ',', 'headache', 'and', 'backache', '.', 'she', 'went', 'to', 'emergency', 'department', 'on', ',', 'where', 'a', 'cta', 'revealed', 'a', 'large', 'bilobed', '1.2', 'to', '2.0', 'centimeter', 'aca', 'aneurysm', ',', 'was', 'transferred', 'to', 'on', ',', 'for', 'further', 'workup', '.', 'cta', 'was', 'repeated', 'confirming', 'the', 'previously', 'mentioned', 'aneurysm', '.', 'she', 'was', 'tran

In [17]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

In [21]:
lr = LogisticRegression(random_state=42)

In [22]:
lr = LogisticRegression(
    C=0.1,
    penalty='l2',
    solver='lbfgs',
    max_iter=1000,
    multi_class='auto',
    random_state=42
)

In [25]:
lr.fit(train_vectors, y_tain)



In [27]:
# Evaluate on the test set
y_test_pred = lr.predict(test_vectors)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred))

Test Accuracy: 0.8957442476074119
Test Classification Report:
              precision    recall  f1-score   support

           0       0.90      1.00      0.94      8797
           1       0.51      0.04      0.07      1025

    accuracy                           0.90      9822
   macro avg       0.70      0.52      0.51      9822
weighted avg       0.86      0.90      0.85      9822



In [28]:
from sklearn.ensemble import RandomForestClassifier

In [29]:
rf_model = RandomForestClassifier(
    n_estimators=100, 
    random_state=42, 
    class_weight='balanced' 
)

rf_model.fit(train_vectors, y_tain)

In [30]:
y_test_pred = rf_model.predict(test_vectors)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred))

Test Accuracy: 0.8956424353492161
Test Classification Report:
              precision    recall  f1-score   support

           0       0.90      1.00      0.94      8797
           1       0.00      0.00      0.00      1025

    accuracy                           0.90      9822
   macro avg       0.45      0.50      0.47      9822
weighted avg       0.80      0.90      0.85      9822



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [31]:
from sklearn.ensemble import GradientBoostingClassifier

In [32]:
gb_clf = GradientBoostingClassifier(
    n_estimators=10,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

In [33]:
gb_clf.fit(train_vectors, y_tain)

In [34]:
y_test_pred = gb_clf.predict(test_vectors)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred))

Test Accuracy: 0.8956424353492161
Test Classification Report:
              precision    recall  f1-score   support

           0       0.90      1.00      0.94      8797
           1       0.00      0.00      0.00      1025

    accuracy                           0.90      9822
   macro avg       0.45      0.50      0.47      9822
weighted avg       0.80      0.90      0.85      9822



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [35]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

In [36]:
# Convert to DMatrix
dtrain = xgb.DMatrix(train_vectors, label=y_tain)
dtest = xgb.DMatrix(test_vectors, label=y_test)

In [37]:
# XGBoost parameters
params = {
    'objective': 'binary:logistic', 
    'eval_metric': 'logloss', 
    'eta': 0.5,     
    'max_depth': 6, 
    'subsample': 0.8, 
    'colsample_bytree': 0.8 
}

bst = xgb.train(params, dtrain, num_boost_round=100)

In [38]:
test_preds = bst.predict(dtest)
test_preds_binary = [1 if prob > 0.5 else 0 for prob in test_preds]

In [39]:
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_test, test_preds_binary)
print(f"Test Accuracy: {accuracy:.4f}")

print(classification_report(y_test, test_preds_binary))

Test Accuracy: 0.8928
              precision    recall  f1-score   support

           0       0.90      0.99      0.94      8797
           1       0.35      0.03      0.06      1025

    accuracy                           0.89      9822
   macro avg       0.63      0.51      0.50      9822
weighted avg       0.84      0.89      0.85      9822



## SMOTE

In [40]:
from imblearn.over_sampling import SMOTE

In [41]:
smote = SMOTE(random_state=42)

In [42]:
train_vectors_smote, train_labels_smote = smote.fit_resample(train_vectors, y_tain)

In [43]:
lr = LogisticRegression(random_state=42)

In [44]:
lr = LogisticRegression(
    C=0.1,
    penalty='l2',
    solver='lbfgs',
    max_iter=1000,
    multi_class='auto',
    random_state=42
)

In [45]:
lr.fit(train_vectors_smote, train_labels_smote)



In [46]:
# Evaluate on the test set
y_test_pred = lr.predict(test_vectors)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred))

Test Accuracy: 0.4600895947872124
Test Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.42      0.58      8797
           1       0.14      0.80      0.24      1025

    accuracy                           0.46      9822
   macro avg       0.54      0.61      0.41      9822
weighted avg       0.86      0.46      0.55      9822



In [47]:
from sklearn.ensemble import RandomForestClassifier

In [48]:
rf_model = RandomForestClassifier(
    n_estimators=100,  
    random_state=42,
    class_weight='balanced'
)

rf_model.fit(train_vectors_smote, train_labels_smote)

In [49]:
y_test_pred = rf_model.predict(test_vectors)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred))

Test Accuracy: 0.804011402972918
Test Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      8797
           1       0.14      0.17      0.15      1025

    accuracy                           0.80      9822
   macro avg       0.52      0.52      0.52      9822
weighted avg       0.82      0.80      0.81      9822



In [50]:
from sklearn.ensemble import GradientBoostingClassifier

In [51]:
gb_clf = GradientBoostingClassifier(
    n_estimators=10,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

In [52]:
gb_clf.fit(train_vectors_smote, train_labels_smote)

In [53]:
# Evaluate on the test set
y_test_pred = gb_clf.predict(test_vectors)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred))

Test Accuracy: 0.5739156994502138
Test Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.57      0.71      8797
           1       0.14      0.59      0.22      1025

    accuracy                           0.57      9822
   macro avg       0.53      0.58      0.46      9822
weighted avg       0.84      0.57      0.66      9822



In [54]:
dtrain_smote = xgb.DMatrix(train_vectors_smote, label=train_labels_smote)

In [55]:
params = {
    'objective': 'binary:logistic', 
    'eval_metric': 'logloss', 
    'eta': 0.5,
    'max_depth': 6,    
    'subsample': 0.8, 
    'colsample_bytree': 0.8,  
    'alpha': 10
}

bst = xgb.train(params, dtrain_smote, num_boost_round=100)

In [56]:
dtest = xgb.DMatrix(test_vectors)
test_preds = bst.predict(dtest)
test_preds_binary = [1 if prob > 0.5 else 0 for prob in test_preds]

from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(y_test, test_preds_binary)
print(f"Test Accuracy: {accuracy:.4f}")
print(classification_report(y_test, test_preds_binary))

Test Accuracy: 0.7889
              precision    recall  f1-score   support

           0       0.92      0.84      0.88      8797
           1       0.20      0.34      0.25      1025

    accuracy                           0.79      9822
   macro avg       0.56      0.59      0.57      9822
weighted avg       0.84      0.79      0.81      9822



## Full-oversampling

In [18]:
import numpy as np
import pandas as pd
from sklearn.utils import resample

train_df = pd.DataFrame(train_vectors)
train_df['hospital_expire_flag'] = y_tain

positive_class = train_df[train_df['hospital_expire_flag'] == 1]
negative_class = train_df[train_df['hospital_expire_flag'] == 0]

positive_oversampled = resample(
    positive_class,
    replace=True, 
    n_samples=len(negative_class), 
    random_state=42  
)

balanced_train_df = pd.concat([negative_class, positive_oversampled])

balanced_train_df = balanced_train_df.sample(frac=1, random_state=42).reset_index(drop=True)

X_train_balanced = balanced_train_df.drop(columns=['hospital_expire_flag']).values
y_train_balanced = balanced_train_df['hospital_expire_flag'].values

print("Class distribution after oversampling:")
print(pd.Series(y_train_balanced).value_counts())

Class distribution after oversampling:
1    30420
0    30420
Name: count, dtype: int64


In [63]:
lr = LogisticRegression(random_state=42)

In [64]:
lr = LogisticRegression(
    C=0.1,
    penalty='l2',
    solver='lbfgs',
    max_iter=1000,
    multi_class='auto',
    random_state=42
)

In [65]:
lr.fit(X_train_balanced, y_train_balanced)



In [67]:
y_test_pred = lr.predict(test_vectors)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred))

Test Accuracy: 0.6493585827733659
Test Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.64      0.77      8797
           1       0.19      0.75      0.31      1025

    accuracy                           0.65      9822
   macro avg       0.57      0.69      0.54      9822
weighted avg       0.88      0.65      0.72      9822



In [19]:
from sklearn.ensemble import RandomForestClassifier

In [20]:
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42, 
    class_weight='balanced'  
)

rf_model.fit(X_train_balanced, y_train_balanced)

In [21]:
y_test_pred = rf_model.predict(test_vectors)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred))

Test Accuracy: 0.8957442476074119
Test Classification Report:
              precision    recall  f1-score   support

           0       0.90      1.00      0.94      8797
           1       1.00      0.00      0.00      1025

    accuracy                           0.90      9822
   macro avg       0.95      0.50      0.47      9822
weighted avg       0.91      0.90      0.85      9822



In [22]:
from sklearn.ensemble import GradientBoostingClassifier

In [23]:
gb_clf = GradientBoostingClassifier(
    n_estimators=10,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

In [24]:
gb_clf.fit(X_train_balanced, y_train_balanced)

In [25]:
# Evaluate on the test set
y_test_pred = gb_clf.predict(test_vectors)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred))

Test Accuracy: 0.6604561189167176
Test Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.67      0.78      8797
           1       0.17      0.58      0.26      1025

    accuracy                           0.66      9822
   macro avg       0.55      0.62      0.52      9822
weighted avg       0.85      0.66      0.73      9822



In [27]:
import xgboost as xgb

In [28]:
dtrain_smote = xgb.DMatrix(X_train_balanced, label=y_train_balanced)

In [35]:
# XGBoost parameters
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',  
    'eta': 0.5,     
    'max_depth': 6,   
    'subsample': 0.8,    
    'colsample_bytree': 0.8,  
    'alpha': 10
}

bst = xgb.train(params, dtrain_smote, num_boost_round=100)

In [36]:
dtest = xgb.DMatrix(test_vectors) 
test_preds = bst.predict(dtest)
test_preds_binary = [1 if prob > 0.5 else 0 for prob in test_preds]

# Metrics
from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(y_test, test_preds_binary)
print(f"Test Accuracy: {accuracy:.4f}")
print(classification_report(y_test, test_preds_binary))

Test Accuracy: 0.8516
              precision    recall  f1-score   support

           0       0.91      0.92      0.92      8797
           1       0.26      0.23      0.24      1025

    accuracy                           0.85      9822
   macro avg       0.59      0.58      0.58      9822
weighted avg       0.84      0.85      0.85      9822

