In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from collections import Counter
from tqdm import tqdm

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/cohort/mp/admission_only_true/train_med7.csv')
test_df = pd.read_csv('/content/drive/MyDrive/cohort/mp/admission_only_true/test_med7.csv')
val_df = pd.read_csv('/content/drive/MyDrive/cohort/mp/admission_only_true/val_med7.csv')

In [None]:
train_df.head()

Unnamed: 0,id,subject_id,gender,dob,admittime,ethnicity,text,hospital_expire_flag,all_entities
0,107384,26027,M,2166-07-13 00:00:00,2205-11-13 21:31:00,WHITE,"CHIEF COMPLAINT: AMS, concern for toxic alcoho...",0,"[('isopropyl', 'DRUG', 1013, 1022), ('Fomepizo..."
1,101061,1578,F,2060-11-17 00:00:00,2139-05-18 22:35:00,WHITE,CHIEF COMPLAINT: abdominal pain\n\nPRESENT ILL...,0,"[('O2', 'DRUG', 116, 118), ('O2', 'DRUG', 326,..."
2,127180,92652,M,2104-07-14 00:00:00,2192-06-09 14:58:00,UNKNOWN/NOT SPECIFIED,CHIEF COMPLAINT: Bilateral Sub Dural Hematoma\...,0,"[('tetracycline', 'DRUG', 322, 334), ('coumadi..."
3,168339,20953,M,2052-08-25 00:00:00,2139-10-22 04:11:00,BLACK/AFRICAN AMERICAN,CHIEF COMPLAINT: Intracranial bleed\n\nPRESENT...,0,"[('1gm', 'STRENGTH', 1168, 1171), ('dilantin',..."
4,154044,19409,F,2092-09-28 00:00:00,2164-04-30 14:54:00,WHITE,CHIEF COMPLAINT: ischemic left foot\n\nPRESENT...,0,"[('heparin', 'DRUG', 519, 526), ('bisacodyl', ..."


In [None]:
train_df['all_entities'].head()

Unnamed: 0,all_entities
0,"[('isopropyl', 'DRUG', 1013, 1022), ('Fomepizo..."
1,"[('O2', 'DRUG', 116, 118), ('O2', 'DRUG', 326,..."
2,"[('tetracycline', 'DRUG', 322, 334), ('coumadi..."
3,"[('1gm', 'STRENGTH', 1168, 1171), ('dilantin',..."
4,"[('heparin', 'DRUG', 519, 526), ('bisacodyl', ..."


In [None]:
tqdm.pandas()

def extract_drugs(all_entities):
    return [entity[0] for entity in all_entities if entity[1] == 'DRUG']

train_df['all_entities'] = train_df['all_entities'].apply(eval)
train_df['drugs_list'] = train_df['all_entities'].apply(extract_drugs)

In [None]:
all_drugs = [drug for drug_list in train_df['drugs_list'] for drug in drug_list]

drug_frequency = Counter(all_drugs)

frequency_dict = dict(drug_frequency)
sorted_dict = dict(sorted(frequency_dict.items(), key=lambda item: item[1], reverse=True))

In [None]:
len(frequency_dict)

14392

In [None]:
test_df['all_entities'] = test_df['all_entities'].apply(eval)
test_df['drugs_list'] = test_df['all_entities'].apply(extract_drugs)
val_df['all_entities'] = val_df['all_entities'].apply(eval)
val_df['drugs_list'] = val_df['all_entities'].apply(extract_drugs)

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader

tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")


model.eval()
model = model.to('cuda')

def generate_embeddings(texts, batch_size=32):
    embeddings = []
    dataloader = DataLoader(texts, batch_size=batch_size, shuffle=False)

    for batch in tqdm(dataloader, desc="Generating embeddings"):
        tokens = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to('cuda')
        with torch.no_grad():
            output = model(**tokens)

        batch_embeddings = output.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.extend(batch_embeddings)

    return embeddings

drug_names = list(frequency_dict.keys())

batch_size = 64
embeddings = generate_embeddings(drug_names, batch_size=batch_size)

In [None]:
import numpy as np

embeddings_array = np.array(embeddings)

np.save('/content/drive/MyDrive/cohort/mp/admission_only_true/drug_embeddings.npy', embeddings_array)

drug_names_array = np.array(list(frequency_dict.keys()))
np.save('/content/drive/MyDrive/cohort/mp/admission_only_true/drug_names.npy', drug_names_array)

In [None]:
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [None]:
import numpy as np
import faiss
from tqdm import tqdm
import pickle
import ast

In [None]:
embeddings_array = np.array(embeddings).astype('float32')
print(len(embeddings_array))
train_df['drugs_list'] = train_df['drugs_list'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
test_df['drugs_list'] = test_df['drugs_list'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
val_df['drugs_list'] = val_df['drugs_list'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

cluster_variations = [8, 16, 32, 64, 128, 256, 512]


embedding_dim = embeddings_array.shape[1]

res = faiss.StandardGpuResources()
faiss_index = faiss.IndexFlatL2(embedding_dim)
gpu_index = faiss.index_cpu_to_gpu(res, 0, faiss_index)

for num_clusters in cluster_variations:
    kmeans = faiss.Clustering(embedding_dim, num_clusters)
    kmeans.niter = 20
    kmeans.max_points_per_centroid = len(frequency_dict)
    kmeans.train(embeddings_array, gpu_index)


    centroids = faiss.vector_to_array(kmeans.centroids).reshape(num_clusters, embedding_dim)
    _, cluster_ids = gpu_index.search(embeddings_array, 1)
    cluster_ids = cluster_ids.flatten()

    word_to_cluster = {drug_names[i]: int(cluster_ids[i]) for i in range(len(drug_names))}
    print(word_to_cluster)

    train_df[f'{num_clusters}_cluster_ids'] = train_df['drugs_list'].apply(lambda drugs: [word_to_cluster.get(drug) for drug in drugs])
    def assign_clusters(diseases, word_to_cluster):
        return [word_to_cluster.get(disease, -1) for disease in diseases if disease in word_to_cluster]

    test_df[f'{num_clusters}_cluster_ids'] = test_df['drugs_list'].apply(lambda drugs: assign_clusters(drugs, word_to_cluster))
    val_df[f'{num_clusters}_cluster_ids'] = val_df['drugs_list'].apply(lambda drugs: assign_clusters(drugs, word_to_cluster))

14392
{'isopropyl': 2, 'Fomepizole': 1, 'ethylene glycol vs methanol': 6, 'Diazepam': 1, 'Phenobarbital': 1, 'Gabapentin': 1, 'Trazodone': 1, 'Codeine': 5, 'opiates': 2, 'benzodiazepines': 7, 'O2': 3, 'zofran': 2, 'compazine': 1, 'amiodarone': 1, 'oxygen': 4, 'lasix': 2, 'paroxetine': 7, 'ASA': 2, 'levothyroxine': 7, 'oxycodone': 7, 'tetracycline': 1, 'coumadin': 2, 'doxycycline': 1, 'minocycline': 7, 'Reglan': 2, 'PPI': 4, 'Keppra': 2, 'Vancomycin': 1, 'Zosyn': 2, 'metoprolol': 1, 'synthroid': 1, 'flomax': 2, 'lopressor': 2, 'advair': 2, 'Zantac': 1, 'Lasix': 2, 'Zocor': 2, 'Demerol': 1, 'Polysporin': 5, 'dilantin': 7, 'labetalol': 1, 'nipride': 1, 'lisinopril': 1, 'prilosec': 2, 'heparin': 1, 'bisacodyl': 7, 'albuterol': 1, 'hydromorphone': 7, 'percocet': 1, 'Penicillins': 7, 'Sulfa (Sulfonamides)': 7, 'Aspirin': 1, 'Wellbutrin': 1, 'Lipitor': 0, 'Zestril': 2, 'Synthroid': 1, 'Percocet': 1, 'lovenox': 2, 'argatroban': 2, 'sodium': 0, 'Heparin': 1, 'steroid': 2, 'antibiotics': 2, 'pre

In [None]:
train_df['drugs_list'][0]

['isopropyl',
 'Fomepizole',
 'ethylene glycol vs methanol',
 'Diazepam',
 'Phenobarbital',
 'Gabapentin',
 'Trazodone',
 'Codeine',
 'opiates',
 'benzodiazepines']

In [None]:
train_df.columns

Index(['id', 'subject_id', 'gender', 'dob', 'admittime', 'ethnicity', 'text',
       'hospital_expire_flag', 'all_entities', 'drugs_list', '8_cluster_ids',
       '16_cluster_ids', '32_cluster_ids', '64_cluster_ids', '128_cluster_ids',
       '256_cluster_ids', '512_cluster_ids'],
      dtype='object')

In [None]:
def create_cluster_one_hot(cluster_ids, num_clusters):
    one_hot_vector = [0] * num_clusters
    for cluster_id in set(cluster_ids):
        if cluster_id != -1 and cluster_id < num_clusters:
            one_hot_vector[cluster_id] = 1
    return one_hot_vector

for num_clusters in cluster_variations:
    train_df[f'{num_clusters}_cluster_onehot'] = train_df[f'{num_clusters}_cluster_ids'].apply(lambda x: create_cluster_one_hot(x, num_clusters))
    test_df[f'{num_clusters}_cluster_onehot'] = test_df[f'{num_clusters}_cluster_ids'].apply(lambda x: create_cluster_one_hot(x, num_clusters))
    val_df[f'{num_clusters}_cluster_onehot'] = val_df[f'{num_clusters}_cluster_ids'].apply(lambda x: create_cluster_one_hot(x, num_clusters))

In [None]:
train_df['8_cluster_onehot']

Unnamed: 0,8_cluster_onehot
0,"[0, 1, 1, 0, 0, 1, 1, 1]"
1,"[0, 1, 1, 1, 1, 0, 0, 1]"
2,"[0, 1, 1, 0, 1, 1, 0, 1]"
3,"[0, 1, 1, 0, 0, 0, 0, 1]"
4,"[0, 1, 0, 0, 0, 0, 0, 1]"
...,...
33949,"[1, 1, 1, 1, 0, 1, 0, 1]"
33950,"[1, 1, 1, 0, 0, 0, 0, 1]"
33951,"[0, 1, 1, 0, 1, 0, 0, 0]"
33952,"[0, 1, 0, 1, 0, 0, 0, 0]"


In [None]:
for split, df in zip(['train', 'test', 'val'], [train_df, test_df, val_df]):
    df.to_csv(f'/content/drive/MyDrive/cohort/mp/admission_only_true/{split}_med7_knn_onehot.csv', index=False)

In [None]:
cluster_variations = [8, 16, 32, 64, 128, 256, 512]
train_df = pd.read_csv('/content/drive/MyDrive/cohort/mp/admission_only_true/train_med7_knn_onehot.csv')
test_df = pd.read_csv('/content/drive/MyDrive/cohort/mp/admission_only_true/test_med7_knn_onehot.csv')
val_df = pd.read_csv('/content/drive/MyDrive/cohort/mp/admission_only_true/val_med7_knn_onehot.csv')

X_train = {}
X_test = {}
X_val = {}

for num_clusters in cluster_variations:
    X_train[f'{num_clusters}']  = np.array([ast.literal_eval(x) if isinstance(x, str) else x for x in train_df[f'{num_clusters}_cluster_onehot'].tolist()])
    X_test[f'{num_clusters}']  = np.array([ast.literal_eval(x) if isinstance(x, str) else x for x in test_df[f'{num_clusters}_cluster_onehot'].tolist()])
    X_val[f'{num_clusters}']  = np.array([ast.literal_eval(x) if isinstance(x, str) else x for x in val_df[f'{num_clusters}_cluster_onehot'].tolist()])

y_train = train_df['hospital_expire_flag'].values
y_test = test_df['hospital_expire_flag'].values
y_val = val_df['hospital_expire_flag'].values

In [None]:
print(X_train['8'].shape)
print(X_test['8'].shape)
print(X_val['8'].shape)

(33954, 8)
(9822, 8)
(4908, 8)


In [None]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report


models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forests': RandomForestClassifier(),
    # 'SVM Classifier': SVC(),
    'XGBoost': XGBClassifier(),
    'Gradient Boosted Trees': GradientBoostingClassifier(
        n_estimators=200,
        learning_rate=0.01,
        max_depth=200,
        random_state=42
    )
}

def store_results(num_clusters, results, y_test, y_pred):

    precision_per_class = precision_score(y_test, y_pred, average=None)
    recall_per_class = recall_score(y_test, y_pred, average=None)
    accuracy = accuracy_score(y_test, y_pred)
    f1_macro = f1_score(y_test, y_pred, average='macro')
    f1_weighted = f1_score(y_test, y_pred, average='weighted')


    results.append([
        num_clusters,
        precision_per_class[0], precision_per_class[1],
        recall_per_class[0], recall_per_class[1],
        accuracy,
        f1_macro,
        f1_weighted
    ])
cluster_variations = [64, 256, 512]
for num_clusters in cluster_variations:
    print(f"Results for {num_clusters} clusters:")

    X_train = np.array(train_df[f'{num_clusters}_cluster_onehot'].tolist())
    X_test = np.array(test_df[f'{num_clusters}_cluster_onehot'].tolist())
    X_val = np.array(val_df[f'{num_clusters}_cluster_onehot'].tolist())
    y_train = train_df['hospital_expire_flag'].values
    y_test = test_df['hospital_expire_flag'].values

    results = []
    for model_name, model in models.items():
        print(f"  {model_name}:")

        # Experiment 1: Without SMOTE
        print("    Without SMOTE:")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(classification_report(y_test, y_pred))

        precision_per_class = precision_score(y_test, y_pred, average=None)
        recall_per_class = recall_score(y_test, y_pred, average=None)
        accuracy = accuracy_score(y_test, y_pred)
        f1_macro = f1_score(y_test, y_pred, average='macro')
        f1_weighted = f1_score(y_test, y_pred, average='weighted')

        results.append([
            num_clusters,
            0,
            precision_per_class[0], precision_per_class[1],
            recall_per_class[0], recall_per_class[1],
            accuracy,
            f1_macro,
            f1_weighted
        ])


        # Experiment 2: With SMOTE
        print("    With SMOTE:")
        smote = SMOTE(random_state=42)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
        model.fit(X_train_resampled, y_train_resampled)
        y_pred_smote = model.predict(X_test)
        print(classification_report(y_test, y_pred_smote))


        precision_per_class = precision_score(y_test, y_pred, average=None)
        recall_per_class = recall_score(y_test, y_pred, average=None)
        accuracy = accuracy_score(y_test, y_pred)
        f1_macro = f1_score(y_test, y_pred, average='macro')
        f1_weighted = f1_score(y_test, y_pred, average='weighted')


        results.append([
            num_clusters,
            1,
            precision_per_class[0], precision_per_class[1],
            recall_per_class[0], recall_per_class[1],
            accuracy,
            f1_macro,
            f1_weighted
        ])


    results_df = pd.DataFrame(results, columns=[
      'num_clusters',
      'SMOTE',
      'precision_class_0', 'precision_class_1',
      'recall_class_0', 'recall_class_1',
      'accuracy', 'f1_macro', 'f1_weighted'
    ])

    results_df.to_csv('/content/drive/MyDrive/ML Final Project/cohort/mp/admission_only_true/classification_results.csv', index=False)


Results for 64 clusters:
  Logistic Regression:
    Without SMOTE:
              precision    recall  f1-score   support

           0       0.90      1.00      0.94      8797
           1       0.00      0.00      0.00      1025

    accuracy                           0.90      9822
   macro avg       0.45      0.50      0.47      9822
weighted avg       0.80      0.90      0.85      9822

    With SMOTE:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.90      0.57      0.69      8797
           1       0.11      0.44      0.17      1025

    accuracy                           0.55      9822
   macro avg       0.50      0.50      0.43      9822
weighted avg       0.81      0.55      0.64      9822

  XGBoost:
    Without SMOTE:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.90      1.00      0.94      8797
           1       0.13      0.01      0.01      1025

    accuracy                           0.89      9822
   macro avg       0.51      0.50      0.48      9822
weighted avg       0.82      0.89      0.85      9822

    With SMOTE:
              precision    recall  f1-score   support

           0       0.90      0.62      0.73      8797
           1       0.11      0.40      0.17      1025

    accuracy                           0.60      9822
   macro avg       0.50      0.51      0.45      9822
weighted avg       0.82      0.60      0.68      9822

  Gradient Boosted Trees:
    Without SMOTE:
              precision    recall  f1-score   support

           0       0.90      0.93      0.91      8797
           1       0.15      0.11      0.13      1025

    accuracy                           0.84      9822
   macro avg       0.53      0.52      0.52      9822
weighted avg 

In [None]:
from sklearn.utils import resample

def balance_classes(df, label_column):
    max_count = df[label_column].value_counts().max()

    oversampled_dfs = []

    for label in df[label_column].unique():
        class_subset = df[df[label_column] == label]
        oversampled_subset = resample(
            class_subset,
            replace=True,
            n_samples=max_count,
            random_state=42
        )
        oversampled_dfs.append(oversampled_subset)


    balanced_df = pd.concat(oversampled_dfs, ignore_index=True)


    balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

    return balanced_df

In [None]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

from sklearn.ensemble import RandomForestClassifier

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forests': RandomForestClassifier(),
    # 'SVM Classifier': SVC(),
    'XGBoost': XGBClassifier(),
    'Gradient Boosted Trees': GradientBoostingClassifier(
        n_estimators=200,
        learning_rate=0.01,
        max_depth=200,
        random_state=42
    )
}

def store_results(num_clusters, results, y_test, y_pred):

    precision_per_class = precision_score(y_test, y_pred, average=None)
    recall_per_class = recall_score(y_test, y_pred, average=None)
    accuracy = accuracy_score(y_test, y_pred)
    f1_macro = f1_score(y_test, y_pred, average='macro')
    f1_weighted = f1_score(y_test, y_pred, average='weighted')


    results.append([
        num_clusters,
        precision_per_class[0], precision_per_class[1],
        recall_per_class[0], recall_per_class[1],
        accuracy,
        f1_macro,
        f1_weighted
    ])
cluster_variations = [64, 256, 512]


for num_clusters in cluster_variations:
    print(f"Results for {num_clusters} clusters:")

    X_train = np.array([ast.literal_eval(x) if isinstance(x, str) else x for x in train_df[f'{num_clusters}_cluster_onehot'].tolist()])
    X_test = np.array([ast.literal_eval(x) if isinstance(x, str) else x for x in test_df[f'{num_clusters}_cluster_onehot'].tolist()])
    X_val = np.array([ast.literal_eval(x) if isinstance(x, str) else x for x in val_df[f'{num_clusters}_cluster_onehot'].tolist()])
    y_train = train_df['hospital_expire_flag'].values
    y_test = test_df['hospital_expire_flag'].values

    train_data = pd.DataFrame(X_train)
    train_data['hospital_expire_flag'] = y_train
    balanced_train_df = balance_classes(train_data, 'hospital_expire_flag')
    X_train_balanced = balanced_train_df.drop('hospital_expire_flag', axis=1).values
    X_train_balanced = np.array([ast.literal_eval(row[0]) if isinstance(row[0], str) else row for row in X_train_balanced])
    y_train_balanced = balanced_train_df['hospital_expire_flag'].values

    results = []
    for model_name, model in models.items():
        print(f"  {model_name}:")

        # Experiment 3: With Balanced Classes
        print("    With Balanced Classes:")
        model.fit(X_train_balanced, y_train_balanced)
        y_pred = model.predict(X_test)
        print(classification_report(y_test, y_pred))


        precision_per_class = precision_score(y_test, y_pred, average=None)
        recall_per_class = recall_score(y_test, y_pred, average=None)
        accuracy = accuracy_score(y_test, y_pred)
        f1_macro = f1_score(y_test, y_pred, average='macro')
        f1_weighted = f1_score(y_test, y_pred, average='weighted')
        results.append([
            num_clusters,
            1,
            precision_per_class[0], precision_per_class[1],
            recall_per_class[0], recall_per_class[1],
            accuracy,
            f1_macro,
            f1_weighted
        ])

Results for 64 clusters:
  Logistic Regression:
    With Balanced Classes:
              precision    recall  f1-score   support

           0       0.92      0.64      0.76      8797
           1       0.15      0.54      0.23      1025

    accuracy                           0.63      9822
   macro avg       0.54      0.59      0.50      9822
weighted avg       0.84      0.63      0.70      9822

  Random Forests:
    With Balanced Classes:
              precision    recall  f1-score   support

           0       0.90      0.94      0.92      8797
           1       0.14      0.08      0.10      1025

    accuracy                           0.85      9822
   macro avg       0.52      0.51      0.51      9822
weighted avg       0.82      0.85      0.84      9822

  XGBoost:
    With Balanced Classes:
              precision    recall  f1-score   support

           0       0.91      0.75      0.82      8797
           1       0.15      0.38      0.22      1025

    accuracy            