In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
readmission_dir=pd.read_csv('gdrive/MyDrive/Colab_notebook/NOT_CLEANED_DATA.csv')

In [None]:
readmission_dir['TEXT_FILE'][0]

In [None]:
!pip install pyspark==3.1.3 spark-nlp==3.4.2

In [None]:
import sparknlp

spark = sparknlp.start(gpu=True)

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

spark

In [None]:
#Defining functions
import sys

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

In [None]:
from pyspark.ml.feature import SQLTransformer
from pyspark.ml.feature import StringIndexer

In [None]:
import pandas as pd
import numpy as np

In [None]:
import string
punc_numb=string.punctuation+'0123456789'
punc_numb=punc_numb.replace('!','')
punc_numb=punc_numb.replace('.','')
punc_numb=punc_numb.replace('?','')

In [None]:
punc_numb

In [None]:
# Replacing punctuations which are bot useful to spacings
def clean_text(text):
    text=text.replace("\n","")
    text=text.replace("\r","")

    t= text.maketrans(dict.fromkeys(punc_numb,""))
    text=text.translate(t)
    
    return(text)

In [None]:
from tqdm import tqdm
for i in tqdm(range(len(readmission_dir))):
    readmission_dir['TEXT_FILE'][i]=clean_text(readmission_dir['TEXT_FILE'][i])

In [None]:
from sklearn.model_selection import train_test_split
df_train, df_test=train_test_split(readmission_dir,test_size=0.3, random_state=49)

# sub-sampling the non-readmitted on the training set
df_train_readm=df_train[df_train.READMISSION_STATUS=='Readmitted']
df_train_non_readm=df_train[df_train.READMISSION_STATUS=='Non-readmitted']
df_train_sub = pd.concat([df_train_readm, df_train_non_readm.sample(n = len(df_train_readm), random_state = 50)],axis = 1.5)

In [None]:
# Convert the pandas df to a spark df
spark.conf.set("arrow.enabling", "true")
train = spark.createDataFrame(df_train_sub)
test= spark.createDataFrame(df_test)

### Build up Pipelines

In [None]:
%%time
# Produce pipeline for data cleaning and sentence(discharge summary) embedding
document_assembler = DocumentAssembler() \
      .setInputCol("TEXT_FILE") \
      .setOutputCol("document")
    
sentence = SentenceDetector() \
    .setInputCols("document") \
    .setOutputCol("sentence")

Bert_sentence_embedding = BertSentenceEmbeddings.pretrained() \
  .setInputCols(["document","sentence"]) \
  .setOutputCol("sentence_bert_embeddings")

embeddings_finisher = EmbeddingsFinisher() \
      .setInputCols(["sentence_bert_embeddings"]) \
      .setOutputCols(["finished_sentence_embeddings"]) \
      .setOutputAsVector(True)\
      .setCleanAnnotations(False)

explodeVectors = SQLTransformer(statement=
      "SELECT EXPLODE(finished_sentence_embeddings) AS features, * FROM __THIS__")

label_stringIdx_label = StringIndexer(inputCol = "READMISSION_STATUS", outputCol = "label")

nlp_pipeline_Bert = Pipeline(
stages=[document_assembler, 
        sentence,
        Bert_sentence_embedding,
        embeddings_finisher,
        explodeVectors,
        label_stringIdx_label])

In [None]:
nlp_Bert_model= nlp_pipeline_Bert.fit(train)

In [None]:
nlp_Bert_model.write().overwrite().save('gdrive/MyDrive/Colab_Notebooks/Models_Pipelines/bert')

In [None]:
from pyspark.ml.pipeline import PipelineModel
nlp_Bert_model= PipelineModel.load("gdrive/MyDrive/Colab_Notebooks/Models_Pipelines/bert/")

In [None]:
processed_bert=nlp_Bert_model.transform(train)

In [None]:
processed_bert.show()

In [None]:
processed_bert_test=nlp_Bert_model.transform(test)

In [None]:
from pyspark.sql.functions import collect_list
from pyspark.sql import functions as F
processed_bert_combined = processed_bert.groupby('HADM_ID').agg(collect_list('features').alias("features"),F.min(processed_bert.label))

In [None]:
processed_bert_test_combined = processed_bert_test.groupby('HADM_ID').agg(collect_list('features').alias("features"),F.min(processed_bert_test.label))

In [None]:
processed_bert_combined.show()

In [None]:
processed_bert_combined.write.orc("gdrive/MyDrive/Colab_notebook/transformed/sentence_train")

In [None]:
processed_bert_test_combined.write.orc("gdrive/MyDrive/Colab_notebook/transformed/sentence_test")

In [None]:
# Load in the saved transformed data
processed_bert_combined = spark.read.orc('gdrive/MyDrive/Colab_notebook/transformed_data/bert_sentence_train/')
processed_bert_test_combined = spark.read.orc('gdrive/MyDrive/Colab_notebook/transformed_data/bert_sentence_test/')

In [None]:
Seperatly analysing the datasets as they are too big
bert_sen_tr1,bert_sen_tr2, bert_sen_tr3, bert_sen_tr4, bert_sen_tr5=processed_bert_combined.randomSplit([0.2, 0.2, 0.2, 0.2, 0.2])

In [None]:
# Define function to compute average embedding vector
def average_emb(df):
  for i in range(len(df)):
    new_embedding_list=[]
    embedding_list=df['features'][i]
    for k in range(len(embedding_list)):
      sentence_embedding=embedding_list[k][3]
      new_embedding_list.append(sentence_embedding)
    df['features'][i]=[sum(sub_list) / len(sub_list) for sub_list in zip(*new_embedding_list)]
  return(df)

In [None]:
pd_tr1=bert_sen_tr1.toPandas()
pd_tr1=average_emb(pd_tr1)

In [None]:
pd_tr2=bert_sen_tr2.toPandas()
pd_tr2=average_emb(pd_tr2)

In [None]:
pd_tr3=bert_sen_tr3.toPandas()
pd_tr3=average_emb(pd_tr3)

In [None]:
pd_tr4=bert_sen_tr4.toPandas()
pd_tr4=average_emb(pd_tr4)

In [None]:
pd_tr5=bert_sen_tr5.toPandas()
pd_tr5=average_emb(pd_tr5)

In [None]:
pd_train=pd.concat([pd_tr1,pd_tr2,pd_tr3,pd_tr4,pd_tr5]).reset_index(drop=True)

In [None]:
pd_train.to_csv("gdrive/MyDrive/Colab_notebook/transformed_data/sentence_train.csv")

In [None]:
# Randomly sample 30% of the original test set. This is should be enough for the prediction evaluation
processed_bert_test_combined=processed_bert_test_combined.sample(withReplacement=False, fraction=0.3, seed=None)

In [None]:
bert_sen_te1,bert_sen_te2, bert_sen_te3, bert_sen_te4=processed_bert_test_combined.randomSplit([0.25, 0.25, 0.25, 0.25])

In [None]:
pd_te1=bert_sen_te1.toPandas()
pd_te1=average_emb(pd_te1)

In [None]:
pd_te2=bert_sen_te2.toPandas()
pd_te2=average_emb(pd_te2)

In [None]:
pd_te3=bert_sen_te3.toPandas()
pd_te3=average_emb(pd_te3)

In [None]:
pd_te4=bert_sen_te4.toPandas()
pd_te4=average_emb(pd_te4)

In [None]:
pd_test=pd.concat([pd_te1,pd_te2,pd_te3,pd_te4])

In [None]:
pd_test.to_csv("gdrive/MyDrive/Colab_notebook/transformed_data/sentence_test.csv")

In [None]:
pd_train=pd.read_csv("gdrive/MyDrive/Colab_notebook/transformed_data/sentence_train.csv")
pd_test=pd.read_csv("gdrive/MyDrive/Colab_notebook/transformed_data/sentence_test.csv")

In [None]:
pd_train.label=pd_train['min(label)'].astype("int")

In [None]:
pd_test.label=pd_test['min(label)'].astype("int")

In [None]:
X_train=pd_train.features
y_train=pd_train['min(label)']

In [None]:
X_test=pd_test.features
y_test=pd_test['min(label)']

In [None]:
X_train_trans=[]
for doc in X_train:
    embedding=doc[1:-1]
    embedding_list=embedding.split(",")
    str_to_num_list=[]
    for num_str in embedding_list:
        str_to_num_list.append(float(num_str))
    X_train_trans.append(str_to_num_list)

In [None]:
X_test_trans=[]
for doc in X_test:
    embedding=doc[1:-1]
    embedding_list=embedding.split(",")
    str_to_num_list=[]
    for num_str in embedding_list:
        str_to_num_list.append(float(num_str))
    X_test_trans.append(str_to_num_list)

## Modeling

In [None]:
model_bert_sentence=[]
Roc_auc_cv=[]
Roc_auc_test=[]

### Logistic Regression

In [None]:
!pip install scikit-optimize

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from skopt import BayesSearchCV
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

In [None]:
# bert vs Logistic regression
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]

param= dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
scoring='roc_auc'

logistic_clf_bert = BayesSearchCV(estimator=LogisticRegression(), search_spaces=param, scoring=scoring, n_jobs=-1, cv=cv)

In [None]:
logistic_clf_bert.fit(X_train_trans,y_train)

In [None]:
import pickle
with open('gdrive/MyDrive/Colab_notebook/Models_Pipelines/logistic_regression.pkl','wb') as f:
    pickle.dump(logistic_clf_bert,f)

In [None]:
with open('gdrive/MyDrive/Colab_notebook/Models_Pipelines/logistic_regression.pkl', 'rb') as f:
    logistic_clf_bert = pickle.load(f)

In [None]:
logistic_regressionbest=logistic_regression.best_score_

In [None]:
logistic_regression_best

In [None]:
y_prob_logistic_clf_bert = logistic_clf_bert.predict_proba(X_test_trans)
roc_auc_y_prob_logistic_clf_bert=roc_auc_score(y_test, y_prob_logistic_clf_bert[:,1])

In [None]:
roc_auc_y_prob_logistic_clf_bert

In [None]:
model_bert.append("Logistic Regression")
Roc_auc_cv.append(logistic_clf_bert_best)
Roc_auc_test.append(roc_auc_y_prob_logistic_clf_bert)

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

max_depth=[2, 3, 5, 10, 20]
min_samples_leaf=[5, 10, 20, 50, 100]
criterion=["gini", "entropy"]

param= dict(max_depth=max_depth,min_samples_leaf=min_samples_leaf,criterion=criterion)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
scoring='roc_auc'

dec_tree_clf_bert = BayesSearchCV(estimator=DecisionTreeClassifier(), search_spaces=param, scoring=scoring, n_jobs=-1, cv=cv)

In [None]:
dec_tree_clf_bert.fit(X_train_trans,y_train)

In [None]:
import pickle
with open('/Models_Pipelines/dec_tree_clf_bert.pkl','wb') as f:
    pickle.dump(dec_tree_clf_bert,f)

In [None]:
with open('/Models_Pipelines/dec_tree_clf_bert.pkl', 'rb') as f:
    dec_tree_clf_bert = pickle.load(f)

In [None]:
dec_tree_clf_bert_best=dec_tree_clf_bert.best_score_

In [None]:
dec_tree_clf_bert_best

In [None]:
y_prob_dec_tree_clf_bert = dec_tree_clf_bert.predict_proba(X_test_trans)
roc_auc_dec_tree_clf_bert=roc_auc_score(y_test, y_prob_dec_tree_clf_bert[:,1])

In [None]:
roc_auc_dec_tree_clf_bert

In [None]:
model_bert.append("Decision Tree")
Roc_auc_cv.append(dec_tree_clf_bert_best)
Roc_auc_test.append(roc_auc_dec_tree_clf_bert)

### Linear SVM

In [None]:
c_values=[100, 10, 1.0, 0.1, 0.01]

param= dict(C=c_values)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
scoring='roc_auc'

lsvc_clf_bert = BayesSearchCV(estimator=LinearSVC(), search_spaces=param, scoring=scoring, n_jobs=-1, cv=cv)

In [None]:
lsvc_clf_bert.fit(X_train_trans,y_train)

In [None]:
with open('gdrive/MyDrive/Colab_notebook/Models_Pipelines/lsvc_clf_bert_sent.pkl','wb') as f:
    pickle.dump(lsvc_clf_bert,f)

In [None]:
with open('gdrive/MyDrive/Colab_notebook/Models_Pipelines/lsvc_clf_bert_sent.pkl', 'rb') as f:
    lsvc_clf_bert = pickle.load(f)

In [None]:
lsvc_clf_bert_best=lsvc_clf_bert.best_score_

In [None]:
lsvc_clf_bert_best

In [None]:
y_dec_func_lsvc_clf_bert=lsvc_clf_bert.decision_function(X_test_trans)
roc_auc_lsvc_clf_bert=roc_auc_score(y_test, y_dec_func_lsvc_clf_bert)

In [None]:
roc_auc_lsvc_clf_bert

In [None]:
model_bert.append("Linear SVM")
Roc_auc_cv.append(lsvc_clf_bert_best)
Roc_auc_test.append(roc_auc_lsvc_clf_bert)

### KNN

In [None]:
k_range = list(range(1, 31))
param= dict(n_neighbors=k_range)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
scoring='roc_auc'

knn_clf_bert = BayesSearchCV(estimator=KNeighborsClassifier(), search_spaces=param, scoring=scoring, n_jobs=-1, cv=cv)

In [None]:
knn_clf_bert.fit(X_train_trans,y_train)

In [None]:
with open('/Models_Pipelines/knn_clf_bert.pkl','wb') as f:
    pickle.dump(knn_clf_bert,f)

In [None]:
with open('/Models_Pipelines/knn_clf_bert.pkl', 'rb') as f:
    knn_clf_bert = pickle.load(f)

In [None]:
knn_clf_bert_best=knn_clf_bert.best_score_

In [None]:
knn_clf_bert_best

In [None]:
y_prob_knn_clf_bert = knn_clf_bert.predict_proba(X_test_trans)
roc_auc_knn_clf_bert=roc_auc_score(y_test,y_prob_knn_clf_bert[:,1])

In [None]:
roc_auc_knn_clf_bert

In [None]:
model_bert.append("KNN")
Roc_auc_cv.append(knn_clf_bert_best)
Roc_auc_test.append(roc_auc_knn_clf_bert)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
max_depth=[5, 10, 15, 20]
min_samples_leaf=[5, 10, 20, 50, 100]
criterion=["gini", "entropy"]
n_estimators=[10,50,100,150]

param= dict(max_depth=max_depth,min_samples_leaf=min_samples_leaf,criterion=criterion,n_estimators=n_estimators)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
scoring='roc_auc'

rand_for_clf_bert = BayesSearchCV(estimator=RandomForestClassifier(), search_spaces=param, scoring=scoring, n_jobs=-1, cv=cv)

In [None]:
rand_for_clf_bert.fit(X_train_trans,y_train)

In [None]:
with open('gdrive/MyDrive/Colab_notebook/Models_Pipelines/rand_for_clf_bert_sent.pkl','wb') as f:
    pickle.dump(rand_for_clf_bert,f)

In [None]:
with open('/Models_Pipelines/rand_for_clf_bert_sent.pkl', 'rb') as f:
    rand_for_clf_bert = pickle.load(f)

In [None]:
rand_for_clf_bert_best=rand_for_clf_bert.best_score_

In [None]:
rand_for_clf_bert_best

In [None]:
y_prob_rand_for_clf_bert = rand_for_clf_bert.predict_proba(X_test_trans)
roc_auc_rand_for_clf_bert=roc_auc_score(y_test,y_prob_rand_for_clf_bert[:,1])

In [None]:
roc_auc_rand_for_clf_bert

In [None]:
model_bert.append("Random Forest")
Roc_auc_cv.append(rand_for_clf_bert_best)
Roc_auc_test.append(roc_auc_rand_for_clf_bert)

In [None]:
result_bert=pd.DataFrame({'model_bert': model_bert, 'Roc_auc_cross_val': Roc_auc_cv,'Roc_auc_test':Roc_auc_test})
result_bert=result_bert.sort_values('Roc_auc_test')
result_bert.reset_index(drop = True)