In [None]:
! pip install -q pyspark==3.1.3 spark-nlp

In [None]:
%cd C:\Users\NEHA\nlp_project\

In [None]:
import os
import sys

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

import pandas as pd

In [None]:
from pyspark.ml.feature import SQLTransformer
from pyspark.ml.feature import StringIndexer

In [None]:
import pandas as pd
import numpy as np

In [None]:
import sparknlp
spark = sparknlp.start()

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

spark

In [None]:
readmission=pd.read_csv('NOT_CLEANED.csv')

In [None]:
from sklearn.model_selection import train_test_split
df_train, df_test=train_test_split(readmission,test_size=0.2, random_state=42)

df_train_readm=df_train[df_train.READMISSION_STATUS=='Readmitted']
df_train_non_readm=df_train[df_train.READMISSION_STATUS=='Non-readmitted']
df_train_sub = pd.concat([df_train_readm, df_train_non_readm.sample(n = len(df_train_readm), random_state = 45)],axis = 0.5)
# Convert the pandas df to a spark df
spark.conf.set("enabled.arrow", "true")
train = spark.createDataFrame(df_train_sub)
test= spark.createDataFrame(df_test)

In [None]:
%%time
# Produce pipeline for data cleaning and sentence(discharge summary) embedding
document_assembler = DocumentAssembler() \
      .setInputCol("TEXT_AGG") \
      .setOutputCol("document")
    
tokenizer = Tokenizer() \
      .setInputCols(["document"]) \
      .setOutputCol("token")
    
normalizer = Normalizer() \
      .setInputCols(["token"]) \
      .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)

lemmatizer = Lemmatizer() \
    .setInputCols(["cleanTokens"]) \
    .setOutputCol("lemma") \
    .setDictionary("./AntBNC_lemmas_ver_001.txt", value_delimiter ="\t", key_delimiter = "->")

glove_embeddings = WordEmbeddingsModel().pretrained() \
      .setInputCols(["document",'lemma'])\
      .setOutputCol("embeddings")\
      .setCaseSensitive(False)

embeddingsSentence = SentenceEmbeddings() \
      .setInputCols(["document", "embeddings"]) \
      .setOutputCol("sentence_embeddings") \
      .setPoolingStrategy("AVERAGE")
    embeddings_finisher = EmbeddingsFinisher() \
      .setInputCols(["sentence_embeddings"]) \
      .setOutputCols(["finished_sentence_embeddings"]) \
      .setOutputAsVector(True)\
      .setCleanAnnotations(False)

explodeVectors = SQLTransformer(statement=
      "SELECT EXPLODE(finished_sentence_embeddings) AS features, * FROM __THIS__")

label_stringIdx = StringIndexer(inputCol = "READMISSION_STATUS", outputCol = "label")

nlp_pipeline_GloVe = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner,
            lemmatizer,
            glove_embeddings,
            embeddingsSentence,
            embeddings_finisher,
            explodeVectors,
            label_stringIdx])

In [None]:
from pyspark.ml.pipeline import PipelineModel
glove_readmission= PipelineModel.load("/Models_Pipelines/glove_readmission")

# Transform training set
processed_GloVe=glove_readmission.transform(train)

In [None]:
processed_GloVe.select('TEXT_AGG','features','label').show(truncate=40)

In [None]:
pd_train=processed_GloVe.select('features','label').toPandas()

In [None]:
pd_test=processed_GloVe_test.select('features','label').toPandas()

In [None]:
pd_train.to_csv("Glove_train.csv")
pd_test.to_csv("glove_test.csv")
pd_train.label=pd_train.label.astype("int")
pd_test.label=pd_test.label.astype("int")
pd_train=pd.read_csv("Glove_train.csv")
pd_test=pd.read_csv("glove_test.csv")

In [None]:
X_train=pd_train.features
y_train=pd_train.label
X_test=pd_test.features
y_test=pd_test.label

In [None]:
X_train_trans=[]
for doc in X_train:
    embedding=doc[2:-2]
    embedding_list=embedding.split(",")
    str_to_num_list=[]
    for num_str in embedding_list:
        str_to_num_list.append(float(num_str))
    X_train_trans.append(str_to_num_list)

In [None]:
X_test_trans=[]
for doc in X_test:
    embedding=doc[2:-2]
    embedding_list=embedding.split(",")
    str_to_num_list=[]
    for num_str in embedding_list:
        str_to_num_list.append(float(num_str))
    X_test_trans.append(str_to_num_list)

In [None]:
model_Glove=[]
Roc_auc_cv=[]
Roc_auc_test=[]

Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from skopt import BayesSearchCV
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

In [None]:
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]

param= dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scoring='roc_auc'

logistic_clf_glove = BayesSearchCV(estimator=LogisticRegression(), search_spaces=param, scoring=scoring, n_jobs=-1, cv=cv)
logistic_clf_glove.fit(X_train_trans,y_train)

In [None]:
import pickle
with open('/Models_Pipelines/logistic_glove.pkl','wb') as f:
    pickle.dump(logistic_glove,f)

In [None]:
with open('/Models_Pipelines/logistic_glove.pkl', 'rb') as f:
    logistic_glove = pickle.load(f)
logistic_glove_best=logistic_glove.best_score_
logistic_glove_best

In [None]:
model_Glove.append("Logistic Regression")
Roc_auc_cv.append(logistic_glove_best)
Roc_auc_test.append(roc_auc_y_prob_logistic_glove)

Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

max_depth=[2, 3, 5, 10, 20]
min_samples_leaf=[5, 10, 20, 50, 100]
criterion=["gini", "entropy"]

param= dict(max_depth=max_depth,min_samples_leaf=min_samples_leaf,criterion=criterion)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scoring='roc_auc'

dec_tree_clf_glove = BayesSearchCV(estimator=DecisionTreeClassifier(), search_spaces=param, scoring=scoring, n_jobs=-1, cv=cv)

In [None]:
dec_tree_glove.fit(X_train_trans,y_train)

In [None]:
with open('/Models_Pipelines/dec_tree_glove.pkl','wb') as f:
    pickle.dump(dec_tree_glove,f)
with open('/Models_Pipelines/dec_tree_glove.pkl', 'rb') as f:
    dec_tree_glove = pickle.load(f)
dec_tree_glove_best=dec_tree_glove.best_score_
dec_tree_glove_best

In [None]:
model_Glove.append("Decision Tree")
Roc_auc_cv.append(dec_tree_glove_best)
Roc_auc_test.append(roc_auc_dec_tree_glove)

SVM

In [None]:
c_values=[100, 10, 1.0, 0.1, 0.01]

param= dict(C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scoring='roc_auc'

lsvc_clf_glove = BayesSearchCV(estimator=LinearSVC(), search_spaces=param, scoring=scoring, n_jobs=-1, cv=cv)
lsvc_clf_glove.fit(X_train_trans,y_train)

In [None]:
with open('/Models_Pipelines/lsvc_glove.pkl','wb') as f:
    pickle.dump(lsvc_glove,f)
with open('/Models_Pipelines/lsvc_glove.pkl', 'rb') as f:
    lsvc_glove = pickle.load(f)
lsvc_glove_best=lsvc_glove.best_score_
lsvc_glove_best

In [None]:
y_dec_func_lsvc_glove=lsvc_glove.decision_function(X_test_trans)
roc_auc_lsvc_glove=roc_auc_score(y_test, y_dec_func_lsvc_glove)
roc_auc_lsvc_glove

Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
max_depth=[5, 10, 15, 20]
min_samples_leaf=[5, 10, 20, 50, 100]
criterion=["gini", "entropy"]
n_estimators=[10,50,100,150]

param= dict(max_depth=max_depth,min_samples_leaf=min_samples_leaf,criterion=criterion,n_estimators=n_estimators)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scoring='roc_auc'

rand_for_glove = BayesSearchCV(estimator=RandomForestClassifier(), search_spaces=param, scoring=scoring, n_jobs=-1, cv=cv)
rand_for_glove.fit(X_train_trans,y_train)

In [None]:
with open('/Models_Pipelines/rand_for_glove.pkl','wb') as f:
    pickle.dump(rand_for_glove,f)
with open('/Models_Pipelines/rand_for_glove.pkl', 'rb') as f:
    rand_for_glove = pickle.load(f)
rand_for_glove_best=rand_for_glove.best_score_
rand_for_glove_best

In [None]:
y_prob_rand_for_glove = rand_for_glove.predict_proba(X_test_trans)
roc_auc_rand_for_glove=roc_auc_score(y_test,y_prob_rand_for_glove[:,1])
roc_auc_rand_for_glove

In [None]:
model_Glove.append("Random Forest")
Roc_auc_cv.append(rand_for_glove_best)
Roc_auc_test.append(roc_auc_rand_for_glove)
result_Glove=pd.DataFrame({'model_GloVe': model_Glove, 'Roc_auc_cross_val': Roc_auc_cv,'Roc_auc_test':Roc_auc_test})
result_Glove=result_Glove.sort_values('Roc_auc_test')
result_Glove.reset_index(drop = True)