In [14]:
# create spark and sparkcontext objects
from pyspark.sql import SparkSession
from pyspark.sql import Row
import numpy as np
import pandas as pd
from pyspark.sql.functions import isnan, isnull, when, count, col
from pyspark.sql import functions as fn
import matplotlib.pyplot as plt
from pyspark.ml import feature
# Funcionality for classification
from pyspark.ml import Pipeline
import seaborn as sns
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler,OneHotEncoder
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, MultilayerPerceptronClassifier, DecisionTreeClassifier,GBTClassifier
from pyspark.sql.types import IntegerType
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder,CrossValidator
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [15]:
# Do not delete or change this cell

# grading import statements
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)
import os

# Define a function to determine if we are running on data bricks
# Return true if running in the data bricks environment, false otherwise
def is_databricks():
    # get the databricks runtime version
    db_env = os.getenv("DATABRICKS_RUNTIME_VERSION")
    
    # if running on data bricks
    if db_env != None:
        return True
    else:
        return False

# Define a function to read the data file.  The full path data file name is constructed
# by checking runtime environment variables to determine if the runtime environment is 
# databricks, or a student's personal computer.  The full path file name is then
# constructed based on the runtime env.
# 
# Params
#   data_file_name: The base name of the data file to load
# 
# Returns the full path file name based on the runtime env
#
# Correct Usage Example (pass ONLY the full file name):
#   file_name_to_load = get_training_filename("sms_spam.csv") # correct - pass ONLY the full file name  
#   
# Incorrect Usage Example
#   file_name_to_load = get_training_filename("/sms_spam.csv") # incorrect - pass ONLY the full file name
#   file_name_to_load = get_training_filename("sms_spam.csv/") # incorrect - pass ONLY the full file name
#   file_name_to_load = get_training_filename("c:/users/will/data/sms_spam.csv") incorrect -pass ONLY the full file name
def get_training_filename(data_file_name):    
    # if running on data bricks
    if is_databricks():
        # build the full path file name assuming data brick env
        full_path_name = "/FileStore/tables/%s" % data_file_name
    # else the data is assumed to be in the same dir as this notebook
    else:
        # Assume the student is running on their own computer and load the data
        # file from the same dir as this notebook
        full_path_name = data_file_name
    
    # return the full path file name to the caller
    return full_path_name

In [16]:
file_path = "W:/Syracuse/Assignements/BigData/FinalProject/Hospital-Readmission-based-on-Diabetes/Data/"
file_name = "hospital_readmission_cleaned.csv"
diabetes_df = spark.read.csv(get_training_filename('hospital_readmission_cleaned.csv'), header=True, inferSchema=True)
diabetes_df.toPandas().shape

(101766, 35)

In [17]:
categorical_columns= ['race','diag_1','diag_2','diag_3','admission_type_name','dischage_disposition_name','admission_source_name',
               'gender','age','max_glu_serum','A1Cresult','metformin','repaglinide',
               'nateglinide','chlorpropamide','glimepiride','glipizide','glyburide','pioglitazone','rosiglitazone','acarbose',
               'miglitol','insulin','glyburide-metformin','change','diabetesMed']

# The index of string vlaues multiple columns
indexers = [feature.StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c)).setHandleInvalid("keep") for c in categorical_columns]

# The encode of indexed values multiple columns
encoders = [OneHotEncoder(dropLast=False,inputCol=indexer.getOutputCol(),outputCol="{0}_encoded".format(indexer.getOutputCol())) for indexer in indexers]

assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders],outputCol="features")

pipeline1 = Pipeline(stages=indexers + encoders)
#diabetes_df = pipeline1.fit(diabetes_df).transform(diabetes_df)
#diabetes_df.toPandas().head()


In [18]:
# Split the dataset into training and testing
train, test = diabetes_df.randomSplit([0.6, 0.4], 0)
train, test2 = train.randomSplit([0.6, 0.4], 0)

train.toPandas().shape
'''
numeric_features = [t[0] for t in train.dtypes if t[1] == 'int']
numeric_data = train.select(numeric_features).toPandas()
axs = pd.plotting.scatter_matrix(numeric_data, figsize=(15, 15));
n = len(numeric_data.columns)
for i in range(n):
    v = axs[i, 0]
    v.yaxis.label.set_rotation(0)
    v.yaxis.label.set_ha('right')
    v.set_yticks(())
    h = axs[n-1, i]
    h.xaxis.label.set_rotation(90)
    h.set_xticks(())
plt.tight_layout()
plt.show()
'''


"\nnumeric_features = [t[0] for t in train.dtypes if t[1] == 'int']\nnumeric_data = train.select(numeric_features).toPandas()\naxs = pd.plotting.scatter_matrix(numeric_data, figsize=(15, 15));\nn = len(numeric_data.columns)\nfor i in range(n):\n    v = axs[i, 0]\n    v.yaxis.label.set_rotation(0)\n    v.yaxis.label.set_ha('right')\n    v.set_yticks(())\n    h = axs[n-1, i]\n    h.xaxis.label.set_rotation(90)\n    h.set_xticks(())\nplt.tight_layout()\nplt.show()\n"

In [19]:
# build the pipelines
va = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders],outputCol="features")
sc = feature.StandardScaler(withMean=True, inputCol='features',outputCol = 'zfeatures') 

pipeline2 = Pipeline(stages= [va,sc])
#pipe_model = Pipeline(stages = [VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders],outputCol="features"),
#                                feature.StandardScaler(withMean=True, inputCol='features',outputCol = 'zfeatures')])
#train = pipeline2.fit(train).transform(train)

In [20]:
# Logistic regression pipeline
pipe_logit = Pipeline(stages = [pipeline1, pipeline2,LogisticRegression(labelCol='readmitted',featuresCol = 'zfeatures')])

In [11]:
# Fitiing logitstic model
#fitted_model = pipe_logit.fit(train)
#fitted_model.transform(train).toPandas().head()

fitted_pipe1 = pipe_logit.fit(train)
'''
plt.figure(figsize=(5,5))
plt.plot([0, 1], [0, 1], 'r--')
plt.plot(fitted_pipe1.summary.roc.select('FPR').collect(),
         fitted_pipe1.summary.roc.select('TPR').collect())
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC')
plt.show()
'''

"\nplt.figure(figsize=(5,5))\nplt.plot([0, 1], [0, 1], 'r--')\nplt.plot(fitted_pipe1.summary.roc.select('FPR').collect(),\n         fitted_pipe1.summary.roc.select('TPR').collect())\nplt.xlabel('FPR')\nplt.ylabel('TPR')\nplt.title('ROC')\nplt.show()\n"

In [12]:
# validate logistic model
evaluator = BinaryClassificationEvaluator(metricName="areaUnderROC",rawPredictionCol='prediction', labelCol='readmitted')
logit_predicted = fitted_pipe1.transform(test)
evaluator.evaluate(logit_predicted)
#fitted_model.transform(test).select(fn.avg(fn.expr('readmitted = prediction').cast('float'))).show()

0.5964306563461123

In [13]:
logit_predicted.select(
(fn.sum(fn.when(fn.col('readmitted')==fn.col('prediction'),1).otherwise(0))/fn.count(fn.col('readmitted'))).alias('Accuracy'),
(fn.sum(fn.when((fn.col('readmitted')==1) & (fn.col('prediction')==1),1).otherwise(0))/
(fn.sum(fn.when((fn.col('readmitted')==1) & (fn.col('prediction')==1),1).otherwise(0))+ 
fn.sum(fn.when((fn.col('readmitted')==1) & (fn.col('prediction')==0),1).otherwise(0)))).alias('Recall')).show()

+------------------+------------------+
|          Accuracy|            Recall|
+------------------+------------------+
|0.6010705212007598|0.5396185871608917|
+------------------+------------------+



In [9]:
# Random forest pipeline
rf = RandomForestClassifier(labelCol="readmitted", featuresCol="zfeatures", numTrees=10)
pipe_rf = Pipeline(stages = [pipeline1,pipeline2,rf])

In [10]:
# Fitiing RF model
#fitted_model = pipe_rf.fit(train)
#fitted_model.transform(train).toPandas().head()
grid = (ParamGridBuilder().addGrid(rf.featureSubsetStrategy, ['auto']).addGrid(rf.impurity,['GINI']).addGrid(rf.numTrees, [80]).build())
evaluator = BinaryClassificationEvaluator(labelCol = 'readmitted',metricName = 'areaUnderROC')
cv1 = CrossValidator(estimator=pipe_rf, estimatorParamMaps=grid, evaluator=evaluator, numFolds = 3, seed = 65)
fitted_cv1 = cv1.fit(train)


In [11]:
train.columns

['race',
 'diag_1',
 'diag_2',
 'diag_3',
 'admission_type_name',
 'dischage_disposition_name',
 'admission_source_name',
 'gender',
 'age',
 'time_in_hospital',
 'num_lab_procedures',
 'num_procedures',
 'num_medications',
 'number_outpatient',
 'number_emergency',
 'number_inpatient',
 'number_diagnoses',
 'max_glu_serum',
 'A1Cresult',
 'metformin',
 'repaglinide',
 'nateglinide',
 'chlorpropamide',
 'glimepiride',
 'glipizide',
 'glyburide',
 'pioglitazone',
 'rosiglitazone',
 'acarbose',
 'miglitol',
 'insulin',
 'glyburide-metformin',
 'change',
 'diabetesMed',
 'readmitted']

In [15]:
# validate random forest model
evaluator = BinaryClassificationEvaluator(metricName="areaUnderROC",rawPredictionCol='prediction', labelCol='readmitted')
rf_predicted = fitted_cv1.transform(test)
evaluator.evaluate(rf_predicted)
print("Test Area Under ROC: " + str(evaluator.evaluate(rf_predicted)))

Test Area Under ROC: 0.5012928703077153


In [16]:
rf_predicted.select(
(fn.sum(fn.when(fn.col('readmitted')==fn.col('prediction'),1).otherwise(0))/fn.count(fn.col('readmitted'))).alias('Accuracy'),
(fn.sum(fn.when((fn.col('readmitted')==1) & (fn.col('prediction')==1),1).otherwise(0))/
(fn.sum(fn.when((fn.col('readmitted')==1) & (fn.col('prediction')==1),1).otherwise(0))+ 
fn.sum(fn.when((fn.col('readmitted')==1) & (fn.col('prediction')==0),1).otherwise(0)))).alias('Recall')).show()

+------------------+--------------------+
|          Accuracy|              Recall|
+------------------+--------------------+
|0.5412159494670351|0.004851242862662602|
+------------------+--------------------+



In [None]:
a = pd.DataFrame(list(zip(train.columns[0:10], fitted_cv1.bestModel.stages[2].featureImportances.toArray())),
           columns = ['feature', 'importance']).sort_values('importance', ascending = False)
a.reset_index(drop=True, inplace=True)
feature_list = a['feature'].head(10)

bestPipeline = fitted_cv1.bestModel
bestModel = bestPipeline.stages[2]

importances = bestModel.featureImportances

x_values = list(range(len(importances)))
#feature_list= list(train.columns[0:-1])
plt.bar(x_values, importances, orientation = 'vertical')
plt.xticks(x_values, feature_list, rotation=40)
plt.ylabel('Importance')
plt.xlabel('Feature')
plt.title('Feature Importances')
plt.show()
x_values

In [None]:
importances = fitted_model.featureImportances

x_values = list(range(len(importances)))

plt.bar(x_values, importances, orientation = 'vertical')
plt.xticks(x_values, feature_list, rotation=40)
plt.ylabel('Importance')
plt.xlabel('Feature')
plt.title('Feature Importances')

In [33]:
# MLP pipeline
layers = [4,5,5,2]
pipe_mlp = Pipeline(stages = [pipe_model,MultilayerPerceptronClassifier(maxIter=100, layers=layers,labelCol = 'target',featuresCol = 'zfeatures', blockSize=128)])


In [36]:
# Fitiing MLP model
fitted_model = pipe_mlp.fit(train)
fitted_model.transform(test).toPandas().head()

Py4JJavaError: An error occurred while calling o3678.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 4 in stage 641.0 failed 1 times, most recent failure: Lost task 4.0 in stage 641.0 (TID 3089, localhost, executor driver): org.apache.spark.SparkException: Failed to execute user defined function($anonfun$1: (struct<type:tinyint,size:int,indices:array<int>,values:array<double>>) => struct<type:tinyint,size:int,indices:array<int>,values:array<double>>)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	at java.lang.Thread.run(Unknown Source)
Caused by: java.lang.IllegalArgumentException: requirement failed: A & B Dimension mismatch!
	at scala.Predef$.require(Predef.scala:224)
	at org.apache.spark.ml.ann.BreezeUtil$.dgemm(BreezeUtil.scala:41)
	at org.apache.spark.ml.ann.AffineLayerModel.eval(Layer.scala:164)
	at org.apache.spark.ml.ann.FeedForwardModel.forward(Layer.scala:508)
	at org.apache.spark.ml.ann.FeedForwardModel.predictRaw(Layer.scala:561)
	at org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel.predictRaw(MultilayerPerceptronClassifier.scala:323)
	at org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel.predictRaw(MultilayerPerceptronClassifier.scala:280)
	at org.apache.spark.ml.classification.ProbabilisticClassificationModel$$anonfun$1.apply(ProbabilisticClassifier.scala:117)
	at org.apache.spark.ml.classification.ProbabilisticClassificationModel$$anonfun$1.apply(ProbabilisticClassifier.scala:116)
	... 18 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:945)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:944)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:299)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3263)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3260)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3369)
	at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3260)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Unknown Source)
Caused by: org.apache.spark.SparkException: Failed to execute user defined function($anonfun$1: (struct<type:tinyint,size:int,indices:array<int>,values:array<double>>) => struct<type:tinyint,size:int,indices:array<int>,values:array<double>>)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	... 1 more
Caused by: java.lang.IllegalArgumentException: requirement failed: A & B Dimension mismatch!
	at scala.Predef$.require(Predef.scala:224)
	at org.apache.spark.ml.ann.BreezeUtil$.dgemm(BreezeUtil.scala:41)
	at org.apache.spark.ml.ann.AffineLayerModel.eval(Layer.scala:164)
	at org.apache.spark.ml.ann.FeedForwardModel.forward(Layer.scala:508)
	at org.apache.spark.ml.ann.FeedForwardModel.predictRaw(Layer.scala:561)
	at org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel.predictRaw(MultilayerPerceptronClassifier.scala:323)
	at org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel.predictRaw(MultilayerPerceptronClassifier.scala:280)
	at org.apache.spark.ml.classification.ProbabilisticClassificationModel$$anonfun$1.apply(ProbabilisticClassifier.scala:117)
	at org.apache.spark.ml.classification.ProbabilisticClassificationModel$$anonfun$1.apply(ProbabilisticClassifier.scala:116)
	... 18 more


In [37]:
gbt = GBTClassifier(labelCol="target",featuresCol="zfeatures")
gbt_pipeline = Pipeline(stages=[pipe_model, gbt]).fit(train)
gbt_pipeline.transform(test).select(fn.avg(fn.expr('target = prediction').cast('float'))).show()

+-----------------------------------------+
|avg(CAST((target = prediction) AS FLOAT))|
+-----------------------------------------+
|                       0.6189003436426117|
+-----------------------------------------+



In [22]:
dt = DecisionTreeClassifier(labelCol="readmitted", featuresCol="zfeatures")
dt_pipeline = Pipeline(stages=[pipeline1,pipeline2, dt]).fit(train)
dt_prediction = dt_pipeline.transform(test)

In [25]:
evaluator = BinaryClassificationEvaluator(metricName="areaUnderROC",rawPredictionCol='prediction', labelCol='readmitted')
print("Test Area Under ROC: " + str(evaluator.evaluate(dt_prediction)))

Test Area Under ROC: 0.5569429075392875


In [23]:
dt_prediction.select(
(fn.sum(fn.when(fn.col('readmitted')==fn.col('prediction'),1).otherwise(0))/fn.count(fn.col('readmitted'))).alias('Accuracy'),
(fn.sum(fn.when((fn.col('readmitted')==1) & (fn.col('prediction')==1),1).otherwise(0))/
(fn.sum(fn.when((fn.col('readmitted')==1) & (fn.col('prediction')==1),1).otherwise(0))+ 
fn.sum(fn.when((fn.col('readmitted')==1) & (fn.col('prediction')==0),1).otherwise(0)))).alias('Recall')).show()

+------------------+-------------------+
|          Accuracy|             Recall|
+------------------+-------------------+
|0.5689795515650823|0.40956218103679826|
+------------------+-------------------+



In [None]:
#diabetes_df = diabetes_df.withColumn("metformin", diabetes_df["metformin"].cast(IntegerType()))
#diabetes_df = diabetes_df.withColumn("diag_2", hosp_readmit_id_mapped_df.diag_2.cast('Int'))
#print(diabetes_df.count())
#diabetes_df = diabetes_df.na.drop()
#diabetes_df.where(diabetes_df.metformin.isNull()).count()



'''
NULL VALUES IN 
diag_1
diag_2
diag_3
max_glu_serum
A1Cresult
metformin_en
repaglinide_en
'''
#diabetes_df = diabetes_df.withColumn('readmitted',when((diabetes_df.readmitted == '<30') | (diabetes_df.readmitted == '>30'),'YES').otherwise('NO'))
#diabetes_df.toPandas().to_csv('Data/diabetes.csv')

#52.169
#discharge_en reduces to 50.73
#source_en 55.24
#time in hosp 55.95
# number_outpatient 57.69
#number_emergency 59.31
# number_inpatient 63.46
# number_diagnoses 63.92