In [35]:
from pyspark.sql import SparkSession

spark = SparkSession\
    .builder\
    .appName("Heart Failure Predictions")\
    .getOrCreate()

df = spark.read\
    .option("header", True)\
    .csv("heart_failure.csv")

In [36]:
numerical_features = [
    "anaemia",
    "diabetes",
    "high_blood_pressure",
    "sex",
    "smoking"
]
categorical_features = [
    "age",
    "creatinine_phosphokinase",
    "ejection_fraction",
    "platelets",
    "serum_creatinine",
    "serum_sodium",
    "time"
]

In [37]:
from pyspark.sql.types import DoubleType, BooleanType, IntegerType
import pyspark.sql.functions as f

df = df.withColumn("age", f.col("age").cast(DoubleType()))\
  .withColumn("creatinine_phosphokinase", f.col("creatinine_phosphokinase").cast(DoubleType()))\
  .withColumn("ejection_fraction", f.col("ejection_fraction").cast(DoubleType()))\
  .withColumn("high_blood_pressure", f.col("high_blood_pressure").cast(DoubleType()))\
  .withColumn("platelets", f.col("platelets").cast(DoubleType()))\
  .withColumn("serum_creatinine", f.col("serum_creatinine").cast(DoubleType()))\
  .withColumn("serum_sodium", f.col("serum_sodium").cast(DoubleType()))\
  .withColumn("DEATH_EVENT", f.col("DEATH_EVENT").cast(DoubleType()))\
  .withColumn("anaemia", f.col("anaemia").cast(IntegerType()))\
  .withColumn("diabetes", f.col("diabetes").cast(IntegerType()))\
  .withColumn("high_blood_pressure", f.col("high_blood_pressure").cast(IntegerType()))\
  .withColumn("sex", f.col("sex").cast(IntegerType()))\
  .withColumn("smoking", f.col("smoking").cast(IntegerType()))\
  .withColumnRenamed("DEATH_EVENT", "label")\
  .drop("time")

In [38]:
from pyspark.sql import Window
import pyspark.sql.functions as f

window = Window.partitionBy("label").orderBy("serum_creatinine")
udf = f.udf(lambda x: x % 5 == 0, BooleanType())

In [39]:
df = df.withColumn("_test_set", f.row_number().over(window))\
  .withColumn("_test_set", udf(f.col("_test_set")))

In [67]:
test = df.where(df["_test_set"] == True)
test = test.drop("_test_set")
train = df.where(df["_test_set"] == False)
train = train.drop("_test_set")

In [69]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer
from pyspark.ml import Pipeline

assembler = VectorAssembler(inputCols = train.columns[:-1], 
                            outputCol = "vec")
indexer = VectorIndexer(inputCol = "vec", 
                        outputCol = "features", 
                        maxCategories = 4)
pipeline = Pipeline(stages = [assembler, indexer])

In [70]:
pipeline = pipeline.fit(train)
train = pipeline.transform(train)
test = pipeline.transform(test)

In [66]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier()

paramGrid = ParamGridBuilder()\
  .addGrid(dt.maxDepth, [2, 3, 4])\
  .addGrid(dt.maxBins, [10, 15, 32])\
  .build()

crossval = CrossValidator(estimator = dt,
                          estimatorParamMaps = paramGrid,
                          evaluator = BinaryClassificationEvaluator(metricName = "areaUnderPR"),
                          numFolds = 3)
cvModel = crossval.fit(train)

In [71]:
bestModel = cvModel.bestModel
test = bestModel.transform(test)

In [72]:
evaluator = BinaryClassificationEvaluator(metricName = "areaUnderPR")
evaluator.evaluate(test)

0.1994903189360176