In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.sql.types import DoubleType, BooleanType
from pyspark.sql import Window

spark = SparkSession\
    .builder\
    .appName("Heart Failure Predictions")\
    .getOrCreate()

In [2]:
df = spark.read\
    .option("header", True)\
    .csv("heart_failure.csv")

In [3]:
columns = [
    "age",
    "ejection_fraction",
    "serum_creatinine",
    "serum_sodium",
    "DEATH_EVENT"
]
df = df[columns]

In [4]:
df = df\
    .withColumn("age", f.col("age").cast(DoubleType()))\
    .withColumn("ejection_fraction", f.col("ejection_fraction").cast(DoubleType()))\
    .withColumn("serum_creatinine", f.col("serum_creatinine").cast(DoubleType()))\
    .withColumn("serum_sodium", f.col("serum_sodium").cast(DoubleType()))\
    .withColumn("DEATH_EVENT", f.col("DEATH_EVENT").cast(DoubleType()))\
    .withColumnRenamed("DEATH_EVENT", "label")

In [5]:
window = Window.partitionBy("label").orderBy("serum_creatinine")
udf = f.udf(lambda x: x % 5 == 0, BooleanType())

df = df\
    .withColumn("test_set", f.row_number().over(window))\
    .withColumn("test_set", udf(f.col("test_set")))

In [6]:
df.show(10)

+----+-----------------+----------------+------------+-----+--------+
| age|ejection_fraction|serum_creatinine|serum_sodium|label|test_set|
+----+-----------------+----------------+------------+-----+--------+
|50.0|             30.0|             0.5|       139.0|  0.0|   false|
|60.0|             40.0|             0.6|       138.0|  0.0|   false|
|75.0|             38.0|             0.6|       131.0|  0.0|   false|
|51.0|             50.0|             0.7|       140.0|  0.0|   false|
|44.0|             40.0|             0.7|       139.0|  0.0|    true|
|58.0|             38.0|             0.7|       142.0|  0.0|   false|
|60.0|             60.0|             0.7|       136.0|  0.0|   false|
|61.0|             30.0|             0.7|       136.0|  0.0|   false|
|53.0|             60.0|             0.7|       138.0|  0.0|   false|
|50.0|             30.0|             0.7|       141.0|  0.0|    true|
+----+-----------------+----------------+------------+-----+--------+
only showing top 10 

In [7]:
test = df.where(df["test_set"] == True)
test = test.drop("test_set")

train = df.where(df["test_set"] == False)
train = train.drop("test_set")

In [8]:
from pyspark.ml.feature import VectorAssembler, MinMaxScaler
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import LogisticRegression
from pyspark.mllib.evaluation import BinaryClassificationMetrics

In [11]:
assembler = VectorAssembler(
    inputCols = ["age", "ejection_fraction", "serum_creatinine", "serum_sodium"], 
    outputCol = "vec"
)
scaler = MinMaxScaler(
    inputCol = "vec", 
    outputCol = "features"
)
pipeline = Pipeline(stages = [
    assembler,
    scaler
])

lr = LogisticRegression()

In [13]:
pipeline = pipeline.fit(train)
train = pipeline.transform(train)

In [None]:
paramGrid = ParamGridBuilder()\
    .addGrid(lr.regParam, [0.1, 0.01])\
    .addGrid(lr.maxIter, [50, 100])\
    .build()
#.addGrid(lr.elasticNetParam, [0.5, 0.8])\
crossval = CrossValidator(estimator =lr,
                          estimatorParamMaps = paramGrid,
                          evaluator = BinaryClassificationEvaluator(),
                          numFolds = 3)
crossval.fit(train)

In [None]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics

test = crossval.transform(test)
metrics = BinaryClassificationMetrics(test[["prediction", "label"]].rdd)
print(metrics.areaUnderPR)