In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('UCI Heart disease').getOrCreate()

In [3]:
from pyspark.ml.classification import LogisticRegression

In [4]:
heart = spark.read.csv('heart.csv', inferSchema = True, header = True)

In [5]:
heart.show(5)

+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
| 63|  1|  3|     145| 233|  1|      0|    150|    0|    2.3|    0|  0|   1|     1|
| 37|  1|  2|     130| 250|  0|      1|    187|    0|    3.5|    0|  0|   2|     1|
| 41|  0|  1|     130| 204|  0|      0|    172|    0|    1.4|    2|  0|   2|     1|
| 56|  1|  1|     120| 236|  0|      1|    178|    0|    0.8|    2|  0|   2|     1|
| 57|  0|  0|     120| 354|  0|      1|    163|    1|    0.6|    2|  0|   2|     1|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
only showing top 5 rows



In [6]:
heart.printSchema()

root
 |-- age: integer (nullable = true)
 |-- sex: integer (nullable = true)
 |-- cp: integer (nullable = true)
 |-- trestbps: integer (nullable = true)
 |-- chol: integer (nullable = true)
 |-- fbs: integer (nullable = true)
 |-- restecg: integer (nullable = true)
 |-- thalach: integer (nullable = true)
 |-- exang: integer (nullable = true)
 |-- oldpeak: double (nullable = true)
 |-- slope: integer (nullable = true)
 |-- ca: integer (nullable = true)
 |-- thal: integer (nullable = true)
 |-- target: integer (nullable = true)



In [7]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [8]:
heart.columns

['age',
 'sex',
 'cp',
 'trestbps',
 'chol',
 'fbs',
 'restecg',
 'thalach',
 'exang',
 'oldpeak',
 'slope',
 'ca',
 'thal',
 'target']

In [9]:
assembler = VectorAssembler(
  inputCols=['age',
 'sex',
 'cp',
 'trestbps',
 'chol',
 'fbs',
 'restecg',
 'thalach',
 'exang',
 'oldpeak',
 'slope',
 'ca',
 'thal'],
              outputCol="features")

In [10]:
output = assembler.transform(heart)

In [11]:
output.show(5)

+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+--------------------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|            features|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+--------------------+
| 63|  1|  3|     145| 233|  1|      0|    150|    0|    2.3|    0|  0|   1|     1|[63.0,1.0,3.0,145...|
| 37|  1|  2|     130| 250|  0|      1|    187|    0|    3.5|    0|  0|   2|     1|[37.0,1.0,2.0,130...|
| 41|  0|  1|     130| 204|  0|      0|    172|    0|    1.4|    2|  0|   2|     1|[41.0,0.0,1.0,130...|
| 56|  1|  1|     120| 236|  0|      1|    178|    0|    0.8|    2|  0|   2|     1|[56.0,1.0,1.0,120...|
| 57|  0|  0|     120| 354|  0|      1|    163|    1|    0.6|    2|  0|   2|     1|[57.0,0.0,0.0,120...|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+--------------------+
only showing top 5 rows



In [12]:
final_data = output.select("features",'target')

In [13]:
train, test = final_data.randomSplit([0.7,0.3])

In [14]:
lr = LogisticRegression(labelCol="target",featuresCol="features")





In [15]:
model=lr.fit(train)
predict_train=model.transform(train)
predict_test=model.transform(test)
predict_test.select("target","prediction").show(10)

+------+----------+
|target|prediction|
+------+----------+
|     1|       1.0|
|     1|       1.0|
|     1|       1.0|
|     1|       1.0|
|     1|       1.0|
|     1|       1.0|
|     1|       1.0|
|     0|       1.0|
|     1|       1.0|
|     1|       1.0|
+------+----------+
only showing top 10 rows



# Evaluating the Model

In [16]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator=BinaryClassificationEvaluator(rawPredictionCol='rawPrediction',labelCol='target')

predict_test.select("target","rawPrediction","prediction","probability").show(5)



+------+--------------------+----------+--------------------+
|target|       rawPrediction|prediction|         probability|
+------+--------------------+----------+--------------------+
|     1|[-4.9029758643091...|       1.0|[0.00736973968213...|
|     1|[-3.0050374421050...|       1.0|[0.04719881649726...|
|     1|[-3.4942066082016...|       1.0|[0.02947752052940...|
|     1|[-0.0013464484671...|       1.0|[0.49966338793406...|
|     1|[-1.0943945063581...|       1.0|[0.25079166778008...|
+------+--------------------+----------+--------------------+
only showing top 5 rows



In [17]:
print("The area under ROC for train set is {}".format(evaluator.evaluate(predict_train)))

print("The area under ROC for test set is {}".format(evaluator.evaluate(predict_test)))

The area under ROC for train set is 0.9362463069507075
The area under ROC for test set is 0.8701393983859136
