# In this notebook, we will take some health indicators and train a binary classifier that predict wether a patient will have or not a heart disease
## Keep in mind that this notebook will run only with PySpark installed correctly on a machine and with the required dependencies installed. 
## The data has been imported from kaggle and preprocessed in the other notebook. You can find the repository here: https://www.kaggle.com/datasets/yasserh/heart-disease-dataset

In [48]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('heart_disease').getOrCreate()

24/05/07 23:52:17 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [49]:
!kaggle datasets download -d yasserh/heart-disease-dataset

Dataset URL: https://www.kaggle.com/datasets/yasserh/heart-disease-dataset
License(s): CC0-1.0
heart-disease-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)


In [50]:
import zipfile
zipf = zipfile.ZipFile("heart-disease-dataset.zip")
zipf.extractall("data/heart_disease.csv")

In [51]:
data = spark.read.csv("data/heart_disease.csv",inferSchema=True,header=True)

In [52]:
data.printSchema()

root
 |-- age: integer (nullable = true)
 |-- sex: integer (nullable = true)
 |-- cp: integer (nullable = true)
 |-- trestbps: integer (nullable = true)
 |-- chol: integer (nullable = true)
 |-- fbs: integer (nullable = true)
 |-- restecg: integer (nullable = true)
 |-- thalach: integer (nullable = true)
 |-- exang: integer (nullable = true)
 |-- oldpeak: double (nullable = true)
 |-- slope: integer (nullable = true)
 |-- ca: integer (nullable = true)
 |-- thal: integer (nullable = true)
 |-- target: integer (nullable = true)



In [53]:
data.show()

+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
| 63|  1|  3|     145| 233|  1|      0|    150|    0|    2.3|    0|  0|   1|     1|
| 37|  1|  2|     130| 250|  0|      1|    187|    0|    3.5|    0|  0|   2|     1|
| 41|  0|  1|     130| 204|  0|      0|    172|    0|    1.4|    2|  0|   2|     1|
| 56|  1|  1|     120| 236|  0|      1|    178|    0|    0.8|    2|  0|   2|     1|
| 57|  0|  0|     120| 354|  0|      1|    163|    1|    0.6|    2|  0|   2|     1|
| 57|  1|  0|     140| 192|  0|      1|    148|    0|    0.4|    1|  0|   1|     1|
| 56|  0|  1|     140| 294|  0|      0|    153|    0|    1.3|    1|  0|   2|     1|
| 44|  1|  1|     120| 263|  0|      1|    173|    0|    0.0|    2|  0|   3|     1|
| 52|  1|  2|     172| 199|  1|      1|    162|    0|    0.5|    2|  0|   3|

In [54]:
# Prepare the necessary columns for the vector assembler
# Given that all columns are relevant and numeric, we will drop our target column
# because we will use it for our predictions

inputCols = data.columns
inputCols.pop()
inputCols

['age',
 'sex',
 'cp',
 'trestbps',
 'chol',
 'fbs',
 'restecg',
 'thalach',
 'exang',
 'oldpeak',
 'slope',
 'ca',
 'thal']

In [55]:
from pyspark.ml.feature import VectorAssembler

In [56]:
assembler = VectorAssembler(inputCols=inputCols, outputCol="features")

In [57]:
# create a new df with the transformed data and the targets
final_data = assembler.transform(data).select("features","target")

In [58]:
final_data.show()

+--------------------+------+
|            features|target|
+--------------------+------+
|[63.0,1.0,3.0,145...|     1|
|[37.0,1.0,2.0,130...|     1|
|[41.0,0.0,1.0,130...|     1|
|[56.0,1.0,1.0,120...|     1|
|[57.0,0.0,0.0,120...|     1|
|[57.0,1.0,0.0,140...|     1|
|[56.0,0.0,1.0,140...|     1|
|[44.0,1.0,1.0,120...|     1|
|[52.0,1.0,2.0,172...|     1|
|[57.0,1.0,2.0,150...|     1|
|[54.0,1.0,0.0,140...|     1|
|[48.0,0.0,2.0,130...|     1|
|[49.0,1.0,1.0,130...|     1|
|[64.0,1.0,3.0,110...|     1|
|[58.0,0.0,3.0,150...|     1|
|[50.0,0.0,2.0,120...|     1|
|[58.0,0.0,2.0,120...|     1|
|[66.0,0.0,3.0,150...|     1|
|[43.0,1.0,0.0,150...|     1|
|[69.0,0.0,3.0,140...|     1|
+--------------------+------+
only showing top 20 rows



In [79]:
train_data,test_data = final_data.randomSplit([0.8,0.2],seed=42)

In [80]:
train_data.describe().show()

+-------+------------------+
|summary|            target|
+-------+------------------+
|  count|               256|
|   mean|        0.52734375|
| stddev|0.5002297266374368|
|    min|                 0|
|    max|                 1|
+-------+------------------+



In [81]:
test_data.describe().show()

+-------+-------------------+
|summary|             target|
+-------+-------------------+
|  count|                 47|
|   mean| 0.6382978723404256|
| stddev|0.48568785444140605|
|    min|                  0|
|    max|                  1|
+-------+-------------------+



In [82]:
from pyspark.ml.classification import LogisticRegression

In [83]:
hd_model = LogisticRegression(labelCol='target').fit(train_data)

In [84]:
hd_model.summary.predictions.show()

+--------------------+------+--------------------+--------------------+----------+
|            features|target|       rawPrediction|         probability|prediction|
+--------------------+------+--------------------+--------------------+----------+
|(13,[0,1,3,4,7,10...|   0.0|[-0.6665158753989...|[0.33927743303048...|       1.0|
|(13,[0,1,3,4,7,10...|   1.0|[-1.5558022856812...|[0.17424981711244...|       1.0|
|(13,[0,1,3,4,7,10...|   1.0|[-0.9900292933484...|[0.27090629170471...|       1.0|
|(13,[0,2,3,4,7,10...|   1.0|[-5.1873617323059...|[0.00555568855598...|       1.0|
|(13,[0,2,3,4,7,10...|   1.0|[-4.7321746059193...|[0.00873040650821...|       1.0|
|(13,[0,3,4,6,7,10...|   1.0|[-3.0903241177085...|[0.04350814474066...|       1.0|
|(13,[0,3,4,7,8,10...|   1.0|[-0.6314027100272...|[0.34719254563749...|       1.0|
|(13,[0,3,4,7,9,10...|   1.0|[-1.4681200630579...|[0.18722852330237...|       1.0|
|(13,[0,3,4,7,9,10...|   1.0|[-1.4157300250329...|[0.19533185382643...|       1.0|
|(13

In [85]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [86]:
predictions = hd_model.evaluate(test_data)
predictions.predictions.show()

+--------------------+------+--------------------+--------------------+----------+
|            features|target|       rawPrediction|         probability|prediction|
+--------------------+------+--------------------+--------------------+----------+
|(13,[0,1,3,4,7,10...|     1|[-1.6021468818335...|[0.16768177236435...|       1.0|
|(13,[0,2,3,4,7,10...|     1|[-4.3402549471055...|[0.01286552596458...|       1.0|
|(13,[0,3,4,6,7,10...|     1|[-3.5228396193126...|[0.02866931403198...|       1.0|
|(13,[0,3,4,7,9,10...|     1|[-0.5148753833399...|[0.37405131913760...|       1.0|
|(13,[0,3,4,7,10,1...|     0|[-2.5441013689582...|[0.07282376145972...|       1.0|
|[35.0,0.0,0.0,138...|     1|[-2.9248284012044...|[0.05093976554131...|       1.0|
|[38.0,1.0,2.0,138...|     1|[0.05986394693776...|[0.51496151887815...|       0.0|
|[39.0,1.0,2.0,140...|     1|[-2.1607643397096...|[0.10332961191995...|       1.0|
|[41.0,1.0,1.0,135...|     1|[-1.6538765505959...|[0.16058571074126...|       1.0|
|[41

In [87]:
# Evaluate our results using a Binary Evaluator provided by PySpark
evaluator = BinaryClassificationEvaluator(labelCol="target",rawPredictionCol="prediction")
accuracy = evaluator.evaluate(predictions.predictions)

In [88]:
print(f"Out model has an accuracy of {accuracy}")

Out model has an accuracy of 0.8450980392156862
