In [1]:
import os
# Find the latest version of spark 3.x  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.5.0'
spark_version = 'spark-3.5.0'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

In [2]:
import matplotlib.pyplot as plt

# Import packages
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

In [3]:
from pyspark import SparkFiles
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType

url = "https://ale-work-123.s3.us-west-1.amazonaws.com/heart_statlog_cleveland_hungary_final_clean.csv"
spark.sparkContext.addFile(url)

schema = StructType(
    [
        StructField("age", IntegerType(), False),
        StructField("sex", IntegerType(), False),
        StructField("chest_pain_type", IntegerType(), False),
        StructField("resting_bp_s", IntegerType(), False),
        StructField("cholesterol", IntegerType(), False),
        StructField("fasting_blood_sugar", IntegerType(), False),
        StructField("resting_ecg", IntegerType(), False),
        StructField("max_heart_rate", IntegerType(), False),
        StructField("exercise_angina", IntegerType(), False),
        StructField("oldpeak", FloatType(), False),
        StructField("ST_slope", IntegerType(), False),
        StructField("target", IntegerType(), False),
    ]
)

df = spark.read.csv(SparkFiles.get("heart_statlog_cleveland_hungary_final_clean.csv"), sep=",", header=True, schema=schema)

df.show()

+---+---+---------------+------------+-----------+-------------------+-----------+--------------+---------------+-------+--------+------+
|age|sex|chest_pain_type|resting_bp_s|cholesterol|fasting_blood_sugar|resting_ecg|max_heart_rate|exercise_angina|oldpeak|ST_slope|target|
+---+---+---------------+------------+-----------+-------------------+-----------+--------------+---------------+-------+--------+------+
| 40|  1|              2|         140|        289|                  0|          0|           172|              0|    0.0|       1|     0|
| 49|  0|              3|         160|        180|                  0|          0|           156|              0|    1.0|       2|     1|
| 37|  1|              2|         130|        283|                  0|          1|            98|              0|    0.0|       1|     0|
| 48|  0|              4|         138|        214|                  0|          0|           108|              1|    1.5|       2|     1|
| 54|  1|              3|         

In [4]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- sex: integer (nullable = true)
 |-- chest_pain_type: integer (nullable = true)
 |-- resting_bp_s: integer (nullable = true)
 |-- cholesterol: integer (nullable = true)
 |-- fasting_blood_sugar: integer (nullable = true)
 |-- resting_ecg: integer (nullable = true)
 |-- max_heart_rate: integer (nullable = true)
 |-- exercise_angina: integer (nullable = true)
 |-- oldpeak: float (nullable = true)
 |-- ST_slope: integer (nullable = true)
 |-- target: integer (nullable = true)



In [5]:
from pyspark.ml.feature import VectorAssembler

df = df.withColumnRenamed('target', 'label')

assembler = VectorAssembler(inputCols=df.columns[:-1], outputCol='features')
df = assembler.transform(df)

df.show(5)

+---+---+---------------+------------+-----------+-------------------+-----------+--------------+---------------+-------+--------+-----+--------------------+
|age|sex|chest_pain_type|resting_bp_s|cholesterol|fasting_blood_sugar|resting_ecg|max_heart_rate|exercise_angina|oldpeak|ST_slope|label|            features|
+---+---+---------------+------------+-----------+-------------------+-----------+--------------+---------------+-------+--------+-----+--------------------+
| 40|  1|              2|         140|        289|                  0|          0|           172|              0|    0.0|       1|    0|[40.0,1.0,2.0,140...|
| 49|  0|              3|         160|        180|                  0|          0|           156|              0|    1.0|       2|    1|[49.0,0.0,3.0,160...|
| 37|  1|              2|         130|        283|                  0|          1|            98|              0|    0.0|       1|    0|[37.0,1.0,2.0,130...|
| 48|  0|              4|         138|        214|  

In [6]:
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

In [7]:
from pyspark.ml.classification import LogisticRegression
logistic_regression = LogisticRegression(featuresCol="features", labelCol="label")
model = logistic_regression.fit(train_data)

In [8]:
predictions = model.transform(test_data)

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Accuracy, Precision, and Recall
multi_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
accuracy = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "accuracy"})
precision = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "weightedPrecision"})
recall = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "weightedRecall"})

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

Accuracy: 0.8389
Precision: 0.8389
Recall: 0.8389


# Analysis

The model for predicting heart disease based upon the acquired data is about 84% accurate. This isn't too bad, overall, though I don't think this model would be acceptable for general use, as a recall of 84% means that 16% heart disease patients could go undiagnosed.