## Heart Attack Prediction

In [1]:
from pyspark.sql import SparkSession 

In [2]:
spark =  SparkSession.builder.appName('Heart_attack').getOrCreate()

In [7]:
heart = spark.read.csv(r'C:\Users\Murillo\Desktop\Python\Portfolio\Heart_Attack\heart.csv', header=True, inferSchema=True)
heart.show(5)

+---+---+---+------+----+---+-------+--------+----+-------+---+---+-----+------+
|age|sex| cp|trtbps|chol|fbs|restecg|thalachh|exng|oldpeak|slp|caa|thall|output|
+---+---+---+------+----+---+-------+--------+----+-------+---+---+-----+------+
| 63|  1|  3|   145| 233|  1|      0|     150|   0|    2.3|  0|  0|    1|     1|
| 37|  1|  2|   130| 250|  0|      1|     187|   0|    3.5|  0|  0|    2|     1|
| 41|  0|  1|   130| 204|  0|      0|     172|   0|    1.4|  2|  0|    2|     1|
| 56|  1|  1|   120| 236|  0|      1|     178|   0|    0.8|  2|  0|    2|     1|
| 57|  0|  0|   120| 354|  0|      1|     163|   1|    0.6|  2|  0|    2|     1|
+---+---+---+------+----+---+-------+--------+----+-------+---+---+-----+------+
only showing top 5 rows



In [None]:
heart.printSchema()

root
 |-- age: integer (nullable = true)
 |-- sex: integer (nullable = true)
 |-- cp: integer (nullable = true)
 |-- trtbps: integer (nullable = true)
 |-- chol: integer (nullable = true)
 |-- fbs: integer (nullable = true)
 |-- restecg: integer (nullable = true)
 |-- thalachh: integer (nullable = true)
 |-- exng: integer (nullable = true)
 |-- oldpeak: double (nullable = true)
 |-- slp: integer (nullable = true)
 |-- caa: integer (nullable = true)
 |-- thall: integer (nullable = true)
 |-- output: integer (nullable = true)



In [None]:
heart.columns

['age',
 'sex',
 'cp',
 'trtbps',
 'chol',
 'fbs',
 'restecg',
 'thalachh',
 'exng',
 'oldpeak',
 'slp',
 'caa',
 'thall',
 'output']

In [None]:
df = heart.toPandas()
df.describe()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [8]:
heart.drop_duplicates().show(5)

+---+---+---+------+----+---+-------+--------+----+-------+---+---+-----+------+
|age|sex| cp|trtbps|chol|fbs|restecg|thalachh|exng|oldpeak|slp|caa|thall|output|
+---+---+---+------+----+---+-------+--------+----+-------+---+---+-----+------+
| 69|  0|  3|   140| 239|  0|      1|     151|   0|    1.8|  2|  2|    2|     1|
| 53|  0|  0|   130| 264|  0|      0|     143|   0|    0.4|  1|  0|    2|     1|
| 54|  1|  2|   125| 273|  0|      0|     152|   0|    0.5|  0|  1|    2|     1|
| 51|  1|  0|   140| 298|  0|      1|     122|   1|    4.2|  1|  3|    3|     0|
| 58|  0|  1|   136| 319|  1|      0|     152|   0|    0.0|  2|  2|    2|     0|
+---+---+---+------+----+---+-------+--------+----+-------+---+---+-----+------+
only showing top 5 rows



In [19]:
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.ml import Pipeline

vec = VectorAssembler(inputCols=['sex', 'cp', 'thalachh', 'exng', 'oldpeak', 'slp', 'caa', 'thall'], outputCol='features_vec')
scaled = StandardScaler(inputCol='features_vec', outputCol='features_scaled')
pipeline = Pipeline(stages=[vec, scaled])
model = pipeline.fit(heart)
df = model.transform(heart)

In [23]:
heart_final = df.select('features_scaled', 'output')
heart_final.show(5)

+--------------------+------+
|     features_scaled|output|
+--------------------+------+
|[2.14587290662820...|     1|
|[2.14587290662820...|     1|
|[0.0,0.9689429657...|     1|
|[2.14587290662820...|     1|
|[0.0,0.0,7.116300...|     1|
+--------------------+------+
only showing top 5 rows



In [102]:
from pyspark.ml.classification import LogisticRegression

train_data, test_data = heart_final.randomSplit([0.8,0.2])
logreg = LogisticRegression(featuresCol='features_scaled', labelCol='output')
heart_train = logreg.fit(train_data)

In [103]:
test_data.groupBy('output').count().show()

+------+-----+
|output|count|
+------+-----+
|     1|   38|
|     0|   32|
+------+-----+



In [104]:
pred_result = heart_train.evaluate(test_data)

In [105]:
pred_result.predictions.show()

+--------------------+------+--------------------+--------------------+----------+
|     features_scaled|output|       rawPrediction|         probability|prediction|
+--------------------+------+--------------------+--------------------+----------+
|(8,[0,1,2,7],[2.1...|     1|[-1.1895172930756...|[0.23334527863659...|       1.0|
|(8,[0,2,5,7],[2.1...|     0|[-0.8439886959521...|[0.30069538412487...|       1.0|
|(8,[0,2,5,7],[2.1...|     1|[-1.6098045308960...|[0.16661575365829...|       1.0|
|(8,[1,2,5],[1.937...|     1|[-4.5976660085410...|[0.00997482445483...|       1.0|
|(8,[1,2,5,7],[0.9...|     1|[-3.4208944552239...|[0.03164880438226...|       1.0|
|(8,[1,2,5,7],[0.9...|     1|[-3.6150570937158...|[0.02620993781563...|       1.0|
|(8,[1,2,5,7],[1.9...|     1|[-4.2957310491018...|[0.01344341816959...|       1.0|
|(8,[2,3,5,7],[6.2...|     0|[-1.0323638610565...|[0.26262607711967...|       1.0|
|(8,[2,4,5,7],[5.3...|     1|[-1.0265441928745...|[0.26375463277847...|       1.0|
|(8,

In [106]:
# Accuracy
pred_result.accuracy

0.9142857142857143

In [107]:
# True Positive Rate
pred_result.weightedTruePositiveRate

0.9142857142857143

In [108]:
# False Positive Rate
pred_result.weightedFalsePositiveRate

0.10178571428571428

In [109]:
# AUC ROC
pred_result.areaUnderROC

0.9629934210526316