In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
        .builder \
        .appName("Aula Interativa 2 - ML") \
        .getOrCreate()

spark.version

'3.3.0'

In [2]:
titanic_df = spark.read.csv('stroke_data.csv', header='True', inferSchema='True')

titanic_df.printSchema()

root
 |-- 0: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: double (nullable = true)
 |-- hypertension: integer (nullable = true)
 |-- heart_disease: integer (nullable = true)
 |-- ever_married: string (nullable = true)
 |-- work_type: string (nullable = true)
 |-- Residence_type: string (nullable = true)
 |-- avg_glucose_level: double (nullable = true)
 |-- bmi: double (nullable = true)
 |-- smoking_status: string (nullable = true)
 |-- stroke: integer (nullable = true)



In [3]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

gender_indexer = StringIndexer(inputCol='gender', outputCol='GenderIndex')
gender_encoder = OneHotEncoder(inputCol='GenderIndex', outputCol='GenderVector')


In [4]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=['age', 'hypertension', 'bmi', 'GenderVector'], outputCol='features')


In [5]:
from pyspark.ml.classification import DecisionTreeClassifier

classifier = DecisionTreeClassifier(labelCol='stroke', featuresCol='features')

classifier

DecisionTreeClassifier_a6c1ac34f248

In [6]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[gender_indexer, gender_encoder, assembler, classifier])

In [7]:
train_data, test_data = titanic_df.randomSplit([0.7, 0.3])

In [25]:
# predictSurvivedModel = pipeline.fit(train_data)

In [12]:
mean_age = titanic_df.agg({'Age': 'mean'}).collect()[0][0]
mean_age

29.69911764705882

In [13]:
titanic_df = titanic_df.fillna(mean_age, subset=['Age'])

In [9]:
train_data, test_data = titanic_df.randomSplit([0.7, 0.3])
predictSurvivedModel = pipeline.fit(train_data)

titanic_df.groupBy('gender').count().show()

+------+-----+
|gender|count|
+------+-----+
|Female|39530|
| Other|   11|
|  Male|27594|
+------+-----+



In [14]:
predictions = predictSurvivedModel.transform(test_data)
predictions.select('features').show(5)



+--------------------+
|            features|
+--------------------+
|[36.0,0.0,24.7,1....|
|[62.0,0.0,31.2,1....|
|[41.0,0.0,33.8,1....|
|[70.0,0.0,24.4,1....|
|[41.0,0.0,34.6,0....|
+--------------------+
only showing top 5 rows



In [11]:
predictions.show()

+---+------+----+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+-----------+-------------+--------------------+----------------+--------------------+----------+
|  0|gender| age|hypertension|heart_disease|ever_married|    work_type|Residence_type|avg_glucose_level| bmi| smoking_status|stroke|GenderIndex| GenderVector|            features|   rawPrediction|         probability|prediction|
+---+------+----+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+-----------+-------------+--------------------+----------------+--------------------+----------+
|  3|Female|36.0|           0|            0|         Yes|     Govt_job|         Urban|            72.63|24.7|         smokes|     0|        0.0|(2,[0],[1.0])|[36.0,0.0,24.7,1....| [5294.0,5647.0]|[0.48386801937665...|       1.0|
|  4|Female|62.0|           0|            0|         Yes|Self-employed|         Rura

In [12]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol='stroke', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(predictions)

accuracy



0.6935131918270184

In [13]:
predictSurvivedModel.stages[-1]


DecisionTreeClassificationModel: uid=DecisionTreeClassifier_a6c1ac34f248, depth=5, numNodes=15, numClasses=2, numFeatures=5

In [27]:
decisionTreeModel = predictSurvivedModel.stages[-1]
decisionTreeModel.depth

5

In [22]:
decisionTreeModel.toDebugString

'DecisionTreeClassificationModel: uid=DecisionTreeClassifier_41c1d1b89e13, depth=5, numNodes=25, numClasses=2, numFeatures=4\n  If (feature 3 in {1.0})\n   If (feature 0 <= 14.5)\n    If (feature 1 <= 2.5)\n     Predict: 1.0\n    Else (feature 1 > 2.5)\n     If (feature 2 <= 20.7875)\n      Predict: 1.0\n     Else (feature 2 > 20.7875)\n      Predict: 0.0\n   Else (feature 0 > 14.5)\n    Predict: 0.0\n  Else (feature 3 not in {1.0})\n   If (feature 1 <= 2.5)\n    If (feature 0 <= 2.5)\n     If (feature 1 <= 1.5)\n      Predict: 0.0\n     Else (feature 1 > 1.5)\n      Predict: 1.0\n    Else (feature 0 > 2.5)\n     Predict: 1.0\n   Else (feature 1 > 2.5)\n    If (feature 2 <= 25.527099999999997)\n     If (feature 0 <= 33.5)\n      Predict: 1.0\n     Else (feature 0 > 33.5)\n      If (feature 0 <= 48.5)\n       Predict: 0.0\n      Else (feature 0 > 48.5)\n       Predict: 1.0\n    Else (feature 2 > 25.527099999999997)\n     If (feature 0 <= 7.5)\n      If (feature 2 <= 31.331249999999997)\

In [23]:
decisionTreeModel.featureImportances

SparseVector(4, {0: 0.0989, 1: 0.1736, 2: 0.0803, 3: 0.6473})

In [24]:
list(zip(assembler.getInputCols(), decisionTreeModel.featureImportances))

[('Age', 0.09885218781275795),
 ('Pclass', 0.17357724994481527),
 ('Fare', 0.08027004487359246),
 ('SexVector', 0.6473005173688343)]