In [31]:
from pyspark.sql import SparkSession

spark = SparkSession \
        .builder \
        .appName("Aula Interativa 2 - ML") \
        .getOrCreate()

spark.version

'3.3.0'

In [32]:
titanic_df = spark.read.csv('titanic.csv', header='True', inferSchema='True')

titanic_df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [33]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

sex_indexer = StringIndexer(inputCol='Sex', outputCol='SexIndex')
sex_encoder = OneHotEncoder(inputCol='SexIndex', outputCol='SexVector')


In [34]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=['Age', 'Pclass', 'Fare', 'SexVector'], outputCol='features')


In [35]:
from pyspark.ml.classification import DecisionTreeClassifier

classifier = DecisionTreeClassifier(labelCol='Survived', featuresCol='features')

classifier

DecisionTreeClassifier_f116875a6bc2

In [36]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[sex_indexer, sex_encoder, assembler, classifier])

In [37]:
train_data, test_data = titanic_df.randomSplit([0.7, 0.3])

In [38]:
# predictSurvivedModel = pipeline.fit(train_data)

In [39]:
mean_age = titanic_df.agg({'Age': 'mean'}).collect()[0][0]
mean_age

29.69911764705882

In [43]:
titanic_df = titanic_df.fillna(mean_age, subset=['Age'])

In [44]:
train_data, test_data = titanic_df.randomSplit([0.7, 0.3])
predictSurvivedModel = pipeline.fit(train_data)

titanic_df.groupBy('Sex').count().show()

+------+-----+
|   Sex|count|
+------+-----+
|female|  314|
|  male|  577|
+------+-----+



In [54]:
predictions = predictSurvivedModel.transform(test_data)
predictions.select('age', 'sex', 'sexIndex', 'sexVector', 'rawPrediction', 'prediction','features').show(5)



+----+------+--------+-------------+-------------+----------+--------------------+
| age|   sex|sexIndex|    sexVector|rawPrediction|prediction|            features|
+----+------+--------+-------------+-------------+----------+--------------------+
|22.0|  male|     0.0|(1,[0],[1.0])| [255.0,27.0]|       0.0| [22.0,3.0,7.25,1.0]|
|26.0|female|     1.0|    (1,[],[])|  [26.0,21.0]|       0.0|[26.0,3.0,7.925,0.0]|
|35.0|  male|     0.0|(1,[0],[1.0])| [255.0,27.0]|       0.0| [35.0,3.0,8.05,1.0]|
|39.0|  male|     0.0|(1,[0],[1.0])| [255.0,27.0]|       0.0|[39.0,3.0,31.275,...|
|34.0|  male|     0.0|(1,[0],[1.0])| [255.0,27.0]|       0.0| [34.0,2.0,13.0,1.0]|
+----+------+--------+-------------+-------------+----------+--------------------+
only showing top 5 rows



In [50]:
predictions.select('sexVector').distinct().show()

+-------------+
|    sexVector|
+-------------+
|(1,[0],[1.0])|
|    (1,[],[])|
+-------------+



In [47]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol='Survived', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(predictions)

accuracy



0.8235294117647058

In [48]:
predictSurvivedModel.stages[-1]


DecisionTreeClassificationModel: uid=DecisionTreeClassifier_f116875a6bc2, depth=5, numNodes=35, numClasses=2, numFeatures=4

In [49]:
decisionTreeModel = predictSurvivedModel.stages[-1]
decisionTreeModel.depth

5

In [22]:
decisionTreeModel.toDebugString

'DecisionTreeClassificationModel: uid=DecisionTreeClassifier_41c1d1b89e13, depth=5, numNodes=25, numClasses=2, numFeatures=4\n  If (feature 3 in {1.0})\n   If (feature 0 <= 14.5)\n    If (feature 1 <= 2.5)\n     Predict: 1.0\n    Else (feature 1 > 2.5)\n     If (feature 2 <= 20.7875)\n      Predict: 1.0\n     Else (feature 2 > 20.7875)\n      Predict: 0.0\n   Else (feature 0 > 14.5)\n    Predict: 0.0\n  Else (feature 3 not in {1.0})\n   If (feature 1 <= 2.5)\n    If (feature 0 <= 2.5)\n     If (feature 1 <= 1.5)\n      Predict: 0.0\n     Else (feature 1 > 1.5)\n      Predict: 1.0\n    Else (feature 0 > 2.5)\n     Predict: 1.0\n   Else (feature 1 > 2.5)\n    If (feature 2 <= 25.527099999999997)\n     If (feature 0 <= 33.5)\n      Predict: 1.0\n     Else (feature 0 > 33.5)\n      If (feature 0 <= 48.5)\n       Predict: 0.0\n      Else (feature 0 > 48.5)\n       Predict: 1.0\n    Else (feature 2 > 25.527099999999997)\n     If (feature 0 <= 7.5)\n      If (feature 2 <= 31.331249999999997)\

In [23]:
decisionTreeModel.featureImportances

SparseVector(4, {0: 0.0989, 1: 0.1736, 2: 0.0803, 3: 0.6473})

In [24]:
list(zip(assembler.getInputCols(), decisionTreeModel.featureImportances))

[('Age', 0.09885218781275795),
 ('Pclass', 0.17357724994481527),
 ('Fare', 0.08027004487359246),
 ('SexVector', 0.6473005173688343)]