In [4]:
from pyspark.sql import SparkSession

spark = SparkSession \
        .builder \
        .appName("Aula Interativa 2 - ML") \
        .getOrCreate()

spark.version

'3.3.0'

In [5]:
titanic_df = spark.read.csv('titanic.csv', header='True', inferSchema='True')

titanic_df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [6]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

sex_indexer = StringIndexer(inputCol='Sex', outputCol='SexIndex')
sex_encoder = OneHotEncoder(inputCol='SexIndex', outputCol='SexVector')


In [7]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=['Age', 'Pclass', 'Fare', 'SexVector'], outputCol='features')


In [8]:
from pyspark.ml.classification import DecisionTreeClassifier

classifier = DecisionTreeClassifier(labelCol='Survived', featuresCol='features')

classifier

DecisionTreeClassifier_41c1d1b89e13

In [9]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[sex_indexer, sex_encoder, assembler, classifier])

In [10]:
train_data, test_data = titanic_df.randomSplit([0.7, 0.3])

In [25]:
# predictSurvivedModel = pipeline.fit(train_data)

In [12]:
mean_age = titanic_df.agg({'Age': 'mean'}).collect()[0][0]
mean_age

29.69911764705882

In [13]:
titanic_df = titanic_df.fillna(mean_age, subset=['Age'])

In [14]:
train_data, test_data = titanic_df.randomSplit([0.7, 0.3])
predictSurvivedModel = pipeline.fit(train_data)

titanic_df.groupBy('Sex').count().show()

+------+-----+
|   Sex|count|
+------+-----+
|female|  314|
|  male|  577|
+------+-----+



In [26]:
predictions = predictSurvivedModel.transform(test_data)
predictions.select('passengerId', 'sex', 'sexIndex', 'sexVector', 'rawPrediction', 'prediction').show(5)



+-----------+------+--------+-------------+-------------+----------+
|passengerId|   sex|sexIndex|    sexVector|rawPrediction|prediction|
+-----------+------+--------+-------------+-------------+----------+
|          4|female|     1.0|    (1,[],[])|  [4.0,115.0]|       1.0|
|          6|  male|     0.0|(1,[0],[1.0])| [304.0,58.0]|       0.0|
|          7|  male|     0.0|(1,[0],[1.0])| [304.0,58.0]|       0.0|
|         10|female|     1.0|    (1,[],[])|  [4.0,115.0]|       1.0|
|         17|  male|     0.0|(1,[0],[1.0])|   [10.0,1.0]|       0.0|
+-----------+------+--------+-------------+-------------+----------+
only showing top 5 rows



In [30]:
predictions.show()

+-----------+--------+------+--------------------+------+-----------------+-----+-----+----------+-------+-----+--------+--------+-------------+--------------------+-------------+--------------------+----------+
|PassengerId|Survived|Pclass|                Name|   Sex|              Age|SibSp|Parch|    Ticket|   Fare|Cabin|Embarked|SexIndex|    SexVector|            features|rawPrediction|         probability|prediction|
+-----------+--------+------+--------------------+------+-----------------+-----+-----+----------+-------+-----+--------+--------+-------------+--------------------+-------------+--------------------+----------+
|          4|       1|     1|Futrelle, Mrs. Ja...|female|             35.0|    1|    0|    113803|   53.1| C123|       S|     1.0|    (1,[],[])| [35.0,1.0,53.1,0.0]|  [4.0,115.0]|[0.03361344537815...|       1.0|
|          6|       0|     3|    Moran, Mr. James|  male|29.69911764705882|    0|    0|    330877| 8.4583| null|       Q|     0.0|(1,[0],[1.0])|[29.6991

In [16]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol='Survived', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(predictions)

accuracy



0.8057553956834532

In [28]:
predictSurvivedModel.stages[-1]


DecisionTreeClassificationModel: uid=DecisionTreeClassifier_41c1d1b89e13, depth=5, numNodes=25, numClasses=2, numFeatures=4

In [27]:
decisionTreeModel = predictSurvivedModel.stages[-1]
decisionTreeModel.depth

5

In [22]:
decisionTreeModel.toDebugString

'DecisionTreeClassificationModel: uid=DecisionTreeClassifier_41c1d1b89e13, depth=5, numNodes=25, numClasses=2, numFeatures=4\n  If (feature 3 in {1.0})\n   If (feature 0 <= 14.5)\n    If (feature 1 <= 2.5)\n     Predict: 1.0\n    Else (feature 1 > 2.5)\n     If (feature 2 <= 20.7875)\n      Predict: 1.0\n     Else (feature 2 > 20.7875)\n      Predict: 0.0\n   Else (feature 0 > 14.5)\n    Predict: 0.0\n  Else (feature 3 not in {1.0})\n   If (feature 1 <= 2.5)\n    If (feature 0 <= 2.5)\n     If (feature 1 <= 1.5)\n      Predict: 0.0\n     Else (feature 1 > 1.5)\n      Predict: 1.0\n    Else (feature 0 > 2.5)\n     Predict: 1.0\n   Else (feature 1 > 2.5)\n    If (feature 2 <= 25.527099999999997)\n     If (feature 0 <= 33.5)\n      Predict: 1.0\n     Else (feature 0 > 33.5)\n      If (feature 0 <= 48.5)\n       Predict: 0.0\n      Else (feature 0 > 48.5)\n       Predict: 1.0\n    Else (feature 2 > 25.527099999999997)\n     If (feature 0 <= 7.5)\n      If (feature 2 <= 31.331249999999997)\

In [23]:
decisionTreeModel.featureImportances

SparseVector(4, {0: 0.0989, 1: 0.1736, 2: 0.0803, 3: 0.6473})

In [24]:
list(zip(assembler.getInputCols(), decisionTreeModel.featureImportances))

[('Age', 0.09885218781275795),
 ('Pclass', 0.17357724994481527),
 ('Fare', 0.08027004487359246),
 ('SexVector', 0.6473005173688343)]