# Machine Learning con la librería MLlib  de Spark

En el presente script construiremos un algoritmo basado en `RandomForest` para clasificación binaria

Todos los procesos serán coleccionados en un Pipeline explícito 

## Load the libraries

In [1]:
import os
import numpy as np
import pandas as pd
from pyspark.sql.types import * #<-- importa todos los tipos de dato como: StringType, FloatType DoubleType, DateType, etc.
from pyspark.ml import Pipeline
from pyspark.sql import functions as f
from pyspark.sql.functions import udf, StringType
from pyspark.sql import SparkSession, functions as F
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
from pyspark.ml.feature import OneHotEncoder, VectorAssembler, StringIndexer, MinMaxScaler

## Initialize Spark Session

In [2]:
spark = SparkSession.builder.appName('deep_learning').getOrCreate()

In [3]:
spark

## Carga y limpieza de datos

In [4]:
data = (spark.read
          .format("csv")
          .option('header', 'true')
          .load("./data/Titanic/titanic-train.csv"))

In [5]:
data.show(5)

+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex|Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male| 22|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female| 38|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female| 26|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female| 35|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male| 35|    0|    0|          373450|   8.05| null|       S|
+-----------+--------+------+--------------------+------+---+-----+-----+---------------

### Seleccionamos variables para trabajar:

In [6]:
# Columnas que no usaremos:
unused_cols = ['PassengerId','Name','SibSp','Parch','Ticket','Cabin']

In [7]:
# Columnas que usaremos:
selected_cols = [ c for c in data.columns if c not in unused_cols ]
selected_cols

['Survived', 'Pclass', 'Sex', 'Age', 'Fare', 'Embarked']

In [8]:
data = data.select( selected_cols )
data.show(5)

+--------+------+------+---+-------+--------+
|Survived|Pclass|   Sex|Age|   Fare|Embarked|
+--------+------+------+---+-------+--------+
|       0|     3|  male| 22|   7.25|       S|
|       1|     1|female| 38|71.2833|       C|
|       1|     3|female| 26|  7.925|       S|
|       1|     1|female| 35|   53.1|       S|
|       0|     3|  male| 35|   8.05|       S|
+--------+------+------+---+-------+--------+
only showing top 5 rows



In [9]:
data.printSchema()

root
 |-- Survived: string (nullable = true)
 |-- Pclass: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Fare: string (nullable = true)
 |-- Embarked: string (nullable = true)



### Ajustamos tipos de datos:

In [10]:
# Columnas categóricas:
categ_cols = ['Sex','Embarked']

In [11]:
# A las columnas categóricas les asignamos el tipo "String"
# A las columnas numéricas les asignamos el tipo "Double"

for col in data.columns:    
    if col in categ_cols:
        # Asignamos el tipo "String"
        data = data.withColumn( col , data[col].cast( StringType() ) )
    else:
        # Asignamos el tipo "Double"
        data = data.withColumn( col , data[col].cast( DoubleType() ) )

In [12]:
data.printSchema()

root
 |-- Survived: double (nullable = true)
 |-- Pclass: double (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Embarked: string (nullable = true)



### Identificamos columnas numéricas y categóricas

In [13]:
data_types = data.dtypes
numerical_columns = [ item[0] for item in data.dtypes if item[1] != 'string' ]
categoric_columns = [ item[0] for item in data.dtypes if item[1].startswith('string') ]

In [14]:
print('Columnas numéricas: \n', numerical_columns)
print('\nColumnas categóric: \n', categoric_columns)

Columnas numéricas: 
 ['Survived', 'Pclass', 'Age', 'Fare']

Columnas categóric: 
 ['Sex', 'Embarked']


### Trabajamos con campos nulos:

In [15]:
from pyspark.sql.functions import isnull, when, count, col

# Mostramos el número de campos vacíos en cada columna:
df_nulls = data.select([count(when(isnull(c), True)).alias(c) for c in data.columns]).toPandas()
df_nulls

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked
0,0,0,0,177,0,2


In [16]:
# Extraemos la edad promedio:
mu_age = data.agg( {"Age": "avg"} ).collect()[0][0]
mu_age = round(mu_age)
mu_age

30

In [17]:
# Rellenamos las edades faltantes con el promedio:
data = data.fillna( mu_age, ["Age"])

In [18]:
# Mostramos el número de campos vacíos en cada columna:
df_nulls = data.select([count(when(isnull(c), True)).alias(c) for c in data.columns]).toPandas()
df_nulls

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked
0,0,0,0,0,0,2


In [19]:
# Eliminamos el resto de filas con algún campo vacío:

#data.filter( data['Embarked'].isNull() ).show()

data = data.replace('null', None)\
    .dropna(how='any')

In [20]:
# Mostramos el número de campos vacíos en cada columna:
df_nulls = data.select([count(when(isnull(c), True)).alias(c) for c in data.columns]).toPandas()
df_nulls

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked
0,0,0,0,0,0,0


for c in df_nulls.columns:
    # Extraemos el valor de la celda:
    n_nulls = df_nulls[c][0]
    if ( n_nulls > 0):
        print('Error!!! La columna',c,'tiene',n_nulls,'campos nulos' )
        stop()
    else:
        print('No se encontraron campos nulos en la columna:',c)

### Obtenemos algunos gráficos

In [21]:
#from pandas_profiling import ProfileReport

#df_pandas = data.toPandas()

#pfr = ProfileReport(df_pandas)
#pfr.to_notebook_iframe()
#pfr

## Extraemos las clases de datos que hay en cada columna

In [22]:
# Extraemos el número de clases de datos en cada columna

from pyspark.sql.functions import countDistinct, col

data.select( [ countDistinct( col(c) ).alias(c) for c in data.columns ] ).toPandas()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked
0,2,3,2,88,247,3


## Identificamos la columna de labels y las columnas de features:

In [23]:
label_col = "Survived"#Pclass" # <-- nombre de la columna que usaremos como labels

# Obtenemos las clases de valores de la columna de labels:
classes = data.select([ label_col ]).distinct().toPandas()
classes

Unnamed: 0,Survived
0,0.0
1,1.0


In [24]:
# Numero de clases de salida:
n_class_out = len( classes )
n_class_out

2

In [25]:
# Columnas que formaran los features:
feature_cols = [col for col in selected_cols if col != label_col]
feature_cols

['Pclass', 'Sex', 'Age', 'Fare', 'Embarked']

### Split the dataset into Train, Validation and Test

In [26]:
train, test  = data.randomSplit([0.8, 0.2])

In [27]:
train.show(5)

+--------+------+------+----+-------+--------+
|Survived|Pclass|   Sex| Age|   Fare|Embarked|
+--------+------+------+----+-------+--------+
|     0.0|   1.0|female| 2.0| 151.55|       S|
|     0.0|   1.0|female|25.0| 151.55|       S|
|     0.0|   1.0|  male|18.0|  108.9|       C|
|     0.0|   1.0|  male|19.0|  263.0|       S|
|     0.0|   1.0|  male|21.0|77.2875|       S|
+--------+------+------+----+-------+--------+
only showing top 5 rows



# Inicia creación de Pipeline

Antes de llegar a este paso debemos tener el dataset limpio y listo para trabajar.

Estaremos usando la función `StringIndexer` que asigna un valor entero a cada categoría de datos, iniciando forzosamente dede 0.

0 se asigna a la categoría más frecuente, 1 a la siguiente categoría más frecuente y así sucesivamente.


### Stage 1: Creación de columna de labels codificados

In [28]:
Stage_1 = StringIndexer(inputCol = label_col, outputCol = "labels")
Stage_1

StringIndexer_2d1ff8390d95

In [29]:
# Ver la salida del stage 
Scaler_1 = Stage_1.fit(train)
data_transform = Scaler_1.transform(train)
data_transform.show(5)

+--------+------+------+----+-------+--------+------+
|Survived|Pclass|   Sex| Age|   Fare|Embarked|labels|
+--------+------+------+----+-------+--------+------+
|     0.0|   1.0|female| 2.0| 151.55|       S|   0.0|
|     0.0|   1.0|female|25.0| 151.55|       S|   0.0|
|     0.0|   1.0|  male|18.0|  108.9|       C|   0.0|
|     0.0|   1.0|  male|19.0|  263.0|       S|   0.0|
|     0.0|   1.0|  male|21.0|77.2875|       S|   0.0|
+--------+------+------+----+-------+--------+------+
only showing top 5 rows



### Stage 2: Transformación de features categóricos a numéricos

In [30]:
# Extraemos nombres de columnas categóricas que formaran los features (ie. no incluyen el label):
categoric_cols_features = [ c for c in categoric_columns if c != label_col ]
categoric_cols_features

['Sex', 'Embarked']

In [31]:
# Creamos nombres de nuevas columnas numéricas que formaran los features:
categoric_cols_features_num = [ c+'_num' for c in categoric_cols_features]
categoric_cols_features_num

['Sex_num', 'Embarked_num']

In [32]:
# Agregamos nuevas columnas numéricas para los features:
Stage_2 = []
for i in range( len(categoric_cols_features) ):    
    st_i = StringIndexer(inputCol = categoric_cols_features[i] , outputCol = categoric_cols_features_num[i] , handleInvalid='keep')
    Stage_2.append(st_i)

Stage_2

[StringIndexer_fe775d9c46fd, StringIndexer_3470a20d7f2f]

In [33]:
# Ver la salida del stage 
for st in Stage_2:
    Scaler_i = st.fit(data_transform)
    data_transform = Scaler_i.transform(data_transform)
    
data_transform.show(5)

+--------+------+------+----+-------+--------+------+-------+------------+
|Survived|Pclass|   Sex| Age|   Fare|Embarked|labels|Sex_num|Embarked_num|
+--------+------+------+----+-------+--------+------+-------+------------+
|     0.0|   1.0|female| 2.0| 151.55|       S|   0.0|    1.0|         0.0|
|     0.0|   1.0|female|25.0| 151.55|       S|   0.0|    1.0|         0.0|
|     0.0|   1.0|  male|18.0|  108.9|       C|   0.0|    0.0|         1.0|
|     0.0|   1.0|  male|19.0|  263.0|       S|   0.0|    0.0|         0.0|
|     0.0|   1.0|  male|21.0|77.2875|       S|   0.0|    0.0|         0.0|
+--------+------+------+----+-------+--------+------+-------+------------+
only showing top 5 rows



### Stage 3: Creación de columna de vectores de features

In [34]:
# Extraemos columnas numéricas para los features:
numeric_cols_features = [col for col in numerical_columns if col != label_col]
numeric_cols_features

['Pclass', 'Age', 'Fare']

In [35]:
# Coleccionamos todas las columnas numericas que formaran a los features:
required_features = numeric_cols_features + categoric_cols_features_num
required_features

['Pclass', 'Age', 'Fare', 'Sex_num', 'Embarked_num']

In [36]:
Stage_3 = VectorAssembler(inputCols=required_features, outputCol='features')
Stage_3

VectorAssembler_2d9be194c06f

In [37]:
# Ver la salida del stage 
data_transform = Stage_3.transform(data_transform)
data_transform.show(5)

+--------+------+------+----+-------+--------+------+-------+------------+--------------------+
|Survived|Pclass|   Sex| Age|   Fare|Embarked|labels|Sex_num|Embarked_num|            features|
+--------+------+------+----+-------+--------+------+-------+------------+--------------------+
|     0.0|   1.0|female| 2.0| 151.55|       S|   0.0|    1.0|         0.0|[1.0,2.0,151.55,1...|
|     0.0|   1.0|female|25.0| 151.55|       S|   0.0|    1.0|         0.0|[1.0,25.0,151.55,...|
|     0.0|   1.0|  male|18.0|  108.9|       C|   0.0|    0.0|         1.0|[1.0,18.0,108.9,0...|
|     0.0|   1.0|  male|19.0|  263.0|       S|   0.0|    0.0|         0.0|[1.0,19.0,263.0,0...|
|     0.0|   1.0|  male|21.0|77.2875|       S|   0.0|    0.0|         0.0|[1.0,21.0,77.2875...|
+--------+------+------+----+-------+--------+------+-------+------------+--------------------+
only showing top 5 rows



### Stage 4: Creación de columna de vectores de features reescalados (entre 0 y 1)

In [38]:
### Reescalamos los features para tomen valore entre 0 y 1:
Stage_4 = MinMaxScaler(min=0.0, max=1.0, inputCol="features", outputCol="scaled_features")
Stage_4

MinMaxScaler_081ee089d2b9

In [39]:
# Ver la salida del stage:
Scaler_4 = Stage_4.fit(data_transform)
data_transform = Scaler_4.transform(data_transform)
data_transform.show(5)
data_transform.select(['features','scaled_features']).show(5,truncate=False)


+--------+------+------+----+-------+--------+------+-------+------------+--------------------+--------------------+
|Survived|Pclass|   Sex| Age|   Fare|Embarked|labels|Sex_num|Embarked_num|            features|     scaled_features|
+--------+------+------+----+-------+--------+------+-------+------------+--------------------+--------------------+
|     0.0|   1.0|female| 2.0| 151.55|       S|   0.0|    1.0|         0.0|[1.0,2.0,151.55,1...|[0.0,0.0198542347...|
|     0.0|   1.0|female|25.0| 151.55|       S|   0.0|    1.0|         0.0|[1.0,25.0,151.55,...|[0.0,0.3088715757...|
|     0.0|   1.0|  male|18.0|  108.9|       C|   0.0|    0.0|         1.0|[1.0,18.0,108.9,0...|[0.0,0.2209097763...|
|     0.0|   1.0|  male|19.0|  263.0|       S|   0.0|    0.0|         0.0|[1.0,19.0,263.0,0...|(5,[1,2],[0.23347...|
|     0.0|   1.0|  male|21.0|77.2875|       S|   0.0|    0.0|         0.0|[1.0,21.0,77.2875...|(5,[1,2],[0.25860...|
+--------+------+------+----+-------+--------+------+-------+---

In [40]:
data_transform.printSchema()

root
 |-- Survived: double (nullable = true)
 |-- Pclass: double (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = false)
 |-- Fare: double (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- labels: double (nullable = false)
 |-- Sex_num: double (nullable = false)
 |-- Embarked_num: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- scaled_features: vector (nullable = true)



### Stage 5: Construcción del modelo

In [41]:
# Declaramos el modelo:
Stage_5 = RandomForestClassifier(
    labelCol= label_col,
    featuresCol='scaled_features',
    maxDepth=6
    )

## Ensamble del Pipeline

In [42]:
# Coleccionamos todos los Satges en una lista:
Stages_list = [Stage_1] + Stage_2 + [ Stage_3, Stage_4, Stage_5]
Stages_list

[StringIndexer_2d1ff8390d95,
 StringIndexer_fe775d9c46fd,
 StringIndexer_3470a20d7f2f,
 VectorAssembler_2d9be194c06f,
 MinMaxScaler_081ee089d2b9,
 RandomForestClassifier_7cd2db5b36fd]

In [43]:
pipeline = Pipeline(stages = Stages_list )
pipeline

Pipeline_04f86f4000d0

## Ejecución del Pipeline

In [44]:
# Ejecutamos el pipeline y se entrena el modelo con los datos de entrenamiento:

model = pipeline.fit(train)

model # <-- Contiene el modelo entrenado

PipelineModel_e70f3e0108cf

## Predecimos datos de entrenamiento y pruebas:

In [45]:
cols_to_show = ['labels', 'scaled_features', 'rawPrediction','probability','prediction']

In [46]:
# Predecimos datos de entrenamiento:
train_prediction = model.transform(train)
train_prediction.select(cols_to_show).show(5)

+------+--------------------+--------------------+--------------------+----------+
|labels|     scaled_features|       rawPrediction|         probability|prediction|
+------+--------------------+--------------------+--------------------+----------+
|   0.0|[0.0,0.0198542347...|[11.7948841698841...|[0.58974420849420...|       0.0|
|   0.0|[0.0,0.3088715757...|[8.29735954245300...|[0.41486797712265...|       1.0|
|   0.0|[0.0,0.2209097763...|[11.9040636918369...|[0.59520318459184...|       0.0|
|   0.0|(5,[1,2],[0.23347...|[16.1695040283520...|[0.80847520141760...|       0.0|
|   0.0|(5,[1,2],[0.25860...|[14.1218849807329...|[0.70609424903664...|       0.0|
+------+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [47]:
# Predecimos datos de pruebas:
test_prediction = model.transform(test)
test_prediction.select(cols_to_show).show(5)

+------+--------------------+--------------------+--------------------+----------+
|labels|     scaled_features|       rawPrediction|         probability|prediction|
+------+--------------------+--------------------+--------------------+----------+
|   0.0|[0.0,0.6230208595...|[0.83350077738753...|[0.04167503886937...|       1.0|
|   0.0|(5,[1,2],[0.23347...|[13.2196645369580...|[0.66098322684790...|       0.0|
|   0.0|[0.0,0.2963056044...|[14.8217752905830...|[0.74108876452915...|       0.0|
|   0.0|[0.0,0.2963056044...|[15.8217752905830...|[0.79108876452915...|       0.0|
|   0.0|[0.0,0.3717014325...|[12.1670512674690...|[0.60835256337345...|       0.0|
+------+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



## Evaluate the Predictions

In [48]:
# Seleccionamos las columnas 'label' y 'prediction' de los DataFrames predichos:
train_Labels_Prediction = train_prediction.select('labels','prediction')
test_Labels_Prediction = test_prediction.select('labels','prediction')

In [49]:
# Elegimos las siguientes métricas:
metrics = ['weightedPrecision', 'weightedRecall', 'accuracy']

# Obtenemos los valores de cada métrica, aplicadas a los datos de entrenamiento, validación y pruebas:
for metric in metrics:
    # Declaramos la métrica actual:
    evaluator = MulticlassClassificationEvaluator(labelCol='labels', predictionCol='prediction', metricName=metric)
    
    f_train = evaluator.evaluate( train_Labels_Prediction )
    f_test = evaluator.evaluate( test_Labels_Prediction )

    print('\nValores de metrica << ' + metric ,'>>' )
    print('Train = ' , f_train)
    print('Test = ' , f_test)


Valores de metrica << weightedPrecision >>
Train =  0.879854381826682
Test =  0.7893004801280341

Valores de metrica << weightedRecall >>
Train =  0.8774104683195593
Test =  0.7852760736196318

Valores de metrica << accuracy >>
Train =  0.8774104683195593
Test =  0.7852760736196319


### Guardamos el modelo entrenado:

In [50]:
import os
from os import system

path = './Models_trained/pipeline_model_Titanic_rf'

# En caso de que exista la ruta la borramos para volver a crearla:
if os.path.exists( path ):
    # Comando a ejecutar:
    comando = 'rm -r ' + path
    # Ejecutamos el comando:
    system( comando )
    print('Se ejecutó: ',comando)


# Guardamos el modelo entrenado:
model.save( path )

Se ejecutó:  rm -r ./Models_trained/pipeline_model_Titanic_rf


### Cargamos el modelo entrenado y hacemos una predicción:

In [51]:
# Cargamos el modelo:
#from pyspark.ml.classification import MultilayerPerceptronClassificationModel
from pyspark.ml import PipelineModel

modelo_NN = PipelineModel.load( path )

In [52]:
# Predecimos datos de pruebas:
predictions_test = modelo_NN.transform(test)
predictions_test.show(5)

+--------+------+------+----+--------+--------+------+-------+------------+--------------------+--------------------+--------------------+--------------------+----------+
|Survived|Pclass|   Sex| Age|    Fare|Embarked|labels|Sex_num|Embarked_num|            features|     scaled_features|       rawPrediction|         probability|prediction|
+--------+------+------+----+--------+--------+------+-------+------------+--------------------+--------------------+--------------------+--------------------+----------+
|     0.0|   1.0|female|50.0| 28.7125|       C|   0.0|    1.0|         1.0|[1.0,50.0,28.7125...|[0.0,0.6230208595...|[0.83350077738753...|[0.04167503886937...|       1.0|
|     0.0|   1.0|  male|19.0|    53.1|       S|   0.0|    0.0|         0.0|[1.0,19.0,53.1,0....|(5,[1,2],[0.23347...|[13.2196645369580...|[0.66098322684790...|       0.0|
|     0.0|   1.0|  male|24.0|    79.2|       C|   0.0|    0.0|         1.0|[1.0,24.0,79.2,0....|[0.0,0.2963056044...|[14.8217752905830...|[0.7410

#### Calculamos la precisión manualmente:

In [53]:
wrong_test = predictions_test.filter( predictions_test['labels']!=predictions_test['prediction'] ).count()
right_test = predictions_test.filter( predictions_test['labels']==predictions_test['prediction'] ).count()

acc_test =  right_test/(right_test + wrong_test)

print( 'La precisión en las predicciones de los datos de prueba es del',round(acc_test * 100),'%' )

La precisión en las predicciones de los datos de prueba es del 79 %


### Consulta de predicciones:

#### Creamos mapeo de labels:

Crearemos el DataFrame de pandas `labels_map` que contiene la relación entre los labels del dataset original y los labels reescalados que se usaron para entrenar el modelo.

In [54]:
Scaler_labels = Stage_1.fit(train)
data_transform = Scaler_labels.transform(train)
data_transform.show(5)

+--------+------+------+----+-------+--------+------+
|Survived|Pclass|   Sex| Age|   Fare|Embarked|labels|
+--------+------+------+----+-------+--------+------+
|     0.0|   1.0|female| 2.0| 151.55|       S|   0.0|
|     0.0|   1.0|female|25.0| 151.55|       S|   0.0|
|     0.0|   1.0|  male|18.0|  108.9|       C|   0.0|
|     0.0|   1.0|  male|19.0|  263.0|       S|   0.0|
|     0.0|   1.0|  male|21.0|77.2875|       S|   0.0|
+--------+------+------+----+-------+--------+------+
only showing top 5 rows



In [55]:
# Obtenemos el mapeo entre la columna de labels reales y labels reescalados:
labels_map = data_transform.select( [label_col , 'labels'] ).groupBy( [label_col,'labels'] ).count().toPandas()
labels_map = labels_map.sort_values('labels')

# Mapeo de labels:
labels_map[['labels', label_col]]

Unnamed: 0,labels,Survived
1,0.0,0.0
0,1.0,1.0


In [56]:
# Llamamos a las componentes de features que necesitamos
feature_cols

['Pclass', 'Sex', 'Age', 'Fare', 'Embarked']

In [57]:
# Ingresamos features manualmente:
datos = [ 
    (3.0 , 'male',   25 , 25.2 , 'C'), #<-- Registro 1
    (1.0 , 'female', 75 , 135.65, 'S')  #<-- Registro 2
    ] 

sample_df = spark.createDataFrame(datos, feature_cols)

sample_df.show()

+------+------+---+------+--------+
|Pclass|   Sex|Age|  Fare|Embarked|
+------+------+---+------+--------+
|   3.0|  male| 25|  25.2|       C|
|   1.0|female| 75|135.65|       S|
+------+------+---+------+--------+



In [58]:
sample_data_test = modelo_NN.transform(sample_df)
sample_data_test.show()

+------+------+---+------+--------+-------+------------+--------------------+--------------------+--------------------+--------------------+----------+
|Pclass|   Sex|Age|  Fare|Embarked|Sex_num|Embarked_num|            features|     scaled_features|       rawPrediction|         probability|prediction|
+------+------+---+------+--------+-------+------------+--------------------+--------------------+--------------------+--------------------+----------+
|   3.0|  male| 25|  25.2|       C|    0.0|         1.0|[3.0,25.0,25.2,0....|[1.0,0.3088715757...|[13.5790954435640...|[0.67895477217820...|       0.0|
|   1.0|female| 75|135.65|       S|    1.0|         0.0|[1.0,75.0,135.65,...|[0.0,0.9371701432...|[0.36428238316914...|[0.01821411915845...|       1.0|
+------+------+---+------+--------+-------+------------+--------------------+--------------------+--------------------+--------------------+----------+



### Cerramos sesión Spark

In [59]:
spark.stop()