# Regresión Logística: Titanic

In [0]:
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder.appName('titanic').getOrCreate()

### Recuperar los datos

In [0]:
ruta = 'dbfs:/FileStore/shared_uploads/jgamarramoreno@gmail.com/titanic.csv'

In [0]:
datos = spark.read.csv(ruta,inferSchema=True,header=True)

In [0]:
datos.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [0]:
datos.columns

Out[6]: ['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [0]:
mis_cols = datos.select(['Survived',
                         'Pclass',
                         'Sex',
                         'Age',
                         'SibSp',
                         'Parch',
                         'Fare',
                         'Embarked'])

In [0]:
mis_datos_final = mis_cols.na.drop()

In [0]:
print(datos.count(),len(datos.columns))

891 12


In [0]:
print(mis_datos_final.count(),len(mis_datos_final.columns))

712 8


### Transformar las columnas categoricas

Importar clases para las transformaciones

In [0]:
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,
                               OneHotEncoder,StringIndexer)

In [0]:
mis_datos_final.columns

Out[15]: ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

Para genero (Sexo)

In [0]:
indexador_genero = StringIndexer(inputCol='Sex',outputCol='SexIndice')

In [0]:
codificador_genero = OneHotEncoder(inputCol='SexIndice',outputCol='SexVector')

Para embarque (Embarked)

In [0]:
indexador_embarque = StringIndexer(inputCol='Embarked',outputCol='EmbarkedIndice')
codificador_embarque = OneHotEncoder(inputCol='EmbarkedIndice',outputCol='EmbarkedVector')

Vector Ensamblador

In [0]:
mis_datos_final.columns

Out[20]: ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

In [0]:
ensamblador = VectorAssembler(inputCols=['Pclass',
                                         'SexVector',
                                         'Age',
                                         'SibSp',
                                         'Parch',
                                         'Fare',
                                         'EmbarkedVector'],
                              outputCol='features')

### Pipelines (Canales, Tuberías)

In [0]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

In [0]:
reg_log_titanic = LogisticRegression(featuresCol='features',
                                    labelCol='Survived')

In [0]:
pipeline = Pipeline(stages=[indexador_genero,indexador_embarque,
                           codificador_genero,codificador_embarque,
                           ensamblador,reg_log_titanic])

### División en conjunto de entrenamiento y prueba

In [0]:
titanic_train, titanic_test = mis_datos_final.randomSplit([0.7,0.3])

### Entrenar el modelo

In [0]:
modelo_entrenado = pipeline.fit(titanic_train)

### Evaluar el modelo

In [0]:
resultados = modelo_entrenado.transform(titanic_test)

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [0]:
mi_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                       labelCol='Survived')

In [0]:
resultados.select('Survived','prediction').show()

+--------+----------+
|Survived|prediction|
+--------+----------+
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
+--------+----------+
only showing top 20 rows



In [0]:
AUC = mi_eval.evaluate(resultados)

In [0]:
AUC

Out[33]: 0.8212305986696231

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
evaluador02 = MulticlassClassificationEvaluator(predictionCol='prediction',
                                               labelCol='Survived',
                                               metricName='accuracy')

In [0]:
acc = evaluador02.evaluate(resultados)

In [0]:
acc

Out[40]: 0.8364485981308412