In [123]:
import findspark
findspark.init('/home/ubuntu/Spark/spark-3.3.0-bin-hadoop3')

In [124]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Logistic_Regression_Part2').getOrCreate()

In [125]:
df = spark.read.format("csv").option("header", "true").load("titanic.csv")

In [126]:
df.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [127]:
#Cast method helps us to change the data type for certain colums. Mostlikley the csv unpacking method changes all values to str.
df = df.selectExpr("cast(Parch as int) Parch", "cast(SibSp as int) SibSp", "cast(Sex as string) Sex",
              "cast(Name as string) Name", "cast(Pclass as int) Pclass", 
              "cast(Survived as int) Survived", "cast(PassengerId as int) PassengerId", 
              "cast(Fare as float) Fare", "cast(Age as float) Age", 
              "cast(Ticket as string) Ticket", "cast(Cabin as string) Cabin", "cast(Embarked as string) Embarked",)
df.head(2)

[Row(Parch=0, SibSp=1, Sex='male', Name='Braund, Mr. Owen Harris', Pclass=3, Survived=0, PassengerId=1, Fare=7.25, Age=22.0, Ticket='A/5 21171', Cabin=None, Embarked='S'),
 Row(Parch=0, SibSp=1, Sex='female', Name='Cumings, Mrs. John Bradley (Florence Briggs Thayer)', Pclass=1, Survived=1, PassengerId=2, Fare=71.2833023071289, Age=38.0, Ticket='PC 17599', Cabin='C85', Embarked='C')]

In [128]:
df.columns

['Parch',
 'SibSp',
 'Sex',
 'Name',
 'Pclass',
 'Survived',
 'PassengerId',
 'Fare',
 'Age',
 'Ticket',
 'Cabin',
 'Embarked']

In [129]:
my_cols = df.select([
    'Survived',
    'Pclass',
    'Sex',
    'Age',
    'SibSp',
    'Parch',
    'Fare',
    'Cabin',
    'Embarked'
])

In [130]:
#Deal with missing data - keeping it simple
my_final_data = my_cols.na.drop()

In [131]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer, StringIndexer, OneHotEncoder

In [132]:
gender_indexer = StringIndexer(inputCol='Sex', outputCol='SexIndex')
# Preforming One Hot Encoding -> Indexing through the vector form of some example category e.g category A will look like [1, 0, 0]
gender_encoder = OneHotEncoder(inputCol='SexIndex', outputCol='SexVec')

In [133]:
embark_indexer = StringIndexer(inputCol='Embarked', outputCol='EmbarkIndex')
embark_encoder = OneHotEncoder(inputCol='EmbarkIndex', outputCol='EmbarkVec')

In [134]:
assembler = VectorAssembler(inputCols=['Pclass', 'SexVec', 'EmbarkVec', 'Age','SibSp','Parch','Fare'], outputCol='features')

In [135]:
from pyspark.ml.classification import LogisticRegression
# Pipeline helps us to set the stages for complicated datasets
from pyspark.ml import Pipeline

In [136]:
log_reg_titanic = LogisticRegression(featuresCol='features', labelCol='Survived')

In [137]:
pipeline = Pipeline(stages=[gender_indexer, embark_indexer, gender_encoder, embark_encoder, assembler, log_reg_titanic])

In [138]:
train_data, test_data = my_final_data.randomSplit([0.7, 0.3])

In [139]:
fit_model = pipeline.fit(train_data)

22/09/13 10:59:07 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/09/13 10:59:07 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS


In [140]:
results = fit_model.transform(test_data)

In [141]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [142]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Survived')

In [143]:
my_eval.evaluate(results)

0.7146341463414634

In [144]:
results.select('Survived', 'prediction').show()

+--------+----------+
|Survived|prediction|
+--------+----------+
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
+--------+----------+
only showing top 20 rows



In [145]:
AUC = my_eval.evaluate(results)

In [146]:
AUC

0.7146341463414634