# Titanic dataset

Predicting what passengers survived the titanic crash based on the passengers features

In [123]:
# fiding spark to initiate
import findspark
findspark.init('/home/ubuntu/Spark/spark-3.3.0-bin-hadoop3')

In [124]:
# importing Spark as usual
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Logistic_Regression_Part2').getOrCreate()

In [125]:
# defining the df from CSV
df = spark.read.format("csv").option("header", "true").load("titanic.csv")

In [147]:
# checking colums
df.columns

['Parch',
 'SibSp',
 'Sex',
 'Name',
 'Pclass',
 'Survived',
 'PassengerId',
 'Fare',
 'Age',
 'Ticket',
 'Cabin',
 'Embarked']

In [127]:
#Cast method helps us to change the data type for certain colums. Mostlikley the csv unpacking method changes all values to str.
df = df.selectExpr("cast(Parch as int) Parch", "cast(SibSp as int) SibSp", "cast(Sex as string) Sex",
              "cast(Name as string) Name", "cast(Pclass as int) Pclass", 
              "cast(Survived as int) Survived", "cast(PassengerId as int) PassengerId", 
              "cast(Fare as float) Fare", "cast(Age as float) Age", 
              "cast(Ticket as string) Ticket", "cast(Cabin as string) Cabin", "cast(Embarked as string) Embarked",)
df.head(2)

[Row(Parch=0, SibSp=1, Sex='male', Name='Braund, Mr. Owen Harris', Pclass=3, Survived=0, PassengerId=1, Fare=7.25, Age=22.0, Ticket='A/5 21171', Cabin=None, Embarked='S'),
 Row(Parch=0, SibSp=1, Sex='female', Name='Cumings, Mrs. John Bradley (Florence Briggs Thayer)', Pclass=1, Survived=1, PassengerId=2, Fare=71.2833023071289, Age=38.0, Ticket='PC 17599', Cabin='C85', Embarked='C')]

In [128]:
df.columns

['Parch',
 'SibSp',
 'Sex',
 'Name',
 'Pclass',
 'Survived',
 'PassengerId',
 'Fare',
 'Age',
 'Ticket',
 'Cabin',
 'Embarked']

In [129]:
my_cols = df.select([
    'Survived',
    'Pclass',
    'Sex',
    'Age',
    'SibSp',
    'Parch',
    'Fare',
    'Cabin',
    'Embarked'
])

In [148]:
#Deal with missing data - keeping it simple - just droping the nan values. 
# The other way is to fill in the missing data
my_final_data = my_cols.na.drop()

In [131]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer, StringIndexer, OneHotEncoder

In [149]:
# Creating the string Indexer - in order to further create the SexIndex and later use it to assign the vector value byt OneHotEncoding
gender_indexer = StringIndexer(inputCol='Sex', outputCol='SexIndex')
# Preforming One Hot Encoding -> Indexing through the vector form of some example category e.g category A will look like [1, 0, 0]
# So the array that will indicate the category that it was
gender_encoder = OneHotEncoder(inputCol='SexIndex', outputCol='SexVec')

In [150]:
# Creating the string Indexer - in order to further create the SexIndex and later use it to assign the vector value byt OneHotEncoding
embark_indexer = StringIndexer(inputCol='Embarked', outputCol='EmbarkIndex')
# Preforming One Hot Encoding -> Indexing through the vector form of some example category e.g category A will look like [1, 0, 0]
# So the array that will indicate the category that it was
embark_encoder = OneHotEncoder(inputCol='EmbarkIndex', outputCol='EmbarkVec')

In [134]:
# Assining the values to Vector 'features' column in order to enable our Logistic Regression method to use the categorycial data.
assembler = VectorAssembler(inputCols=['Pclass', 'SexVec', 'EmbarkVec', 'Age','SibSp','Parch','Fare'], outputCol='features')

In [135]:
from pyspark.ml.classification import LogisticRegression
# Pipeline helps us to set the stages for complicated datasets
from pyspark.ml import Pipeline

In [151]:
# Now assigning var for Logistic regression to take info consideration created vector column 'features'
# and the final 'Survived' status
log_reg_titanic = LogisticRegression(featuresCol='features', labelCol='Survived')

In [137]:
# Now creating the pipeline variable which contains a param of stages that indicates everything we want to do.
# Pipline actually exists only for the assembler method. It a kind of way to tell python what we intend to do. 
# Also at the end we are passing the LG model created ealier.
# Further usage of the pipeline invloves it's usage as a normal model.
pipeline = Pipeline(stages=[gender_indexer, embark_indexer, gender_encoder, embark_encoder, assembler, log_reg_titanic])

In [138]:
# As ususaly we are splitting the final data 
train_data, test_data = my_final_data.randomSplit([0.7, 0.3])

In [156]:
# So now we are fitting created model. We are using our pipeline as we would normally do with the machine learning object
fit_model = pipeline.fit(train_data)

In [152]:
# Now we are transfforming our data. So to speak test our data on the test dataset
results = fit_model.transform(test_data)

In [153]:
# Importing the BinaryClassificationEvaluator allows us to get some clasyffication metrics on binary classification
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [154]:
# Creating our Evaluating object
# The rawPredictionCol='prediction' column name comes from the results. And more particualry the transform method that by 
# default names the data tested on test data 'prediction'
# Furthermore the labelCol='Survived' label col is the label of our orginal data so: 'Survived'
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Survived')

In [1]:
my_eval.evaluate(results)

NameError: name 'my_eval' is not defined

In [144]:
# Displaying our comparishment between who actually survived and who was calculated to survive by our model
results.select('Survived', 'prediction').show()

+--------+----------+
|Survived|prediction|
+--------+----------+
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
+--------+----------+
only showing top 20 rows



In [155]:
# In order to evaluate our results we are using the BinaryClassificationEvaluator again
# It returns the area under the curve.
AUC = my_eval.evaluate(results)

In [146]:
AUC

0.7146341463414634