In [1]:
import numpy as np
import pandas as pd
import pyspark
import pyspark.sql.functions as func

In [2]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import types, functions
from pyspark.sql.types import *

In [3]:
from pyspark.sql.functions import *

# SparkContext

Initialize the engine.

In [4]:
sc = SparkContext()

Define the session, specifying the engine as the parent class.

In [5]:
session = SparkSession(sc)
session

## Exploratory Data Analysis

Import the dataset for this exercise.

In [6]:
sp = session.read.option("inferSchema", True).option("header", True).csv('./res/data/titanic_dataset.csv')

Perform descriptive analysis.

In [7]:
sp.show(truncate=True)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

In [8]:
sp.count()

891

In [9]:
sp.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



Convert all `double` types to `float`.

In [10]:
def convertColumn(df, columns, newtype):
    
    for col in columns:
        df = df.withColumn(col, df[col].cast(newtype))
        
    return df

In [11]:
# list all features with type == 'double'
convertFeatures = [col for col in sp.columns if sp.select(col).dtypes[0][1] == 'double']

# convert all features with type == 'double' to type == 'float'
sp = convertColumn(sp, convertFeatures, 'float')

In [12]:
sp.describe('Survived', 'Age', 'SibSp', 'Parch', 'Fare').show()

+-------+-------------------+------------------+------------------+-------------------+-----------------+
|summary|           Survived|               Age|             SibSp|              Parch|             Fare|
+-------+-------------------+------------------+------------------+-------------------+-----------------+
|  count|                891|               714|               891|                891|              891|
|   mean| 0.3838383838383838| 29.69911764704046|0.5230078563411896|0.38159371492704824|32.20420804114722|
| stddev|0.48659245426485753|14.526497332370992|1.1027434322934315| 0.8060572211299488|49.69342916316158|
|    min|                  0|              0.42|                 0|                  0|              0.0|
|    max|                  1|              80.0|                 8|                  6|         512.3292|
+-------+-------------------+------------------+------------------+-------------------+-----------------+



In [13]:
sp.count()

891

### Age

In [14]:
sp.groupby('Survived').avg('Age').select('*', round('avg(Age)', 2)).show()

+--------+------------------+------------------+
|Survived|          avg(Age)|round(avg(Age), 2)|
+--------+------------------+------------------+
|       1|28.343689655127196|             28.34|
|       0| 30.62617924528302|             30.63|
+--------+------------------+------------------+



In [15]:
sp.crosstab('Age', 'Survived').sort('Age_Survived').show()

+------------+---+---+
|Age_Survived|  0|  1|
+------------+---+---+
|        0.42|  0|  1|
|        0.67|  0|  1|
|        0.75|  0|  2|
|        0.83|  0|  2|
|        0.92|  0|  1|
|         1.0|  2|  5|
|        10.0|  2|  0|
|        11.0|  3|  1|
|        12.0|  0|  1|
|        13.0|  0|  2|
|        14.0|  3|  3|
|        14.5|  1|  0|
|        15.0|  1|  4|
|        16.0| 11|  6|
|        17.0|  7|  6|
|        18.0| 17|  9|
|        19.0| 16|  9|
|         2.0|  7|  3|
|        20.0| 12|  3|
|        20.5|  1|  0|
+------------+---+---+
only showing top 20 rows



In [16]:
sp.crosstab('Pclass', 'Survived').sort('Pclass_Survived').withColumn(
    'Survival_Rate', round(( col('1') / ( col('1') + col('0') ) * 100), 2)
).show()

+---------------+---+---+-------------+
|Pclass_Survived|  0|  1|Survival_Rate|
+---------------+---+---+-------------+
|              1| 80|136|        62.96|
|              2| 97| 87|        47.28|
|              3|372|119|        24.24|
+---------------+---+---+-------------+



## Data Preprocessing

### Null Values

Replace 'Age' null values with mean.

In [17]:
sp.select(
    [count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in sp.columns]
).show()

+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|PassengerId|Survived|Pclass|Name|Sex|Age|SibSp|Parch|Ticket|Fare|Cabin|Embarked|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|          0|       0|     0|   0|  0|177|    0|    0|     0|   0|  687|       2|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+



In [18]:
# calculate average 'Age'
avg = sp.select(func.mean(col('Age'))).collect()


sp = sp.fillna(value=np.round(avg[0][0], 2), subset=['Age'])

In [19]:
sp.select(
    [count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in sp.columns]
).show()

+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|PassengerId|Survived|Pclass|Name|Sex|Age|SibSp|Parch|Ticket|Fare|Cabin|Embarked|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|          0|       0|     0|   0|  0|  0|    0|    0|     0|   0|  687|       2|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+



Drop 'Cabin' due to excessive null values.

In [20]:
sp = sp.drop('Cabin')
sp = sp.na.drop(subset=['Embarked'])

In [21]:
sp.select(
    [count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in sp.columns]
).show()

+-----------+--------+------+----+---+---+-----+-----+------+----+--------+
|PassengerId|Survived|Pclass|Name|Sex|Age|SibSp|Parch|Ticket|Fare|Embarked|
+-----------+--------+------+----+---+---+-----+-----+------+----+--------+
|          0|       0|     0|   0|  0|  0|    0|    0|     0|   0|       0|
+-----------+--------+------+----+---+---+-----+-----+------+----+--------+



## Pipeline

In [22]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

### Isolate the Categories and Encode/Index

In [23]:
sp.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: float (nullable = false)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: float (nullable = true)
 |-- Embarked: string (nullable = true)



In [24]:
categories = ['Sex', 'Embarked']

In [34]:
stages = []

for cat in categories:
    
    stringIdx = StringIndexer(inputCol=cat,
                              outputCol=cat + 'Index')
    
    encoder = OneHotEncoder(inputCols=[stringIdx.getOutputCol()],
                            outputCols=[cat + 'classVec'])
    
    stages += [stringIdx, encoder]

In [35]:
stages

[StringIndexer_16795471719e,
 OneHotEncoder_ff49f5830de1,
 StringIndexer_0dc2dc36c270,
 OneHotEncoder_cd5378f5a59b]

### Target or Label Variable

In [36]:
targetIdx = StringIndexer(inputCol='Survived',
                          outputCol=('Survived' + 'Label'))

stages += [targetIdx]

### Define the Columns (or Vectors) using VectorAssembler

In [37]:
numerical = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

In [38]:
assemblerInputs = [cat + "classVec" for cat in categories] + numerical

In [39]:
assembler = VectorAssembler(inputCols=assemblerInputs,
                            outputCol="features")

stages += [assembler]

### Build the Pipeline

In [46]:
spPared = sp.select(categories + numerical + ['Survived'])

In [47]:
spPared.show()

+------+--------+------+----+-----+-----+-------+--------+
|   Sex|Embarked|Pclass| Age|SibSp|Parch|   Fare|Survived|
+------+--------+------+----+-----+-----+-------+--------+
|  male|       S|     3|22.0|    1|    0|   7.25|       0|
|female|       C|     1|38.0|    1|    0|71.2833|       1|
|female|       S|     3|26.0|    0|    0|  7.925|       1|
|female|       S|     1|35.0|    1|    0|   53.1|       1|
|  male|       S|     3|35.0|    0|    0|   8.05|       0|
|  male|       Q|     3|29.7|    0|    0| 8.4583|       0|
|  male|       S|     1|54.0|    0|    0|51.8625|       0|
|  male|       S|     3| 2.0|    3|    1| 21.075|       0|
|female|       S|     3|27.0|    0|    2|11.1333|       1|
|female|       C|     2|14.0|    1|    0|30.0708|       1|
|female|       S|     3| 4.0|    1|    1|   16.7|       1|
|female|       S|     1|58.0|    0|    0|  26.55|       1|
|  male|       S|     3|20.0|    0|    0|   8.05|       0|
|  male|       S|     3|39.0|    1|    5| 31.275|       

In [48]:
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(spPared)
model = pipelineModel.transform(spPared)

In [49]:
model.show()

+------+--------+------+----+-----+-----+-------+--------+--------+-------------+-------------+----------------+-------------+--------------------+
|   Sex|Embarked|Pclass| Age|SibSp|Parch|   Fare|Survived|SexIndex|  SexclassVec|EmbarkedIndex|EmbarkedclassVec|SurvivedLabel|            features|
+------+--------+------+----+-----+-----+-------+--------+--------+-------------+-------------+----------------+-------------+--------------------+
|  male|       S|     3|22.0|    1|    0|   7.25|       0|     0.0|(1,[0],[1.0])|          0.0|   (2,[0],[1.0])|          0.0|[1.0,1.0,0.0,3.0,...|
|female|       C|     1|38.0|    1|    0|71.2833|       1|     1.0|    (1,[],[])|          1.0|   (2,[1],[1.0])|          1.0|[0.0,0.0,1.0,1.0,...|
|female|       S|     3|26.0|    0|    0|  7.925|       1|     1.0|    (1,[],[])|          0.0|   (2,[0],[1.0])|          1.0|(8,[1,3,4,7],[1.0...|
|female|       S|     1|35.0|    1|    0|   53.1|       1|     1.0|    (1,[],[])|          0.0|   (2,[0],[1.0])|

### Build the Classifiers

In [52]:
from pyspark.ml.linalg import DenseVector

In [53]:
input_data = model.rdd.map(lambda x: (x["SurvivedLabel"], DenseVector(x["features"])))

In [57]:
spTrain = session.createDataFrame(input_data, ["Survived", "features"])

#### Create Train/Test Split

In [58]:
train_data, test_data = spTrain.randomSplit([.8,.2])

#### Logistic Regression

In [55]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier

In [59]:
lr = LogisticRegression(labelCol="Survived",
                        featuresCol="features",
                        maxIter=10,
                        regParam=0.3)

In [60]:
linearModel = lr.fit(train_data)

#### Random Forest Classifier

In [61]:
rf = RandomForestClassifier(labelCol='Survived',
                            featuresCol='features',
                            maxDepth=5,
                            maxBins=32)

In [62]:
randomForest = rf.fit(train_data)

### Train and Evaluate the Models

In [65]:
lrPredictions = linearModel.transform(test_data)
rfPredictions = randomForest.transform(test_data)

In [66]:
lrPredictions.printSchema()

root
 |-- Survived: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [67]:
rfPredictions.printSchema()

root
 |-- Survived: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



#### Evaluate Linear Model

In [78]:
checkLR = lrPredictions.select('Survived', 'prediction')

In [79]:
checkLR.groupby('Survived').agg({'Survived': 'count'}).show()
checkLR.groupby('prediction').agg({'prediction': 'count'}).show()

+--------+---------------+
|Survived|count(Survived)|
+--------+---------------+
|     0.0|            107|
|     1.0|             69|
+--------+---------------+

+----------+-----------------+
|prediction|count(prediction)|
+----------+-----------------+
|       0.0|              143|
|       1.0|               33|
+----------+-----------------+



In [80]:
checkLR.filter(checkLR.Survived == checkLR.prediction).count() / checkLR.count()

0.75

#### Evaluate Random Forest

In [73]:
checkRF = rfPredictions.select('Survived', 'prediction')

In [74]:
checkRF.groupby('Survived').agg({'Survived': 'count'}).show()
checkRF.groupby('prediction').agg({'prediction': 'count'}).show()

+--------+---------------+
|Survived|count(Survived)|
+--------+---------------+
|     0.0|            107|
|     1.0|             69|
+--------+---------------+

+----------+-----------------+
|prediction|count(prediction)|
+----------+-----------------+
|       0.0|              128|
|       1.0|               48|
+----------+-----------------+



In [76]:
checkRF.filter(checkRF.Survived == checkRF.prediction).count() / checkRF.count()

0.8238636363636364