Create a Logistic regression and a Random Forest Classifier to predict whether passengers would survive the demise of the Titanic.

In [1]:
import pyspark
from pyspark import SparkContext
sc = SparkContext()

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

21/09/02 01:25:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
from pyspark.sql import Row
from pyspark.sql import SQLContext

sqlContext = SQLContext(sc)

In [3]:
df = sqlContext.read.option("inferSchema",True).option("header",True).csv("titanic_dataset.csv")

In [4]:
df.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------

In [5]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [6]:
# Convert longs to floats

# Import all from `sql.types`
from pyspark.sql.types import *

# Write a custom function to convert the data type of DataFrame columns
def convertColumn(df, names, newType):
    for name in names: 
        df = df.withColumn(name, df[name].cast(newType))
    return df 
# List of continuous features
CONTI_FEATURES  = ['PassengerId', 'Survived', 'Pclass','Age', 'SibSp', 'Parch', 'Fare']
# Convert the type
df = convertColumn(df, CONTI_FEATURES, FloatType())
# Check the dataset
df.printSchema()

root
 |-- PassengerId: float (nullable = true)
 |-- Survived: float (nullable = true)
 |-- Pclass: float (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: float (nullable = true)
 |-- SibSp: float (nullable = true)
 |-- Parch: float (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: float (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [7]:
COLUMNS = ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Survived']
df = df.select(COLUMNS)
df.show(2)

+-----------+------+--------------------+------+----+-----+-----+---------+-------+-----+--------+--------+
|PassengerId|Pclass|                Name|   Sex| Age|SibSp|Parch|   Ticket|   Fare|Cabin|Embarked|Survived|
+-----------+------+--------------------+------+----+-----+-----+---------+-------+-----+--------+--------+
|        1.0|   3.0|Braund, Mr. Owen ...|  male|22.0|  1.0|  0.0|A/5 21171|   7.25| null|       S|     0.0|
|        2.0|   1.0|Cumings, Mrs. Joh...|female|38.0|  1.0|  0.0| PC 17599|71.2833|  C85|       C|     1.0|
+-----------+------+--------------------+------+----+-----+-----+---------+-------+-----+--------+--------+
only showing top 2 rows



In [8]:
# Number of rows
df.count()

891

In [9]:
df.toPandas()['Pclass'].value_counts()

3.0    491
1.0    216
2.0    184
Name: Pclass, dtype: int64

In [10]:
df.toPandas()['SibSp'].value_counts()

0.0    608
1.0    209
2.0     28
4.0     18
3.0     16
8.0      7
5.0      5
Name: SibSp, dtype: int64

In [11]:
df.toPandas()['Parch'].value_counts()

0.0    678
1.0    118
2.0     80
5.0      5
3.0      5
4.0      4
6.0      1
Name: Parch, dtype: int64

In [12]:
df.toPandas()['Ticket'].value_counts()

347082      7
CA. 2343    7
1601        7
3101295     6
CA 2144     6
           ..
9234        1
19988       1
2693        1
PC 17612    1
370376      1
Name: Ticket, Length: 681, dtype: int64

In [13]:
df.toPandas()['Cabin'].value_counts()

B96 B98        4
G6             4
C23 C25 C27    4
C22 C26        3
F33            3
              ..
E34            1
C7             1
C54            1
E36            1
C148           1
Name: Cabin, Length: 147, dtype: int64

In [14]:
df.toPandas()['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [15]:
df_remove = df.filter(df.Parch != 6)

In [16]:
from pyspark.sql.functions import isnan, when, count, col
df_remove.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_remove.columns]).show()

+-----------+------+----+---+---+-----+-----+------+----+-----+--------+--------+
|PassengerId|Pclass|Name|Sex|Age|SibSp|Parch|Ticket|Fare|Cabin|Embarked|Survived|
+-----------+------+----+---+---+-----+-----+------+----+-----+--------+--------+
|          0|     0|   0|  0|177|    0|    0|     0|   0|  686|       2|       0|
+-----------+------+----+---+---+-----+-----+------+----+-----+--------+--------+



In [17]:
df_remove = df_remove.drop(*('PassengerId', 'Name', 'Ticket', 'Cabin'))
df_remove = df_remove.filter(df_remove.Age. isNotNull())
df_remove = df_remove.filter(df_remove.Embarked. isNotNull())

In [18]:
df_remove.count()

711

In [19]:
df_remove.printSchema()

root
 |-- Pclass: float (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: float (nullable = true)
 |-- SibSp: float (nullable = true)
 |-- Parch: float (nullable = true)
 |-- Fare: float (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- Survived: float (nullable = true)



In [20]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
CATE_FEATURES = ['Sex', 'Embarked']
stages = [] # stages in our Pipeline
for categoricalCol in CATE_FEATURES:
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()],
                                     outputCols=[categoricalCol + "classVec"])
    stages += [stringIndexer, encoder]

In [21]:
# Convert label into label indices using the StringIndexer
#label_stringIdx =  StringIndexer(inputCol="Survived", outputCol="newSurvived")
#stages += [label_stringIdx]

In [22]:
CONTI_FEATURES = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
assemblerInputs = [c + "classVec" for c in CATE_FEATURES] + CONTI_FEATURES
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [23]:
# Create a Pipeline.
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(df_remove)
model = pipelineModel.transform(df_remove)

In [24]:
model.take(1)

[Row(Pclass=3.0, Sex='male', Age=22.0, SibSp=1.0, Parch=0.0, Fare=7.25, Embarked='S', Survived=0.0, SexIndex=0.0, SexclassVec=SparseVector(1, {0: 1.0}), EmbarkedIndex=0.0, EmbarkedclassVec=SparseVector(2, {0: 1.0}), features=DenseVector([1.0, 1.0, 0.0, 3.0, 22.0, 1.0, 0.0, 7.25]))]

In [29]:
model.take(3)

[Row(Pclass=3.0, Sex='male', Age=22.0, SibSp=1.0, Parch=0.0, Fare=7.25, Embarked='S', Survived=0.0, SexIndex=0.0, SexclassVec=SparseVector(1, {0: 1.0}), EmbarkedIndex=0.0, EmbarkedclassVec=SparseVector(2, {0: 1.0}), features=DenseVector([1.0, 1.0, 0.0, 3.0, 22.0, 1.0, 0.0, 7.25])),
 Row(Pclass=1.0, Sex='female', Age=38.0, SibSp=1.0, Parch=0.0, Fare=71.2833023071289, Embarked='C', Survived=1.0, SexIndex=1.0, SexclassVec=SparseVector(1, {}), EmbarkedIndex=1.0, EmbarkedclassVec=SparseVector(2, {1: 1.0}), features=DenseVector([0.0, 0.0, 1.0, 1.0, 38.0, 1.0, 0.0, 71.2833])),
 Row(Pclass=3.0, Sex='female', Age=26.0, SibSp=0.0, Parch=0.0, Fare=7.925000190734863, Embarked='S', Survived=1.0, SexIndex=1.0, SexclassVec=SparseVector(1, {}), EmbarkedIndex=0.0, EmbarkedclassVec=SparseVector(2, {0: 1.0}), features=SparseVector(8, {1: 1.0, 3: 3.0, 4: 26.0, 7: 7.925}))]

In [28]:
model.show(5)

+------+------+----+-----+-----+-------+--------+--------+--------+-------------+-------------+----------------+--------------------+
|Pclass|   Sex| Age|SibSp|Parch|   Fare|Embarked|Survived|SexIndex|  SexclassVec|EmbarkedIndex|EmbarkedclassVec|            features|
+------+------+----+-----+-----+-------+--------+--------+--------+-------------+-------------+----------------+--------------------+
|   3.0|  male|22.0|  1.0|  0.0|   7.25|       S|     0.0|     0.0|(1,[0],[1.0])|          0.0|   (2,[0],[1.0])|[1.0,1.0,0.0,3.0,...|
|   1.0|female|38.0|  1.0|  0.0|71.2833|       C|     1.0|     1.0|    (1,[],[])|          1.0|   (2,[1],[1.0])|[0.0,0.0,1.0,1.0,...|
|   3.0|female|26.0|  0.0|  0.0|  7.925|       S|     1.0|     1.0|    (1,[],[])|          0.0|   (2,[0],[1.0])|(8,[1,3,4,7],[1.0...|
|   1.0|female|35.0|  1.0|  0.0|   53.1|       S|     1.0|     1.0|    (1,[],[])|          0.0|   (2,[0],[1.0])|[0.0,1.0,0.0,1.0,...|
|   3.0|  male|35.0|  0.0|  0.0|   8.05|       S|     0.0|    

In [30]:
from pyspark.ml.linalg import DenseVector
input_data = model.rdd.map(lambda x: (x["Survived"], DenseVector(x["features"])))

In [31]:
df_train = sqlContext.createDataFrame(input_data, ["Survived", "features"])
df_train.show(2)

+--------+--------------------+
|Survived|            features|
+--------+--------------------+
|     0.0|[1.0,1.0,0.0,3.0,...|
|     1.0|[0.0,0.0,1.0,1.0,...|
+--------+--------------------+
only showing top 2 rows



In [32]:
# Split the data into train and test sets
train_data, test_data = df_train.randomSplit([.8,.2],seed=1234)

In [33]:
train_data.groupby('Survived').agg({'Survived': 'count'}).show()

+--------+---------------+
|Survived|count(Survived)|
+--------+---------------+
|     0.0|            328|
|     1.0|            229|
+--------+---------------+



### LogisticRegression

In [34]:
# Import `LogisticRegression`
from pyspark.ml.classification import LogisticRegression

# Initialize `lr`
lr = LogisticRegression(labelCol="Survived",
                        featuresCol="features",
                        maxIter=10,
                        regParam=0.3)

# Fit the data to the model
linearModel = lr.fit(train_data)

# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(linearModel.coefficients))
print("Intercept: " + str(linearModel.intercept))

21/09/02 01:34:00 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
21/09/02 01:34:00 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


Coefficients: [-0.9908522767656833,-0.1621320421717106,0.2612724694646815,-0.33985875701556506,-0.005834849713198188,-0.054528842349197915,0.05348017163209499,0.002285740351778472]
Intercept: 1.1697834319184208


In [35]:
# Make predictions on test data using the transform() method.
predictions = linearModel.transform(test_data)

In [36]:
predictions.printSchema()

root
 |-- Survived: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [37]:
selected = predictions.select("Survived", "prediction", "probability")
selected.show(10)

+--------+----------+--------------------+
|Survived|prediction|         probability|
+--------+----------+--------------------+
|     0.0|       1.0|[0.48865449664423...|
|     0.0|       1.0|[0.43619450519124...|
|     0.0|       0.0|[0.51581049682227...|
|     0.0|       0.0|[0.53009808020190...|
|     0.0|       1.0|[0.49699092271870...|
|     0.0|       0.0|[0.51431922277506...|
|     0.0|       0.0|[0.52600327491420...|
|     0.0|       0.0|[0.54008710503582...|
|     0.0|       0.0|[0.54521598269143...|
|     0.0|       0.0|[0.53579731894507...|
+--------+----------+--------------------+
only showing top 10 rows



In [40]:
cm = predictions.select("Survived", "prediction")
cm.groupby('Survived').agg({'Survived': 'count'}).show()

+--------+---------------+
|Survived|count(Survived)|
+--------+---------------+
|     0.0|             95|
|     1.0|             59|
+--------+---------------+



In [41]:
cm.groupby('prediction').agg({'prediction': 'count'}).show()

+----------+-----------------+
|prediction|count(prediction)|
+----------+-----------------+
|       0.0|              115|
|       1.0|               39|
+----------+-----------------+



In [43]:
cm.filter(cm.Survived == cm.prediction).count() / cm.count()

0.7662337662337663

### RandomForestClassifier

In [48]:
# Import `RandomForestClassifier`
from pyspark.ml.classification import RandomForestClassifier

# Initialize `lr`
rf = RandomForestClassifier(labelCol="Survived",
                        featuresCol="features")

# Fit the data to the model
rfModel = rf.fit(train_data)

In [49]:
predictionsRF = rfModel.transform(test_data)

In [50]:
cmRF = predictionsRF.select("Survived", "prediction")

In [51]:
cmRF.filter(cmRF.Survived == cmRF.prediction).count() / cmRF.count()

0.7987012987012987