In [1]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext('local')

spark = SparkSession(sc)

df = spark.read.csv('titanic.csv', inferSchema = True, header = True)

In [2]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [3]:
df.show(10)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

In [4]:
df = df.drop('PassengerId', 'Name', 'Ticket', "Cabin")
df = df.na.drop()
df.printSchema()
cols = df.columns

root
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Embarked: string (nullable = true)



In [5]:
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline

cat_cols = ['Sex', 'Embarked']
stages = []

for cat_col in cat_cols:
    stringindex = StringIndexer(inputCol = cat_col, outputCol = cat_col + 'Index')
    encoder = OneHotEncoderEstimator(inputCols = [stringindex.getOutputCol()], outputCols = [cat_col + 'ClassVec'])
    stages += [stringindex, encoder]

num_cols = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

assembler_cols = [c + 'ClassVec' for c in cat_cols] + num_cols

assembler = VectorAssembler(inputCols = assembler_cols, outputCol = 'features')
stages += [assembler]


In [6]:
stages

[StringIndexer_7b6ebabf45d5,
 OneHotEncoderEstimator_d69eda17f88a,
 StringIndexer_07a8a0fb65e6,
 OneHotEncoderEstimator_c0b663db0cd0,
 VectorAssembler_48010d403dd7]

In [7]:
pipe = Pipeline(stages = stages)
pipelineModel = pipe.fit(df)
df = pipelineModel.transform(df)


In [8]:
df.show()

+--------+------+------+----+-----+-----+-------+--------+--------+-------------+-------------+----------------+--------------------+
|Survived|Pclass|   Sex| Age|SibSp|Parch|   Fare|Embarked|SexIndex|  SexClassVec|EmbarkedIndex|EmbarkedClassVec|            features|
+--------+------+------+----+-----+-----+-------+--------+--------+-------------+-------------+----------------+--------------------+
|       0|     3|  male|22.0|    1|    0|   7.25|       S|     0.0|(1,[0],[1.0])|          0.0|   (2,[0],[1.0])|[1.0,1.0,0.0,3.0,...|
|       1|     1|female|38.0|    1|    0|71.2833|       C|     1.0|    (1,[],[])|          1.0|   (2,[1],[1.0])|[0.0,0.0,1.0,1.0,...|
|       1|     3|female|26.0|    0|    0|  7.925|       S|     1.0|    (1,[],[])|          0.0|   (2,[0],[1.0])|(8,[1,3,4,7],[1.0...|
|       1|     1|female|35.0|    1|    0|   53.1|       S|     1.0|    (1,[],[])|          0.0|   (2,[0],[1.0])|[0.0,1.0,0.0,1.0,...|
|       0|     3|  male|35.0|    0|    0|   8.05|       S|    

In [9]:
selectedcols =  ['features'] + cols
df = df.select(selectedcols)

In [10]:
df.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Embarked: string (nullable = true)



In [11]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol = 'Survived', featuresCol = 'features')

final_df = df.select('features', 'Survived')


In [12]:
rf_model = rf.fit(final_df)

In [13]:
final_df.take(1)

[Row(features=DenseVector([1.0, 1.0, 0.0, 3.0, 22.0, 1.0, 0.0, 7.25]), Survived=0)]

In [14]:
rf_model.featureImportances

SparseVector(8, {0: 0.4749, 1: 0.0062, 2: 0.0189, 3: 0.1137, 4: 0.1253, 5: 0.0471, 6: 0.0392, 7: 0.1747})

In [17]:
from pyspark.ml.classification import LogisticRegression

log_reg = LogisticRegression(featuresCol = 'features', labelCol = 'Survived')
pip = pipeline = Pipeline(stages = [log_reg])

train, test = final_df.randomSplit([0.7, 0.3])

fit_model = pip.fit(train)

results = fit_model.transform(test)

results.select('prediction', 'Survived').show(3)

+----------+--------+
|prediction|Survived|
+----------+--------+
|       0.0|       0|
|       0.0|       0|
|       0.0|       0|
+----------+--------+
only showing top 3 rows



In [18]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

eval = BinaryClassificationEvaluator(rawPredictionCol = 'rawPrediction', labelCol = 'Survived')
AUC = eval.evaluate(results)
AUC

0.8368328958880146