# Spark-tanic

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('wine').getOrCreate()

In [143]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

import numpy as np
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from sklearn.metrics import classification_report

In [33]:
df = spark.read.csv('titanic.csv',header=True,inferSchema=True)
df.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------

In [34]:
print('Number of rows = {}'.format(df.count()))

Number of rows = 891


In [35]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



## Check nan values

In [36]:
def null_value_calc(df):
    null_columns_counts = []
    numRows = df.count()
    for k in df.columns:
        nullRows = df.where(col(k).isNull()).count()
        if(nullRows > 0):
            temp = k,nullRows,(nullRows/numRows)*100
            null_columns_counts.append(temp)
    null_columns_calc_list = (null_columns_counts)
    return spark.createDataFrame(null_columns_calc_list, ['Column_Name', 'Null_Values_Count','Null_Value_Percent']).toPandas()

null_value_calc(df)

Unnamed: 0,Column_Name,Null_Values_Count,Null_Value_Percent
0,Age,177,19.86532
1,Cabin,687,77.104377
2,Embarked,2,0.224467


## Keep only relevant columns

In [38]:
df = df.select('Pclass','Name','Sex','Age','SibSp','Parch','Fare','Survived')
df.show(8)

+------+--------------------+------+----+-----+-----+-------+--------+
|Pclass|                Name|   Sex| Age|SibSp|Parch|   Fare|Survived|
+------+--------------------+------+----+-----+-----+-------+--------+
|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|   7.25|       0|
|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|71.2833|       1|
|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|  7.925|       1|
|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|   53.1|       1|
|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|   8.05|       0|
|     3|    Moran, Mr. James|  male|null|    0|    0| 8.4583|       0|
|     1|McCarthy, Mr. Tim...|  male|54.0|    0|    0|51.8625|       0|
|     3|Palsson, Master. ...|  male| 2.0|    3|    1| 21.075|       0|
+------+--------------------+------+----+-----+-----+-------+--------+
only showing top 8 rows



## Extract title feature

In [39]:
split_name = udf(lambda x: x.split(', ')[1].split()[0])
df.withColumn('Title',split_name('Name')).select('Title').groupBy('Title').count().show()

+---------+-----+
|    Title|count|
+---------+-----+
|    Miss.|  182|
|      Ms.|    1|
|     Sir.|    1|
|     Rev.|    6|
|      Mr.|  517|
|   Major.|    2|
|     Mrs.|  125|
|      Dr.|    7|
|     Col.|    2|
|  Master.|   40|
|    Mlle.|    2|
|Jonkheer.|    1|
|     Mme.|    1|
|      the|    1|
|    Lady.|    1|
|    Capt.|    1|
|     Don.|    1|
+---------+-----+



In [40]:
Normal_titles = ['Miss.','Ms.','Mr.','Mrs.','Mlle.']
process_name = udf(lambda x: 'Normal' if x.split(', ')[1].split()[0] in Normal_titles else 'High_society')
df = df.withColumn('Title',process_name('Name')).drop('Name')

## Fill age with median

In [60]:
mean_age = np.round(df.select('Age').dropna().select(mean('Age')).collect()[0][0],1)
df = df.na.fill(mean_age, 'Age')

In [63]:
df.show()

+------+------+----+-----+-----+-------+--------+------------+
|Pclass|   Sex| Age|SibSp|Parch|   Fare|Survived|       Title|
+------+------+----+-----+-----+-------+--------+------------+
|     3|  male|22.0|    1|    0|   7.25|       0|      Normal|
|     1|female|38.0|    1|    0|71.2833|       1|      Normal|
|     3|female|26.0|    0|    0|  7.925|       1|      Normal|
|     1|female|35.0|    1|    0|   53.1|       1|      Normal|
|     3|  male|35.0|    0|    0|   8.05|       0|      Normal|
|     3|  male|29.7|    0|    0| 8.4583|       0|      Normal|
|     1|  male|54.0|    0|    0|51.8625|       0|      Normal|
|     3|  male| 2.0|    3|    1| 21.075|       0|High_society|
|     3|female|27.0|    0|    2|11.1333|       1|      Normal|
|     2|female|14.0|    1|    0|30.0708|       1|      Normal|
|     3|female| 4.0|    1|    1|   16.7|       1|      Normal|
|     1|female|58.0|    0|    0|  26.55|       1|      Normal|
|     3|  male|20.0|    0|    0|   8.05|       0|      

## Check who survived

In [88]:
def who_survived(col):
  d1 = df.groupBy(col).agg(sum('Survived').alias('Alive'))
  d2 = df.groupBy(col).count()
  res = d2.join(d1, [col], how='left')
  return res.withColumn('Pct', res['Alive']/res['count']*100)

who_survived('Sex').show()

+------+-----+-----+------------------+
|   Sex|count|Alive|               Pct|
+------+-----+-----+------------------+
|female|  314|  233| 74.20382165605095|
|  male|  577|  109|18.890814558058924|
+------+-----+-----+------------------+



In [89]:
who_survived('Title').show()

+------------+-----+-----+-----------------+
|       Title|count|Alive|              Pct|
+------------+-----+-----+-----------------+
|High_society|   64|   32|             50.0|
|      Normal|  827|  310|37.48488512696493|
+------------+-----+-----+-----------------+



In [91]:
who_survived('Pclass').orderBy('Pclass').show()

+------+-----+-----+------------------+
|Pclass|count|Alive|               Pct|
+------+-----+-----+------------------+
|     1|  216|  136| 62.96296296296296|
|     2|  184|   87| 47.28260869565217|
|     3|  491|  119|24.236252545824847|
+------+-----+-----+------------------+



In [102]:
df.groupBy('Survived').agg(avg('Fare')).show()

+--------+------------------+
|Survived|         avg(Fare)|
+--------+------------------+
|       1| 48.39540760233917|
|       0|22.117886885245877|
+--------+------------------+



In [122]:
dead_med_fare = np.round(np.median([i[0] for i in df.filter(df.Survived == 0).select('Fare').collect()]),1)
alive_med_fare = np.round(np.median([i[0] for i in df.filter(df.Survived == 1).select('Fare').collect()]),1)

print("Survived median Fare = {}\nNot survived median Fare = {}".format(alive_med_fare,dead_med_fare))

Survived median Fare = 26.0
Not survived median Fare = 10.5


## Machine learning

In [135]:
df = df.withColumnRenamed('Survived','label')

Sex_enc = StringIndexer(inputCol='Sex', outputCol='Sex_enc')
Title_enc = StringIndexer(inputCol='Title', outputCol='Title_enc')

featured_data = VectorAssembler(inputCols=['Pclass','Sex_enc','Age','SibSp','Parch','Fare','Title_enc'], outputCol='features')

pre_processor = Pipeline(stages=[Sex_enc, Title_enc, featured_data])

cleaner = pre_processor.fit(df)
clean_df = cleaner.transform(df)
clean_df.show(5)

+------+------+----+-----+-----+-------+-----+------+-------+---------+--------------------+
|Pclass|   Sex| Age|SibSp|Parch|   Fare|label| Title|Sex_enc|Title_enc|            features|
+------+------+----+-----+-----+-------+-----+------+-------+---------+--------------------+
|     3|  male|22.0|    1|    0|   7.25|    0|Normal|    0.0|      0.0|[3.0,0.0,22.0,1.0...|
|     1|female|38.0|    1|    0|71.2833|    1|Normal|    1.0|      0.0|[1.0,1.0,38.0,1.0...|
|     3|female|26.0|    0|    0|  7.925|    1|Normal|    1.0|      0.0|[3.0,1.0,26.0,0.0...|
|     1|female|35.0|    1|    0|   53.1|    1|Normal|    1.0|      0.0|[1.0,1.0,35.0,1.0...|
|     3|  male|35.0|    0|    0|   8.05|    0|Normal|    0.0|      0.0|(7,[0,2,5],[3.0,3...|
+------+------+----+-----+-----+-------+-----+------+-------+---------+--------------------+
only showing top 5 rows



In [138]:
clean_data_training = clean_df.select('features','label')
clean_data_training.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[3.0,0.0,22.0,1.0...|    0|
|[1.0,1.0,38.0,1.0...|    1|
|[3.0,1.0,26.0,0.0...|    1|
|[1.0,1.0,35.0,1.0...|    1|
|(7,[0,2,5],[3.0,3...|    0|
+--------------------+-----+
only showing top 5 rows



In [147]:
train, test = clean_data_training.randomSplit([0.7,0.3])

# Evaluator precises the target column and the kind of metrics to use 
evaluator = MulticlassClassificationEvaluator(labelCol='label', metricName='f1')

# Define transformtions stages to throw in the pipeline
rfc = RandomForestClassifier()

# Definition of pipeline
pipeline_nb = Pipeline(stages=[rfc])

# Definition of the grid parameters
paramGrid = ParamGridBuilder().addGrid(rfc.numTrees, [50,100]).build()

# Definition of the cross validator
cv = CrossValidator(
  estimator=pipeline_nb,
  estimatorParamMaps=paramGrid, 
  evaluator=evaluator, 
  numFolds=8)

# Train the model
model = cv.fit(train)

# Predict classes on test part
predictions = model.transform(test)
predictions.show(5)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(7,[0,2],[1.0,29.7])|    0|[42.0690897043342...|[0.84138179408668...|       0.0|
|(7,[0,2],[1.0,39.0])|    0|[41.5281461657729...|[0.83056292331545...|       0.0|
|(7,[0,2],[2.0,29.7])|    0|[44.9790680784523...|[0.89958136156904...|       0.0|
|(7,[0,2],[3.0,19.0])|    0|[44.7272126772770...|[0.89454425354554...|       0.0|
|(7,[0,2],[3.0,25.0])|    1|[44.8835604046736...|[0.89767120809347...|       0.0|
+--------------------+-----+--------------------+--------------------+----------+
only showing top 5 rows



In [148]:
predictions_pd = predictions.toPandas()
print(classification_report(predictions_pd.prediction, predictions_pd.label))

              precision    recall  f1-score   support

         0.0       0.90      0.81      0.85       162
         1.0       0.73      0.84      0.78        95

    accuracy                           0.82       257
   macro avg       0.81      0.83      0.82       257
weighted avg       0.83      0.82      0.83       257

