## Big Data Analytics Technical Project

####Preprocessing

In [3]:
#### Importing needful libraries

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.sql.functions import mean,col,split, col, regexp_extract, when, lit
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import QuantileDiscretizer


In [4]:
###The entry point into all functionality in Spark is the SparkSession class.To create a basic SparkSession, just use SparkSession.builder

spark = SparkSession \
    .builder \
    .appName("Spark ML on titanic data ") \
    .getOrCreate()

In [5]:
##Loading CSV data
dataset = "/FileStore/tables/train.csv"
titanic_df = spark.read.csv(dataset,header = 'True',inferSchema='True')
display(titanic_df)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


#### Analyzing the dataset

In [7]:
###The below is the schema of data
titanic_df.printSchema()

In [8]:
passengers_count = titanic_df.count()
print(passengers_count)

In [9]:
###Viewing few rows
titanic_df.show(5)

Summary of data

In [11]:
titanic_df.describe().show()

Checking Schema of the dataset

In [13]:
titanic_df.printSchema()

###   Exploratory data analysis (EDA)

Selecting few features

In [16]:
###Checking survival rate using feature class.

titanic_df.select("Survived","Pclass","Embarked").show()

In [17]:
groupBy_output = titanic_df.groupBy("Survived", "Pclass").count()

In [18]:
display(groupBy_output)

Survived,Pclass,count
1,2,87
1,1,136
1,3,119
0,1,80
0,2,97
0,3,372


###### Here it can be seen that the Pclass1 people were given priority to pclass3 people, even though
We can clearly see that Passenegers Of Pclass 1 were given a very high priority while rescue. Even though the the number of Passengers in Pclass 3 were a lot higher, still the number of survival from them is very low.

###### Knowing the number of Passengers Survived ?

In [21]:
titanic_df.groupBy("Survived").count().show()

Out of 891 passengers in dataset, only about 342 survived.

###### To know the particulars about survivors we have to explore more of the data.
###### The survival rate can be determined by different features of the dataset such as Sex, Port of Embarcation, Age; few to be mentioned.

In [24]:
###Checking survival rate using feature Sex.

titanic_df.groupBy("Sex","Survived").count().show()

In [25]:
grp_output = titanic_df.groupBy( "Sex", "Survived").count()

In [26]:
display(grp_output)

Sex,Survived,count
male,0,468
female,1,233
female,0,81
male,1,109


###### Although the number of males are more than females on ship, the female survivors are twice the number of males saved.

In [28]:
### Checking total number of passengers in each Pclass survived.

a = titanic_df.groupBy("Pclass").count()

In [29]:
display(a)

Pclass,count
1,216
3,491
2,184


In [30]:
### checking Age with feature pclass.
b = titanic_df.groupBy("Age", "Survived").count()

In [31]:
display(b)

Age,Survived,count
34.0,0,9
28.5,0,2
48.0,0,3
70.0,0,2
51.0,1,2
12.0,1,1
40.0,1,6
61.0,0,3
2.0,1,3
19.0,1,9


#### Handling Null values

In [33]:
# This function use to print feature with null values and null count 
def null_value_count(df):
  null_columns_counts = []
  numRows = df.count()
  for k in df.columns:
    nullRows = df.where(col(k).isNull()).count()
    if(nullRows > 0):
      temp = k,nullRows
      null_columns_counts.append(temp)
  return(null_columns_counts)

# Calling function
null_columns_count_list = null_value_count(titanic_df)
spark.createDataFrame(null_columns_count_list, ['Column_With_Null_Value', 'Null_Values_Count']).show()


Age feature has 177 null values.

In [36]:
mean_age = titanic_df.select(mean('Age')).collect()[0][0]
print(mean_age)

In [37]:
titanic_df.select("Name").show()

######  To replace these NaN values, we can assign them the mean age of the dataset.But the problem is, there were many people with many different ages. We just cant assign a 4 year kid with the mean age that is 29 years.

###### we can check the Name feature. Looking upon the feature, we can see that the names have a salutation like Mr or Mrs. Thus we can assign the mean values of Mr and Mrs to the respective groups

In [41]:
###Using the Regex ""[A-Za-z]+)." we extract the initials from the Name. It looks for strings which lie between A-Z or a-z and followed by a .(dot).

titanic_df = titanic_df.withColumn("Initial",regexp_extract(col("Name"),"([A-Za-z]+)\.",1))


In [42]:

titanic_df.show()

In [43]:
titanic_df.select("Initial").distinct().show()


In [44]:
### There are some misspelled Initials like Mlle or Mme that stand for Miss. I will replace them with Miss and same thing for other values.

titanic_df = titanic_df.replace(['Mlle','Mme', 'Ms', 'Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],
               ['Miss','Miss','Miss','Mr','Mr',  'Mrs',  'Mrs',  'Other',  'Other','Other','Mr','Mr','Mr'])


In [45]:
titanic_df.select("Initial").distinct().show()


In [46]:
###lets check the average age by Initials
titanic_df.groupby('Initial').avg('Age').collect()

In [47]:
###Let's impute missing values in age feature based on average age of Initials

titanic_df = titanic_df.withColumn("Age",when((titanic_df["Initial"] == "Miss") & (titanic_df["Age"].isNull()), 22).otherwise(titanic_df["Age"]))
titanic_df = titanic_df.withColumn("Age",when((titanic_df["Initial"] == "Other") & (titanic_df["Age"].isNull()), 46).otherwise(titanic_df["Age"]))
titanic_df = titanic_df.withColumn("Age",when((titanic_df["Initial"] == "Master") & (titanic_df["Age"].isNull()), 5).otherwise(titanic_df["Age"]))
titanic_df = titanic_df.withColumn("Age",when((titanic_df["Initial"] == "Mr") & (titanic_df["Age"].isNull()), 33).otherwise(titanic_df["Age"]))
titanic_df = titanic_df.withColumn("Age",when((titanic_df["Initial"] == "Mrs") & (titanic_df["Age"].isNull()), 36).otherwise(titanic_df["Age"]))


Check the imputation

In [49]:
###Check the imputation

titanic_df.filter(titanic_df.Age==46).select("Initial").show()


In [50]:
titanic_df.select("Age").show()

In [51]:
###Embarked feature has only two missining values. Let's check values within Embarked
titanic_df.groupBy("Embarked").count().show()

In [52]:
###Majority Passengers boarded from "S". We can impute with "S"
titanic_df = titanic_df.na.fill({"Embarked" : 'S'})


In [53]:
###We can drop Cabin features as it has lots of null values
titanic_df = titanic_df.drop("Cabin")

In [54]:
titanic_df.printSchema()

We can create a new feature called "Family_size" and "Alone" and analyse it. This feature is the summation of Parch(parents/children) and SibSp(siblings/spouses). It gives us a combined data so that we can check if survival rate have anything to do with family size of the passengers

In [56]:
titanic_df = titanic_df.withColumn("Family_Size",col('SibSp')+col('Parch'))

In [57]:
titanic_df.groupBy("Family_Size").count().show()

In [58]:
ab = titanic_df.groupBy("Family_Size").count()
display(ab)

Family_Size,count
1,161
6,12
3,29
5,22
4,15
7,6
10,7
2,102
0,537


In [59]:
titanic_df = titanic_df.withColumn('Alone',lit(0))
titanic_df = titanic_df.withColumn("Alone",when(titanic_df["Family_Size"] == 0, 1).otherwise(titanic_df["Alone"]))


In [60]:
titanic_df.columns

In [61]:

#convert Sex, Embarked & Initial columns from string to number using StringIndexer.

indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(titanic_df) for column in ["Sex","Embarked","Initial"]]
pipeline = Pipeline(stages=indexers)
titanic_df = pipeline.fit(titanic_df).transform(titanic_df)

In [62]:
titanic_df.show()

In [63]:
titanic_df.printSchema()

In [64]:
###Drop columns which are not required

titanic_df = titanic_df.drop("PassengerId","Name","Ticket","Cabin","Embarked","Sex","Initial")

In [65]:
titanic_df.show()

###### Let's put all features into vector

In [67]:
feature = VectorAssembler(inputCols=titanic_df.columns[1:],outputCol="features")
feature_vector= feature.transform(titanic_df)

In [68]:
feature_vector.show()

In [69]:
###Now that the data is all set, let's split it into training and test. I'll be using 80% of it.

(trainingData, testData) = feature_vector.randomSplit([0.8, 0.2],seed = 11)

### Modelling

######  Classification Algorithms used to model the dataset are shown below

LogisticRegression

DecisionTreeClassifier

RandomForestClassifier

Gradient-boosted tree classifier

NaiveBayes

Support Vector Machine

###### LogisticRegression

In [73]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol="Survived", featuresCol="features")
#Training algo
lrModel = lr.fit(trainingData)
lr_prediction = lrModel.transform(testData)
lr_prediction.select("prediction", "Survived", "features").show()
evaluator = MulticlassClassificationEvaluator(labelCol="Survived", predictionCol="prediction", metricName="accuracy")

###### Evaluating accuracy of LogisticRegression.

In [75]:
lr_accuracy = evaluator.evaluate(lr_prediction)
print("Accuracy of LogisticRegression is = %g"% (lr_accuracy))
print("Test Error of LogisticRegression = %g " % (1.0 - lr_accuracy))

In [76]:
display(lr_prediction)

Survived,Pclass,Age,SibSp,Parch,Fare,Family_Size,Alone,Sex_index,Embarked_index,Initial_index,features,rawPrediction,probability,prediction
0,1,19.0,3,2,263.0,5,0,0.0,0.0,0.0,"List(1, 10, List(), List(1.0, 19.0, 3.0, 2.0, 263.0, 5.0, 0.0, 0.0, 0.0, 0.0))","List(1, 2, List(), List(0.9761351290031279, -0.9761351290031279))","List(1, 2, List(), List(0.726340668701844, 0.273659331298156))",0.0
0,1,27.0,0,2,211.5,2,0,0.0,1.0,0.0,"List(1, 10, List(), List(1.0, 27.0, 0.0, 2.0, 211.5, 2.0, 0.0, 0.0, 1.0, 0.0))","List(1, 2, List(), List(-0.393194583840367, 0.393194583840367))","List(1, 2, List(), List(0.4029485068014416, 0.5970514931985584))",1.0
0,1,28.0,0,0,47.1,0,1,0.0,0.0,0.0,"List(0, 10, List(0, 1, 4, 6), List(1.0, 28.0, 47.1, 1.0))","List(1, 2, List(), List(0.2781148455819795, -0.2781148455819795))","List(1, 2, List(), List(0.569083992754136, 0.43091600724586404))",0.0
0,1,28.0,1,0,82.1708,1,0,0.0,1.0,0.0,"List(1, 10, List(), List(1.0, 28.0, 1.0, 0.0, 82.1708, 1.0, 0.0, 0.0, 1.0, 0.0))","List(1, 2, List(), List(-0.2707061402119497, 0.2707061402119497))","List(1, 2, List(), List(0.4327337468619107, 0.5672662531380893))",1.0
0,1,33.0,0,0,26.0,0,1,0.0,0.0,0.0,"List(0, 10, List(0, 1, 4, 6), List(1.0, 33.0, 26.0, 1.0))","List(1, 2, List(), List(0.5239058315088307, -0.5239058315088307))","List(1, 2, List(), List(0.6280606258497693, 0.3719393741502307))",0.0
0,1,33.0,0,0,27.7208,0,1,0.0,1.0,0.0,"List(0, 10, List(0, 1, 4, 6, 8), List(1.0, 33.0, 27.7208, 1.0, 1.0))","List(1, 2, List(), List(0.27960675219309516, -0.27960675219309516))","List(1, 2, List(), List(0.5694498113646942, 0.43055018863530575))",0.0
0,1,38.0,0,1,153.4625,1,0,0.0,0.0,0.0,"List(0, 10, List(0, 1, 3, 4, 5), List(1.0, 38.0, 1.0, 153.4625, 1.0))","List(1, 2, List(), List(-0.02711080731558768, 0.02711080731558768))","List(1, 2, List(), List(0.49322271327249956, 0.5067772867275004))",1.0
0,1,40.0,0,0,0.0,0,1,0.0,0.0,0.0,"List(0, 10, List(0, 1, 6), List(1.0, 40.0, 1.0))","List(1, 2, List(), List(0.8551852223538181, -0.8551852223538181))","List(1, 2, List(), List(0.7016537287283117, 0.29834627127168833))",0.0
0,1,40.0,0,0,27.7208,0,1,0.0,1.0,0.0,"List(0, 10, List(0, 1, 4, 6, 8), List(1.0, 40.0, 27.7208, 1.0, 1.0))","List(1, 2, List(), List(0.5166692713522876, -0.5166692713522876))","List(1, 2, List(), List(0.6263686013054682, 0.37363139869453177))",0.0
0,1,45.0,1,0,83.475,1,0,0.0,0.0,0.0,"List(0, 10, List(0, 1, 2, 4, 5), List(1.0, 45.0, 1.0, 83.475, 1.0))","List(1, 2, List(), List(0.5383544292709086, -0.5383544292709086))","List(1, 2, List(), List(0.6314295330807972, 0.3685704669192028))",0.0


###### DecisionTreeClassifier

In [78]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(labelCol="Survived", featuresCol="features")
dt_model = dt.fit(trainingData)
dt_prediction = dt_model.transform(testData)
dt_prediction.select("prediction", "Survived", "features").show()


###### Evaluating accuracy of DecisionTreeClassifier.

In [80]:
dt_accuracy = evaluator.evaluate(dt_prediction)
print("Accuracy of DecisionTreeClassifier is = %g"% (dt_accuracy))
print("Test Error of DecisionTreeClassifier = %g " % (1.0 - dt_accuracy))


In [81]:
display(dt_prediction)

Survived,Pclass,Age,SibSp,Parch,Fare,Family_Size,Alone,Sex_index,Embarked_index,Initial_index,features,rawPrediction,probability,prediction
0,1,19.0,3,2,263.0,5,0,0.0,0.0,0.0,"List(1, 10, List(), List(1.0, 19.0, 3.0, 2.0, 263.0, 5.0, 0.0, 0.0, 0.0, 0.0))","List(1, 2, List(), List(64.0, 38.0))","List(1, 2, List(), List(0.6274509803921569, 0.37254901960784315))",0.0
0,1,27.0,0,2,211.5,2,0,0.0,1.0,0.0,"List(1, 10, List(), List(1.0, 27.0, 0.0, 2.0, 211.5, 2.0, 0.0, 0.0, 1.0, 0.0))","List(1, 2, List(), List(64.0, 38.0))","List(1, 2, List(), List(0.6274509803921569, 0.37254901960784315))",0.0
0,1,28.0,0,0,47.1,0,1,0.0,0.0,0.0,"List(0, 10, List(0, 1, 4, 6), List(1.0, 28.0, 47.1, 1.0))","List(1, 2, List(), List(64.0, 38.0))","List(1, 2, List(), List(0.6274509803921569, 0.37254901960784315))",0.0
0,1,28.0,1,0,82.1708,1,0,0.0,1.0,0.0,"List(1, 10, List(), List(1.0, 28.0, 1.0, 0.0, 82.1708, 1.0, 0.0, 0.0, 1.0, 0.0))","List(1, 2, List(), List(64.0, 38.0))","List(1, 2, List(), List(0.6274509803921569, 0.37254901960784315))",0.0
0,1,33.0,0,0,26.0,0,1,0.0,0.0,0.0,"List(0, 10, List(0, 1, 4, 6), List(1.0, 33.0, 26.0, 1.0))","List(1, 2, List(), List(64.0, 38.0))","List(1, 2, List(), List(0.6274509803921569, 0.37254901960784315))",0.0
0,1,33.0,0,0,27.7208,0,1,0.0,1.0,0.0,"List(0, 10, List(0, 1, 4, 6, 8), List(1.0, 33.0, 27.7208, 1.0, 1.0))","List(1, 2, List(), List(64.0, 38.0))","List(1, 2, List(), List(0.6274509803921569, 0.37254901960784315))",0.0
0,1,38.0,0,1,153.4625,1,0,0.0,0.0,0.0,"List(0, 10, List(0, 1, 3, 4, 5), List(1.0, 38.0, 1.0, 153.4625, 1.0))","List(1, 2, List(), List(64.0, 38.0))","List(1, 2, List(), List(0.6274509803921569, 0.37254901960784315))",0.0
0,1,40.0,0,0,0.0,0,1,0.0,0.0,0.0,"List(0, 10, List(0, 1, 6), List(1.0, 40.0, 1.0))","List(1, 2, List(), List(64.0, 38.0))","List(1, 2, List(), List(0.6274509803921569, 0.37254901960784315))",0.0
0,1,40.0,0,0,27.7208,0,1,0.0,1.0,0.0,"List(0, 10, List(0, 1, 4, 6, 8), List(1.0, 40.0, 27.7208, 1.0, 1.0))","List(1, 2, List(), List(64.0, 38.0))","List(1, 2, List(), List(0.6274509803921569, 0.37254901960784315))",0.0
0,1,45.0,1,0,83.475,1,0,0.0,0.0,0.0,"List(0, 10, List(0, 1, 2, 4, 5), List(1.0, 45.0, 1.0, 83.475, 1.0))","List(1, 2, List(), List(64.0, 38.0))","List(1, 2, List(), List(0.6274509803921569, 0.37254901960784315))",0.0


###### RandomForestClassifier

In [83]:
from pyspark.ml.classification import RandomForestClassifier
rf = DecisionTreeClassifier(labelCol="Survived", featuresCol="features")
rf_model = rf.fit(trainingData)
rf_prediction = rf_model.transform(testData)
rf_prediction.select("prediction", "Survived", "features").show()

###### Evaluating accuracy of RandomForestClassifier.

In [85]:
rf_accuracy = evaluator.evaluate(rf_prediction)
print("Accuracy of RandomForestClassifier is = %g"% (rf_accuracy))
print("Test Error of RandomForestClassifier  = %g " % (1.0 - rf_accuracy))

In [86]:
display(rf_prediction)

Survived,Pclass,Age,SibSp,Parch,Fare,Family_Size,Alone,Sex_index,Embarked_index,Initial_index,features,rawPrediction,probability,prediction
0,1,19.0,3,2,263.0,5,0,0.0,0.0,0.0,"List(1, 10, List(), List(1.0, 19.0, 3.0, 2.0, 263.0, 5.0, 0.0, 0.0, 0.0, 0.0))","List(1, 2, List(), List(64.0, 38.0))","List(1, 2, List(), List(0.6274509803921569, 0.37254901960784315))",0.0
0,1,27.0,0,2,211.5,2,0,0.0,1.0,0.0,"List(1, 10, List(), List(1.0, 27.0, 0.0, 2.0, 211.5, 2.0, 0.0, 0.0, 1.0, 0.0))","List(1, 2, List(), List(64.0, 38.0))","List(1, 2, List(), List(0.6274509803921569, 0.37254901960784315))",0.0
0,1,28.0,0,0,47.1,0,1,0.0,0.0,0.0,"List(0, 10, List(0, 1, 4, 6), List(1.0, 28.0, 47.1, 1.0))","List(1, 2, List(), List(64.0, 38.0))","List(1, 2, List(), List(0.6274509803921569, 0.37254901960784315))",0.0
0,1,28.0,1,0,82.1708,1,0,0.0,1.0,0.0,"List(1, 10, List(), List(1.0, 28.0, 1.0, 0.0, 82.1708, 1.0, 0.0, 0.0, 1.0, 0.0))","List(1, 2, List(), List(64.0, 38.0))","List(1, 2, List(), List(0.6274509803921569, 0.37254901960784315))",0.0
0,1,33.0,0,0,26.0,0,1,0.0,0.0,0.0,"List(0, 10, List(0, 1, 4, 6), List(1.0, 33.0, 26.0, 1.0))","List(1, 2, List(), List(64.0, 38.0))","List(1, 2, List(), List(0.6274509803921569, 0.37254901960784315))",0.0
0,1,33.0,0,0,27.7208,0,1,0.0,1.0,0.0,"List(0, 10, List(0, 1, 4, 6, 8), List(1.0, 33.0, 27.7208, 1.0, 1.0))","List(1, 2, List(), List(64.0, 38.0))","List(1, 2, List(), List(0.6274509803921569, 0.37254901960784315))",0.0
0,1,38.0,0,1,153.4625,1,0,0.0,0.0,0.0,"List(0, 10, List(0, 1, 3, 4, 5), List(1.0, 38.0, 1.0, 153.4625, 1.0))","List(1, 2, List(), List(64.0, 38.0))","List(1, 2, List(), List(0.6274509803921569, 0.37254901960784315))",0.0
0,1,40.0,0,0,0.0,0,1,0.0,0.0,0.0,"List(0, 10, List(0, 1, 6), List(1.0, 40.0, 1.0))","List(1, 2, List(), List(64.0, 38.0))","List(1, 2, List(), List(0.6274509803921569, 0.37254901960784315))",0.0
0,1,40.0,0,0,27.7208,0,1,0.0,1.0,0.0,"List(0, 10, List(0, 1, 4, 6, 8), List(1.0, 40.0, 27.7208, 1.0, 1.0))","List(1, 2, List(), List(64.0, 38.0))","List(1, 2, List(), List(0.6274509803921569, 0.37254901960784315))",0.0
0,1,45.0,1,0,83.475,1,0,0.0,0.0,0.0,"List(0, 10, List(0, 1, 2, 4, 5), List(1.0, 45.0, 1.0, 83.475, 1.0))","List(1, 2, List(), List(64.0, 38.0))","List(1, 2, List(), List(0.6274509803921569, 0.37254901960784315))",0.0


###### Gradient-boosted tree classifier

In [88]:
from pyspark.ml.classification import GBTClassifier
gbt = GBTClassifier(labelCol="Survived", featuresCol="features",maxIter=10)
gbt_model = gbt.fit(trainingData)
gbt_prediction = gbt_model.transform(testData)
gbt_prediction.select("prediction", "Survived", "features").show()


###### Evaluate accuracy of Gradient-boosted.

In [90]:
gbt_accuracy = evaluator.evaluate(gbt_prediction)
print("Accuracy of Gradient-boosted tree classifie is = %g"% (gbt_accuracy))
print("Test Error of Gradient-boosted tree classifie %g"% (1.0 - gbt_accuracy))


In [91]:
display(gbt_prediction)

Survived,Pclass,Age,SibSp,Parch,Fare,Family_Size,Alone,Sex_index,Embarked_index,Initial_index,features,rawPrediction,probability,prediction
0,1,19.0,3,2,263.0,5,0,0.0,0.0,0.0,"List(1, 10, List(), List(1.0, 19.0, 3.0, 2.0, 263.0, 5.0, 0.0, 0.0, 0.0, 0.0))","List(1, 2, List(), List(0.11490109912042995, -0.11490109912042995))","List(1, 2, List(), List(0.5571990518653402, 0.44280094813465976))",0.0
0,1,27.0,0,2,211.5,2,0,0.0,1.0,0.0,"List(1, 10, List(), List(1.0, 27.0, 0.0, 2.0, 211.5, 2.0, 0.0, 0.0, 1.0, 0.0))","List(1, 2, List(), List(-0.3710794384516155, 0.3710794384516155))","List(1, 2, List(), List(0.32253223732789243, 0.6774677626721075))",1.0
0,1,28.0,0,0,47.1,0,1,0.0,0.0,0.0,"List(0, 10, List(0, 1, 4, 6), List(1.0, 28.0, 47.1, 1.0))","List(1, 2, List(), List(0.04048429804784651, -0.04048429804784651))","List(1, 2, List(), List(0.5202310974542518, 0.4797689025457482))",0.0
0,1,28.0,1,0,82.1708,1,0,0.0,1.0,0.0,"List(1, 10, List(), List(1.0, 28.0, 1.0, 0.0, 82.1708, 1.0, 0.0, 0.0, 1.0, 0.0))","List(1, 2, List(), List(-0.3030161439272254, 0.3030161439272254))","List(1, 2, List(), List(0.3529648170977369, 0.6470351829022631))",1.0
0,1,33.0,0,0,26.0,0,1,0.0,0.0,0.0,"List(0, 10, List(0, 1, 4, 6), List(1.0, 33.0, 26.0, 1.0))","List(1, 2, List(), List(-0.2710071905243533, 0.2710071905243533))","List(1, 2, List(), List(0.36771911043963923, 0.6322808895603608))",1.0
0,1,33.0,0,0,27.7208,0,1,0.0,1.0,0.0,"List(0, 10, List(0, 1, 4, 6, 8), List(1.0, 33.0, 27.7208, 1.0, 1.0))","List(1, 2, List(), List(0.5981255774179265, -0.5981255774179265))","List(1, 2, List(), List(0.7678572135527792, 0.23214278644722075))",0.0
0,1,38.0,0,1,153.4625,1,0,0.0,0.0,0.0,"List(0, 10, List(0, 1, 3, 4, 5), List(1.0, 38.0, 1.0, 153.4625, 1.0))","List(1, 2, List(), List(0.13626864722362206, -0.13626864722362206))","List(1, 2, List(), List(0.5677157006977246, 0.4322842993022754))",0.0
0,1,40.0,0,0,0.0,0,1,0.0,0.0,0.0,"List(0, 10, List(0, 1, 6), List(1.0, 40.0, 1.0))","List(1, 2, List(), List(1.3334511729565355, -1.3334511729565355))","List(1, 2, List(), List(0.9350451464879072, 0.06495485351209285))",0.0
0,1,40.0,0,0,27.7208,0,1,0.0,1.0,0.0,"List(0, 10, List(0, 1, 4, 6, 8), List(1.0, 40.0, 27.7208, 1.0, 1.0))","List(1, 2, List(), List(0.5981255774179265, -0.5981255774179265))","List(1, 2, List(), List(0.7678572135527792, 0.23214278644722075))",0.0
0,1,45.0,1,0,83.475,1,0,0.0,0.0,0.0,"List(0, 10, List(0, 1, 2, 4, 5), List(1.0, 45.0, 1.0, 83.475, 1.0))","List(1, 2, List(), List(0.056079090946451554, -0.056079090946451554))","List(1, 2, List(), List(0.5280101888785068, 0.4719898111214932))",0.0


###### NaiveBayes

In [93]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(labelCol="Survived", featuresCol="features")
nb_model = nb.fit(trainingData)
nb_prediction = nb_model.transform(testData)
nb_prediction.select("prediction", "Survived", "features").show()


###### Evaluating accuracy of NaiveBayes.

In [95]:
nb_accuracy = evaluator.evaluate(nb_prediction)
print("Accuracy of NaiveBayes is  = %g"% (nb_accuracy))
print("Test Error of NaiveBayes  = %g " % (1.0 - nb_accuracy))

In [96]:
display(nb_prediction)

Survived,Pclass,Age,SibSp,Parch,Fare,Family_Size,Alone,Sex_index,Embarked_index,Initial_index,features,rawPrediction,probability,prediction
0,1,19.0,3,2,263.0,5,0,0.0,0.0,0.0,"List(1, 10, List(), List(1.0, 19.0, 3.0, 2.0, 263.0, 5.0, 0.0, 0.0, 0.0, 0.0))","List(1, 2, List(), List(-321.8003252617946, -216.79984487784156))","List(1, 2, List(), List(2.505363650279672E-46, 1.0))",1.0
0,1,27.0,0,2,211.5,2,0,0.0,1.0,0.0,"List(1, 10, List(), List(1.0, 27.0, 0.0, 2.0, 211.5, 2.0, 0.0, 0.0, 1.0, 0.0))","List(1, 2, List(), List(-254.30830190605852, -173.88973570251883))","List(1, 2, List(), List(1.1875733830285463E-35, 1.0))",1.0
0,1,28.0,0,0,47.1,0,1,0.0,0.0,0.0,"List(0, 10, List(0, 1, 4, 6), List(1.0, 28.0, 47.1, 1.0))","List(1, 2, List(), List(-72.51027732344984, -65.81252050868913))","List(1, 2, List(), List(0.0012321560837094655, 0.9987678439162906))",1.0
0,1,28.0,1,0,82.1708,1,0,0.0,1.0,0.0,"List(1, 10, List(), List(1.0, 28.0, 1.0, 0.0, 82.1708, 1.0, 0.0, 0.0, 1.0, 0.0))","List(1, 2, List(), List(-116.94870173250123, -94.51672095117122))","List(1, 2, List(), List(1.8109836458891784E-10, 0.9999999998189015))",1.0
0,1,33.0,0,0,26.0,0,1,0.0,0.0,0.0,"List(0, 10, List(0, 1, 4, 6), List(1.0, 33.0, 26.0, 1.0))","List(1, 2, List(), List(-54.768989285858126, -59.649355474039695))","List(1, 2, List(), List(0.9924630050923153, 0.007536994907684597))",0.0
0,1,33.0,0,0,27.7208,0,1,0.0,1.0,0.0,"List(0, 10, List(0, 1, 4, 6, 8), List(1.0, 33.0, 27.7208, 1.0, 1.0))","List(1, 2, List(), List(-61.67546270204022, -65.78070185849455))","List(1, 2, List(), List(0.9837813069839789, 0.016218693016021028))",0.0
0,1,38.0,0,1,153.4625,1,0,0.0,0.0,0.0,"List(0, 10, List(0, 1, 3, 4, 5), List(1.0, 38.0, 1.0, 153.4625, 1.0))","List(1, 2, List(), List(-189.20027935668622, -139.1942222205373))","List(1, 2, List(), List(1.9171024580616214E-22, 1.0))",1.0
0,1,40.0,0,0,0.0,0,1,0.0,0.0,0.0,"List(0, 10, List(0, 1, 6), List(1.0, 40.0, 1.0))","List(1, 2, List(), List(-33.439670575380546, -52.95838125168652))","List(1, 2, List(), List(0.9999999966647248, 3.3352750752654846E-9))",0.0
0,1,40.0,0,0,27.7208,0,1,0.0,1.0,0.0,"List(0, 10, List(0, 1, 4, 6, 8), List(1.0, 40.0, 27.7208, 1.0, 1.0))","List(1, 2, List(), List(-66.11467452713362, -73.31963652881278))","List(1, 2, List(), List(0.9992576610523204, 7.423389476796005E-4))",0.0
0,1,45.0,1,0,83.475,1,0,0.0,0.0,0.0,"List(0, 10, List(0, 1, 2, 4, 5), List(1.0, 45.0, 1.0, 83.475, 1.0))","List(1, 2, List(), List(-123.82124073819823, -108.34981107816128))","List(1, 2, List(), List(1.9091646683954347E-7, 0.9999998090835331))",1.0


###### Support Vector Machine

In [98]:
from pyspark.ml.classification import LinearSVC
svm = LinearSVC(labelCol="Survived", featuresCol="features")
svm_model = svm.fit(trainingData)
svm_prediction = svm_model.transform(testData)
svm_prediction.select("prediction", "Survived", "features").show()


###### Evaluating the accuracy of Support Vector Machine.

In [100]:
svm_accuracy = evaluator.evaluate(svm_prediction)
print("Accuracy of Support Vector Machine is = %g"% (svm_accuracy))
print("Test Error of Support Vector Machine = %g " % (1.0 - svm_accuracy))

In [101]:
display(svm_prediction)

Survived,Pclass,Age,SibSp,Parch,Fare,Family_Size,Alone,Sex_index,Embarked_index,Initial_index,features,rawPrediction,prediction
0,1,19.0,3,2,263.0,5,0,0.0,0.0,0.0,"List(1, 10, List(), List(1.0, 19.0, 3.0, 2.0, 263.0, 5.0, 0.0, 0.0, 0.0, 0.0))","List(1, 2, List(), List(1.455265908308349, -1.455265908308349))",0.0
0,1,27.0,0,2,211.5,2,0,0.0,1.0,0.0,"List(1, 10, List(), List(1.0, 27.0, 0.0, 2.0, 211.5, 2.0, 0.0, 0.0, 1.0, 0.0))","List(1, 2, List(), List(0.2339648610258473, -0.2339648610258473))",0.0
0,1,28.0,0,0,47.1,0,1,0.0,0.0,0.0,"List(0, 10, List(0, 1, 4, 6), List(1.0, 28.0, 47.1, 1.0))","List(1, 2, List(), List(0.7498990361269506, -0.7498990361269506))",0.0
0,1,28.0,1,0,82.1708,1,0,0.0,1.0,0.0,"List(1, 10, List(), List(1.0, 28.0, 1.0, 0.0, 82.1708, 1.0, 0.0, 0.0, 1.0, 0.0))","List(1, 2, List(), List(0.5483388263838227, -0.5483388263838227))",0.0
0,1,33.0,0,0,26.0,0,1,0.0,0.0,0.0,"List(0, 10, List(0, 1, 4, 6), List(1.0, 33.0, 26.0, 1.0))","List(1, 2, List(), List(0.9004670381472674, -0.9004670381472674))",0.0
0,1,33.0,0,0,27.7208,0,1,0.0,1.0,0.0,"List(0, 10, List(0, 1, 4, 6, 8), List(1.0, 33.0, 27.7208, 1.0, 1.0))","List(1, 2, List(), List(0.8608025283653931, -0.8608025283653931))",0.0
0,1,38.0,0,1,153.4625,1,0,0.0,0.0,0.0,"List(0, 10, List(0, 1, 3, 4, 5), List(1.0, 38.0, 1.0, 153.4625, 1.0))","List(1, 2, List(), List(0.26559184708678313, -0.26559184708678313))",0.0
0,1,40.0,0,0,0.0,0,1,0.0,0.0,0.0,"List(0, 10, List(0, 1, 6), List(1.0, 40.0, 1.0))","List(1, 2, List(), List(1.095790069538479, -1.095790069538479))",0.0
0,1,40.0,0,0,27.7208,0,1,0.0,1.0,0.0,"List(0, 10, List(0, 1, 4, 6, 8), List(1.0, 40.0, 27.7208, 1.0, 1.0))","List(1, 2, List(), List(0.9424881424210036, -0.9424881424210036))",0.0
0,1,45.0,1,0,83.475,1,0,0.0,0.0,0.0,"List(0, 10, List(0, 1, 2, 4, 5), List(1.0, 45.0, 1.0, 83.475, 1.0))","List(1, 2, List(), List(0.7731614081464582, -0.7731614081464582))",0.0
