# **Classification using Pyspark**

## **Configuration**

In [None]:
#Installation
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [None]:
#spark
import findspark
findspark.init()

In [None]:
#Creating Session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("ClassificationwithSpark").getOrCreate()

## **Data Load and Explore**

In [None]:
#import necessary functions
from itertools import chain
from pyspark.sql.functions import count, mean, when, lit, create_map, regexp_extract

In [None]:
#Data loading
df1 = spark.read.csv('/content/train.csv',
                     header=True, inferSchema=True)
df2 = spark.read.csv('/content/test.csv',
                     header=True, inferSchema=True)

In [None]:
#Printing information about each column
df1.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [None]:
#Showing the dataframe
df1.show(4)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
only showing top 4 rows



In [None]:
#Converting dataset into pandas to observe the data in a more formatted way
df1.limit(100).toPandas()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,0,3,"Shorney, Mr. Charles Joseph",male,,0,0,374910,8.0500,,S
96,97,0,1,"Goldschmidt, Mr. George B",male,71.0,0,0,PC 17754,34.6542,A5,C
97,98,1,1,"Greenfield, Mr. William Bertram",male,23.0,0,1,PC 17759,63.3583,D10 D12,C
98,99,1,2,"Doling, Mrs. John T (Ada Julia Bone)",female,34.0,0,1,231919,23.0000,,S


In [None]:
#Select some specific columns
df1.select('Survived', 'Pclass', 'Age', 'Fare').how(4)s

+--------+------+----+-------+
|Survived|Pclass| Age|   Fare|
+--------+------+----+-------+
|       0|     3|22.0|   7.25|
|       1|     1|38.0|71.2833|
|       1|     3|26.0|  7.925|
|       1|     1|35.0|   53.1|
+--------+------+----+-------+
only showing top 4 rows



In [None]:
#Summarizing key information of the selected columns (describe() method can also be used)
df1.select('Survived', 'Pclass', 'Age', 'Fare').summary().show()

+-------+-------------------+------------------+------------------+-----------------+
|summary|           Survived|            Pclass|               Age|             Fare|
+-------+-------------------+------------------+------------------+-----------------+
|  count|                891|               891|               714|              891|
|   mean| 0.3838383838383838| 2.308641975308642| 29.69911764705882| 32.2042079685746|
| stddev|0.48659245426485753|0.8360712409770491|14.526497332334035|49.69342859718089|
|    min|                  0|                 1|              0.42|              0.0|
|    25%|                  0|                 2|              20.0|           7.8958|
|    50%|                  0|                 3|              28.0|          14.4542|
|    75%|                  1|                 3|              38.0|             31.0|
|    max|                  1|                 3|              80.0|         512.3292|
+-------+-------------------+------------------+------

In [None]:
#To know the shape of the dataframe
print('Number of rows: \t', df1.count())
print('Number of columns: \t', len(df1.columns))

Number of rows: 	 891
Number of columns: 	 12


## **Exploratory Data Analysis**

In [None]:
#Grouping a column based on the type of data present there
df1.groupBy('Survived').count().show()

+--------+-----+
|Survived|count|
+--------+-----+
|       1|  342|
|       0|  549|
+--------+-----+



In [None]:
#Grouping a column and finding information about each group (continuous variables)
df1.groupBy('Survived').mean('Fare', 'Age').show()

+--------+------------------+------------------+
|Survived|         avg(Fare)|          avg(Age)|
+--------+------------------+------------------+
|       1| 48.39540760233917|28.343689655172415|
|       0|22.117886885245877| 30.62617924528302|
+--------+------------------+------------------+



In [None]:
#Grouping a column based on categories of data present there (categorial variable)
df1.groupBy('Survived').pivot('Sex').count().show()

+--------+------+----+
|Survived|female|male|
+--------+------+----+
|       1|   233| 109|
|       0|    81| 468|
+--------+------+----+



In [None]:
#Class effect on survival
df1.groupBy('Survived').pivot('Pclass').count().show()

+--------+---+---+---+
|Survived|  1|  2|  3|
+--------+---+---+---+
|       1|136| 87|119|
|       0| 80| 97|372|
+--------+---+---+---+



In [None]:
#Sibling number effect on survival
df1.groupBy('Survived').pivot('SibSp').count().show()

+--------+---+---+---+---+---+----+----+
|Survived|  0|  1|  2|  3|  4|   5|   8|
+--------+---+---+---+---+---+----+----+
|       1|210|112| 13|  4|  3|null|null|
|       0|398| 97| 15| 12| 15|   5|   7|
+--------+---+---+---+---+---+----+----+



In [None]:
#Printing number of column values that has null (Train set)
for col in df1.columns:
    print(col.ljust(15), df1.filter(df1[col].isNull()).count())

PassengerId     0
Survived        0
Pclass          0
Name            0
Sex             0
Age             177
SibSp           0
Parch           0
Ticket          0
Fare            0
Cabin           687
Embarked        2


In [None]:
#Printing number of column values that has null (Test set)
for col in df2.columns:
    print(col.ljust(15), df2.filter(df2[col].isNull()).count())

PassengerId     0
Pclass          0
Name            0
Sex             0
Age             86
SibSp           0
Parch           0
Ticket          0
Fare            1
Cabin           327
Embarked        0


In [None]:
#As only 2 value missing in embarked column, it can be filled with the most repeated value (one way)
df1.select('Fare', 'Embarked').summary('max', "50%", "mean").show()

+-------+----------------+--------+
|summary|            Fare|Embarked|
+-------+----------------+--------+
|    max|        512.3292|       S|
|    50%|         14.4542|    null|
|   mean|32.2042079685746|    null|
+-------+----------------+--------+



In [None]:
#The test dataset has some missing value in fare column
df1 = df1.fillna({'Embarked': 'S', 'Fare':14.45})

In [None]:
#Filling missing age values with the mean age of the group
df1 = df1.withColumn('Title', regexp_extract(df1['Name'],\
                '([A-Za-z]+)\.', 1))

df1.groupBy('Title').agg(count('Age'), mean('Age')).sort('count(Age)').show()

+--------+----------+------------------+
|   Title|count(Age)|          avg(Age)|
+--------+----------+------------------+
|Countess|         1|              33.0|
|     Don|         1|              40.0|
|    Lady|         1|              48.0|
|     Sir|         1|              49.0|
|Jonkheer|         1|              38.0|
|     Mme|         1|              24.0|
|    Capt|         1|              70.0|
|      Ms|         1|              28.0|
|     Col|         2|              58.0|
|    Mlle|         2|              24.0|
|   Major|         2|              48.5|
|     Rev|         6|43.166666666666664|
|      Dr|         6|              42.0|
|  Master|        36| 4.574166666666667|
|     Mrs|       108|35.898148148148145|
|    Miss|       146|21.773972602739725|
|      Mr|       398|32.368090452261306|
+--------+----------+------------------+



In [None]:
#Filling missing values
title_dic = {'Mr':'Mr', 'Miss':'Miss', 'Mrs':'Mrs', 'Master':'Master', \
             'Mlle': 'Miss', 'Major': 'Mr', 'Col': 'Mr', 'Sir': 'Mr',\
             'Don': 'Mr', 'Mme': 'Miss', 'Jonkheer': 'Mr', 'Lady': 'Mrs',\
             'Capt': 'Mr', 'Countess': 'Mrs', 'Ms': 'Miss', 'Dona': 'Mrs', \
             'Dr':'Mr', 'Rev':'Mr'}


mapping = create_map([lit(x) for x in chain(*title_dic.items())])

df1 = df1.withColumn('Title', mapping[df1['Title']])
df1.groupBy('Title').mean('Age').show()

+------+------------------+
| Title|          avg(Age)|
+------+------------------+
|  Miss|             21.86|
|Master| 4.574166666666667|
|    Mr| 33.02272727272727|
|   Mrs|35.981818181818184|
+------+------------------+



In [None]:
df1.show(4)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+-----+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|Title|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+-----+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|   Mr|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|  Mrs|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S| Miss|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|  Mrs|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+-----+
only showing top 4 rows



In [None]:
#A function for replacing age values in places of null
def age_imputer(df, title, age):

    return df.withColumn('Age', \
                         when((df['Age'].isNull()) & (df['Title']==title), age).otherwise(df['Age']))

In [None]:
#Replacing values
df1 = age_imputer(df1, 'Mr', 33.02)
df1 = age_imputer(df1, 'Mrs', 35.98)
df1 = age_imputer(df1, 'Miss', 21.86)
df1 = age_imputer(df1, 'Master', 4.75)

## **Feature Engineering**

In [None]:
#Creating a new column and droping unnecessary columns
df1 = df1.withColumn('FamilySize', df1['Parch'] + df1['SibSp']).\
            drop('Parch', 'SibSp')

In [None]:
#Dropping unwanted columns
df1 = df1.drop('PassengerID', 'Cabin', 'Name', 'Ticket', 'Title')

In [None]:
df1.show(5)

+--------+------+------+----+-------+--------+----------+
|Survived|Pclass|   Sex| Age|   Fare|Embarked|FamilySize|
+--------+------+------+----+-------+--------+----------+
|       0|     3|  male|22.0|   7.25|       S|         1|
|       1|     1|female|38.0|71.2833|       C|         1|
|       1|     3|female|26.0|  7.925|       S|         0|
|       1|     1|female|35.0|   53.1|       S|         1|
|       0|     3|  male|35.0|   8.05|       S|         0|
+--------+------+------+----+-------+--------+----------+
only showing top 5 rows



In [None]:
#Cheking for null values
for col in df1.columns:
    print(col.ljust(15), df1.filter(df1[col].isNull()).count())

Survived        0
Pclass          0
Sex             0
Age             0
Fare            0
Embarked        0
FamilySize      0


## **Model Build**

In [None]:
# importing required libraries
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression,\
                    RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [None]:
#Converting categorical values to integer values (encoding)
stringIndex = StringIndexer(inputCols=['Sex', 'Embarked'],
                       outputCols=['SexNum', 'EmbNum'])

stringIndex_model = stringIndex.fit(df1)

df1_ = stringIndex_model.transform(df1).drop('Sex', 'Embarked')
df1_.show(4)

+--------+------+----+-------+----------+------+------+
|Survived|Pclass| Age|   Fare|FamilySize|SexNum|EmbNum|
+--------+------+----+-------+----------+------+------+
|       0|     3|22.0|   7.25|         1|   0.0|   0.0|
|       1|     1|38.0|71.2833|         1|   1.0|   1.0|
|       1|     3|26.0|  7.925|         0|   1.0|   0.0|
|       1|     1|35.0|   53.1|         1|   1.0|   0.0|
+--------+------+----+-------+----------+------+------+
only showing top 4 rows



In [None]:
#converitng dataset for model
vec_asmbl = VectorAssembler(inputCols=df1_.columns[1:],
                           outputCol='features')

df1_ = vec_asmbl.transform(df1_).select('features', 'Survived')
df1_.show(4, truncate=False)

+------------------------------+--------+
|features                      |Survived|
+------------------------------+--------+
|[3.0,22.0,7.25,1.0,0.0,0.0]   |0       |
|[1.0,38.0,71.2833,1.0,1.0,1.0]|1       |
|[3.0,26.0,7.925,0.0,1.0,0.0]  |1       |
|[1.0,35.0,53.1,1.0,1.0,0.0]   |1       |
+------------------------------+--------+
only showing top 4 rows



In [None]:
# Creating train and validation set
train_df, valid_df = df1_.randomSplit([0.8, 0.2])

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol='Survived',
                                          metricName='accuracy')

In [None]:
#Ridge Regression model
ridge = LogisticRegression(labelCol='Survived',
                        maxIter=100,
                        elasticNetParam=0,
                        regParam=0.03)

model = ridge.fit(train_df)
pred = model.transform(valid_df)
evaluator.evaluate(pred)

0.7734806629834254

In [None]:
#Lasso Regression model
lasso = LogisticRegression(labelCol='Survived',
                           maxIter=100,
                           elasticNetParam=1,
                           regParam=0.0003)

model = lasso.fit(train_df)
pred = model.transform(valid_df)
evaluator.evaluate(pred)

0.7955801104972375

In [None]:
#Random Forest Classifier Model
rf = RandomForestClassifier(labelCol='Survived',
                           numTrees=100, maxDepth=3)

model = rf.fit(train_df)
pred = model.transform(valid_df)
evaluator.evaluate(pred)

0.7845303867403315

In [None]:
#Gradient Boosting Classifier
gb = GBTClassifier(labelCol='Survived', maxIter=100, maxDepth=3)

model = gb.fit(train_df)
pred = model.transform(valid_df)
evaluator.evaluate(pred)

0.8066298342541437

## **Test Data Prepare**

In [None]:
df2 = spark.read.csv('/content/test.csv',
                     header=True, inferSchema=True)

In [None]:
#Prepareing Test Dataset
df2 = df2.fillna({'Embarked': 'S', 'Fare':14.45})
df2 = df2.withColumn('FamilySize', df2['Parch'] + df2['SibSp']).\
            drop('Parch', 'SibSp')

In [None]:
#Same as for Train set
df2 = df2.withColumn('Title', regexp_extract(df2['Name'],\
                '([A-Za-z]+)\.', 1))

df2 = df2.withColumn('Title', mapping[df2['Title']])

df2.groupBy('Title').agg(count('Age'), mean('Age')).sort('count(Age)').show()

+------+----------+------------------+
| Title|count(Age)|          avg(Age)|
+------+----------+------------------+
|Master|        17| 7.406470588235294|
|   Mrs|        63|38.904761904761905|
|  Miss|        64|21.774843750000002|
|    Mr|       188|32.340425531914896|
+------+----------+------------------+



In [None]:
#Replacing Values
df2 = age_imputer(df2, 'Mr', 32.34)
df2 = age_imputer(df2, 'Mrs', 38.90)
df2 = age_imputer(df2, 'Miss', 21.77)
df2 = age_imputer(df2, 'Master', 7.41)

df2 = df2.drop('Cabin', 'Name', 'Ticket', 'Title')
df2.show(4)

+-----------+------+------+----+------+--------+----------+
|PassengerId|Pclass|   Sex| Age|  Fare|Embarked|FamilySize|
+-----------+------+------+----+------+--------+----------+
|        892|     3|  male|34.5|7.8292|       Q|         0|
|        893|     3|female|47.0|   7.0|       S|         1|
|        894|     2|  male|62.0|9.6875|       Q|         0|
|        895|     3|  male|27.0|8.6625|       S|         0|
+-----------+------+------+----+------+--------+----------+
only showing top 4 rows



In [None]:
for col in df2.columns:
    print(col.ljust(15), df2.filter(df2[col].isNull()).count())

PassengerId     0
Pclass          0
Sex             0
Age             0
Fare            0
Embarked        0
FamilySize      0


## **Introduction of Pipeline**

In [None]:
#Creating pipelne
pipeline_rf = Pipeline(stages=[stringIndex, vec_asmbl, rf])

#Creating Grid
paramGrid = ParamGridBuilder().\
            addGrid(rf.maxDepth, [3, 4, 5]).\
            addGrid(rf.minInfoGain, [0.001, 0.01, 0.1]).\
            addGrid(rf.numTrees, [100,500,1000]).\
            build()

#Initializing crossvalidator
selected_model = CrossValidator(estimator=pipeline_rf,
                                estimatorParamMaps=paramGrid,
                                evaluator=evaluator,
                                numFolds=5)

model_final = selected_model.fit(df1)
pred_train = model_final.transform(df1)
evaluator.evaluate(pred_train)

0.8484848484848485

In [None]:
#Predicting from Test Data
pred_test = model_final.transform(df2)

#Getting only predictions
predictions = pred_test.select('PassengerId', 'prediction')
predictions = predictions.\
                withColumn('Survived', predictions['prediction'].\
                cast('integer')).drop('prediction')
predictions.show(5)

+-----------+--------+
|PassengerId|Survived|
+-----------+--------+
|        892|       0|
|        893|       0|
|        894|       0|
|        895|       0|
|        896|       1|
+-----------+--------+
only showing top 5 rows



In [None]:
#Saving the Model
model_final.write().save('titanic_classification.model')

In [None]:
# Load the saved model
from pyspark.ml.tuning import CrossValidatorModel

loaded_model = CrossValidatorModel.load('titanic_classification.model')