In [1]:
import findspark
findspark.init()
findspark.find()

from IPython.display import *
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [2]:
import pyspark
print(pyspark.__version__)

3.5.2


<b>Dataset location: </b>https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data

The data is in the format value1, value2... <br />
The leading whitespace for each value needs to be removed

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName('Predicting whether a person\'s income is greater than $50K') \
    .getOrCreate()

rawData = spark.read\
            .format('csv')\
            .option('header', 'false')\
            .option('ignoreLeadingWhiteSpace', 'true')\
            .load('datasets/adult.csv')

#### Specify column headers for data set

In [2]:
dataset = rawData.toDF('Age',
               'WorkClass',
               'FnlWgt',
               'Education',
               'EducationNum',
               'MaritalStatus',
               'Occupation',
               'Relationship',
               'Race',
               'Gender',
               'CapitalGain',
               'CapitalLoss',
               'HoursPerWeek',
               'NativeCountry',
               'Label'
                )

In [3]:
dataset.toPandas().head()

Unnamed: 0,Age,WorkClass,FnlWgt,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


#### Drop FnlWgt column which does not appear meaningful

In [5]:
dataset = dataset.drop('FnlWgt')

#### Examine the dataset
* The FnlWgt column has been dropped
* There are missing values in the data represented by '?' (e.g. line 32541 for column WorkClass)

In [6]:
dataset.toPandas()

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Label
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


#### Count rows in dataset

In [7]:
dataset.count()

32561

#### Convert missing values to null
Missing values in this dataset are represented by ?

In [8]:
dataset = dataset.replace('?', None)

#### Drop all rows which contain even a single missing value
The value 'any' for parameter how specifies that even a single missing value in a row should result in it being dropped (as opposed to 'all' where all values need to be missing)

In [9]:
dataset = dataset.dropna(how='any')

#### Number of rows has reduced now

In [10]:
dataset.count()

30162

#### Confirm missing value rows are not there
Row 32541 for example

In [11]:
dataset.toPandas()

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Label
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30157,27,Private,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
30158,40,Private,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
30159,58,Private,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
30160,22,Private,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


#### View the data types for all the columns
Since they have all been loaded as Strings, we need to convert the numeric fields to Float

In [12]:
dataset.describe()

DataFrame[summary: string, Age: string, WorkClass: string, Education: string, EducationNum: string, MaritalStatus: string, Occupation: string, Relationship: string, Race: string, Gender: string, CapitalGain: string, CapitalLoss: string, HoursPerWeek: string, NativeCountry: string, Label: string]

In [13]:
# Check the updated schema
dataset.printSchema()


root
 |-- Age: string (nullable = true)
 |-- WorkClass: string (nullable = true)
 |-- Education: string (nullable = true)
 |-- EducationNum: string (nullable = true)
 |-- MaritalStatus: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- Relationship: string (nullable = true)
 |-- Race: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- CapitalGain: string (nullable = true)
 |-- CapitalLoss: string (nullable = true)
 |-- HoursPerWeek: string (nullable = true)
 |-- NativeCountry: string (nullable = true)
 |-- Label: string (nullable = true)



In [14]:
from pyspark.sql.types import FloatType
from pyspark.sql.functions import col

dataset = dataset.withColumn('Age', 
                             dataset['Age'].cast(FloatType()))
dataset = dataset.withColumn('EducationNum', 
                             dataset['EducationNum'].cast(FloatType()))
dataset = dataset.withColumn('CapitalGain', 
                             dataset['CapitalGain'].cast(FloatType()))
dataset = dataset.withColumn('CapitalLoss', 
                             dataset['CapitalLoss'].cast(FloatType()))
dataset = dataset.withColumn('HoursPerWeek', 
                             dataset['HoursPerWeek'].cast(FloatType()))

dataset.toPandas().head()

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Label
0,39.0,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50.0,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38.0,Private,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53.0,Private,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28.0,Private,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


In [15]:
# Check the updated schema
dataset.printSchema()


root
 |-- Age: float (nullable = true)
 |-- WorkClass: string (nullable = true)
 |-- Education: string (nullable = true)
 |-- EducationNum: float (nullable = true)
 |-- MaritalStatus: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- Relationship: string (nullable = true)
 |-- Race: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- CapitalGain: float (nullable = true)
 |-- CapitalLoss: float (nullable = true)
 |-- HoursPerWeek: float (nullable = true)
 |-- NativeCountry: string (nullable = true)
 |-- Label: string (nullable = true)



#### Transform categorical fields
First use StringIndexer to convert categorical values to indices

In [16]:
from pyspark.ml.feature import StringIndexer

indexedDF = StringIndexer(
    inputCol='WorkClass', 
    outputCol='WorkClass_index').fit(dataset).transform(dataset)

In [17]:
indexedDF.show(5)


+----+----------------+---------+------------+------------------+-----------------+-------------+-----+------+-----------+-----------+------------+-------------+-----+---------------+
| Age|       WorkClass|Education|EducationNum|     MaritalStatus|       Occupation| Relationship| Race|Gender|CapitalGain|CapitalLoss|HoursPerWeek|NativeCountry|Label|WorkClass_index|
+----+----------------+---------+------------+------------------+-----------------+-------------+-----+------+-----------+-----------+------------+-------------+-----+---------------+
|39.0|       State-gov|Bachelors|        13.0|     Never-married|     Adm-clerical|Not-in-family|White|  Male|     2174.0|        0.0|        40.0|United-States|<=50K|            3.0|
|50.0|Self-emp-not-inc|Bachelors|        13.0|Married-civ-spouse|  Exec-managerial|      Husband|White|  Male|        0.0|        0.0|        13.0|United-States|<=50K|            1.0|
|38.0|         Private|  HS-grad|         9.0|          Divorced|Handlers-cleane

#### A new column called WorkClass_index is created
This stores the indexed values of WorkClass

In [18]:
indexedDF.toPandas().head()

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Label,WorkClass_index
0,39.0,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K,3.0
1,50.0,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K,1.0
2,38.0,Private,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K,0.0
3,53.0,Private,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K,0.0
4,28.0,Private,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K,0.0


#### OneHotEncoding
Use the new indexed field to obtain a one-hot-encoded field

In [19]:
from pyspark.ml.linalg import Vectors

# Dense vector with mostly zeros
dense_vec = Vectors.dense([0.0, 3.0, 0.0, 0.0, 5.0, 0.0, 0.0, 0.0, 7.0])



sparse_vec = Vectors.sparse(9, [1, 4, 8], [3.0, 5.0, 7.0])

print(dense_vec)
print(sparse_vec)

[0.0,3.0,0.0,0.0,5.0,0.0,0.0,0.0,7.0]
(9,[1,4,8],[3.0,5.0,7.0])


In [20]:
from pyspark.ml.feature import OneHotEncoder


In [21]:

encodedDF = OneHotEncoder(
    inputCol="WorkClass_index", 
    outputCol="WorkClass_encoded").fit(indexedDF).transform(indexedDF)

In [22]:
encodedDF.show(5)

+----+----------------+---------+------------+------------------+-----------------+-------------+-----+------+-----------+-----------+------------+-------------+-----+---------------+-----------------+
| Age|       WorkClass|Education|EducationNum|     MaritalStatus|       Occupation| Relationship| Race|Gender|CapitalGain|CapitalLoss|HoursPerWeek|NativeCountry|Label|WorkClass_index|WorkClass_encoded|
+----+----------------+---------+------------+------------------+-----------------+-------------+-----+------+-----------+-----------+------------+-------------+-----+---------------+-----------------+
|39.0|       State-gov|Bachelors|        13.0|     Never-married|     Adm-clerical|Not-in-family|White|  Male|     2174.0|        0.0|        40.0|United-States|<=50K|            3.0|    (6,[3],[1.0])|
|50.0|Self-emp-not-inc|Bachelors|        13.0|Married-civ-spouse|  Exec-managerial|      Husband|White|  Male|        0.0|        0.0|        13.0|United-States|<=50K|            1.0|    (6,[1

In [102]:
encodedDF.select('WorkClass_index', 'WorkClass_encoded').distinct().sort('WorkClass_index').show()

+---------------+-----------------+
|WorkClass_index|WorkClass_encoded|
+---------------+-----------------+
|            0.0|    (6,[0],[1.0])|
|            1.0|    (6,[1],[1.0])|
|            2.0|    (6,[2],[1.0])|
|            3.0|    (6,[3],[1.0])|
|            4.0|    (6,[4],[1.0])|
|            5.0|    (6,[5],[1.0])|
|            6.0|        (6,[],[])|
+---------------+-----------------+



#### A WorkClass_encoded field is created 
* This contains the one-hot-encoding for WorkClass
* This cannot operate directly on a column with string values - values need to be numeric. Hence we use the WorkClass_index as input

In [23]:
encodedDF.toPandas().head()

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Label,WorkClass_index,WorkClass_encoded
0,39.0,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K,3.0,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0)"
1,50.0,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K,1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)"
2,38.0,Private,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
3,53.0,Private,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
4,28.0,Private,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)"


#### View the original and transformed fields together

In [24]:
encodedDF.select('WorkClass', 'WorkClass_index', 'WorkClass_encoded')\
         .toPandas()\
         .head()

Unnamed: 0,WorkClass,WorkClass_index,WorkClass_encoded
0,State-gov,3.0,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0)"
1,Self-emp-not-inc,1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)"
2,Private,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
3,Private,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
4,Private,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)"


### Transform the entire dataset
* So far we have only transformed a single column
* We need to perform this transformation for every categorical and non-numeric column
* This will be simplified by using a Pipeline (a feature of Spark ML)

####  First, split the data into training and test sets

In [25]:
(trainingData, testData) = dataset.randomSplit([0.8,0.2])

#### Encode all the categorical fields in the dataset
We begin by listing all the categorical fields

In [26]:
categoricalFeatures = [
               'WorkClass',
               'Education',
               'MaritalStatus',
               'Occupation',
               'Relationship',
               'Race',
               'Gender',
               'NativeCountry'
]

#### Create an array of StringIndexers to convert the categorical values to indices

In [27]:
indexers = [StringIndexer(
    inputCol=column, 
    outputCol=column + '_index', 
    handleInvalid='keep') for column in categoricalFeatures]

#### Apply on Dataset

In [28]:
from pyspark.ml.feature import StringIndexer

# categorical features
categorical_features = ['WorkClass', 'Education', 'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Gender', 'NativeCountry']

indexed_dataset = dataset

for column in categorical_features:
    indexer = StringIndexer(inputCol=column, outputCol=column + "_index", handleInvalid="keep")
    indexed_dataset = indexer.fit(indexed_dataset).transform(indexed_dataset)

# Show the transformed dataset
indexed_dataset.show(5)


+----+----------------+---------+------------+------------------+-----------------+-------------+-----+------+-----------+-----------+------------+-------------+-----+---------------+---------------+-------------------+----------------+------------------+----------+------------+-------------------+
| Age|       WorkClass|Education|EducationNum|     MaritalStatus|       Occupation| Relationship| Race|Gender|CapitalGain|CapitalLoss|HoursPerWeek|NativeCountry|Label|WorkClass_index|Education_index|MaritalStatus_index|Occupation_index|Relationship_index|Race_index|Gender_index|NativeCountry_index|
+----+----------------+---------+------------+------------------+-----------------+-------------+-----+------+-----------+-----------+------------+-------------+-----+---------------+---------------+-------------------+----------------+------------------+----------+------------+-------------------+
|39.0|       State-gov|Bachelors|        13.0|     Never-married|     Adm-clerical|Not-in-family|Whi

<!--  -->

#### Create an array of OneHotEncoders to encode the categorical values

In [29]:
encoders = [OneHotEncoder(
    inputCol=column + '_index', 
    outputCol= column + '_encoded') for column in categoricalFeatures]

#### Index the Label field

In [30]:
labelIndexer = [StringIndexer(
    inputCol='Label', outputCol='Label_index')]

In [111]:
label_indexer = labelIndexer[0].fit(dataset)

transformed_dataset = label_indexer.transform(dataset)

transformed_dataset.select('Label', 'Label_index').show(10)


+-----+-----------+
|Label|Label_index|
+-----+-----------+
|<=50K|        0.0|
|<=50K|        0.0|
|<=50K|        0.0|
|<=50K|        0.0|
|<=50K|        0.0|
|<=50K|        0.0|
|<=50K|        0.0|
| >50K|        1.0|
| >50K|        1.0|
| >50K|        1.0|
+-----+-----------+
only showing top 10 rows



#### Create a pipeline
The pipeline contains the array of StringIndexers and OneHotEncoders

In [32]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=indexers + encoders + labelIndexer)

#### View the result of the transformations performed by this pipeline
This pipeline can transform our dataset into a format which can be used by our model

In [33]:
transformedDF = pipeline.fit(trainingData).transform(trainingData)
transformedDF.toPandas().tail()

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,...,NativeCountry_index,WorkClass_encoded,Education_encoded,MaritalStatus_encoded,Occupation_encoded,Relationship_encoded,Race_encoded,Gender_encoded,NativeCountry_encoded,Label_index
24180,90.0,Private,Some-college,10.0,Never-married,Other-service,Not-in-family,Asian-Pac-Islander,Male,0.0,...,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
24181,90.0,Private,Some-college,10.0,Separated,Adm-clerical,Own-child,White,Female,0.0,...,6.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...",0.0
24182,90.0,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Husband,White,Male,10566.0,...,0.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
24183,90.0,Self-emp-not-inc,HS-grad,9.0,Never-married,Exec-managerial,Not-in-family,White,Male,2964.0,...,0.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
24184,90.0,Self-emp-not-inc,Some-college,10.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,...,0.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0


#### Select the required features
At this point the dataset contains a lot of additional columns. We select the features needed by our model

In [34]:
requiredFeatures = [
    'Age',
    'EducationNum',
    'CapitalGain',
    'CapitalLoss',
    'HoursPerWeek',
    'WorkClass_encoded',
    'Education_encoded',
    'MaritalStatus_encoded',
    'Occupation_encoded',
    'Relationship_encoded',
    'Race_encoded',
    'Gender_encoded',
    'NativeCountry_encoded'
]

#### VectorAssembler
VectorAssembler is a transformer that combines a given list of columns into a single vector column. It is useful for combining raw features and features generated by different feature transformers into a single feature vector
* We had previously written our own function to create such a vector

In [35]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=requiredFeatures, outputCol='features')

In [36]:
transformedDF = assembler.transform(transformedDF)
transformedDF.toPandas().tail()

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,...,WorkClass_encoded,Education_encoded,MaritalStatus_encoded,Occupation_encoded,Relationship_encoded,Race_encoded,Gender_encoded,NativeCountry_encoded,Label_index,features
24180,90.0,Private,Some-college,10.0,Never-married,Other-service,Not-in-family,Asian-Pac-Islander,Male,0.0,...,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(90.0, 10.0, 0.0, 0.0, 35.0, 1.0, 0.0, 0.0, 0...."
24181,90.0,Private,Some-college,10.0,Separated,Adm-clerical,Own-child,White,Female,0.0,...,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...",0.0,"(90.0, 10.0, 0.0, 0.0, 40.0, 1.0, 0.0, 0.0, 0...."
24182,90.0,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Husband,White,Male,10566.0,...,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(90.0, 13.0, 10566.0, 0.0, 50.0, 0.0, 1.0, 0.0..."
24183,90.0,Self-emp-not-inc,HS-grad,9.0,Never-married,Exec-managerial,Not-in-family,White,Male,2964.0,...,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(90.0, 9.0, 2964.0, 0.0, 12.0, 0.0, 1.0, 0.0, ..."
24184,90.0,Self-emp-not-inc,Some-college,10.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,...,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(90.0, 10.0, 0.0, 0.0, 40.0, 0.0, 1.0, 0.0, 0...."


In [37]:
transformedDF.select('features').toPandas().tail()

Unnamed: 0,features
24180,"(90.0, 10.0, 0.0, 0.0, 35.0, 1.0, 0.0, 0.0, 0...."
24181,"(90.0, 10.0, 0.0, 0.0, 40.0, 1.0, 0.0, 0.0, 0...."
24182,"(90.0, 13.0, 10566.0, 0.0, 50.0, 0.0, 1.0, 0.0..."
24183,"(90.0, 9.0, 2964.0, 0.0, 12.0, 0.0, 1.0, 0.0, ..."
24184,"(90.0, 10.0, 0.0, 0.0, 40.0, 0.0, 1.0, 0.0, 0...."


#### Specify our estimator

In [53]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol='Label_index', 
                            featuresCol='features',
                            maxDepth=25)

#### Final Pipeline
* The pipeline we built previously only transformed the feature columns
* We re-create the pipeline to include the VectorAssembler and the estimator

The pipeline to be used to build the model contains all the transformers and ends with the estimator

In [54]:
pipeline = Pipeline(
    stages=indexers + encoders + labelIndexer + [assembler, rf]
)

#### Train the model

In [55]:
model = pipeline.fit(trainingData)

#### Use the test data for predictions

In [56]:
predictions = model.transform(testData)
predictionsDF = predictions.toPandas()
predictionsDF.head()

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,...,Occupation_encoded,Relationship_encoded,Race_encoded,Gender_encoded,NativeCountry_encoded,Label_index,features,rawPrediction,probability,prediction
0,17.0,Local-gov,10th,6.0,Never-married,Other-service,Own-child,White,Female,0.0,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 6.0, 0.0, 0.0, 25.0, 0.0, 0.0, 1.0, 0.0...","[19.98840335019393, 0.01159664980607026]","[0.9994201675096965, 0.000579832490303513]",0.0
1,17.0,Local-gov,11th,7.0,Never-married,Prof-specialty,Own-child,Black,Male,0.0,...,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 7.0, 0.0, 0.0, 15.0, 0.0, 0.0, 1.0, 0.0...","[19.867924180593015, 0.1320758194069858]","[0.9933962090296508, 0.00660379097034929]",0.0
2,17.0,Local-gov,11th,7.0,Never-married,Sales,Own-child,White,Male,0.0,...,"(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 7.0, 0.0, 0.0, 20.0, 0.0, 0.0, 1.0, 0.0...","[19.987580170273723, 0.012419829726277316]","[0.9993790085136862, 0.0006209914863138658]",0.0
3,17.0,Local-gov,9th,5.0,Never-married,Other-service,Own-child,White,Male,0.0,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 5.0, 0.0, 0.0, 45.0, 0.0, 0.0, 1.0, 0.0...","[19.948228559806296, 0.051771440193701744]","[0.997411427990315, 0.0025885720096850877]",0.0
4,17.0,Private,10th,6.0,Never-married,Handlers-cleaners,Own-child,White,Male,0.0,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 6.0, 0.0, 0.0, 20.0, 1.0, 0.0, 0.0, 0.0...","[19.982723037941653, 0.017276962058347195]","[0.9991361518970827, 0.0008638481029173597]",0.0


#### Select the correct label and predictions to evaluate the model

In [57]:
predictions = predictions.select(
    'Label_index',
    'prediction'
)

#### Create an evaluator for our model

In [58]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol='Label_index', 
    predictionCol='prediction', 
    metricName='accuracy')

#### Check the accuracy

In [59]:
accuracy = evaluator.evaluate(predictions)
print('Test Accuracy = ', accuracy)

Test Accuracy =  0.8500920194077296


#### Show distinct classes in the Label_index column


In [60]:
distinct_classes = predictions.select('Label_index').distinct().collect()
classes = [row['Label_index'] for row in distinct_classes]

print(f"Distinct classes in Label_index: {classes}")
print(f"Number of classes: {len(classes)}")


Distinct classes in Label_index: [0.0, 1.0]
Number of classes: 2


In [61]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Initialize the evaluator for precision
evaluator = MulticlassClassificationEvaluator(
    labelCol='Label_index', 
    predictionCol='prediction', 
    metricName='precisionByLabel'
)

# Evaluate precision for class 0
precision_class_0 = evaluator.evaluate(predictions, {evaluator.metricLabel: 0.0})
print(f"Precision for class 0: {precision_class_0:.4f}")

# Evaluate precision for class 1
precision_class_1 = evaluator.evaluate(predictions, {evaluator.metricLabel: 1.0})
print(f"Precision for class 1: {precision_class_1:.4f}")


Precision for class 0: 0.8821
Precision for class 1: 0.7313


In [62]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Initialize the evaluator for recall
evaluator_recall = MulticlassClassificationEvaluator(
    labelCol='Label_index', 
    predictionCol='prediction', 
    metricName='recallByLabel'
)

# Evaluate recall for class 0
recall_class_0 = evaluator_recall.evaluate(predictions, {evaluator_recall.metricLabel: 0.0})
print(f"Recall for class 0: {recall_class_0:.4f}")

# Evaluate recall for class 1
recall_class_1 = evaluator_recall.evaluate(predictions, {evaluator_recall.metricLabel: 1.0})
print(f"Recall for class 1: {recall_class_1:.4f}")


Recall for class 0: 0.9241
Recall for class 1: 0.6258


#### Examine incorrect predictions

In [128]:
predictionsDF.loc[
    predictionsDF['Label_index'] != predictionsDF['prediction']
]

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,...,Occupation_encoded,Relationship_encoded,Race_encoded,Gender_encoded,NativeCountry_encoded,Label_index,features,rawPrediction,probability,prediction
279,20.0,Private,HS-grad,9.0,Married-civ-spouse,Adm-clerical,Husband,White,Male,3781.0,...,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(20.0, 9.0, 3781.0, 0.0, 50.0, 1.0, 0.0, 0.0, ...","[9.029516716843807, 10.970483283156195]","[0.4514758358421903, 0.5485241641578098]",1.0
469,21.0,Private,Some-college,10.0,Never-married,Protective-serv,Not-in-family,Black,Female,99999.0,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(21.0, 10.0, 99999.0, 0.0, 40.0, 1.0, 0.0, 0.0...","[12.028002534272472, 7.971997465727529]","[0.6014001267136236, 0.39859987328637647]",0.0
495,22.0,Local-gov,5th-6th,3.0,Never-married,Handlers-cleaners,Other-relative,White,Male,0.0,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(22.0, 3.0, 0.0, 0.0, 40.0, 0.0, 0.0, 1.0, 0.0...","[18.80034366774818, 1.1996563322518188]","[0.940017183387409, 0.05998281661259094]",0.0
508,22.0,Private,11th,7.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0.0,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(22.0, 7.0, 0.0, 0.0, 70.0, 1.0, 0.0, 0.0, 0.0...","[13.96573353379545, 6.0342664662045475]","[0.6982866766897725, 0.3017133233102274]",0.0
526,22.0,Private,Bachelors,13.0,Never-married,Craft-repair,Not-in-family,White,Male,13550.0,...,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(22.0, 13.0, 13550.0, 0.0, 55.0, 1.0, 0.0, 0.0...","[11.253600543431887, 8.746399456568115]","[0.5626800271715944, 0.43731997282840573]",0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6074,77.0,Private,Prof-school,15.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,...,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(77.0, 15.0, 0.0, 0.0, 40.0, 1.0, 0.0, 0.0, 0....","[9.623088712883456, 10.376911287116544]","[0.4811544356441728, 0.5188455643558272]",1.0
6084,79.0,Self-emp-inc,Bachelors,13.0,Married-civ-spouse,Sales,Husband,White,Male,0.0,...,"(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(79.0, 13.0, 0.0, 0.0, 20.0, 0.0, 0.0, 0.0, 0....","[10.477865705833935, 9.522134294166065]","[0.5238932852916968, 0.47610671470830324]",0.0
6086,80.0,Private,Doctorate,16.0,Married-civ-spouse,Prof-specialty,Husband,White,Male,0.0,...,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(80.0, 16.0, 0.0, 0.0, 30.0, 1.0, 0.0, 0.0, 0....","[8.852424931152314, 11.147575068847685]","[0.4426212465576157, 0.5573787534423842]",1.0
6097,88.0,Self-emp-not-inc,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,White,Male,0.0,...,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(88.0, 15.0, 0.0, 0.0, 40.0, 0.0, 1.0, 0.0, 0....","[8.46250580541118, 11.537494194588819]","[0.423125290270559, 0.5768747097294409]",1.0
