In [211]:
from pyspark.sql import SparkSession

In [213]:
spark = SparkSession.builder.appName("Random_Forest_Customer_Banking").getOrCreate()
customer_banking = spark.read.csv("data-resources/customer_banking/", header = True, inferSchema = True)

In [214]:
customer_banking.show(5)

+---+----------+-------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+
|age|       job|marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|deposit|
+---+----------+-------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+
| 59|    admin.|married|secondary|     no|   2343|    yes|  no|unknown|  5|  may|    1042|       1|   -1|       0| unknown|    yes|
| 56|    admin.|married|secondary|     no|     45|     no|  no|unknown|  5|  may|    1467|       1|   -1|       0| unknown|    yes|
| 41|technician|married|secondary|     no|   1270|    yes|  no|unknown|  5|  may|    1389|       1|   -1|       0| unknown|    yes|
| 55|  services|married|secondary|     no|   2476|    yes|  no|unknown|  5|  may|     579|       1|   -1|       0| unknown|    yes|
| 54|    admin.|married| tertiary|     no|    184|     no|  no|unknown|  5| 

In [215]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoderEstimator, OneHotEncoder

In [23]:
df = customer_banking.select('age', 'job', 'marital', 
                             'education', 'default', 
                             'balance', 'housing', 
                             'loan', 'contact', 
                             'duration', 'campaign', 
                             'pdays', 'previous', 
                             'poutcome', 'deposit')
cols = df.columns
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- deposit: string (nullable = true)



In [119]:
categoricalColumns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
stages = []

In [141]:
indexers = [StringIndexer(inputCol=column, outputCol="{0}_index".format(column)) for column in categoricalColumns]
pipeline = Pipeline(stages=indexers)
string_index_df = pipeline.fit(df).transform(df)

string_index_df.limit(10).toPandas()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,duration,...,poutcome,deposit,job_index,marital_index,education_index,default_index,housing_index,loan_index,contact_index,poutcome_index
0,59,admin.,married,secondary,no,2343,yes,no,unknown,1042,...,unknown,yes,3.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,56,admin.,married,secondary,no,45,no,no,unknown,1467,...,unknown,yes,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,41,technician,married,secondary,no,1270,yes,no,unknown,1389,...,unknown,yes,2.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,55,services,married,secondary,no,2476,yes,no,unknown,579,...,unknown,yes,4.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,54,admin.,married,tertiary,no,184,no,no,unknown,673,...,unknown,yes,3.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
5,42,management,single,tertiary,no,0,yes,yes,unknown,562,...,unknown,yes,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0
6,56,management,married,tertiary,no,830,yes,yes,unknown,1201,...,unknown,yes,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0
7,60,retired,divorced,secondary,no,545,yes,no,unknown,1030,...,unknown,yes,5.0,2.0,0.0,0.0,1.0,0.0,1.0,0.0
8,37,technician,married,secondary,no,1,yes,no,unknown,608,...,unknown,yes,2.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
9,28,services,single,secondary,no,5090,yes,no,unknown,1297,...,unknown,yes,4.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0


In [148]:
encoder = OneHotEncoderEstimator(
    inputCols=[indexer.getOutputCol() for indexer in indexers],
    outputCols=["{0}_encoded".format(indexer.getOutputCol()) for indexer in indexers]
)

In [151]:
encode_vector = encoder.fit(string_index_df)
encode_vector_df = encode_vector.transform(string_index_df)

In [153]:
encode_vector_df.limit(5).toPandas()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,duration,...,contact_index,poutcome_index,contact_index_encoded,poutcome_index_encoded,marital_index_encoded,loan_index_encoded,default_index_encoded,housing_index_encoded,education_index_encoded,job_index_encoded
0,59,admin.,married,secondary,no,2343,yes,no,unknown,1042,...,1.0,0.0,"(0.0, 1.0)","(1.0, 0.0, 0.0)","(1.0, 0.0)",(1.0),(1.0),(0.0),"(1.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,56,admin.,married,secondary,no,45,no,no,unknown,1467,...,1.0,0.0,"(0.0, 1.0)","(1.0, 0.0, 0.0)","(1.0, 0.0)",(1.0),(1.0),(1.0),"(1.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,41,technician,married,secondary,no,1270,yes,no,unknown,1389,...,1.0,0.0,"(0.0, 1.0)","(1.0, 0.0, 0.0)","(1.0, 0.0)",(1.0),(1.0),(0.0),"(1.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,55,services,married,secondary,no,2476,yes,no,unknown,579,...,1.0,0.0,"(0.0, 1.0)","(1.0, 0.0, 0.0)","(1.0, 0.0)",(1.0),(1.0),(0.0),"(1.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
4,54,admin.,married,tertiary,no,184,no,no,unknown,673,...,1.0,0.0,"(0.0, 1.0)","(1.0, 0.0, 0.0)","(1.0, 0.0)",(1.0),(1.0),(1.0),"(0.0, 1.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [154]:
labeling_deposit = StringIndexer(inputCol = 'deposit', outputCol = 'label')

In [155]:
labeling_string = labeling_deposit.fit(encode_vector_df)
label_df = labeling_string.transform(encode_vector_df)

In [157]:
label_df.limit(5).toPandas()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,duration,...,poutcome_index,contact_index_encoded,poutcome_index_encoded,marital_index_encoded,loan_index_encoded,default_index_encoded,housing_index_encoded,education_index_encoded,job_index_encoded,label
0,59,admin.,married,secondary,no,2343,yes,no,unknown,1042,...,0.0,"(0.0, 1.0)","(1.0, 0.0, 0.0)","(1.0, 0.0)",(1.0),(1.0),(0.0),"(1.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0
1,56,admin.,married,secondary,no,45,no,no,unknown,1467,...,0.0,"(0.0, 1.0)","(1.0, 0.0, 0.0)","(1.0, 0.0)",(1.0),(1.0),(1.0),"(1.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0
2,41,technician,married,secondary,no,1270,yes,no,unknown,1389,...,0.0,"(0.0, 1.0)","(1.0, 0.0, 0.0)","(1.0, 0.0)",(1.0),(1.0),(0.0),"(1.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0
3,55,services,married,secondary,no,2476,yes,no,unknown,579,...,0.0,"(0.0, 1.0)","(1.0, 0.0, 0.0)","(1.0, 0.0)",(1.0),(1.0),(0.0),"(1.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",1.0
4,54,admin.,married,tertiary,no,184,no,no,unknown,673,...,0.0,"(0.0, 1.0)","(1.0, 0.0, 0.0)","(1.0, 0.0)",(1.0),(1.0),(1.0),"(0.0, 1.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0


In [171]:
string_indexer_cols = []
for c in range(len(indexers)):
    string_indexer_cols.append(indexers[c].getOutputCol())

In [173]:
numericCols = ['age', 'balance', 'duration', 'campaign', 'pdays', 'previous']
assemblerInputs = string_indexer_cols + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

In [178]:
assemble_model = assembler.transform(label_df)
assemble_model.limit(3).toPandas()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,duration,...,contact_index_encoded,poutcome_index_encoded,marital_index_encoded,loan_index_encoded,default_index_encoded,housing_index_encoded,education_index_encoded,job_index_encoded,label,features
0,59,admin.,married,secondary,no,2343,yes,no,unknown,1042,...,"(0.0, 1.0)","(1.0, 0.0, 0.0)","(1.0, 0.0)",(1.0),(1.0),(0.0),"(1.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(3.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 59.0,..."
1,56,admin.,married,secondary,no,45,no,no,unknown,1467,...,"(0.0, 1.0)","(1.0, 0.0, 0.0)","(1.0, 0.0)",(1.0),(1.0),(1.0),"(1.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 56.0,..."
2,41,technician,married,secondary,no,1270,yes,no,unknown,1389,...,"(0.0, 1.0)","(1.0, 0.0, 0.0)","(1.0, 0.0)",(1.0),(1.0),(0.0),"(1.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(2.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 41.0,..."


In [183]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages = stages)
pipelineModel = pipeline.fit(customer_banking)
customer_banking = pipelineModel.transform(customer_banking)
selectedCols = ['label', 'features'] + cols
customer_banking = customer_banking.select(selectedCols)
customer_banking.printSchema()

root
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- deposit: string (nullable = true)



In [184]:
customer_banking.show()

+-----+--------------------+---+-----------+--------+---------+-------+-------+-------+----+-------+--------+--------+-----+--------+--------+-------+
|label|            features|age|        job| marital|education|default|balance|housing|loan|contact|duration|campaign|pdays|previous|poutcome|deposit|
+-----+--------------------+---+-----------+--------+---------+-------+-------+-------+----+-------+--------+--------+-----+--------+--------+-------+
|  1.0|(30,[3,11,13,16,1...| 59|     admin.| married|secondary|     no|   2343|    yes|  no|unknown|    1042|       1|   -1|       0| unknown|    yes|
|  1.0|(30,[3,11,13,16,1...| 56|     admin.| married|secondary|     no|     45|     no|  no|unknown|    1467|       1|   -1|       0| unknown|    yes|
|  1.0|(30,[2,11,13,16,1...| 41| technician| married|secondary|     no|   1270|    yes|  no|unknown|    1389|       1|   -1|       0| unknown|    yes|
|  1.0|(30,[4,11,13,16,1...| 55|   services| married|secondary|     no|   2476|    yes|  no|un

In [185]:
train, test = customer_banking.randomSplit([0.7, 0.3], seed = 2018)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

Training Dataset Count: 7764
Test Dataset Count: 3398


## Random Forest Classifier

In [204]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label')
rfModel = rf.fit(train)
predictions = rfModel.transform(test)
predictions.select('age', 'job', 'label', 'rawPrediction', 'prediction', 'probability').show(10)

+---+----------+-----+--------------------+----------+--------------------+
|age|       job|label|       rawPrediction|prediction|         probability|
+---+----------+-----+--------------------+----------+--------------------+
| 37|management|  0.0|[14.8334308004391...|       0.0|[0.74167154002195...|
| 40|management|  0.0|[16.7031168414533...|       0.0|[0.83515584207266...|
| 53|management|  0.0|[12.6883343038168...|       0.0|[0.63441671519084...|
| 32|management|  0.0|[15.5558390485192...|       0.0|[0.77779195242596...|
| 54|management|  0.0|[15.8386547209387...|       0.0|[0.79193273604693...|
| 40|management|  0.0|[14.7552487613006...|       0.0|[0.73776243806503...|
| 56|management|  0.0|[18.1672267887941...|       0.0|[0.90836133943970...|
| 50|management|  0.0|[5.87152084001478...|       1.0|[0.29357604200073...|
| 47|management|  0.0|[11.4103373287095...|       0.0|[0.57051686643547...|
| 44|management|  0.0|[11.8632831071246...|       0.0|[0.59316415535623...|
+---+-------

In [205]:
evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))

Test Area Under ROC: 0.8850781161105441
