<a href="https://colab.research.google.com/github/Melvinmcrn/DataScience/blob/master/SparkML_Bank_Marketing/SparkML_Bank_Marketing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bank Marketing

### Import and create spark

In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [0]:
# import module
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import DecisionTreeClassifier, DecisionTreeClassificationModel, LogisticRegression, LogisticRegressionModel, RandomForestClassifier, RandomForestClassificationModel
from pyspark.ml.feature import StringIndexer, IndexToString, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql.functions import isnan, when, count, col

In [0]:
# Create spark context
sc = SparkContext.getOrCreate()

In [5]:
sc

In [6]:
# Setup SparkSession(SparkSQL)
spark = (SparkSession
         .builder
         .appName("SparkML_Bank_Marketing")
         .getOrCreate())
print (spark)

<pyspark.sql.session.SparkSession object at 0x7f3e82456b38>


### Read file and prepare data

In [7]:
# Read file to spark DataFrame
data = (spark
        .read
        .option("header","true")
        .option("inferSchema", "true")
        .option("sep",";")
        .csv("bank-full.csv"))
data.cache()
print ("finish caching data")

finish caching data


In [8]:
data.describe().toPandas()

Unnamed: 0,summary,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,count,45211.0,45211,45211,45211,45211,45211.0,45211,45211,45211,45211.0,45211,45211.0,45211.0,45211.0,45211.0,45211,45211
1,mean,40.93621021432837,,,,,1362.2720576850766,,,,15.80641879188693,,258.1630797814691,2.763840658246887,40.19782796222158,0.5803233726305546,,
2,stddev,10.618762040975408,,,,,3044.7658291685243,,,,8.322476153044596,,257.5278122651709,3.098020883279184,100.12874599059812,2.3034410449312204,,
3,min,18.0,admin.,divorced,primary,no,-8019.0,no,no,cellular,1.0,apr,0.0,1.0,-1.0,0.0,failure,no
4,max,95.0,unknown,single,unknown,yes,102127.0,yes,yes,unknown,31.0,sep,4918.0,63.0,871.0,275.0,unknown,yes


In [9]:
data.toPandas().head(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [10]:
data.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- y: string (nullable = true)



In [11]:
data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in data.columns]).show()

+---+---+-------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
|age|job|marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|  y|
+---+---+-------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
|  0|  0|      0|        0|      0|      0|      0|   0|      0|  0|    0|       0|       0|    0|       0|       0|  0|
+---+---+-------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+



In [12]:
# split Train and Test data
label = 'y'

data = data.sort(label)
(trainData, testData) = data.randomSplit([0.7, 0.3],seed = 101)

print ("data count : " + str(data.count()))
print ("trainData count : " + str(trainData.count()))
print ("testData count : " + str(testData.count()))

data.groupBy(label).count().show()
trainData.groupBy(label).count().show()
testData.groupBy(label).count().show()

data count : 45211
trainData count : 31768
testData count : 13443
+---+-----+
|  y|count|
+---+-----+
| no|39922|
|yes| 5289|
+---+-----+

+---+-----+
|  y|count|
+---+-----+
| no|28029|
|yes| 3739|
+---+-----+

+---+-----+
|  y|count|
+---+-----+
| no|11893|
|yes| 1550|
+---+-----+



In [37]:
# String indexer
category = ['job','marital','education','default','housing','loan','contact','month','poutcome']
continuous = ['age','balance','day','duration','pdays','campaign','previous']
featureidx_list = [StringIndexer(inputCol = label, outputCol='label')]
featureidx_list += [StringIndexer(inputCol = c, outputCol=c + 'idx') for c in category]

print (featureidx_list)

[StringIndexer_a0acd001857b, StringIndexer_25b08f022fa3, StringIndexer_f6a6d89e1352, StringIndexer_77b1597d6a6b, StringIndexer_4fa7330bfe93, StringIndexer_e37296ee33b6, StringIndexer_973742fd0dc6, StringIndexer_cac63be14727, StringIndexer_c210fc8325b3, StringIndexer_026ac0f651f4]


In [38]:
# Create Vector
features = continuous + [c + 'idx' for c in category]
assem =  VectorAssembler(inputCols = features ,outputCol="features")

print (type(assem))

<class 'pyspark.ml.feature.VectorAssembler'>


### Create Model

In [39]:
# Create model - Decision Tree
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")

print (dt)

DecisionTreeClassifier_d82be66ed6e4


In [40]:
# Create model - Logistic Regression
lr = LogisticRegression(labelCol="label", featuresCol="features")

print (lr)

LogisticRegression_8bfc433dda76


In [41]:
# Create model - Random Forest
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

print(rf)

RandomForestClassifier_72e54821bfc9


### Set ML pipeline

In [42]:
# Set ML pipeline - Decision Tree
print (featureidx_list)
print (assem)
print (dt)
print ("\n")

all_process_list = featureidx_list + [assem,dt]
print (all_process_list)

dt_pipeline = Pipeline(stages=all_process_list)
print ("\n")
print (dt_pipeline)

[StringIndexer_a0acd001857b, StringIndexer_25b08f022fa3, StringIndexer_f6a6d89e1352, StringIndexer_77b1597d6a6b, StringIndexer_4fa7330bfe93, StringIndexer_e37296ee33b6, StringIndexer_973742fd0dc6, StringIndexer_cac63be14727, StringIndexer_c210fc8325b3, StringIndexer_026ac0f651f4]
VectorAssembler_e20a93599b61
DecisionTreeClassifier_d82be66ed6e4


[StringIndexer_a0acd001857b, StringIndexer_25b08f022fa3, StringIndexer_f6a6d89e1352, StringIndexer_77b1597d6a6b, StringIndexer_4fa7330bfe93, StringIndexer_e37296ee33b6, StringIndexer_973742fd0dc6, StringIndexer_cac63be14727, StringIndexer_c210fc8325b3, StringIndexer_026ac0f651f4, VectorAssembler_e20a93599b61, DecisionTreeClassifier_d82be66ed6e4]


Pipeline_0e3890f6eec6


In [43]:
# Set ML pipeline - Logistic Regression
print (featureidx_list)
print (assem)
print (lr)
print ("\n")

all_process_list = featureidx_list + [assem,lr]
print (all_process_list)

lr_pipeline = Pipeline(stages=all_process_list)
print ("\n")
print (lr_pipeline)

[StringIndexer_a0acd001857b, StringIndexer_25b08f022fa3, StringIndexer_f6a6d89e1352, StringIndexer_77b1597d6a6b, StringIndexer_4fa7330bfe93, StringIndexer_e37296ee33b6, StringIndexer_973742fd0dc6, StringIndexer_cac63be14727, StringIndexer_c210fc8325b3, StringIndexer_026ac0f651f4]
VectorAssembler_e20a93599b61
LogisticRegression_8bfc433dda76


[StringIndexer_a0acd001857b, StringIndexer_25b08f022fa3, StringIndexer_f6a6d89e1352, StringIndexer_77b1597d6a6b, StringIndexer_4fa7330bfe93, StringIndexer_e37296ee33b6, StringIndexer_973742fd0dc6, StringIndexer_cac63be14727, StringIndexer_c210fc8325b3, StringIndexer_026ac0f651f4, VectorAssembler_e20a93599b61, LogisticRegression_8bfc433dda76]


Pipeline_38bc4b410a68


In [44]:
# Set ML pipeline - Random Forest
print (featureidx_list)
print (assem)
print (rf)
print ("\n")

all_process_list = featureidx_list + [assem,rf]
print (all_process_list)

rf_pipeline = Pipeline(stages=all_process_list)
print ("\n")
print (dt_pipeline)

[StringIndexer_a0acd001857b, StringIndexer_25b08f022fa3, StringIndexer_f6a6d89e1352, StringIndexer_77b1597d6a6b, StringIndexer_4fa7330bfe93, StringIndexer_e37296ee33b6, StringIndexer_973742fd0dc6, StringIndexer_cac63be14727, StringIndexer_c210fc8325b3, StringIndexer_026ac0f651f4]
VectorAssembler_e20a93599b61
RandomForestClassifier_72e54821bfc9


[StringIndexer_a0acd001857b, StringIndexer_25b08f022fa3, StringIndexer_f6a6d89e1352, StringIndexer_77b1597d6a6b, StringIndexer_4fa7330bfe93, StringIndexer_e37296ee33b6, StringIndexer_973742fd0dc6, StringIndexer_cac63be14727, StringIndexer_c210fc8325b3, StringIndexer_026ac0f651f4, VectorAssembler_e20a93599b61, RandomForestClassifier_72e54821bfc9]


Pipeline_0e3890f6eec6


### Param Grid

In [0]:
# Param Grid - Decision Tree
dt_paramGrid = (ParamGridBuilder()
    .addGrid(dt.maxDepth, [5,10,30])
     .addGrid(dt.minInstancesPerNode, [1,10])
     .addGrid(dt.impurity, ["gini","entropy"])        
    .build())

In [0]:
# Param Grid - Logistic Regression
lr_paramGrid = (ParamGridBuilder()
    .addGrid(lr.family, ['multinomial' ,'binomial' ])
    .addGrid(lr.maxIter, [5,10,50])
    .addGrid(lr.regParam, [0.1,0.5,0.8])       
    .build())

In [0]:
# Param Grid - Random Forest
rf_paramGrid = (ParamGridBuilder()
    .addGrid(rf.numTrees, [5,10,50])
     .addGrid(rf.maxDepth, [5,10])
     .addGrid(rf.impurity , ["gini","entropy"])        
    .build())

### Cross Validator

In [0]:
# Cross Validator - Decision Tree
crossval = CrossValidator(estimator=dt_pipeline,
                      estimatorParamMaps=dt_paramGrid,
                      evaluator=MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1"),
                      numFolds=3)
dt_cvModel = crossval.fit(trainData)
dt_model = dt_cvModel.bestModel

In [60]:
print (dt_model)
print(max(dt_cvModel.avgMetrics))

PipelineModel_ff7869a7001c
0.8879057472963947


In [0]:
# Cross Validator - Logistic Regression
crossval = CrossValidator(estimator=lr_pipeline,
                      estimatorParamMaps=lr_paramGrid,
                      evaluator=MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1"),
                      numFolds=3)
lr_cvModel = crossval.fit(trainData)
lr_model = lr_cvModel.bestModel

In [58]:
print (lr_model)
print(max(lr_cvModel.avgMetrics))

PipelineModel_0c0acc50597c
0.8620602392909142


In [0]:
# Cross Validator - Random Forest
crossval = CrossValidator(estimator=rf_pipeline,
                      estimatorParamMaps=rf_paramGrid,
                      evaluator=MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1"),
                      numFolds=3)
rf_cvModel = crossval.fit(trainData)
rf_model = rf_cvModel.bestModel

In [64]:
print (rf_model)
print(max(rf_cvModel.avgMetrics))

PipelineModel_26b02b9d4445
0.8869278328034349


### Choose best model and make prediction

In [65]:
model = dt_model

if max(lr_cvModel.avgMetrics) > max(rf_cvModel.avgMetrics) and max(lr_cvModel.avgMetrics) > max(dt_cvModel.avgMetrics):
  model = lr_model
  print('best model = Logistic Regression')
elif max(rf_cvModel.avgMetrics) > max(lr_cvModel.avgMetrics) and max(rf_cvModel.avgMetrics) > max(dt_cvModel.avgMetrics):
  model = rf_model
  print('best model = Random Forest')
else:
  model = dt_model
  print('best model = Decision Tree')

best model = Decision Tree


In [0]:
prediction = model.transform(testData)

In [67]:
prediction.toPandas()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y,label,jobidx,maritalidx,educationidx,defaultidx,housingidx,loanidx,contactidx,monthidx,poutcomeidx,features,rawPrediction,probability,prediction
0,18,student,single,unknown,no,35,no,no,telephone,21,aug,104,2,-1,0,unknown,no,0.0,10.0,1.0,3.0,0.0,1.0,0.0,2.0,2.0,0.0,"[18.0, 35.0, 21.0, 104.0, -1.0, 2.0, 0.0, 10.0...","[0.0, 2.0]","[0.0, 1.0]",1.0
1,18,student,single,unknown,no,108,no,no,cellular,9,feb,92,1,183,1,success,yes,1.0,10.0,1.0,3.0,0.0,1.0,0.0,0.0,6.0,3.0,"[18.0, 108.0, 9.0, 92.0, 183.0, 1.0, 1.0, 10.0...","[69.0, 0.0]","[1.0, 0.0]",0.0
2,18,student,single,unknown,no,108,no,no,cellular,10,aug,167,1,-1,0,unknown,yes,1.0,10.0,1.0,3.0,0.0,1.0,0.0,0.0,2.0,0.0,"[18.0, 108.0, 10.0, 167.0, -1.0, 1.0, 0.0, 10....","[12.0, 27.0]","[0.3076923076923077, 0.6923076923076923]",1.0
3,19,student,single,primary,no,103,no,no,cellular,10,jul,104,2,-1,0,unknown,yes,1.0,10.0,1.0,2.0,0.0,1.0,0.0,0.0,1.0,0.0,"[19.0, 103.0, 10.0, 104.0, -1.0, 2.0, 0.0, 10....","[74.0, 0.0]","[1.0, 0.0]",0.0
4,19,student,single,primary,no,1247,no,no,cellular,23,apr,94,1,-1,0,unknown,no,0.0,10.0,1.0,2.0,0.0,1.0,0.0,0.0,5.0,0.0,"[19.0, 1247.0, 23.0, 94.0, -1.0, 1.0, 0.0, 10....","[112.0, 22.0]","[0.835820895522388, 0.16417910447761194]",0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13438,84,retired,divorced,primary,no,639,no,no,telephone,18,may,353,3,-1,0,unknown,yes,1.0,5.0,2.0,2.0,0.0,1.0,0.0,2.0,0.0,0.0,"[84.0, 639.0, 18.0, 353.0, -1.0, 3.0, 0.0, 5.0...","[39.0, 55.0]","[0.4148936170212766, 0.5851063829787234]",1.0
13439,86,retired,married,primary,no,1255,no,no,cellular,14,oct,192,2,92,4,success,no,0.0,5.0,0.0,2.0,0.0,1.0,0.0,0.0,8.0,3.0,"[86.0, 1255.0, 14.0, 192.0, 92.0, 2.0, 4.0, 5....","[57.0, 234.0]","[0.1958762886597938, 0.8041237113402062]",1.0
13440,86,retired,married,primary,no,5236,no,no,telephone,1,apr,558,2,-1,0,unknown,yes,1.0,5.0,0.0,2.0,0.0,1.0,0.0,2.0,5.0,0.0,"[86.0, 5236.0, 1.0, 558.0, -1.0, 2.0, 0.0, 5.0...","[6.0, 1.0]","[0.8571428571428571, 0.14285714285714285]",0.0
13441,89,retired,married,primary,no,0,no,no,telephone,27,sep,157,5,-1,0,unknown,no,0.0,5.0,0.0,2.0,0.0,1.0,0.0,2.0,9.0,0.0,"[89.0, 0.0, 27.0, 157.0, -1.0, 5.0, 0.0, 5.0, ...","[48.0, 6.0]","[0.8888888888888888, 0.1111111111111111]",0.0


In [68]:
prediction.select("prediction", "rawPrediction", "probability", "label", "features").toPandas()

Unnamed: 0,prediction,rawPrediction,probability,label,features
0,1.0,"[0.0, 2.0]","[0.0, 1.0]",0.0,"[18.0, 35.0, 21.0, 104.0, -1.0, 2.0, 0.0, 10.0..."
1,0.0,"[69.0, 0.0]","[1.0, 0.0]",1.0,"[18.0, 108.0, 9.0, 92.0, 183.0, 1.0, 1.0, 10.0..."
2,1.0,"[12.0, 27.0]","[0.3076923076923077, 0.6923076923076923]",1.0,"[18.0, 108.0, 10.0, 167.0, -1.0, 1.0, 0.0, 10...."
3,0.0,"[74.0, 0.0]","[1.0, 0.0]",1.0,"[19.0, 103.0, 10.0, 104.0, -1.0, 2.0, 0.0, 10...."
4,0.0,"[112.0, 22.0]","[0.835820895522388, 0.16417910447761194]",0.0,"[19.0, 1247.0, 23.0, 94.0, -1.0, 1.0, 0.0, 10...."
...,...,...,...,...,...
13438,1.0,"[39.0, 55.0]","[0.4148936170212766, 0.5851063829787234]",1.0,"[84.0, 639.0, 18.0, 353.0, -1.0, 3.0, 0.0, 5.0..."
13439,1.0,"[57.0, 234.0]","[0.1958762886597938, 0.8041237113402062]",0.0,"[86.0, 1255.0, 14.0, 192.0, 92.0, 2.0, 4.0, 5...."
13440,0.0,"[6.0, 1.0]","[0.8571428571428571, 0.14285714285714285]",1.0,"[86.0, 5236.0, 1.0, 558.0, -1.0, 2.0, 0.0, 5.0..."
13441,0.0,"[48.0, 6.0]","[0.8888888888888888, 0.1111111111111111]",0.0,"[89.0, 0.0, 27.0, 157.0, -1.0, 5.0, 0.0, 5.0, ..."


### Evaluate model

In [72]:
for metricName in ['f1','weightedPrecision','weightedRecall']:
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName=metricName)
    result = evaluator.evaluate(prediction)
    print ('%s = %g' % (metricName,result))

f1 = 0.888259
weightedPrecision = 0.884237
weightedRecall = 0.895633
