<a href="https://colab.research.google.com/github/Melvinmcrn/DataScience/blob/master/SparkML_Bank_Marketing/SparkML_Bank_Marketing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [0]:
#1 - import module
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import DecisionTreeClassifier, DecisionTreeClassificationModel
from pyspark.ml.feature import StringIndexer, IndexToString, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql.functions import isnan, when, count, col

In [0]:
#2 - Create spark context
sc = SparkContext.getOrCreate()

In [5]:
sc

In [6]:
#3 - Setup SparkSession(SparkSQL)
spark = (SparkSession
         .builder
         .appName("SparkML_Bank_Marketing")
         .getOrCreate())
print (spark)

<pyspark.sql.session.SparkSession object at 0x7f1b2a5d7b38>


In [7]:
#4 - Read file to spark DataFrame
data = (spark
        .read
        .option("header","true")
        .option("inferSchema", "true")
        .option("sep",";")
        .csv("bank-full.csv"))
data.cache()
print ("finish caching data")

finish caching data


In [8]:
data.describe().toPandas()

Unnamed: 0,summary,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,count,45211.0,45211,45211,45211,45211,45211.0,45211,45211,45211,45211.0,45211,45211.0,45211.0,45211.0,45211.0,45211,45211
1,mean,40.93621021432837,,,,,1362.2720576850766,,,,15.80641879188693,,258.1630797814691,2.763840658246887,40.19782796222158,0.5803233726305546,,
2,stddev,10.618762040975408,,,,,3044.7658291685243,,,,8.322476153044596,,257.5278122651709,3.098020883279184,100.12874599059812,2.3034410449312204,,
3,min,18.0,admin.,divorced,primary,no,-8019.0,no,no,cellular,1.0,apr,0.0,1.0,-1.0,0.0,failure,no
4,max,95.0,unknown,single,unknown,yes,102127.0,yes,yes,unknown,31.0,sep,4918.0,63.0,871.0,275.0,unknown,yes


In [9]:
data.toPandas().head(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [10]:
data.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- y: string (nullable = true)



In [11]:
data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in data.columns]).show()

+---+---+-------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
|age|job|marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|  y|
+---+---+-------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
|  0|  0|      0|        0|      0|      0|      0|   0|      0|  0|    0|       0|       0|    0|       0|       0|  0|
+---+---+-------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+



In [12]:
# split Train and Test data
label = 'y'

data = data.sort(label)
(trainData, testData) = data.randomSplit([0.7, 0.3],seed = 101)

print ("data count : " + str(data.count()))
print ("trainData count : " + str(trainData.count()))
print ("testData count : " + str(testData.count()))

data.groupBy(label).count().show()
trainData.groupBy(label).count().show()
testData.groupBy(label).count().show()

data count : 45211
trainData count : 31768
testData count : 13443
+---+-----+
|  y|count|
+---+-----+
| no|39922|
|yes| 5289|
+---+-----+

+---+-----+
|  y|count|
+---+-----+
| no|28029|
|yes| 3739|
+---+-----+

+---+-----+
|  y|count|
+---+-----+
| no|11893|
|yes| 1550|
+---+-----+



In [14]:
# String indexer
category = ['job','maritial','education','default','housing','loan','contact','month','pcoutcome']
continuous = ['age','balance','day','duration','pdays','campaign','prevoius']
featureidx_list = [StringIndexer(inputCol = label, outputCol='label')]
featureidx_list += [StringIndexer(inputCol = c, outputCol=c + 'idx') for c in category]

print (featureidx_list)

[StringIndexer_19c9f084eb2c, StringIndexer_a3c3c3272f75, StringIndexer_d28e3a11ab68, StringIndexer_cd8042875e93, StringIndexer_ad9cd79c16ed, StringIndexer_f9f8c9d4933a, StringIndexer_682798ace7bc, StringIndexer_c824689d6d99, StringIndexer_6f056073f615, StringIndexer_42de2e57ac2b]


In [15]:
# Create Vector
features = continuous + [c + 'idx' for c in category]
assem =  VectorAssembler(inputCols = features ,outputCol="features")

print (type(assem))

<class 'pyspark.ml.feature.VectorAssembler'>
