In [1]:
import os
import sys

os.chdir("C:/dataanalytics/python")
os.curdir

#Configure the environment . Set this up to the directory where spark is installed
if 'SPARK_HOME' not in os.environ:
    os.environ['SPARK_HOME'] = 'C:\\spark'
    
#create a variable for our root path
SPARK_HOME = os.environ['SPARK_HOME']

#Add the following paths to the system path. Please check your installation
#to make sure that these zip files actually exists. The names might change as
#versions change
sys.path.insert(0,os.path.join(SPARK_HOME,"python"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","py4j-0.10.6-src.zip"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","pyspark.zip"))
 
#Initialize a spark context
from pyspark import SparkContext
from pyspark import SparkConf

#optionally configure spark
conf = (SparkConf().setAppName("V2Maestros").setMaster("local[2]").set("spark.executor.memory", "1g"))

#Initalize spark context onl runs once
sc = SparkContext(conf=conf)

In [2]:
bankData = sc.textFile(r'C:\Users\jeffnerd\Desktop\spark\bank.csv')

In [3]:
bankData.cache()

C:\Users\jeffnerd\Desktop\spark\bank.csv MapPartitionsRDD[1] at textFile at <unknown>:0

In [4]:
bankData.count()

542

In [7]:
#removing he first line
first = bankData.first()
dataLines = bankData.filter(lambda x: x!=first)
dataLines.count()

541

In [21]:
#convert rdds into a dense vector .Labesl are converterd to numeric ones
import math
from pyspark.ml.linalg import Vectors

In [11]:
from pyspark.mllib.linalg import Vectors

In [22]:
def transformToNumeric(inputStr):
    
    attList = inputStr.replace("\"","").split(";")
    
    age = float(attList[0])
    #convert oucome to float
    outcome = 0.0 if attList[16] == "no" else 1.0
    
    #create indicator variables for single/married. This creates three columns
    single = 1.0 if attList[2] == "single" else 0.0
    married = 1.0 if attList[2] == "married" else 0.0
    divorced = 1.0 if attList[2] == "divorced" else 0.0
    
    #creating indicator variables for education
    primary = 1.0 if attList[3] == "primary" else 0.0
    secondary = 1.0 if attList[3] == "secondary" else 0.0
    tertiary = 1.0 if attList[3] == "tertiary" else 0.0
    
    #converting dafault to float
    default = 0.0 if attList[4] == "no" else 1.0
    #convert balance amount yo float
    balance = float(attList[5])
    #convert loan to float
    loan = 0.0 if attList[7] == "no" else 1.0
    
    #Filter out columns not wanted at this stage
    values = Vectors.dense([outcome,age,single,married,divorced,primary,secondary,tertiary,\
                           default,balance,loan])
    return values
    
    
                                             

In [23]:
#change to a vector
bankVectors = dataLines.map(transformToNumeric)
bankVectors.collect() [:15]

[DenseVector([0.0, 30.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1787.0, 0.0]),
 DenseVector([1.0, 33.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 4789.0, 1.0]),
 DenseVector([1.0, 35.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1350.0, 0.0]),
 DenseVector([1.0, 30.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1476.0, 1.0]),
 DenseVector([0.0, 59.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]),
 DenseVector([1.0, 35.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 747.0, 0.0]),
 DenseVector([1.0, 36.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 307.0, 0.0]),
 DenseVector([0.0, 39.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 147.0, 0.0]),
 DenseVector([0.0, 41.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 221.0, 0.0]),
 DenseVector([1.0, 43.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, -88.0, 1.0]),
 DenseVector([0.0, 39.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 9374.0, 0.0]),
 DenseVector([0.0, 43.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 264.0, 0.0]),
 DenseVector([0.0, 36.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1109.0, 0.0]),
 DenseVector([1.0, 20.0, 1.0, 0.0,

In [14]:
#perform statistical analysis
from pyspark.mllib.stat import Statistics

In [15]:
bankStats = Statistics.colStats(bankVectors)

In [16]:
bankStats.mean()

array([3.97412200e-01, 4.12698706e+01, 2.75415896e-01, 6.15526802e-01,
       1.09057301e-01, 1.53419593e-01, 4.95378928e-01, 3.14232902e-01,
       2.21811460e-02, 1.44478189e+03, 1.62661738e-01])

In [17]:
bankStats.variance()

array([2.39919217e-01, 1.11415924e+02, 1.99931540e-01, 2.37091805e-01,
       9.73437393e-02, 1.30122544e-01, 2.50441569e-01, 2.15889642e-01,
       2.17293079e-02, 5.87224851e+06, 1.36455124e-01])

In [18]:
bankStats.min()

array([    0.,    19.,     0.,     0.,     0.,     0.,     0.,     0.,
           0., -1206.,     0.])

In [19]:
bankStats.max()

array([1.0000e+00, 7.8000e+01, 1.0000e+00, 1.0000e+00, 1.0000e+00,
       1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00, 1.6873e+04,
       1.0000e+00])

In [20]:
Statistics.corr(bankVectors)

array([[ 1.        , -0.18232104,  0.46323285, -0.37532413, -0.0781266 ,
        -0.12561549,  0.02639277,  0.08494841, -0.04536965,  0.03657487,
        -0.03042059],
       [-0.18232104,  1.        , -0.40971334,  0.24253548,  0.208662  ,
         0.18705376, -0.1049356 , -0.08566612,  0.02589999,  0.14746211,
        -0.0108042 ],
       [ 0.46323285, -0.40971334,  1.        , -0.78008253, -0.21570121,
        -0.10171839,  0.02638786,  0.06399288, -0.03666486,  0.00224317,
         0.01977069],
       [-0.37532413,  0.24253548, -0.78008253,  1.        , -0.44268309,
         0.06232365,  0.00789467, -0.0625317 , -0.06156785, -0.00746014,
         0.02917413],
       [-0.0781266 ,  0.208662  , -0.21570121, -0.44268309,  1.        ,
         0.04851091, -0.05013811,  0.00587947,  0.14863123,  0.00842788,
        -0.07386451],
       [-0.12561549,  0.18705376, -0.10171839,  0.06232365,  0.04851091,
         1.        , -0.42178621, -0.28816671,  0.04036243, -0.01358146,
         0.048

In [None]:
#transform a dataframe for input to machine learning.Drop columns that are not required.(Low correlation.) Nb the above
#data has low correlation spread ou so we use all features


In [24]:
def transformTolabeledPoint(inStr):
    lp = (float(inStr[0]),\
         Vectors.dense([inStr[1],inStr[2],inStr[3],inStr[4],inStr[5],inStr[6],inStr[7],\
                       inStr[8],inStr[9],inStr[10]]))
    return lp

In [25]:
from pyspark.sql import DataFrame,SparkSession,SQLContext

In [29]:
spark = SparkSession.builder.master("local").appName("randomforest").config(conf=conf).getOrCreate()

In [None]:
bankLp = bankVectors.map(transformTolabeledPoint)
bankLp.collect()

In [32]:
#create dataframe
bankDF = spark.createDataFrame(bankLp, ["label", "features"])
bankDF.select("label", "features").show(10)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|[30.0,0.0,1.0,0.0...|
|  1.0|[33.0,0.0,1.0,0.0...|
|  1.0|[35.0,1.0,0.0,0.0...|
|  1.0|[30.0,0.0,1.0,0.0...|
|  0.0|[59.0,0.0,1.0,0.0...|
|  1.0|[35.0,1.0,0.0,0.0...|
|  1.0|[36.0,0.0,1.0,0.0...|
|  0.0|[39.0,0.0,1.0,0.0...|
|  0.0|[41.0,0.0,1.0,0.0...|
|  1.0|[43.0,0.0,1.0,0.0...|
+-----+--------------------+
only showing top 10 rows



In [33]:
#transformation PCA
from pyspark.ml.feature import PCA

In [34]:
bankPCA = PCA(k=3, inputCol="features", outputCol= "pcafeatures")
pcaModel = bankPCA.fit(bankDF)
pcaResult = pcaModel.transform(bankDF).select("label", "pcafeatures")
pcaResult.show(truncate = False)

+-----+-------------------------------------------------------------+
|label|pcafeatures                                                  |
+-----+-------------------------------------------------------------+
|0.0  |[-1787.018897197381,28.86209683775469,-0.06459982604832748]  |
|1.0  |[-4789.020177138491,29.922562636340352,-0.9830243513099493]  |
|1.0  |[-1350.022213163262,34.10110809796642,0.8951427168281559]    |
|1.0  |[-1476.0189517184556,29.05133399359621,0.39527238680255483]  |
|0.0  |[-0.037889185366455545,58.98971820001769,-0.7290792383674499]|
|1.0  |[-747.0223377634923,34.48829198181747,0.9045654956949856]    |
|1.0  |[-307.0230691022592,35.799850539655125,0.5170631523787519]   |
|0.0  |[-147.02501216176339,38.901078566503244,-0.8069627548805192] |
|0.0  |[-221.0262985348787,40.853633675694866,0.5373036365803051]   |
|1.0  |[87.9723868768871,43.06265944115108,-0.067016428711774]      |
|0.0  |[-9374.023105550941,32.97645883798975,-0.9511484606918585]   |
|0.0  |[-264.0275573

In [35]:
#indexing pre-req for decision trees
from pyspark.ml.feature import StringIndexer 

In [36]:
stringIndexer = StringIndexer(inputCol="label",outputCol="indexed")
si_model = stringIndexer.fit(pcaResult)
td = si_model.transform(pcaResult)
td.collect()

[Row(label=0.0, pcafeatures=DenseVector([-1787.0189, 28.8621, -0.0646]), indexed=0.0),
 Row(label=1.0, pcafeatures=DenseVector([-4789.0202, 29.9226, -0.983]), indexed=1.0),
 Row(label=1.0, pcafeatures=DenseVector([-1350.0222, 34.1011, 0.8951]), indexed=1.0),
 Row(label=1.0, pcafeatures=DenseVector([-1476.019, 29.0513, 0.3953]), indexed=1.0),
 Row(label=0.0, pcafeatures=DenseVector([-0.0379, 58.9897, -0.7291]), indexed=0.0),
 Row(label=1.0, pcafeatures=DenseVector([-747.0223, 34.4883, 0.9046]), indexed=1.0),
 Row(label=1.0, pcafeatures=DenseVector([-307.0231, 35.7999, 0.5171]), indexed=1.0),
 Row(label=0.0, pcafeatures=DenseVector([-147.025, 38.9011, -0.807]), indexed=0.0),
 Row(label=0.0, pcafeatures=DenseVector([-221.0263, 40.8536, 0.5373]), indexed=0.0),
 Row(label=1.0, pcafeatures=DenseVector([87.9724, 43.0627, -0.067]), indexed=1.0),
 Row(label=0.0, pcafeatures=DenseVector([-9374.0231, 32.9765, -0.9511]), indexed=0.0),
 Row(label=0.0, pcafeatures=DenseVector([-264.0276, 42.8248, -0

In [37]:
#splitting the data intotraining and test data sets
(trainingData, testData)= td.randomSplit([0.7,0.3])

In [38]:
trainingData.count()

353

In [39]:
testData.count()

188

In [40]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [42]:
#create a model for this data
rmClassifier = RandomForestClassifier(labelCol="indexed",featuresCol="pcafeatures")

In [44]:
rmClassifier.getImpurity()

'gini'

In [45]:
rmModel = rmClassifier.fit(trainingData)

In [46]:
#prediction on the test data set
prediction = rmModel.transform(testData)

In [47]:
prediction.select("prediction","indexed","label","pcafeatures").show(10)

+----------+-------+-----+--------------------+
|prediction|indexed|label|         pcafeatures|
+----------+-------+-----+--------------------+
|       0.0|    0.0|  0.0|[-11494.034229470...|
|       0.0|    0.0|  0.0|[-9374.0231055509...|
|       0.0|    0.0|  0.0|[-8104.0336452947...|
|       0.0|    0.0|  0.0|[-7082.0351460463...|
|       0.0|    0.0|  0.0|[-6313.0372339667...|
|       0.0|    0.0|  0.0|[-5883.0251285738...|
|       0.0|    0.0|  0.0|[-5426.0252230055...|
|       0.0|    0.0|  0.0|[-4012.0312961776...|
|       0.0|    0.0|  0.0|[-3762.0274938415...|
|       1.0|    0.0|  0.0|[-3096.0186237192...|
+----------+-------+-----+--------------------+
only showing top 10 rows



In [48]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",labelCol="indexed",metricName="f1")
evaluator.evaluate(prediction)

0.7095500558768324

In [49]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",labelCol="indexed",metricName="accuracy")
evaluator.evaluate(prediction)

0.7287234042553191

In [50]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",labelCol="indexed",metricName="weightedPrecision")
evaluator.evaluate(prediction)

0.7260534204938169

In [51]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",labelCol="indexed",metricName="weightedRecall")
evaluator.evaluate(prediction)

0.7287234042553192

In [52]:
#drawing a confusion matrix
prediction.groupby("prediction","indexed").count().show()

+----------+-------+-----+
|prediction|indexed|count|
+----------+-------+-----+
|       1.0|    1.0|   30|
|       0.0|    1.0|   39|
|       1.0|    0.0|   12|
|       0.0|    0.0|  107|
+----------+-------+-----+



In [53]:
import pandas as pd