# Introduction to Spark Random Forest Algorithm

#### Tabnine Extension for intellisense

In [3]:
# !pip install jupyter-tabnine
# !jupyter nbextension install --py jupyter_tabnine
!jupyter nbextension enable --py jupyter_tabnine
!jupyter serverextension enable --py jupyter_tabnine

Enabling notebook extension jupyter_tabnine/main...
      - Validating: [32mOK[0m
Enabling: jupyter_tabnine
- Writing config: /home/ateeb/.jupyter
    - Validating...
      jupyter_tabnine  [32mOK[0m


### Spark Initiallization Script

In [2]:
import os
import sys

# getting the directory where Spark was installed
if 'SPARK_HOME' not in os.environ:
    os.environ['SPARK_HOME'] = '/opt/spark'
    
# python variable to store the root path for later reference
SPARK_HOME = os.environ['SPARK_HOME']

# adding pyspark and py4j packages paths to python path env variable
sys.path.insert(0, os.path.join(SPARK_HOME, "python"))
sys.path.insert(0, os.path.join(SPARK_HOME, "python", "lib"))
sys.path.insert(0, os.path.join(SPARK_HOME, "python", "lib", 'py4j-0.10.9-src.zip'))
sys.path.insert(0, os.path.join(SPARK_HOME, 'python', 'lib', 'pyspark.zip'))

from pyspark import SparkContext
from pyspark import SparkConf

# creating custom config for spark 1gb ram and 2 cpu cores
conf = SparkConf()
conf.set('spark.executer.memory', '1g')
conf.set('spark.cores.max', '2')

# give name to your spark application
conf.setAppName("RandomForestApp")

# create a spark context object 
# note: Execute only once otherwise results in Context Errors
# create the spark context with 2 threads for streaming purposes
sc = SparkContext('local',conf=conf)

#### Once the above script is executed you can view the Spark instance info here http://localhost:4040

#### reading csv data into rdd

In [4]:
# read csv file into an rdd
bankData = sc.textFile('data/bank.csv')
bankData.cache()
bankData.take(3)

['"age";"job";"marital";"education";"default";"balance";"housing";"loan";"contact";"day";"month";"duration";"campaign";"pdays";"previous";"poutcome";"y"',
 '30;"unemployed";"married";"primary";"no";1787;"no";"no";"cellular";19;"oct";79;1;-1;0;"unknown";"no"',
 '33;"services";"married";"secondary";"no";4789;"yes";"yes";"cellular";11;"may";220;1;339;4;"failure";"yes"']

#### removing header line

In [6]:
firstLine = bankData.first()
dataLines = bankData.filter(lambda x: x!=firstLine)
dataLines.count()

541

#### Converting RDD to dense vectors and changing labels to numeric

In [8]:
import math
from pyspark.ml.linalg import Vectors

def transformToNumeric(inputStr):
    # remove quotation marks from data lines and split 
    attList = inputStr.replace("\"", "").split(";")
    
    age = float(attList[0])
    
    # convert outcome to float
    outcome = 0.0 if attList[16]=='no' else 1.0
    
    # create indicator for single/married
    single = 1.0 if attList[2] == "single" else 0.0
    married = 1.0 if attList[2] == 'married' else 0.0
    divorced = 1.0 if attList[2] == 'divorced' else 0.0
    
    # create indicator variable for education
    primary = 1.0 if attList[3] == 'primary' else 0.0
    secondry = 1.0 if attList[3] == 'secondry' else 0.0
    tertiary = 1.0 if attList[3] == 'tertiary' else 0.0
    
    # convert default to float
    default = 0.0 if attList[4] == 'no' else 1.0
    
    #convert balance amount to float
    balance = float(attList[5])
    
    #convert loan to float
    loan = 0.0 if attList[7]=='no' else 1.0
    
    #Filter out unwanted columns and create dense vector
    values = Vectors.dense([outcome, age, single, married, divorced, primary, secondry, tertiary, default, balance, loan])
    
    return values

#Change to vector
bankVectors = dataLines.map(transformToNumeric)
bankVectors.take(2)

[DenseVector([0.0, 30.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1787.0, 0.0]),
 DenseVector([1.0, 33.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4789.0, 1.0])]

#### Convert data into Dataframe with label and features for analysis

In [9]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

def transformToLabelPoint(inStr):
    return (float(inStr[0]), 
            Vectors.dense([
                inStr[1],inStr[2],inStr[3],inStr[4],
                inStr[5],inStr[6], inStr[7],inStr[8],
                inStr[9], inStr[10]
            ])
           )
bankLp = bankVectors.map(transformToLabelPoint)
bankLp.take(2)          

[(0.0, DenseVector([30.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1787.0, 0.0])),
 (1.0, DenseVector([33.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4789.0, 1.0]))]

In [11]:
bankDF = sqlContext.createDataFrame(bankLp,['label','features'])
bankDF.show(10)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|[30.0,0.0,1.0,0.0...|
|  1.0|[33.0,0.0,1.0,0.0...|
|  1.0|[35.0,1.0,0.0,0.0...|
|  1.0|[30.0,0.0,1.0,0.0...|
|  0.0|[59.0,0.0,1.0,0.0...|
|  1.0|[35.0,1.0,0.0,0.0...|
|  1.0|[36.0,0.0,1.0,0.0...|
|  0.0|[39.0,0.0,1.0,0.0...|
|  0.0|[41.0,0.0,1.0,0.0...|
|  1.0|[43.0,0.0,1.0,0.0...|
+-----+--------------------+
only showing top 10 rows



#### Perform Principal Component Analysis

In [15]:
from pyspark.ml.feature import PCA

bankPCA = PCA(k=3,inputCol='features', outputCol='pcaFeatures')

pcaModel = bankPCA.fit(bankDF)
pcaResult = pcaModel.transform(bankDF).select ('label','pcaFeatures')
pcaResult.show(5,truncate=False)

+-----+-------------------------------------------------------------+
|label|pcaFeatures                                                  |
+-----+-------------------------------------------------------------+
|0.0  |[-1787.0188972402566,28.862493618682592,-0.27105149625042446]|
|1.0  |[-4789.020184182034,29.927929724171516,-0.2442456641141761]  |
|1.0  |[-1350.022213195645,34.10154745848408,1.3242759453483874]    |
|1.0  |[-1476.018951753866,29.051719386627397,-0.16259922035163182] |
|0.0  |[-0.03789611394222689,58.9952650770054,0.34110916402266667]  |
+-----+-------------------------------------------------------------+
only showing top 5 rows



#### Indexing data as per-requirement

In [17]:
from pyspark.ml.feature import StringIndexer
stringIndexer = StringIndexer(inputCol ="label", outputCol='indexed')
si_model = stringIndexer.fit(pcaResult)
td = si_model.transform(pcaResult)
td.show(5, truncate=False)

+-----+-------------------------------------------------------------+-------+
|label|pcaFeatures                                                  |indexed|
+-----+-------------------------------------------------------------+-------+
|0.0  |[-1787.0188972402566,28.862493618682592,-0.27105149625042446]|0.0    |
|1.0  |[-4789.020184182034,29.927929724171516,-0.2442456641141761]  |1.0    |
|1.0  |[-1350.022213195645,34.10154745848408,1.3242759453483874]    |1.0    |
|1.0  |[-1476.018951753866,29.051719386627397,-0.16259922035163182] |1.0    |
|0.0  |[-0.03789611394222689,58.9952650770054,0.34110916402266667]  |0.0    |
+-----+-------------------------------------------------------------+-------+
only showing top 5 rows



#### Splitting data into training and subsets

In [19]:
trainingData, testData = td.randomSplit([0.7,0.3])

print("trianing Row Count: ", trainingData.count(),"test Row Count: ", testData.count())

trianing Row Count:  375 test Row Count:  166


#### Creating Random Forest Classifier

In [20]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Create the model
rmClassifier = RandomForestClassifier(labelCol = 'indexed', featuresCol='pcaFeatures')
rmModel = rmClassifier.fit(trainingData)

# prediction on test data
predictions = rmModel.transform(testData)

predictions.select("prediction", "indexed", "label", "pcaFeatures").show(10, truncate=False)

+----------+-------+-----+-------------------------------------------------------------+
|prediction|indexed|label|pcaFeatures                                                  |
+----------+-------+-----+-------------------------------------------------------------+
|0.0       |0.0    |0.0  |[-14093.033699696094,47.947126111914436,0.0903606443655938]  |
|0.0       |0.0    |0.0  |[-12186.027028044504,38.174585634959755,-0.08941010167841718]|
|0.0       |0.0    |0.0  |[-9374.023112704537,32.98201569967229,-0.17967045236880375]  |
|0.0       |0.0    |0.0  |[-8969.023848784527,34.23756359220832,-0.06609599928765825]  |
|0.0       |0.0    |0.0  |[-7190.025503592384,37.373990895859336,0.9613194767414225]   |
|0.0       |0.0    |0.0  |[-5799.035411296781,53.27906641078873,0.18030419337684683]   |
|1.0       |0.0    |0.0  |[-4943.020796609097,30.79945070089049,1.1425067501080945]    |
|0.0       |0.0    |0.0  |[-4380.033137168154,50.1846531638563,0.16119899669617513]    |
|0.0       |0.0    |0

In [21]:
#### evaluating the model performance

In [25]:
evaluator = MulticlassClassificationEvaluator(labelCol='indexed', 
                                              predictionCol='prediction',
                                              metricName='weightedPrecision'
                                             )
evaluator.evaluate(predictions)

0.6866648178391315

#### Confusion Matrix

In [27]:
predictions.groupBy("indexed","prediction").count().show()

+-------+----------+-----+
|indexed|prediction|count|
+-------+----------+-----+
|    1.0|       1.0|   35|
|    0.0|       1.0|   16|
|    1.0|       0.0|   36|
|    0.0|       0.0|   79|
+-------+----------+-----+



In [28]:
import pandas as pd
labelList = predictions.select('indexed','prediction').distinct().toPandas()
labelList.head()

Unnamed: 0,indexed,prediction
0,1.0,1.0
1,0.0,1.0
2,1.0,0.0
3,0.0,0.0
