# Introduction to Spark Decision Tree Algorithm

#### Tabnine Extension for intellisense

In [1]:
# !pip install jupyter-tabnine
# !jupyter nbextension install --py jupyter_tabnine
!jupyter nbextension enable --py jupyter_tabnine
!jupyter serverextension enable --py jupyter_tabnine

Enabling notebook extension jupyter_tabnine/main...
      - Validating: [32mOK[0m
Enabling: jupyter_tabnine
- Writing config: /home/ateeb/.jupyter
    - Validating...
      jupyter_tabnine  [32mOK[0m


### Spark Initiallization Script

In [2]:
#!/usr/bin/env python3
import sys
import os

# getting the directory where Spark was installed
if 'SPARK_HOME' not in os.environ:
    os.environ['SPARK_HOME'] = '/opt/spark'

# python variable to store the root path for later reference
SPARK_HOME = os.environ['SPARK_HOME']

# adding pyspark and py4j packages paths to python path env variable
sys.path.insert(0,os.path.join(SPARK_HOME, "python"))
sys.path.insert(0,os.path.join(SPARK_HOME, "python", "lib"))
sys.path.insert(0,os.path.join(SPARK_HOME, "python", "lib", 'py4j-0.10.9-src.zip'))
sys.path.insert(0, os.path.join(SPARK_HOME, 'python', 'lib', 'pyspark.zip'))

from pyspark import SparkContext
from pyspark import SparkConf

conf = SparkConf()
conf.set('spark.executer.memory','1g')
conf.set('spark.cores.max','2')

# give name to your spark application
conf.setAppName("SparkDTreeApp")

# create a spark context object 
# note: Execute only once otherwise results in Context Errors
# create the spark context with 2 threads for streaming
sc = SparkContext('local',conf=conf)

#### Once the above script is executed you can view the Spark instance info here http://localhost:4040

## Problem Statement
*****************
The input data is the iris dataset. It contains recordings of 
information about flower samples. For each sample, the petal and 
sepal length and width are recorded along with the type of the 
flower. We need to use this dataset to build a decision tree 
model that can predict the type of flower based on the petal 
and sepal information.

#### Techniques Used

1. Decision Trees 

### Loading Data

In [3]:
# reading csv file into an RDD
irisData = sc.textFile("data/iris.csv")
irisData.persist()
irisData.take(3)

['Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species',
 '5.1,3.5,1.4,0.2,setosa',
 '4.9,3,1.4,0.2,setosa']

In [4]:
#Remove the first line (contains headers)
dataLines = irisData.filter(lambda line: "Sepal" not in line)
dataLines.count()

150

### Transforming Data to labelpoint

In [5]:
import math
from pyspark.ml.linalg import Vectors
from pyspark.sql import SQLContext

# creating sqlContext to work with dataframes
sqlContext = SQLContext(sc)

def transformToLabeledPoint(inputStr):
    attList = inputStr.split(",")
    lp  = (attList[4], Vectors.dense([attList[0],attList[2],attList[3]]))
    return lp
# from analysis it was visible the the Sepal.Width has week Correlation
# with the target so we will be dropping this column from the dataset

irisLp = dataLines.map(transformToLabeledPoint)
irisDF = sqlContext.createDataFrame(irisLp,["label","features"])
irisDF.select("label","features").show(10)

+------+-------------+
| label|     features|
+------+-------------+
|setosa|[5.1,1.4,0.2]|
|setosa|[4.9,1.4,0.2]|
|setosa|[4.7,1.3,0.2]|
|setosa|[4.6,1.5,0.2]|
|setosa|[5.0,1.4,0.2]|
|setosa|[5.4,1.7,0.4]|
|setosa|[4.6,1.4,0.3]|
|setosa|[5.0,1.5,0.2]|
|setosa|[4.4,1.4,0.2]|
|setosa|[4.9,1.5,0.1]|
+------+-------------+
only showing top 10 rows



### Indexing labels to integers (DTree Requirement)

In [6]:
# indexing nedded as per-req for Decision trees
from pyspark.ml.feature import StringIndexer
# create a stringindexer object
stringIndexer = StringIndexer(inputCol="label",outputCol="index")

# fit the indexer model to the dataset and learn the mapping
si_model = stringIndexer.fit(irisDF)

# transform the dataset, adding a index columns
td = si_model.transform(irisDF)

td.take(5)

[Row(label='setosa', features=DenseVector([5.1, 1.4, 0.2]), index=0.0),
 Row(label='setosa', features=DenseVector([4.9, 1.4, 0.2]), index=0.0),
 Row(label='setosa', features=DenseVector([4.7, 1.3, 0.2]), index=0.0),
 Row(label='setosa', features=DenseVector([4.6, 1.5, 0.2]), index=0.0),
 Row(label='setosa', features=DenseVector([5.0, 1.4, 0.2]), index=0.0)]

### Correlation of predictors with target variables

In [7]:
# finding correlations
from pyspark.mllib.stat import Statistics

td_corr  = td.select("features","index")
numFeatures = td_corr.take(1)[0].features.size
labelRDD  = td_corr.rdd.values()

for i in range(numFeatures):
    featuresRDD = td_corr.rdd.map(lambda lp: lp.features[i])
    corr = Statistics.corr(labelRDD,featuresRDD,'pearson')
    print("%d\t%g" % (i, corr))

0	0.782561
1	0.949035
2	0.956547


### Splitting Data into train and test sets

In [10]:
(trainingData, testData) = td.randomSplit([0.9,0.1])
print( trainingData.count(), testData.count())

140 10


### Creating the Decision Tree model

In [12]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# create the model
dtClassifier = DecisionTreeClassifier(maxDepth=2,labelCol='index')
dtModel = dtClassifier.fit(trainingData)

In [14]:
dtModel.numNodes

5

In [13]:
dtModel.depth

2

### Making Predictions on test data

In [17]:
predictions = dtModel.transform(testData)
predictions.select("prediction","index","label","features").show(10)

+----------+-----+----------+-------------+
|prediction|index|     label|     features|
+----------+-----+----------+-------------+
|       0.0|  0.0|    setosa|[5.0,1.3,0.3]|
|       0.0|  0.0|    setosa|[5.2,1.4,0.2]|
|       1.0|  1.0|versicolor|[5.8,4.1,1.0]|
|       2.0|  1.0|versicolor|[5.9,4.8,1.8]|
|       1.0|  1.0|versicolor|[6.0,4.5,1.6]|
|       1.0|  1.0|versicolor|[6.3,4.9,1.5]|
|       1.0|  1.0|versicolor|[6.5,4.6,1.5]|
|       1.0|  1.0|versicolor|[6.6,4.4,1.4]|
|       2.0|  2.0| virginica|[6.0,4.8,1.8]|
|       2.0|  2.0| virginica|[6.4,5.3,1.9]|
+----------+-----+----------+-------------+



### Evaluating the accuracy of the model

In [24]:
evaluator = MulticlassClassificationEvaluator(labelCol="index",predictionCol="prediction",metricName="weightedPrecision")
evaluator.evaluate(predictions)

0.9333333333333333

### Making Confusion Matrix

In [25]:
import pandas as pd
labelList = predictions.select("index","label").distinct().toPandas()
predictions.groupBy("index","prediction").count().show()

+-----+----------+-----+
|index|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|    5|
|  2.0|       2.0|    2|
|  1.0|       2.0|    1|
|  0.0|       0.0|    2|
+-----+----------+-----+



In [26]:
labelList

Unnamed: 0,index,label
0,2.0,virginica
1,1.0,versicolor
2,0.0,setosa
