In [1]:
import os
import sys

os.chdir("C:/dataanalytics/python")
os.curdir

#Configure the environment . Set this up to the directory where spark is installed
if 'SPARK_HOME' not in os.environ:
    os.environ['SPARK_HOME'] = 'C:\\spark'
    
#create a variable for our root path
SPARK_HOME = os.environ['SPARK_HOME']

#Add the following paths to the system path. Please check your installation
#to make sure that these zip files actually exists. The names might change as
#versions change
sys.path.insert(0,os.path.join(SPARK_HOME,"python"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","py4j-0.10.6-src.zip"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","pyspark.zip"))
 
#Initialize a spark context
from pyspark import SparkContext
from pyspark import SparkConf

#optionally configure spark
conf = (SparkConf().setAppName("V2Maestros").setMaster("local[2]").set("spark.executor.memory", "1g"))

#Initalize spark context onl runs once
sc = SparkContext(conf=conf)

In [5]:
#loading data into an RDD
irisData = sc.textFile(r'C:\Users\jeffnerd\Desktop\spark\iris.csv')

In [6]:
irisData.persist()

C:\Users\jeffnerd\Desktop\spark\iris.csv MapPartitionsRDD[1] at textFile at <unknown>:0

In [7]:
#removibg the first row as it is not wanted
first = irisData.first()

In [8]:
dataLines = irisData.filter(lambda x: x!=first)

In [9]:
dataLines.count()

150

In [13]:
#convert the rddd into a dense vector. aas part of this example we change labels to numeric ones
from pyspark.mllib.linalg import Vectors

In [14]:
def transformToNumeric(inputStr):
    attList = inputStr.split(",")
    
    #set the default Setosa
    irisvalue = 1.0
    if attList[4] == "versicolor":
        irisvalue = 2.0
    if attList[4] == "virginica":
        irisvalue = 3.0
    #Filter out columns not wanted ou at this stage
    values = Vectors.dense([irisvalue,\
                           float(attList[0]),\
                           float(attList[1]),\
                           float(attList[2]),\
                           float(attList[3])\
                           ])
    return values
        
        
            
        

In [15]:
#chanhe to a vector
irisVectors = dataLines.map(transformToNumeric)
irisVectors.collect()

[DenseVector([1.0, 5.1, 3.5, 1.4, 0.2]),
 DenseVector([1.0, 4.9, 3.0, 1.4, 0.2]),
 DenseVector([1.0, 4.7, 3.2, 1.3, 0.2]),
 DenseVector([1.0, 4.6, 3.1, 1.5, 0.2]),
 DenseVector([1.0, 5.0, 3.6, 1.4, 0.2]),
 DenseVector([1.0, 5.4, 3.9, 1.7, 0.4]),
 DenseVector([1.0, 4.6, 3.4, 1.4, 0.3]),
 DenseVector([1.0, 5.0, 3.4, 1.5, 0.2]),
 DenseVector([1.0, 4.4, 2.9, 1.4, 0.2]),
 DenseVector([1.0, 4.9, 3.1, 1.5, 0.1]),
 DenseVector([1.0, 5.4, 3.7, 1.5, 0.2]),
 DenseVector([1.0, 4.8, 3.4, 1.6, 0.2]),
 DenseVector([1.0, 4.8, 3.0, 1.4, 0.1]),
 DenseVector([1.0, 4.3, 3.0, 1.1, 0.1]),
 DenseVector([1.0, 5.8, 4.0, 1.2, 0.2]),
 DenseVector([1.0, 5.7, 4.4, 1.5, 0.4]),
 DenseVector([1.0, 5.4, 3.9, 1.3, 0.4]),
 DenseVector([1.0, 5.1, 3.5, 1.4, 0.3]),
 DenseVector([1.0, 5.7, 3.8, 1.7, 0.3]),
 DenseVector([1.0, 5.1, 3.8, 1.5, 0.3]),
 DenseVector([1.0, 5.4, 3.4, 1.7, 0.2]),
 DenseVector([1.0, 5.1, 3.7, 1.5, 0.4]),
 DenseVector([1.0, 4.6, 3.6, 1.0, 0.2]),
 DenseVector([1.0, 5.1, 3.3, 1.7, 0.5]),
 DenseVector([1.

In [11]:
#perfoming statistical analysis
from pyspark.mllib.stat import Statistics

In [16]:
irisstats = Statistics.colStats(irisVectors)

In [17]:
irisstats.mean()

array([2.        , 5.84333333, 3.05733333, 3.758     , 1.19933333])

In [18]:
irisstats.variance()

array([0.67114094, 0.68569351, 0.18997942, 3.11627785, 0.58100626])

In [19]:
irisstats.min()

array([1. , 4.3, 2. , 1. , 0.1])

In [20]:
irisstats.max()

array([3. , 7.9, 4.4, 6.9, 2.5])

In [21]:
Statistics.corr(irisVectors)

array([[ 1.        ,  0.78256123, -0.42665756,  0.9490347 ,  0.95654733],
       [ 0.78256123,  1.        , -0.11756978,  0.87175378,  0.81794113],
       [-0.42665756, -0.11756978,  1.        , -0.4284401 , -0.36612593],
       [ 0.9490347 ,  0.87175378, -0.4284401 ,  1.        ,  0.96286543],
       [ 0.95654733,  0.81794113, -0.36612593,  0.96286543,  1.        ]])

In [10]:
from pyspark.ml.linalg import Vectors

In [11]:
def transformLabeledPoint(inStr):
    attList = inStr.split(",")
    lp = (attList[4],Vectors.dense([attList[0], attList[2], attList[3]]))
    return lp

In [12]:
irisLp = dataLines.map(transformLabeledPoint)

In [13]:
from pyspark.sql import SQLContext
from pyspark.sql import DataFrame

In [14]:
spark = SQLContext(sc)

In [15]:
irisDF = spark.createDataFrame(irisLp, ["label", "features"])

In [16]:
irisDF.select("label","features").show(10)

+------+-------------+
| label|     features|
+------+-------------+
|setosa|[5.1,1.4,0.2]|
|setosa|[4.9,1.4,0.2]|
|setosa|[4.7,1.3,0.2]|
|setosa|[4.6,1.5,0.2]|
|setosa|[5.0,1.4,0.2]|
|setosa|[5.4,1.7,0.4]|
|setosa|[4.6,1.4,0.3]|
|setosa|[5.0,1.5,0.2]|
|setosa|[4.4,1.4,0.2]|
|setosa|[4.9,1.5,0.1]|
+------+-------------+
only showing top 10 rows



In [17]:
#indexing needed to for pre-req for decision trees
from pyspark.ml.feature import StringIndexer

In [18]:
stringindexer = StringIndexer(inputCol="label", outputCol="indexed")
si_model = stringindexer.fit(irisDF)
td = si_model.transform(irisDF)
td.collect()

[Row(label='setosa', features=DenseVector([5.1, 1.4, 0.2]), indexed=2.0),
 Row(label='setosa', features=DenseVector([4.9, 1.4, 0.2]), indexed=2.0),
 Row(label='setosa', features=DenseVector([4.7, 1.3, 0.2]), indexed=2.0),
 Row(label='setosa', features=DenseVector([4.6, 1.5, 0.2]), indexed=2.0),
 Row(label='setosa', features=DenseVector([5.0, 1.4, 0.2]), indexed=2.0),
 Row(label='setosa', features=DenseVector([5.4, 1.7, 0.4]), indexed=2.0),
 Row(label='setosa', features=DenseVector([4.6, 1.4, 0.3]), indexed=2.0),
 Row(label='setosa', features=DenseVector([5.0, 1.5, 0.2]), indexed=2.0),
 Row(label='setosa', features=DenseVector([4.4, 1.4, 0.2]), indexed=2.0),
 Row(label='setosa', features=DenseVector([4.9, 1.5, 0.1]), indexed=2.0),
 Row(label='setosa', features=DenseVector([5.4, 1.5, 0.2]), indexed=2.0),
 Row(label='setosa', features=DenseVector([4.8, 1.6, 0.2]), indexed=2.0),
 Row(label='setosa', features=DenseVector([4.8, 1.4, 0.1]), indexed=2.0),
 Row(label='setosa', features=DenseVec

In [19]:
#spliting the data into training and test data
(trainingData, testData) = td.randomSplit([0.75, 0.25])

In [20]:
trainingData.count()

121

In [38]:
testData.count()

37

In [21]:
testData.collect()

[Row(label='setosa', features=DenseVector([4.5, 1.3, 0.3]), indexed=2.0),
 Row(label='setosa', features=DenseVector([4.7, 1.6, 0.2]), indexed=2.0),
 Row(label='setosa', features=DenseVector([4.8, 1.6, 0.2]), indexed=2.0),
 Row(label='setosa', features=DenseVector([4.9, 1.5, 0.2]), indexed=2.0),
 Row(label='setosa', features=DenseVector([5.0, 1.2, 0.2]), indexed=2.0),
 Row(label='setosa', features=DenseVector([5.0, 1.6, 0.6]), indexed=2.0),
 Row(label='setosa', features=DenseVector([5.1, 1.4, 0.3]), indexed=2.0),
 Row(label='setosa', features=DenseVector([5.1, 1.6, 0.2]), indexed=2.0),
 Row(label='setosa', features=DenseVector([5.1, 1.7, 0.5]), indexed=2.0),
 Row(label='setosa', features=DenseVector([5.2, 1.5, 0.1]), indexed=2.0),
 Row(label='setosa', features=DenseVector([5.4, 1.5, 0.4]), indexed=2.0),
 Row(label='setosa', features=DenseVector([5.7, 1.7, 0.3]), indexed=2.0),
 Row(label='versicolor', features=DenseVector([6.2, 4.5, 1.5]), indexed=0.0),
 Row(label='versicolor', features=

In [22]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [23]:
#create the model
dtClassifier = DecisionTreeClassifier(maxDepth=2, labelCol="indexed")

In [43]:
dtClassifier.getImpurity()

'gini'

In [24]:
dtModel = dtClassifier.fit(trainingData)

In [26]:
dtModel.numNodes


5

In [27]:
dtModel.depth

2

In [29]:
#predict the test data
prediction = dtModel.transform(testData)
prediction.select("prediction", "indexed","label", "features").collect()

[Row(prediction=2.0, indexed=2.0, label='setosa', features=DenseVector([4.5, 1.3, 0.3])),
 Row(prediction=2.0, indexed=2.0, label='setosa', features=DenseVector([4.7, 1.6, 0.2])),
 Row(prediction=2.0, indexed=2.0, label='setosa', features=DenseVector([4.8, 1.6, 0.2])),
 Row(prediction=2.0, indexed=2.0, label='setosa', features=DenseVector([4.9, 1.5, 0.2])),
 Row(prediction=2.0, indexed=2.0, label='setosa', features=DenseVector([5.0, 1.2, 0.2])),
 Row(prediction=2.0, indexed=2.0, label='setosa', features=DenseVector([5.0, 1.6, 0.6])),
 Row(prediction=2.0, indexed=2.0, label='setosa', features=DenseVector([5.1, 1.4, 0.3])),
 Row(prediction=2.0, indexed=2.0, label='setosa', features=DenseVector([5.1, 1.6, 0.2])),
 Row(prediction=2.0, indexed=2.0, label='setosa', features=DenseVector([5.1, 1.7, 0.5])),
 Row(prediction=2.0, indexed=2.0, label='setosa', features=DenseVector([5.2, 1.5, 0.1])),
 Row(prediction=2.0, indexed=2.0, label='setosa', features=DenseVector([5.4, 1.5, 0.4])),
 Row(predi

In [34]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",labelCol="indexed", metricName="accuracy")
evaluator.evaluate(prediction)

0.896551724137931

In [None]:
#draw a confusion matrix
prediction.groupBy("indexed","prediction").count().show()