In [1]:
import findspark
findspark.init()
import pyspark
sc = pyspark.SparkContext('local[*]')
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

ImportError: No module named findspark

In [2]:
basedir = '/Users/linamiao/playground/spark/'

In [16]:
irisTwoFeatures = sqlContext.read.parquet(basedir + 'irisTwoFeatures.parquet')
irisTwoFeatures.take(2)

[Row(features=DenseVector([-0.5556, 0.5]), label=0.0),
 Row(features=DenseVector([-0.8333, 0.0]), label=0.0)]

In [17]:
from pyspark.sql.functions import udf, lit
from pyspark.sql.types import DoubleType

getElement = udf(lambda v, i: float(v[i]), DoubleType())
irisSeparateFeatures = (irisTwoFeatures
                           .withColumn('sepalLength', getElement('features',lit(0)))
                           .withColumn('sepalWidth', getElement('features',lit(1))))
irisSeparateFeatures.take(5)

[Row(features=DenseVector([-0.5556, 0.5]), label=0.0, sepalLength=-0.555556, sepalWidth=0.5),
 Row(features=DenseVector([-0.8333, 0.0]), label=0.0, sepalLength=-0.833333, sepalWidth=0.0),
 Row(features=DenseVector([-0.4444, 0.4167]), label=0.0, sepalLength=-0.444444, sepalWidth=0.416667),
 Row(features=DenseVector([-0.6111, 0.0833]), label=0.0, sepalLength=-0.611111, sepalWidth=0.0833333),
 Row(features=DenseVector([0.5, 0.0]), label=1.0, sepalLength=0.5, sepalWidth=0.0)]

In [18]:
from pyspark.sql.functions import col
from pyspark.sql.utils import AnalysisException

try:
    irisTwoFeatures.withColumn('sepalLength', col('features').getItem(0)).take(5)
except AnalysisException as e:
    print e

u"Can't extract value from features#252;"


In [19]:
from pyspark.sql import Row

arrayDF = sqlContext.createDataFrame([Row(anArray=[1,2,3]),Row(anArray=[4,5])])
arrayDF.take(2)
arrayDF.show()

arrayDF.select(col('anArray').getItem(0)).show()
arrayDF.select(col('anArray')[0]).show()

+---------+
|  anArray|
+---------+
|[1, 2, 3]|
|   [4, 5]|
+---------+

+----------+
|anArray[0]|
+----------+
|         1|
|         4|
+----------+

+----------+
|anArray[0]|
+----------+
|         1|
|         4|
+----------+



In [20]:
sqlContext.udf.register('getElement',getElement.func,getElement.returnType)
irisTwoFeatures.registerTempTable('irisTwo')

In [21]:
sqlContext.sql('select getElement(features, 0) as sepalLength from irisTwo').show()

+-----------+
|sepalLength|
+-----------+
|  -0.555556|
|  -0.833333|
|  -0.444444|
|  -0.611111|
|        0.5|
|   0.166667|
|   0.444444|
|  -0.333333|
|  -0.555556|
|  -0.666667|
|  -0.777778|
|  -0.833333|
|  -0.611111|
|  -0.388889|
|  -0.833333|
|  -0.611111|
|  0.0555554|
|  -0.555556|
|  -0.222222|
|   0.111111|
+-----------+
only showing top 20 rows



## EDA and feature enginerring

In [22]:
irisSeparateFeatures.describe('label','sepalLength','sepalWidth').show()

+-------+------------------+--------------------+--------------------+
|summary|             label|         sepalLength|          sepalWidth|
+-------+------------------+--------------------+--------------------+
|  count|               150|                 150|                 150|
|   mean|               1.0|-0.14259263863153332|-0.12166668000000001|
| stddev|0.8192319205190404|  0.4600366943839747|  0.3613285840513614|
|    min|               0.0|                -1.0|                -1.0|
|    max|               2.0|                 1.0|                 1.0|
+-------+------------------+--------------------+--------------------+



In [23]:
from pyspark.ml.feature import StandardScaler
standardScaler = (StandardScaler()
                      .setInputCol('features')
                      .setOutputCol('standardized')
                      .setWithMean(True))
print standardScaler.explainParams()

inputCol: input column name. (current: features)
outputCol: output column name. (default: StandardScaler_4a869ac3b8bd6b81d740__output, current: standardized)
withMean: Center data with mean (default: False, current: True)
withStd: Scale to unit standard deviation (default: True)


In [24]:
irisStandardizedLength = (standardScaler
                              .fit(irisSeparateFeatures)
                              .transform(irisSeparateFeatures)
                              .withColumn('standadizedLength', getElement('standardized',lit(0))))
irisStandardizedLength.show()                        
                              

+--------------------+-----+-----------+----------+--------------------+--------------------+
|            features|label|sepalLength|sepalWidth|        standardized|   standadizedLength|
+--------------------+-----+-----------+----------+--------------------+--------------------+
|     [-0.555556,0.5]|  0.0|  -0.555556|       0.5|[-0.8976748298773...| -0.8976748298773366|
|     [-0.833333,0.0]|  0.0|  -0.833333|       0.0|[-1.5014897068013...| -1.5014897068013724|
|[-0.444444,0.416667]|  0.0|  -0.444444|  0.416667|[-0.6561462706201...| -0.6561462706201501|
|[-0.611111,0.0833...|  0.0|  -0.611111| 0.0833333|[-1.0184369357662...| -1.0184369357662864|
|           [0.5,0.0]|  1.0|        0.5|       0.0|[1.39682909314878...|  1.3968290931487877|
|      [0.166667,0.0]|  1.0|   0.166667|       0.0|[0.67224993659615...|  0.6722499365961586|
|[0.444444,-0.0833...|  1.0|   0.444444|-0.0833334|[1.27606481352019...|  1.2760648135201946|
|   [-0.333333,-0.75]|  1.0|  -0.333333|     -0.75|[-0.41461

In [28]:
irisStandardizedLength.describe('sepalLength','standadizedLength').show()

+-------+--------------------+--------------------+
|summary|         sepalLength|   standadizedLength|
+-------+--------------------+--------------------+
|  count|                 150|                 150|
|   mean|-0.14259263863153332|-4.73695157173400...|
| stddev|  0.4600366943839747|                 1.0|
|    min|                -1.0|  -1.863780371947509|
|    max|                 1.0|  2.4836989148475532|
+-------+--------------------+--------------------+



In [34]:
from pyspark.ml.feature import Normalizer
normalizer = (Normalizer()
                 .setInputCol('features')
                 .setOutputCol('featuresNorm')
                 .setP(2.0))
irisNormalized = normalizer.transform(irisTwoFeatures)
irisNormalized.show(20,False)

+----------------------+-----+------------------------------------------+
|features              |label|featuresNorm                              |
+----------------------+-----+------------------------------------------+
|[-0.555556,0.5]       |0.0  |[-0.7432944123545073,0.6689644359475079]  |
|[-0.833333,0.0]       |0.0  |[-1.0,0.0]                                |
|[-0.444444,0.416667]  |0.0  |[-0.7295365898724013,0.6839417840996028]  |
|[-0.611111,0.0833333] |0.0  |[-0.9908301719908108,0.13511317579222407] |
|[0.5,0.0]             |1.0  |[1.0,0.0]                                 |
|[0.166667,0.0]        |1.0  |[1.0,0.0]                                 |
|[0.444444,-0.0833334] |1.0  |[0.9828721268491966,-0.1842888555038989]  |
|[-0.333333,-0.75]     |1.0  |[-0.40613812690680956,-0.9138116993520209]|
|[-0.555556,0.25]      |0.0  |[-0.9119216280284423,0.4103644043212756]  |
|[-0.666667,-0.166667] |0.0  |[-0.970142414544513,-0.24253596743935182] |
|[-0.777778,0.0]       |0.0  |[-1.0,0.

In [39]:
firstVector = irisNormalized.select('featuresNorm').first()[0]
print firstVector
firstVector.norm(2.0)

[-0.743294412355,0.668964435948]


0.99999999999999989

In [46]:
l2Norm = udf(lambda v: float(v.norm(2.0)), DoubleType())
featureLengths = irisNormalized.select(l2Norm('features').alias('featuresLength'),
                                       l2Norm('featuresNorm').alias('featureNormLength'))
featureLengths.show()

+-------------------+------------------+
|     featuresLength| featureNormLength|
+-------------------+------------------+
| 0.7474238885237748|0.9999999999999999|
|           0.833333|               1.0|
| 0.6092141315046787|               1.0|
| 0.6167666440477224|               1.0|
|                0.5|               1.0|
|           0.166667|               1.0|
| 0.4521890364566129|               1.0|
| 0.8207380147702432|0.9999999999999999|
| 0.6092146330612882|               1.0|
| 0.6871846751623613|               1.0|
|           0.777778|               1.0|
| 0.8374893100479313|               1.0|
|  0.696108858735471|0.9999999999999999|
| 0.7010791989568653|               1.0|
| 0.8498363241107078|               1.0|
| 0.6334307722316622|               1.0|
| 0.2560984234023318|0.9999999999999999|
| 0.8055556206898442|0.9999999999999999|
| 0.4006164077680793|               1.0|
|0.13888878000000576|0.9999999999999999|
+-------------------+------------------+
only showing top

In [47]:
from pyspark.ml.feature import Bucketizer
splits = [-float('inf'), -.5, 0.0, .5, float('inf')]
lengthBucketizer = (Bucketizer()
                       .setInputCol('sepalLength')
                       .setOutputCol('LengthFeatures')
                       .setSplits(splits))
irisBucketizdLength = lengthBucketizer.transform(irisSeparateFeatures)
irisBucketizdLength.show()

+--------------------+-----+-----------+----------+--------------+
|            features|label|sepalLength|sepalWidth|LengthFeatures|
+--------------------+-----+-----------+----------+--------------+
|     [-0.555556,0.5]|  0.0|  -0.555556|       0.5|           0.0|
|     [-0.833333,0.0]|  0.0|  -0.833333|       0.0|           0.0|
|[-0.444444,0.416667]|  0.0|  -0.444444|  0.416667|           1.0|
|[-0.611111,0.0833...|  0.0|  -0.611111| 0.0833333|           0.0|
|           [0.5,0.0]|  1.0|        0.5|       0.0|           3.0|
|      [0.166667,0.0]|  1.0|   0.166667|       0.0|           2.0|
|[0.444444,-0.0833...|  1.0|   0.444444|-0.0833334|           2.0|
|   [-0.333333,-0.75]|  1.0|  -0.333333|     -0.75|           1.0|
|    [-0.555556,0.25]|  0.0|  -0.555556|      0.25|           0.0|
|[-0.666667,-0.166...|  0.0|  -0.666667| -0.166667|           0.0|
|     [-0.777778,0.0]|  0.0|  -0.777778|       0.0|           0.0|
|[-0.833333,-0.083...|  0.0|  -0.833333|-0.0833334|           

In [49]:
widthBucketizer = (Bucketizer()
                   .setInputCol("sepalWidth")
                   .setOutputCol("widthFeatures")
                   .setSplits(splits))

irisBucketizedWidth = widthBucketizer.transform(irisBucketizdLength)
irisBucketizedWidth.show()

+--------------------+-----+-----------+----------+--------------+-------------+
|            features|label|sepalLength|sepalWidth|LengthFeatures|widthFeatures|
+--------------------+-----+-----------+----------+--------------+-------------+
|     [-0.555556,0.5]|  0.0|  -0.555556|       0.5|           0.0|          3.0|
|     [-0.833333,0.0]|  0.0|  -0.833333|       0.0|           0.0|          2.0|
|[-0.444444,0.416667]|  0.0|  -0.444444|  0.416667|           1.0|          2.0|
|[-0.611111,0.0833...|  0.0|  -0.611111| 0.0833333|           0.0|          2.0|
|           [0.5,0.0]|  1.0|        0.5|       0.0|           3.0|          2.0|
|      [0.166667,0.0]|  1.0|   0.166667|       0.0|           2.0|          2.0|
|[0.444444,-0.0833...|  1.0|   0.444444|-0.0833334|           2.0|          1.0|
|   [-0.333333,-0.75]|  1.0|  -0.333333|     -0.75|           1.0|          0.0|
|    [-0.555556,0.25]|  0.0|  -0.555556|      0.25|           0.0|          2.0|
|[-0.666667,-0.166...|  0.0|

In [50]:
from pyspark.ml.pipeline import Pipeline
pipelineBucketizer = Pipeline().setStages([lengthBucketizer,widthBucketizer])
pipelineModelBucketizer = pipelineBucketizer.fit(irisSeparateFeatures)
irisBucketized = pipelineModelBucketizer.transform(irisSeparateFeatures)

irisBucketized.show()

+--------------------+-----+-----------+----------+--------------+-------------+
|            features|label|sepalLength|sepalWidth|LengthFeatures|widthFeatures|
+--------------------+-----+-----------+----------+--------------+-------------+
|     [-0.555556,0.5]|  0.0|  -0.555556|       0.5|           0.0|          3.0|
|     [-0.833333,0.0]|  0.0|  -0.833333|       0.0|           0.0|          2.0|
|[-0.444444,0.416667]|  0.0|  -0.444444|  0.416667|           1.0|          2.0|
|[-0.611111,0.0833...|  0.0|  -0.611111| 0.0833333|           0.0|          2.0|
|           [0.5,0.0]|  1.0|        0.5|       0.0|           3.0|          2.0|
|      [0.166667,0.0]|  1.0|   0.166667|       0.0|           2.0|          2.0|
|[0.444444,-0.0833...|  1.0|   0.444444|-0.0833334|           2.0|          1.0|
|   [-0.333333,-0.75]|  1.0|  -0.333333|     -0.75|           1.0|          0.0|
|    [-0.555556,0.25]|  0.0|  -0.555556|      0.25|           0.0|          2.0|
|[-0.666667,-0.166...|  0.0|

In [51]:
from pyspark.ml.feature import VectorAssembler
pipeline = Pipeline()
assembler = VectorAssembler()

print assembler.explainParams()

inputCols: input column names. (undefined)
outputCol: output column name. (default: VectorAssembler_42788cc46693f91c1eea__output)


In [53]:
(assembler
     .setInputCols(['LengthFeatures','widthFeatures'])
     .setOutputCol('featuresBucketized'))
pipeline.setStages([lengthBucketizer,widthBucketizer, assembler])
irisAssembled = pipeline.fit(irisSeparateFeatures).transform(irisSeparateFeatures)
irisAssembled.show()

+--------------------+-----+-----------+----------+--------------+-------------+------------------+
|            features|label|sepalLength|sepalWidth|LengthFeatures|widthFeatures|featuresBucketized|
+--------------------+-----+-----------+----------+--------------+-------------+------------------+
|     [-0.555556,0.5]|  0.0|  -0.555556|       0.5|           0.0|          3.0|         [0.0,3.0]|
|     [-0.833333,0.0]|  0.0|  -0.833333|       0.0|           0.0|          2.0|         [0.0,2.0]|
|[-0.444444,0.416667]|  0.0|  -0.444444|  0.416667|           1.0|          2.0|         [1.0,2.0]|
|[-0.611111,0.0833...|  0.0|  -0.611111| 0.0833333|           0.0|          2.0|         [0.0,2.0]|
|           [0.5,0.0]|  1.0|        0.5|       0.0|           3.0|          2.0|         [3.0,2.0]|
|      [0.166667,0.0]|  1.0|   0.166667|       0.0|           2.0|          2.0|         [2.0,2.0]|
|[0.444444,-0.0833...|  1.0|   0.444444|-0.0833334|           2.0|          1.0|         [2.0,1.0]|


## logistic regression

In [60]:
irisSeparateFeatures.groupby('label').count().orderBy('label').show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0|   50|
|  1.0|   50|
|  2.0|   50|
+-----+-----+



In [62]:
from pyspark.sql.functions import col
irisTwoClass = irisSeparateFeatures.filter(col('label')<2)
irisTwoClass.groupby('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0|   50|
|  1.0|   50|
+-----+-----+



In [64]:
irisTest, irisTrain = irisTwoClass.randomSplit([.25, .75], seed=0)
irisTest.cache()
irisTrain.cache()

print 'Item in train dataset:{0}'.format(irisTrain.count())
print 'Item in test dataset:{0}'.format(irisTest.count())

Item in train dataset:70
Item in test dataset:30


In [69]:
from pyspark.ml.classification import LogisticRegression
lr = (LogisticRegression()
         .setFeaturesCol('featuresBucketized')
         .setRegParam(0.0)
         .setLabelCol('label')
         .setMaxIter(1000))
pipeline.setStages([lengthBucketizer, widthBucketizer, assembler, lr])

pipelineModelLR = pipeline.fit(irisTrain)

irisTestPredictions = (pipelineModelLR
                          .transform(irisTest)
                          .cache())
irisTestPredictions.select('label','probability','prediction').show(10,False)

+-----+------------------------------------------+----------+
|label|probability                               |prediction|
+-----+------------------------------------------+----------+
|0.0  |[0.9999999999914739,8.526040450948622E-12]|0.0       |
|1.0  |[2.453432149811593E-22,1.0]               |1.0       |
|1.0  |[1.918375289718597E-11,0.9999999999808162]|1.0       |
|1.0  |[1.2659797057529866E-27,1.0]              |1.0       |
|1.0  |[1.6190759585416815E-38,1.0]              |1.0       |
|1.0  |[3.71774215018083E-6,0.9999962822578499]  |1.0       |
|0.0  |[0.999996560002663,3.4399973370201116E-6] |0.0       |
|1.0  |[3.71774215018083E-6,0.9999962822578499]  |1.0       |
|0.0  |[0.9999999999914739,8.526040450948622E-12]|0.0       |
|0.0  |[0.6000010126279576,0.39999898737204237]  |0.0       |
+-----+------------------------------------------+----------+
only showing top 10 rows



In [71]:
from pyspark.ml.feature import OneHotEncoder
oneHotLength = (OneHotEncoder()
                   .setInputCol('LengthFeatures')
                   .setOutputCol('lengthOneHot'))
pipeline.setStages([lengthBucketizer, widthBucketizer, oneHotLength])
irisWithOneHotLength = pipeline.fit(irisTrain).transform(irisTrain)

In [72]:
irisWithOneHotLength.show()

+--------------------+-----+-----------+----------+--------------+-------------+-------------+
|            features|label|sepalLength|sepalWidth|LengthFeatures|widthFeatures| lengthOneHot|
+--------------------+-----+-----------+----------+--------------+-------------+-------------+
|[-0.833333,-0.083...|  0.0|  -0.833333|-0.0833334|           0.0|          1.0|(3,[0],[1.0])|
|     [-0.833333,0.0]|  0.0|  -0.833333|       0.0|           0.0|          2.0|(3,[0],[1.0])|
|[-0.833333,0.166667]|  0.0|  -0.833333|  0.166667|           0.0|          2.0|(3,[0],[1.0])|
|     [-0.777778,0.0]|  0.0|  -0.777778|       0.0|           0.0|          2.0|(3,[0],[1.0])|
|[-0.666667,-0.166...|  0.0|  -0.666667| -0.166667|           0.0|          1.0|(3,[0],[1.0])|
|[-0.611111,0.0833...|  0.0|  -0.611111| 0.0833333|           0.0|          2.0|(3,[0],[1.0])|
|[-0.611111,0.166667]|  0.0|  -0.611111|  0.166667|           0.0|          2.0|(3,[0],[1.0])|
|[-0.611111,0.333333]|  0.0|  -0.611111|  0.333333

In [75]:
oneHotWidth = (OneHotEncoder()
               .setInputCol('widthFeatures')
               .setOutputCol('widthOneHot'))

assembleOneHot = (VectorAssembler()
                  .setInputCols(['lengthOneHot', 'widthOneHot'])
                  .setOutputCol('featuresBucketized'))

pipeline.setStages([lengthBucketizer, widthBucketizer, oneHotLength, oneHotWidth, assembleOneHot])
irisWithOneHot = pipeline.fit(irisTrain).transform(irisTrain)

In [78]:
irisWithOneHot.select('lengthOneHot', 'widthOneHot','featuresBucketized').show()

+-------------+-------------+-------------------+
| lengthOneHot|  widthOneHot| featuresBucketized|
+-------------+-------------+-------------------+
|(3,[0],[1.0])|(3,[1],[1.0])|(6,[0,4],[1.0,1.0])|
|(3,[0],[1.0])|(3,[2],[1.0])|(6,[0,5],[1.0,1.0])|
|(3,[0],[1.0])|(3,[2],[1.0])|(6,[0,5],[1.0,1.0])|
|(3,[0],[1.0])|(3,[2],[1.0])|(6,[0,5],[1.0,1.0])|
|(3,[0],[1.0])|(3,[1],[1.0])|(6,[0,4],[1.0,1.0])|
|(3,[0],[1.0])|(3,[2],[1.0])|(6,[0,5],[1.0,1.0])|
|(3,[0],[1.0])|(3,[2],[1.0])|(6,[0,5],[1.0,1.0])|
|(3,[0],[1.0])|(3,[2],[1.0])|(6,[0,5],[1.0,1.0])|
|(3,[0],[1.0])|(3,[0],[1.0])|(6,[0,3],[1.0,1.0])|
|(3,[0],[1.0])|(3,[2],[1.0])|(6,[0,5],[1.0,1.0])|
|(3,[0],[1.0])|    (3,[],[])|      (6,[0],[1.0])|
|(3,[1],[1.0])|(3,[2],[1.0])|(6,[1,5],[1.0,1.0])|
|(3,[1],[1.0])|(3,[1],[1.0])|(6,[1,4],[1.0,1.0])|
|(3,[1],[1.0])|(3,[0],[1.0])|(6,[1,3],[1.0,1.0])|
|(3,[1],[1.0])|(3,[1],[1.0])|(6,[1,4],[1.0,1.0])|
|(3,[1],[1.0])|(3,[1],[1.0])|(6,[1,4],[1.0,1.0])|
|(3,[1],[1.0])|(3,[2],[1.0])|(6,[1,5],[1.0,1.0])|


In [79]:
pipeline.setStages([lengthBucketizer, widthBucketizer, oneHotLength, oneHotWidth, assembleOneHot, lr])
pipelineModelLR2 = pipeline.fit(irisTrain)

irisTestPredictions2 = (pipelineModelLR2
                        .transform(irisTest)
                        .cache())
irisTestPredictions2.show()

+--------------------+-----+-----------+----------+--------------+-------------+-------------+-------------+-------------------+--------------------+--------------------+----------+
|            features|label|sepalLength|sepalWidth|LengthFeatures|widthFeatures| lengthOneHot|  widthOneHot| featuresBucketized|       rawPrediction|         probability|prediction|
+--------------------+-----+-----------+----------+--------------+-------------+-------------+-------------+-------------------+--------------------+--------------------+----------+
|[-0.388889,0.583333]|  0.0|  -0.388889|  0.583333|           1.0|          3.0|(3,[1],[1.0])|    (3,[],[])|      (6,[1],[1.0])|[19.4146216417010...|[0.99999999629884...|       0.0|
|[-0.333333,-0.583...|  1.0|  -0.333333| -0.583333|           1.0|          0.0|(3,[1],[1.0])|(3,[0],[1.0])|(6,[1,3],[1.0,1.0])|[-56.974613634311...|[1.80401311314184...|       1.0|
|[-0.277778,-0.166...|  1.0|  -0.277778| -0.166667|           1.0|          1.0|(3,[1],[1.

In [85]:
logisticModel = pipelineModelLR2.stages[-1]
print logisticModel.intercept
print logisticModel.coefficients

-3.83727483566
[-51.9027200324,-15.577346806,28.8000479171,76.389235276,38.6564323872,19.0091573305]


In [88]:
from pyspark.sql.functions import col

def modelAccuracy(df):
    return (df
               .select((col('prediction') == col('label')).cast('int').alias('correct'))
               .groupby()
               .avg('correct')
               .first()[0])

modelOneAccuracy = modelAccuracy(irisTestPredictions)
modelTwoAccuracy = modelAccuracy(irisTestPredictions2)

print 'modelOneAccuracy:{0:.3f}'.format(modelOneAccuracy)
print 'modelTwoAccuracy:{0:.3f}'.format(modelTwoAccuracy)

modelOneAccuracy:0.967
modelTwoAccuracy:0.967


In [90]:
irisTestPredictions.registerTempTable('modelOnePredictions')
sqlResult = sqlContext.sql('select avg(int(prediction == label)) from modelOnePredictions')
sqlResult.show()

+--------------------------------------+
|avg(CAST((prediction = label) AS INT))|
+--------------------------------------+
|                    0.9666666666666667|
+--------------------------------------+



In [91]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
binaryEvaluator = (BinaryClassificationEvaluator()
                      .setRawPredictionCol('rawPrediction')
                      .setMetricName('areaUnderROC'))
firstModelTestAUC = binaryEvaluator.evaluate(irisTestPredictions)
secondModelTestAUC = binaryEvaluator.evaluate(irisTestPredictions2)

print 'First model AUC: {0:.3f}'.format(firstModelTestAUC)
print 'Second model AUC: {0:.3f}'.format(secondModelTestAUC)

First model AUC: 0.996
Second model AUC: 0.973


In [92]:
irisTrainPredictions = pipelineModelLR.transform(irisTrain)
irisTrainPredictions2 = pipelineModelLR2.transform(irisTrain)

firstModelTrainAUC = binaryEvaluator.evaluate(irisTrainPredictions)
secondModelTrainAUC = binaryEvaluator.evaluate(irisTrainPredictions2)

print '\nFirst model training AUC: {0:.3f}'.format(firstModelTrainAUC)
print 'Second model training AUC: {0:.3f}'.format(secondModelTrainAUC)


First model training AUC: 0.998
Second model training AUC: 0.998


In [103]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

metric = 'precision'

multiclassEval = MulticlassClassificationEvaluator()

#multiclassEval.setMetricName(metric)
print 'Model one {0}: {1:.3f}'.format(metric, multiclassEval.evaluate(irisTestPredictions))
print 'Model two {0}: {1:.3f}\n'.format(metric, multiclassEval.evaluate(irisTestPredictions2))

Model one precision: 0.967
Model two precision: 0.967



In [100]:
multiclassEval.explainParams()

'labelCol: label column name. (default: label)\nmetricName: metric name in evaluation (f1|weightedPrecision|weightedRecall|accuracy) (default: f1, current: precision)\npredictionCol: prediction column name. (default: prediction)'