In [3]:
import findspark
findspark.init()
import pyspark
sc = pyspark.SparkContext('local[*]')

In [5]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [6]:
from pyspark.mllib.util import MLUtils

In [7]:
baseDir = '/Users/linamiao/playground/spark/'
irisPath = baseDir + 'iris.scale'
irisRDD = MLUtils.loadLibSVMFile(sc,irisPath, minPartitions=20).cache()

irisRDD.take(5)

[LabeledPoint(1.0, (4,[0,1,2,3],[-0.555556,0.25,-0.864407,-0.916667])),
 LabeledPoint(1.0, (4,[0,1,2,3],[-0.666667,-0.166667,-0.864407,-0.916667])),
 LabeledPoint(1.0, (4,[0,2,3],[-0.777778,-0.898305,-0.916667])),
 LabeledPoint(1.0, (4,[0,1,2,3],[-0.833333,-0.0833334,-0.830508,-0.916667])),
 LabeledPoint(1.0, (4,[0,1,2,3],[-0.611111,0.333333,-0.864407,-0.916667]))]

In [8]:
sc.textFile(irisPath).take(5)

[u'1 1:-0.555556 2:0.25 3:-0.864407 4:-0.916667 ',
 u'1 1:-0.666667 2:-0.166667 3:-0.864407 4:-0.916667 ',
 u'1 1:-0.777778 3:-0.898305 4:-0.916667 ',
 u'1 1:-0.833333 2:-0.0833334 3:-0.830508 4:-0.916667 ',
 u'1 1:-0.611111 2:0.333333 3:-0.864407 4:-0.916667 ']

In [9]:
print 'number of partitions:{0}'.format(irisRDD.getNumPartitions())
elementsPerPart = (irisRDD
                    .mapPartitionsWithIndex(lambda i,x: [(i, len(list(x)))])
                    .collect()
                    )
print 'elements per partition: {0}\n'.format(elementsPerPart)
irisRDD.glom().take(1)

number of partitions:20
elements per partition: [(0, 8), (1, 7), (2, 8), (3, 8), (4, 7), (5, 8), (6, 8), (7, 7), (8, 7), (9, 7), (10, 8), (11, 7), (12, 7), (13, 7), (14, 8), (15, 8), (16, 8), (17, 7), (18, 7), (19, 8)]



[[LabeledPoint(1.0, (4,[0,1,2,3],[-0.555556,0.25,-0.864407,-0.916667])),
  LabeledPoint(1.0, (4,[0,1,2,3],[-0.666667,-0.166667,-0.864407,-0.916667])),
  LabeledPoint(1.0, (4,[0,2,3],[-0.777778,-0.898305,-0.916667])),
  LabeledPoint(1.0, (4,[0,1,2,3],[-0.833333,-0.0833334,-0.830508,-0.916667])),
  LabeledPoint(1.0, (4,[0,1,2,3],[-0.611111,0.333333,-0.864407,-0.916667])),
  LabeledPoint(1.0, (4,[0,1,2,3],[-0.388889,0.583333,-0.762712,-0.75])),
  LabeledPoint(1.0, (4,[0,1,2,3],[-0.833333,0.166667,-0.864407,-0.833333])),
  LabeledPoint(1.0, (4,[0,1,2,3],[-0.611111,0.166667,-0.830508,-0.916667]))]]

In [11]:
irisDF = irisRDD.toDF()
irisDF.take(5)

[Row(features=SparseVector(4, {0: -0.5556, 1: 0.25, 2: -0.8644, 3: -0.9167}), label=1.0),
 Row(features=SparseVector(4, {0: -0.6667, 1: -0.1667, 2: -0.8644, 3: -0.9167}), label=1.0),
 Row(features=SparseVector(4, {0: -0.7778, 2: -0.8983, 3: -0.9167}), label=1.0),
 Row(features=SparseVector(4, {0: -0.8333, 1: -0.0833, 2: -0.8305, 3: -0.9167}), label=1.0),
 Row(features=SparseVector(4, {0: -0.6111, 1: 0.3333, 2: -0.8644, 3: -0.9167}), label=1.0)]

In [13]:
irisDF.show(5, False)

+--------------------------------------------------------+-----+
|features                                                |label|
+--------------------------------------------------------+-----+
|(4,[0,1,2,3],[-0.555556,0.25,-0.864407,-0.916667])      |1.0  |
|(4,[0,1,2,3],[-0.666667,-0.166667,-0.864407,-0.916667]) |1.0  |
|(4,[0,2,3],[-0.777778,-0.898305,-0.916667])             |1.0  |
|(4,[0,1,2,3],[-0.833333,-0.0833334,-0.830508,-0.916667])|1.0  |
|(4,[0,1,2,3],[-0.611111,0.333333,-0.864407,-0.916667])  |1.0  |
+--------------------------------------------------------+-----+
only showing top 5 rows



In [19]:
print irisDF.schema
irisDF.printSchema()

StructType(List(StructField(features,VectorUDT,true),StructField(label,DoubleType,true)))
root
 |-- features: vector (nullable = true)
 |-- label: double (nullable = true)



In [21]:
from pyspark.sql.functions import col
irisDFZeroIndex = irisDF.select('features',(col('label') -1).alias('label'))
irisDFZeroIndex.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(4,[0,1,2,3],[-0....|  0.0|
|(4,[0,1,2,3],[-0....|  0.0|
|(4,[0,2,3],[-0.77...|  0.0|
|(4,[0,1,2,3],[-0....|  0.0|
|(4,[0,1,2,3],[-0....|  0.0|
+--------------------+-----+
only showing top 5 rows



In [38]:
from pyspark.sql.functions import udf
from pyspark.ml.linalg import Vectors, VectorUDT

firstTwoFeatures = udf(lambda sv: Vectors.dense(sv.toArray()[:2]), VectorUDT())
irisTwoFeatures = irisDFZeroIndex.select(firstTwoFeatures('features').alias('features'), 'label').cache()
irisTwoFeatures.take(5)

[Row(features=DenseVector([-0.5556, 0.25]), label=0.0),
 Row(features=DenseVector([-0.6667, -0.1667]), label=0.0),
 Row(features=DenseVector([-0.7778, 0.0]), label=0.0),
 Row(features=DenseVector([-0.8333, -0.0833]), label=0.0),
 Row(features=DenseVector([-0.6111, 0.3333]), label=0.0)]

In [39]:
repr(irisTwoFeatures.first()[0])
irisTwoFeatures.take(5)

[Row(features=DenseVector([-0.5556, 0.25]), label=0.0),
 Row(features=DenseVector([-0.6667, -0.1667]), label=0.0),
 Row(features=DenseVector([-0.7778, 0.0]), label=0.0),
 Row(features=DenseVector([-0.8333, -0.0833]), label=0.0),
 Row(features=DenseVector([-0.6111, 0.3333]), label=0.0)]

In [40]:
import uuid
if 'parqUUID' not in locals():
    parqUUID = uuid.uuid1()
irisTwoFeatures.write.mode('overwrite').parquet('/tmp/{0}/irisTwoFeatures.parquet'.format(parqUUID))

In [31]:
dbutils.fs.ls('/tmp/{0}/irisTwoFeatures.parquet'.format(parqUUID)).take(5)

NameError: name 'dbutils' is not defined

In [41]:
irisDFZeroIndex.write.mode('overwrite').parquet('/tmp/{0}/irisFourFeatures.parquet'.format(parqUUID))

In [42]:
from pyspark.ml.clustering import KMeans
kmeans = (KMeans()
         .setK(3)
         .setSeed(5)
         .setMaxIter(20)
         .setInitSteps(1))
model = kmeans.fit(irisTwoFeatures)
centers = model.clusterCenters()
transformed = model.transform(irisTwoFeatures)

NameError: name 'clusters' is not defined

In [43]:
print centers

[array([ 0.35115296, -0.10691828]), array([-0.60204082,  0.20068028]), array([-0.21875006, -0.4670139 ])]


In [44]:
transformed.show(20)

+--------------------+-----+----------+
|            features|label|prediction|
+--------------------+-----+----------+
|    [-0.555556,0.25]|  0.0|         1|
|[-0.666667,-0.166...|  0.0|         1|
|     [-0.777778,0.0]|  0.0|         1|
|[-0.833333,-0.083...|  0.0|         1|
|[-0.611111,0.333333]|  0.0|         1|
|[-0.388889,0.583333]|  0.0|         1|
|[-0.833333,0.166667]|  0.0|         1|
|[-0.611111,0.166667]|  0.0|         1|
|   [-0.944444,-0.25]|  0.0|         1|
|[-0.666667,-0.083...|  0.0|         1|
|[-0.388889,0.416667]|  0.0|         1|
|[-0.722222,0.166667]|  0.0|         1|
|[-0.722222,-0.166...|  0.0|         1|
|    [-1.0,-0.166667]|  0.0|         1|
|[-0.166667,0.666667]|  0.0|         1|
|     [-0.222222,1.0]|  0.0|         1|
|[-0.388889,0.583333]|  0.0|         1|
|    [-0.555556,0.25]|  0.0|         1|
|     [-0.222222,0.5]|  0.0|         1|
|     [-0.555556,0.5]|  0.0|         1|
+--------------------+-----+----------+
only showing top 20 rows



In [45]:
modelCenters = []
iterations = [0,2,4,7,10,20]
for i in iterations:
    kmeans = KMeans(k=3,seed=5, maxIter=i, initSteps=1)
    model = kmeans.fit(irisTwoFeatures)
    modelCenters.append(model.clusterCenters())

In [46]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np