In [2]:
from pyspark import SparkContext
from pyspark import SparkConf

conf = SparkConf().setAppName("kmeans2")
sc = SparkContext(conf=conf)

In [3]:
import numpy as np
import matplotlib.pyplot as plt

In [57]:
data = sc.textFile("data.txt")
centroid = sc.textFile("centroid.txt")

In [5]:
# clean data
data = data.map(lambda p: np.array([ float(i) for i in p.split('\t') ]) )

In [6]:
data.take(5)

[array([0.  , 0.64, 0.64, 0.  , 0.32, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.64, 0.  , 0.  , 0.  , 0.32, 0.  , 1.29, 1.93, 0.  ]),
 array([0.21, 0.28, 0.5 , 0.  , 0.14, 0.28, 0.21, 0.07, 0.  , 0.94, 0.21,
        0.79, 0.65, 0.21, 0.14, 0.14, 0.07, 0.28, 3.47, 0.  ]),
 array([0.06, 0.  , 0.71, 0.  , 1.23, 0.19, 0.19, 0.12, 0.64, 0.25, 0.38,
        0.45, 0.12, 0.  , 1.75, 0.06, 0.06, 1.03, 1.36, 0.32]),
 array([0.  , 0.  , 0.  , 0.  , 0.63, 0.  , 0.31, 0.63, 0.31, 0.63, 0.31,
        0.31, 0.31, 0.  , 0.  , 0.31, 0.  , 0.  , 3.18, 0.  ]),
 array([0.  , 0.  , 0.  , 0.  , 0.63, 0.  , 0.31, 0.63, 0.31, 0.63, 0.31,
        0.31, 0.31, 0.  , 0.  , 0.31, 0.  , 0.  , 3.18, 0.  ])]

# 1) Pick k initial centroids

In [58]:
init_c = centroid.map(lambda x: [float(i) for i in x.split('\t')])

In [59]:
init_c = np.array(init_c.collect())

In [60]:
init_c

array([[0.  , 0.64, 0.64, 0.  , 0.32, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.64, 0.  , 0.  , 0.  , 0.32, 0.  , 1.29, 1.93, 0.  ],
       [0.21, 0.28, 0.5 , 0.  , 0.14, 0.28, 0.21, 0.07, 0.  , 0.94, 0.21,
        0.79, 0.65, 0.21, 0.14, 0.14, 0.07, 0.28, 3.47, 0.  ],
       [0.06, 0.  , 0.71, 0.  , 1.23, 0.19, 0.19, 0.12, 0.64, 0.25, 0.38,
        0.45, 0.12, 0.  , 1.75, 0.06, 0.06, 1.03, 1.36, 0.32],
       [0.  , 0.  , 0.  , 0.  , 0.63, 0.  , 0.31, 0.63, 0.31, 0.63, 0.31,
        0.31, 0.31, 0.  , 0.  , 0.31, 0.  , 0.  , 3.18, 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.63, 0.  , 0.31, 0.63, 0.31, 0.63, 0.31,
        0.31, 0.31, 0.  , 0.  , 0.31, 0.  , 0.  , 3.18, 0.  ],
       [0.  , 0.  , 0.  , 0.  , 1.85, 0.  , 0.  , 1.85, 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 1.92, 0.  , 0.  , 0.  , 0.  , 0.64, 0.96,
        1.28, 0.  , 0.  , 0.  , 0.96, 0.  , 0.32, 3.85, 0.  ],
       [0.  , 0.  , 0.  , 0.  , 1.88, 0. 

# 2) Assigan all points to the closest centroid cluster

In [70]:
def euclidean_dist(X,Y):
    print("euclidean_dist",X)
    print("y",Y)
    return np.sqrt(np.dot(X, X) - 2 * np.dot(X, Y) + np.dot(Y, Y))  # from sklearn
#     return np.sqrt(np.sum(np.square(np.array(X)-np.array(Y) )))
        

In [35]:
def assign_cluster(point, c):
    dist = [ euclidean_dist(point, ci) for ci in c]
    cluster = np.argmin(dist)
    return (cluster, (point,1), dist[cluster] )

In [75]:
init_c

array([[0.  , 0.64, 0.64, 0.  , 0.32, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.64, 0.  , 0.  , 0.  , 0.32, 0.  , 1.29, 1.93, 0.  ],
       [0.21, 0.28, 0.5 , 0.  , 0.14, 0.28, 0.21, 0.07, 0.  , 0.94, 0.21,
        0.79, 0.65, 0.21, 0.14, 0.14, 0.07, 0.28, 3.47, 0.  ],
       [0.06, 0.  , 0.71, 0.  , 1.23, 0.19, 0.19, 0.12, 0.64, 0.25, 0.38,
        0.45, 0.12, 0.  , 1.75, 0.06, 0.06, 1.03, 1.36, 0.32],
       [0.  , 0.  , 0.  , 0.  , 0.63, 0.  , 0.31, 0.63, 0.31, 0.63, 0.31,
        0.31, 0.31, 0.  , 0.  , 0.31, 0.  , 0.  , 3.18, 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.63, 0.  , 0.31, 0.63, 0.31, 0.63, 0.31,
        0.31, 0.31, 0.  , 0.  , 0.31, 0.  , 0.  , 3.18, 0.  ],
       [0.  , 0.  , 0.  , 0.  , 1.85, 0.  , 0.  , 1.85, 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 1.92, 0.  , 0.  , 0.  , 0.  , 0.64, 0.96,
        1.28, 0.  , 0.  , 0.  , 0.96, 0.  , 0.32, 3.85, 0.  ],
       [0.  , 0.  , 0.  , 0.  , 1.88, 0. 

In [74]:
for p in data.take(10):
    print(assign_cluster(p,init_c))

euclidean_dist 0	0.64	0.64	0	0.32	0	0	0	0	0	0	0.64	0	0	0	0.32	0	1.29	1.93	0
y [0.   0.64 0.64 0.   0.32 0.   0.   0.   0.   0.   0.   0.64 0.   0.
 0.   0.32 0.   1.29 1.93 0.  ]


UFuncTypeError: ufunc 'multiply' did not contain a loop with signature matching types (dtype('<U60'), dtype('<U60')) -> dtype('<U60')

# 3) update centroid

In [66]:
def cumulate(pair):
    cumulate_point = np.zeros(20)
    cumulate_count = 0
    for point, count in pair[1]:
        cumulate_point += point
        cumulate_count += count
    return (pair[0], cumulate_point, cumulate_count)

def update_c(point_cluster, c):
    """
    point_cluster: list, point assigned with cluster
    c: list, centroid

    return: list, updated centroid
    """
    new_c = point_cluster.map(lambda x: (x[0], x[1][0])).aggregateByKey([], lambda seq, elem: seq + [elem], lambda a, b: a + b) \
                .map(lambda x: (x[0], np.average(x[1], axis=0)) ).collect()
    

    
#     new_c = point_cluster.map(lambda x: (x[0], [x[1]])) \
#             .reduceByKey(lambda x, y: x + y) \
#             .map(cumulate) \
#             .map(lambda x: (x[0],x[1] / x[2])).collect()
    print(new_c)
    for (i, ci) in new_c:
        c[i] = ci
    return c



In [37]:
# data_cluster.map(lambda x: (x[0], [x[1]])).reduceByKey(lambda x, y: x + y).map(cumulate).map(lambda x: (x[0], x[1] / x[2])).collect()

[(0, array([0.03886598, 0.36319588, 0.53216495, 0.0043299 , 0.65639175,
         0.09536082, 0.42412371, 0.15680412, 0.14453608, 0.24257732,
         0.11824742, 0.50556701, 0.13793814, 0.00453608, 0.10793814,
         0.75391753, 0.21020619, 1.48917526, 2.04164948, 0.07113402])),
 (4, array([0.30559748, 0.26867925, 0.44584906, 0.        , 0.24679245,
         0.13402516, 0.27584906, 0.24119497, 0.21125786, 0.73163522,
         0.20396226, 1.06540881, 0.23075472, 0.2663522 , 0.04509434,
         0.22628931, 0.38534591, 0.18465409, 3.94886792, 0.10509434])),
 (6, array([0.0044186 , 0.01883721, 0.07302326, 0.01418605, 0.76930233,
         0.00372093, 0.3427907 , 1.0027907 , 0.11209302, 0.2172093 ,
         0.0972093 , 0.04790698, 0.02139535, 0.06232558, 0.        ,
         0.87255814, 0.20139535, 0.25139535, 0.18465116, 0.09906977])),
 (2, array([0.18142857, 0.19464286, 0.44946429, 0.        , 1.34910714,
         0.11517857, 0.53107143, 0.16214286, 0.04482143, 0.28357143,
         0.16

# 4) Repeat step 2,3    

In [14]:
def calculate_cost(point_cluster):
    pc = point_cluster.map(lambda p:np.square(p[2])) \
                .reduce(lambda x,y: x + y)                                                                                                                                                                                                                      
    return pc
    

In [15]:
# def assign_cluster(point, c):
#     dist = [ euclidean_dist(point, ci) for ci in c]
#     cluster = np.argmin(dist)
#     return (cluster, (point,1), dist[cluster] )
# data.map(lambda x: assign_cluster(x))

PythonRDD[6] at RDD at PythonRDD.scala:53

In [71]:
k = 1
curr_c = init_c
c_list = []
cost = 10000
cost_list = []
while True:
    if k > 20 or cost <= 0.01:
        break
    data_cluster = data.map(lambda p: assign_cluster(p, curr_c))
    cost = calculate_cost(data_cluster) 
    cost_list.append(cost)
    
    curr_c = update_c(data_cluster, curr_c)
    print("k=%s cost_func=%s"%(k,cost))
    k += 1
      

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 395.0 failed 1 times, most recent failure: Lost task 0.0 in stage 395.0 (TID 784, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/anaconda3/envs/env6665/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 377, in main
    process()
  File "/opt/anaconda3/envs/env6665/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 372, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/opt/anaconda3/envs/env6665/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 393, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/opt/anaconda3/envs/env6665/lib/python3.7/site-packages/pyspark/rdd.py", line 839, in func
    initial = next(iterator)
  File "/opt/anaconda3/envs/env6665/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/util.py", line 99, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-71-0a1d4f62f7e3>", line 9, in <lambda>
  File "<ipython-input-35-5ae06081ad65>", line 2, in assign_cluster
  File "<ipython-input-35-5ae06081ad65>", line 2, in <listcomp>
  File "<ipython-input-70-dd5d96c46c35>", line 4, in euclidean_dist
  File "<__array_function__ internals>", line 6, in dot
numpy.core._exceptions.UFuncTypeError: ufunc 'multiply' did not contain a loop with signature matching types (dtype('<U60'), dtype('<U60')) -> dtype('<U60')

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:592)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:575)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$class.foreach(Iterator.scala:891)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:945)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:945)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:945)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:944)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:166)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.GeneratedMethodAccessor51.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/anaconda3/envs/env6665/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 377, in main
    process()
  File "/opt/anaconda3/envs/env6665/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 372, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/opt/anaconda3/envs/env6665/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 393, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/opt/anaconda3/envs/env6665/lib/python3.7/site-packages/pyspark/rdd.py", line 839, in func
    initial = next(iterator)
  File "/opt/anaconda3/envs/env6665/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/util.py", line 99, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-71-0a1d4f62f7e3>", line 9, in <lambda>
  File "<ipython-input-35-5ae06081ad65>", line 2, in assign_cluster
  File "<ipython-input-35-5ae06081ad65>", line 2, in <listcomp>
  File "<ipython-input-70-dd5d96c46c35>", line 4, in euclidean_dist
  File "<__array_function__ internals>", line 6, in dot
numpy.core._exceptions.UFuncTypeError: ufunc 'multiply' did not contain a loop with signature matching types (dtype('<U60'), dtype('<U60')) -> dtype('<U60')

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:592)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:575)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$class.foreach(Iterator.scala:891)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:945)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:945)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [54]:
cost_list

[6005.63514361198,
 6005.63514361198,
 6005.63514361198,
 6005.63514361198,
 6005.63514361198,
 6005.63514361198,
 6005.63514361198,
 6005.63514361198,
 6005.63514361198,
 6005.63514361198,
 6005.63514361198,
 6005.63514361198,
 6005.63514361198,
 6005.63514361198,
 6005.63514361198,
 6005.63514361198,
 6005.63514361198,
 6005.63514361198,
 6005.63514361198,
 6005.63514361198]

In [69]:
data_cluster.take(5)

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 394.0 failed 1 times, most recent failure: Lost task 0.0 in stage 394.0 (TID 783, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/anaconda3/envs/env6665/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 377, in main
    process()
  File "/opt/anaconda3/envs/env6665/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 372, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/opt/anaconda3/envs/env6665/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 393, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/opt/anaconda3/envs/env6665/lib/python3.7/site-packages/pyspark/rdd.py", line 1354, in takeUpToNumLeft
    yield next(iterator)
  File "/opt/anaconda3/envs/env6665/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/util.py", line 99, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-67-0a1d4f62f7e3>", line 9, in <lambda>
  File "<ipython-input-35-5ae06081ad65>", line 2, in assign_cluster
  File "<ipython-input-35-5ae06081ad65>", line 2, in <listcomp>
  File "<ipython-input-10-291cce8ad0fb>", line 2, in euclidean_dist
  File "<__array_function__ internals>", line 6, in dot
numpy.core._exceptions.UFuncTypeError: ufunc 'multiply' did not contain a loop with signature matching types (dtype('<U60'), dtype('<U60')) -> dtype('<U60')

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:592)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:575)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$class.foreach(Iterator.scala:891)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$$anonfun$3.apply(PythonRDD.scala:153)
	at org.apache.spark.api.python.PythonRDD$$anonfun$3.apply(PythonRDD.scala:153)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:153)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/anaconda3/envs/env6665/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 377, in main
    process()
  File "/opt/anaconda3/envs/env6665/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 372, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/opt/anaconda3/envs/env6665/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 393, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/opt/anaconda3/envs/env6665/lib/python3.7/site-packages/pyspark/rdd.py", line 1354, in takeUpToNumLeft
    yield next(iterator)
  File "/opt/anaconda3/envs/env6665/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/util.py", line 99, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-67-0a1d4f62f7e3>", line 9, in <lambda>
  File "<ipython-input-35-5ae06081ad65>", line 2, in assign_cluster
  File "<ipython-input-35-5ae06081ad65>", line 2, in <listcomp>
  File "<ipython-input-10-291cce8ad0fb>", line 2, in euclidean_dist
  File "<__array_function__ internals>", line 6, in dot
numpy.core._exceptions.UFuncTypeError: ufunc 'multiply' did not contain a loop with signature matching types (dtype('<U60'), dtype('<U60')) -> dtype('<U60')

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:592)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:575)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$class.foreach(Iterator.scala:891)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$$anonfun$3.apply(PythonRDD.scala:153)
	at org.apache.spark.api.python.PythonRDD$$anonfun$3.apply(PythonRDD.scala:153)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


# 5)Plot the graph of cost vs. iteration.

In [None]:
# klist = [i for i in range(1,21)]
# plt.plot(klist, cost_list)
# plt.xticks(klist)
# plt.xlabel("k")
# plt.ylabel("cost")
# plt.title("Cost vs Iteration")
# plt.savefig("k_cost.png")

In [43]:
from pyspark import SparkContext
import numpy as np
import math
import matplotlib.pyplot as plt


def read_centroids(centroids_path):
    centroids = []
    with open(centroids_path, "r") as f:
        for line in f.readlines():
            line = line.split("\t")
            line = [float(x) for x in line]
            centroids.append(np.array(line))

    return centroids

def l2_distance(p1, p2):
    return math.sqrt(np.sum((p1 - p2) ** 2))

def data_clean(point):
    point = point.split("\t")

    return np.array(point, dtype=float)

def kmeans_map(point):
    min_index = None
    distance = float("+inf")
    for i in range(len(centroids)):
        temp = l2_distance(point, centroids[i])
        if temp < distance:
            min_index = i
            distance = temp

    return (min_index, (point, 1), distance)

def cumulate(pair):
    cumulate_point = np.zeros(20)
    cumulate_count = 0
    for point, count in pair[1]:
        cumulate_point += point
        cumulate_count += count

    return (pair[0], cumulate_point, cumulate_count)

def plotting(output_path, costs):
    fig = plt.figure()

    plt.plot(np.arange(1, len(costs) + 1), costs, "r", linewidth=2)

    plt.xticks(np.arange(1, len(costs) + 1))
    plt.title(r"Error $\Phi$ with iterations.")
    plt.xlabel("Iteration")
    plt.ylabel(r"Error $\Phi$")

    plt.savefig(output_path, dpi=200)

if __name__ == '__main__':   
    data_path = "data.txt"
    centroids_path = "centroid.txt"
#     output_path = "/Users/yimingzhao/Desktop/Postgraduate/EachSemester/2020_SPRING/CS6665/CS6665_Assignments/Assignment_3/result2a.png"
    
    centroids = read_centroids(centroids_path)

    max_iteration = 1
    
#     sc = SparkContext("local", "kmeans app")

    data = sc.textFile(data_path).cache()
    data = data.map(data_clean)

    costs = []
    cen_list = []
    for i in range(max_iteration):
        kmeans_map_result = data.map(kmeans_map)

        cost = kmeans_map_result.map(lambda x: x[2] ** 2).reduce(lambda a, b: a + b)
#         cost = kmeans_map_result.map(lambda x: x[2]).reduce(lambda a, b: a + b)

        costs.append(cost)

        kmeans_reduce = kmeans_map_result.map(lambda x: (x[0], [x[1]])) \
            .reduceByKey(lambda x, y: x + y) \
                .map(cumulate) \
                    .map(lambda x: (x[0], x[1] / x[2]))
        
        for index, centroid in kmeans_reduce.collect():
            centroids[index] = centroid  

    print(costs)
#     plotting(output_path, costs)
    



[7901.148300000001]


In [44]:
centroids

[array([0.03371429, 0.36295238, 0.49419048, 0.01142857, 0.54533333,
        0.08314286, 0.30828571, 0.16104762, 0.15657143, 0.26209524,
        0.1167619 , 0.46161905, 0.12190476, 0.004     , 0.08866667,
        0.72980952, 0.17714286, 1.44790476, 1.86552381, 0.0632381 ]),
 array([0.30341176, 0.29064706, 0.46164706, 0.        , 0.22023529,
        0.16688235, 0.30811765, 0.14964706, 0.20158824, 0.72758824,
        0.19611765, 0.979     , 0.24235294, 0.25035294, 0.09035294,
        0.31041176, 0.38517647, 0.20711765, 3.90270588, 0.12717647]),
 array([0.12076923, 0.20512821, 0.73051282, 0.02435897, 0.64692308,
        0.46435897, 0.19846154, 0.24051282, 0.44487179, 0.38410256,
        0.12435897, 0.80897436, 0.11948718, 0.        , 1.81487179,
        0.10615385, 0.25794872, 0.95512821, 1.82897436, 0.10641026]),
 array([0.07036232, 0.10724638, 0.23507246, 0.01673913, 0.57492754,
        0.17934783, 0.39108696, 0.37282609, 0.13173913, 0.29434783,
        0.1442029 , 0.28753623, 0.13572464