In [1]:
# general imports
import matplotlib.pyplot as plt
import numpy as np
import numpy.typing as npt
import os
import pandas as pd
import time

from functools import singledispatch

# dataset
from sklearn.datasets import fetch_kddcup99
from sklearn.preprocessing import StandardScaler

# pyspark module
from pyspark.rdd import RDD

# src module
from src.utils import kddSetup, sparkSetup
from src.kmeans import compute_centroidDistances, get_clusterId, get_minDistance, compute_cost, \
    kMeansParallel_init, kMeansPlusPlus_init, kMeansRandom_init, \
    lloydKMeans, miniBatchKMeans

Starting the cluster

In [2]:
# creating the zipped environment if it doesn't already exist
! if [ ! -f "environment.tar.gz" ]; then venv-pack -o "environment.tar.gz" ; fi
# creating the zipped module src
! if [ -f "src.tar.gz" ]; then rm src.tar.gz ; fi
! tar -czf src.tar.gz src

In [3]:
# starting the cluster
! $SPARK_HOME/sbin/start-all.sh

starting org.apache.spark.deploy.master.Master, logging to /usr/local/spark/logs/spark-ubuntu-org.apache.spark.deploy.master.Master-1-mapd-b-14-1.out


worker3: starting org.apache.spark.deploy.worker.Worker, logging to /usr/local/spark/logs/spark-ubuntu-org.apache.spark.deploy.worker.Worker-1-mapd-b-14-4.out
master: starting org.apache.spark.deploy.worker.Worker, logging to /usr/local/spark/logs/spark-ubuntu-org.apache.spark.deploy.worker.Worker-1-mapd-b-14-1.out
worker1: starting org.apache.spark.deploy.worker.Worker, logging to /usr/local/spark/logs/spark-ubuntu-org.apache.spark.deploy.worker.Worker-1-mapd-b-14-2.out
worker2: starting org.apache.spark.deploy.worker.Worker, logging to /usr/local/spark/logs/spark-ubuntu-org.apache.spark.deploy.worker.Worker-1-mapd-b-14-3.out


In [4]:
os.environ["PYSPARK_PYTHON"] = "./environment/bin/python"

In [5]:
# creating a sparkSession
spark = sparkSetup("kMeans")
sc = spark.sparkContext
# exporting the src module
sc.addPyFile("src.tar.gz") # telling spark where to find the python binary

25/09/12 08:06:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/12 08:06:25 WARN Utils: Untarring behavior will be deprecated at spark.files and SparkContext.addFile. Consider using spark.archives or SparkContext.addArchive instead.


Loading the dataset

In [6]:
kdd_data, kdd_labels, entries_dict = kddSetup(standardize=True)

In [10]:
k = np.unique(kdd_labels).shape[0]
l = 0.5 * k

In [11]:
data_rdd = sc.parallelize([row for row in kdd_data])
data_rdd = data_rdd.persist()
centroids = kMeansParallel_init(data_rdd, k, l)
data_rdd.unpersist()

25/09/12 08:07:36 WARN TaskSetManager: Stage 0 contains a task of very large size (10603 KiB). The maximum recommended task size is 1000 KiB.
25/09/12 08:07:39 WARN TaskSetManager: Stage 1 contains a task of very large size (10603 KiB). The maximum recommended task size is 1000 KiB.
25/09/12 08:07:40 WARN TaskSetManager: Stage 2 contains a task of very large size (10603 KiB). The maximum recommended task size is 1000 KiB.
25/09/12 08:07:41 WARN TaskSetManager: Stage 3 contains a task of very large size (10603 KiB). The maximum recommended task size is 1000 KiB.
25/09/12 08:07:42 WARN TaskSetManager: Stage 4 contains a task of very large size (10603 KiB). The maximum recommended task size is 1000 KiB.
25/09/12 08:07:44 WARN TaskSetManager: Stage 5 contains a task of very large size (10603 KiB). The maximum recommended task size is 1000 KiB.
25/09/12 08:07:45 WARN TaskSetManager: Stage 6 contains a task of very large size (10603 KiB). The maximum recommended task size is 1000 KiB.
25/09/

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:289

In [12]:
time_dict = {}

Lloyd's kMeans serial

In [13]:
start_time = time.time()
final_centroids = lloydKMeans(kdd_data, centroids, epochs = 10)
delta_time = time.time() - start_time
time_dict[1] = {"lloyd": delta_time}

Parallel

In [11]:
partitions = [2, 4, 8, 16]

In [14]:
time_dict

{1: {'lloyd': 42.844629526138306}}

In [16]:
numSlices = 16
time_dict[numSlices] = {}
data_rdd = sc.parallelize([row for row in kdd_data], numSlices=numSlices).persist()

start_time = time.time()
final_centroids = lloydKMeans(data_rdd, centroids, epochs = 10)
delta_time = time.time() - start_time
time_dict[numSlices]["lloyd"] = delta_time

start_time = time.time()
final_centroids = miniBatchKMeans(data_rdd, centroids, epochs = 10)
delta_time = time.time() - start_time
time_dict[numSlices]["miniBatch"] = delta_time

data_rdd.unpersist()

25/09/12 08:14:20 WARN TaskSetManager: Stage 42 contains a task of very large size (10603 KiB). The maximum recommended task size is 1000 KiB.
25/09/12 08:14:21 ERROR Inbox: An error happened while processing message in the inbox for CoarseGrainedScheduler
java.lang.OutOfMemoryError: Java heap space
Exception in thread "dispatcher-CoarseGrainedScheduler" java.lang.OutOfMemoryError: Java heap space
ERROR:root:KeyboardInterrupt while sending command.===>           (4 + 11) / 16]
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/socket.py", line 707, in readinto
    re

KeyboardInterrupt: 

[Stage 40:=>              (1 + 15) / 16][Stage 42:====>           (4 + 11) / 16]

In [12]:
for numSlices in partitions:
    time_dict[numSlices] = {}
    data_rdd = sc.parallelize([row for row in kdd_data], numSlices=numSlices).persist()
    
    start_time = time.time()
    final_centroids = lloydKMeans(data_rdd, centroids, epochs = 10)
    delta_time = time.time() - start_time
    time_dict[numSlices]["lloyd"] = delta_time

    start_time = time.time()
    final_centroids = miniBatchKMeans(data_rdd, centroids, epochs = 10)
    delta_time = time.time() - start_time
    time_dict[numSlices]["miniBatch"] = delta_time
    
    data_rdd.unpersist()

25/09/11 14:58:09 WARN TaskSetManager: Stage 38 contains a task of very large size (85120 KiB). The maximum recommended task size is 1000 KiB.
25/09/11 14:58:10 ERROR Inbox: An error happened while processing message in the inbox for CoarseGrainedScheduler
java.lang.OutOfMemoryError: Java heap space
	at java.base/java.util.Arrays.copyOf(Arrays.java:3537)
	at java.base/java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:100)
	at java.base/java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:130)
	at org.apache.spark.util.ByteBufferOutputStream.write(ByteBufferOutputStream.scala:41)
	at java.base/java.io.DataOutputStream.write(DataOutputStream.java:112)
	at java.base/java.io.ObjectOutputStream$BlockDataOutputStream.write(ObjectOutputStream.java:1862)
	at java.base/java.io.ObjectOutputStream.write(ObjectOutputStream.java:714)
	at java.base/java.nio.channels.Channels$WritableByteChannelImpl.write(Channels.java:463)
	at org.apache.spark.util.SerializableBuffer.

KeyboardInterrupt: 

Plotting

In [None]:
fig, ax = plt.subplots()


### Cost vs iterations


In [35]:
iterations = np.linspace(10,50,5, dtype=int)
results_dict = {"lloyd_serial":np.zeros(len(iterations)), "lloyd_parallel":np.zeros(len(iterations)), "minibatch":np.zeros(len(iterations))}

In [None]:
for key in results_dict.keys():
    print(key)
    for i in range(len(iterations)):
        print(f"running for {iterations[i]} iterations...")
        if key == 'lloyd_serial':
            final_centroids = lloydKMeans(kdd_data,centroids = centroids, epochs = iterations[i])
            algo_cost = compute_cost(kdd_data,final_centroids)
            results_dict[key][i] = algo_cost
        else:
            break
        

lloyd_serial
running for 10 iterations...


[Stage 40:=>              (1 + 15) / 16][Stage 42:====>           (4 + 11) / 16]

running for 20 iterations...


[Stage 40:=>              (1 + 15) / 16][Stage 42:====>           (4 + 11) / 16]

running for 30 iterations...


[Stage 40:=>              (1 + 15) / 16][Stage 42:====>           (4 + 11) / 16]

running for 40 iterations...


[Stage 40:=>              (1 + 15) / 16][Stage 42:====>           (4 + 11) / 16]

running for 50 iterations...


KeyboardInterrupt: 

[Stage 40:=>              (1 + 15) / 16][Stage 42:====>           (4 + 11) / 16]

Stopping the cluster

In [14]:
sc.stop()
spark.stop()

In [37]:
# stopping the cluster
! $SPARK_HOME/sbin/stop-all.sh

worker3: stopping org.apache.spark.deploy.worker.Worker
worker1: stopping org.apache.spark.deploy.worker.Worker
worker2: stopping org.apache.spark.deploy.worker.Worker


25/09/12 08:53:39 ERROR TaskSchedulerImpl: Lost executor 2 on 10.67.22.208: Worker shutting down
25/09/12 08:53:39 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_57_15 !
25/09/12 08:53:39 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_57_7 !
25/09/12 08:53:39 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_57_3 !
25/09/12 08:53:39 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_68_14 !
25/09/12 08:53:39 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_57_11 !
[Stage 40:=>              (1 + 14) / 16][Stage 42:====>            (4 + 8) / 16]

master: stopping org.apache.spark.deploy.worker.Worker
stopping org.apache.spark.deploy.master.Master


25/09/12 08:53:39 ERROR TaskSchedulerImpl: Lost executor 1 on 10.67.22.170: Command exited with code 143
25/09/12 08:53:39 ERROR TaskSchedulerImpl: Lost executor 0 on 10.67.22.202: Command exited with code 143
25/09/12 08:53:39 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_57_5 !
25/09/12 08:53:39 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_57_9 !
25/09/12 08:53:39 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_57_13 !
25/09/12 08:53:39 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_68_12 !
25/09/12 08:53:39 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_57_1 !
25/09/12 08:53:39 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_57_8 !
25/09/12 08:53:39 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_57_0 !
25/09/12 08:53:39 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_68_15 !
25/09/12 08:53:39 WARN BlockManagerMasterEn

25/09/12 08:53:40 WARN StandaloneAppClient$ClientEndpoint: Connection to 10.67.22.224:7077 failed; waiting for master to reconnect...
25/09/12 08:53:40 WARN StandaloneSchedulerBackend: Disconnected from Spark cluster! Waiting for reconnection...
25/09/12 08:53:44 ERROR TaskSchedulerImpl: Lost executor 3 on 10.67.22.224: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
25/09/12 08:53:44 WARN TaskSetManager: Lost task 15.1 in stage 40.0 (TID 672) (10.67.22.224 executor 3): ExecutorLostFailure (executor 3 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
25/09/12 08:53:44 WARN TaskSetManager: Lost task 7.1 in stage 40.0 (TID 673) (10.67.22.224 executor 3): ExecutorLostFailure (executor 3 exited caused by one of the running tasks) Reason: Remote RPC client disassociated