In [8]:
# general imports
import matplotlib.pyplot as plt
import numpy as np
import numpy.typing as npt
import os
import pandas as pd
import time
import warnings, logging

from functools import singledispatch

# dataset
from sklearn.datasets import fetch_kddcup99
from sklearn.preprocessing import StandardScaler

# pyspark module
from pyspark.rdd import RDD

# src module
from src.utils import kddSetup, sparkSetup
from src.kmeans.base import compute_cost
from src.kmeans.initialization import kMeansParallel_init, kMeansPlusPlus_init, kMeansRandom_init
from src.kmeans.update import lloydKMeans, miniBatchKMeans

Starting the cluster

In [9]:
# creating the zipped environment if it doesn't already exist
! if [ ! -f "environment.tar.gz" ]; then venv-pack -o "environment.tar.gz" ; fi
# creating the zipped module src
! if [ -f "src.tar.gz" ]; then rm src.tar.gz ; fi
! tar -czf src.tar.gz src

In [10]:
# starting the cluster
! $SPARK_HOME/sbin/start-all.sh

org.apache.spark.deploy.master.Master running as process 526108.  Stop it first.
worker3: org.apache.spark.deploy.worker.Worker running as process 232336.  Stop it first.
worker1: org.apache.spark.deploy.worker.Worker running as process 225764.  Stop it first.
worker2: org.apache.spark.deploy.worker.Worker running as process 224249.  Stop it first.
master: org.apache.spark.deploy.worker.Worker running as process 526314.  Stop it first.


In [11]:
os.environ["PYSPARK_PYTHON"] = "./environment/bin/python"

In [12]:
# creating a sparkSession
spark = sparkSetup("kMeans")
sc = spark.sparkContext
# exporting the src module
sc.addPyFile("src.tar.gz")
sc.setLogLevel("ERROR")
sc.addPyFile("kmeans.py")
sc.addPyFile("utils.py") 
# Setup the spark warnings
log4j_conf_path = "./Settings//log4j.properties"
warnings.filterwarnings("ignore") 
logging.getLogger('py4j').setLevel(logging.ERROR) 
logging.getLogger('pyspark').setLevel(logging.ERROR) 

25/09/12 16:14:49 WARN SparkContext: The path src.tar.gz has been added already. Overwriting of added paths is not supported in the current version.


Py4JJavaError: An error occurred while calling o722.addFile.
: java.io.FileNotFoundException: File file:/home/ubuntu/Distributed-K-means/kmeans.py does not exist
	at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:779)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:1100)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:769)
	at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:462)
	at org.apache.spark.SparkContext.addFile(SparkContext.scala:1750)
	at org.apache.spark.SparkContext.addFile(SparkContext.scala:1728)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:569)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:840)


Loading the dataset

In [6]:
kdd_data, kdd_labels, entries_dict = kddSetup(standardize=True)

In [7]:
k = np.unique(kdd_labels).shape[0]
l = 0.5 * k

In [8]:
data_rdd = sc.parallelize([row for row in kdd_data]).persist()
centroids = kMeansParallel_init(data_rdd, k, l)
data_rdd.unpersist()

25/09/12 14:02:33 WARN TaskSetManager: Stage 0 contains a task of very large size (5682 KiB). The maximum recommended task size is 1000 KiB.
25/09/12 14:02:36 WARN TaskSetManager: Stage 1 contains a task of very large size (5682 KiB). The maximum recommended task size is 1000 KiB.
25/09/12 14:02:36 WARN TaskSetManager: Stage 2 contains a task of very large size (5682 KiB). The maximum recommended task size is 1000 KiB.
25/09/12 14:02:38 WARN TaskSetManager: Stage 3 contains a task of very large size (5682 KiB). The maximum recommended task size is 1000 KiB.
25/09/12 14:02:38 WARN TaskSetManager: Stage 4 contains a task of very large size (5682 KiB). The maximum recommended task size is 1000 KiB.
25/09/12 14:02:40 WARN TaskSetManager: Stage 5 contains a task of very large size (5682 KiB). The maximum recommended task size is 1000 KiB.
25/09/12 14:02:40 WARN TaskSetManager: Stage 6 contains a task of very large size (5682 KiB). The maximum recommended task size is 1000 KiB.
25/09/12 14:0

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:289

In [9]:
time_dict = {}

Lloyd's kMeans serial

In [10]:
start_time = time.time()
final_centroids = lloydKMeans(kdd_data, centroids, epochs = 10)
delta_time = time.time() - start_time
time_dict[1] = {"lloyd": delta_time}

Parallel

In [11]:
partitions = [2, 4, 8, 16, 32, 64, 128]
for numSlices in partitions:
    time_dict[numSlices] = {}
    data_rdd = sc.parallelize([row for row in kdd_data], numSlices=numSlices).persist()
    
    start_time = time.time()
    final_centroids = lloydKMeans(data_rdd, centroids, epochs = 10)
    delta_time = time.time() - start_time
    time_dict[numSlices]["lloyd"] = delta_time

    start_time = time.time()
    final_centroids = miniBatchKMeans(data_rdd, centroids, epochs = 10)
    delta_time = time.time() - start_time
    time_dict[numSlices]["miniBatch"] = delta_time
    
    data_rdd.unpersist()

25/09/12 14:03:54 WARN TaskSetManager: Stage 38 contains a task of very large size (45590 KiB). The maximum recommended task size is 1000 KiB.
25/09/12 14:03:57 WARN TaskSetManager: Stage 40 contains a task of very large size (45590 KiB). The maximum recommended task size is 1000 KiB.
25/09/12 14:04:00 WARN TaskSetManager: Stage 42 contains a task of very large size (45590 KiB). The maximum recommended task size is 1000 KiB.
25/09/12 14:04:03 WARN TaskSetManager: Stage 44 contains a task of very large size (45673 KiB). The maximum recommended task size is 1000 KiB.
25/09/12 14:04:06 WARN TaskSetManager: Stage 46 contains a task of very large size (45590 KiB). The maximum recommended task size is 1000 KiB.
25/09/12 14:04:09 WARN TaskSetManager: Stage 48 contains a task of very large size (45673 KiB). The maximum recommended task size is 1000 KiB.
25/09/12 14:04:12 WARN TaskSetManager: Stage 50 contains a task of very large size (45590 KiB). The maximum recommended task size is 1000 KiB.

In [12]:
time_dict

{1: {'lloyd': 22.632549047470093},
 2: {'lloyd': 30.550265550613403, 'miniBatch': 11.384488105773926},
 4: {'lloyd': 19.4558002948761, 'miniBatch': 8.661375284194946},
 8: {'lloyd': 11.948050737380981, 'miniBatch': 6.0729289054870605},
 16: {'lloyd': 8.140554904937744, 'miniBatch': 5.756534099578857},
 32: {'lloyd': 9.793381929397583, 'miniBatch': 6.788220167160034},
 64: {'lloyd': 11.758658647537231, 'miniBatch': 9.278021097183228},
 128: {'lloyd': 16.789658069610596, 'miniBatch': 13.773572206497192}}

### Cost vs iterations


In [35]:
iterations = np.linspace(10,50,5, dtype=int)
results_dict = {"lloyd_serial":np.zeros(len(iterations)), "lloyd_parallel":np.zeros(len(iterations)), "minibatch":np.zeros(len(iterations))}

In [None]:
for key in results_dict.keys():
    print(key)
    for i in range(len(iterations)):
        print(f"running for {iterations[i]} iterations...")
        if key == 'lloyd_serial':
            final_centroids = lloydKMeans(kdd_data,centroids = centroids, epochs = iterations[i])
            algo_cost = compute_cost(kdd_data,final_centroids)
            results_dict[key][i] = algo_cost
        else:
            break
        

lloyd_serial
running for 10 iterations...


[Stage 40:=>              (1 + 15) / 16][Stage 42:====>           (4 + 11) / 16]

running for 20 iterations...


[Stage 40:=>              (1 + 15) / 16][Stage 42:====>           (4 + 11) / 16]

running for 30 iterations...


[Stage 40:=>              (1 + 15) / 16][Stage 42:====>           (4 + 11) / 16]

running for 40 iterations...


[Stage 40:=>              (1 + 15) / 16][Stage 42:====>           (4 + 11) / 16]

running for 50 iterations...


KeyboardInterrupt: 

[Stage 40:=>              (1 + 15) / 16][Stage 42:====>           (4 + 11) / 16]

Stopping the cluster

In [21]:
sc.stop()
spark.stop()

In [None]:
# stopping the cluster
! $SPARK_HOME/sbin/stop-all.sh
# clearing the `$SPARK_HOME/work` directory in all the nodes
# this avoids cluttering of storage among nodes
! rm -rf $SPARK_HOME/work/*
! ssh worker1 "rm -rf $SPARK_HOME/work/*"
! ssh worker2 "rm -rf $SPARK_HOME/work/*"
! ssh worker3 "rm -rf $SPARK_HOME/work/*"

worker3: stopping org.apache.spark.deploy.worker.Worker
worker1: stopping org.apache.spark.deploy.worker.Worker
worker2: stopping org.apache.spark.deploy.worker.Worker
master: stopping org.apache.spark.deploy.worker.Worker
stopping org.apache.spark.deploy.master.Master


In [None]:
kdd_data, kdd_labels, entries_dict = kddSetup(standardize=True)

# get the number of clusters from kdd_labels
k = np.unique(kdd_labels).shape[0]

#parallelize
kdd_rdd = sc.parallelize([row for row in kdd_data])
l = k * 10
iterations = 1_000_000
init_centroids = kMeansParallel_init(kdd_rdd, k, l)
phi_init = compute_cost(kdd_rdd, init_centroids)
final_centroids = miniBatchKMeans(kdd_rdd, init_centroids, iterations, 0.3)
phi_final = compute_cost(kdd_rdd, final_centroids)
print(f"{phi_init:.6g}\t{phi_final:.6g}")

25/09/12 16:10:59 WARN TaskSetManager: Stage 0 contains a task of very large size (5682 KiB). The maximum recommended task size is 1000 KiB.
25/09/12 16:11:02 WARN TaskSetManager: Stage 1 contains a task of very large size (5682 KiB). The maximum recommended task size is 1000 KiB.
25/09/12 16:11:02 WARN TaskSetManager: Stage 2 contains a task of very large size (5682 KiB). The maximum recommended task size is 1000 KiB.
25/09/12 16:11:03 WARN TaskSetManager: Stage 3 contains a task of very large size (5682 KiB). The maximum recommended task size is 1000 KiB.
25/09/12 16:11:05 WARN TaskSetManager: Stage 4 contains a task of very large size (5682 KiB). The maximum recommended task size is 1000 KiB.
25/09/12 16:11:05 WARN TaskSetManager: Stage 5 contains a task of very large size (5682 KiB). The maximum recommended task size is 1000 KiB.
25/09/12 16:11:07 WARN TaskSetManager: Stage 6 contains a task of very large size (5682 KiB). The maximum recommended task size is 1000 KiB.
25/09/12 16:1

CONVERGED!


25/09/12 16:12:24 WARN TaskSetManager: Stage 47 contains a task of very large size (5682 KiB). The maximum recommended task size is 1000 KiB.
25/09/12 16:12:25 WARN TaskSetManager: Stage 48 contains a task of very large size (5682 KiB). The maximum recommended task size is 1000 KiB.


38.6195	38.5942
