In [None]:
# general imports
import matplotlib.pyplot as plt
import numpy as np
import numpy.typing as npt
import os
import pandas as pd

from functools import singledispatch

# dataset
from sklearn.datasets import fetch_kddcup99
from sklearn.preprocessing import StandardScaler

# pyspark module
from pyspark.rdd import RDD

# src module
from src.utils import kddSetup, sparkSetup
from src.kmeans import compute_centroidDistances, get_clusterId, get_minDistance, \
    kMeansParallel_init, kMeansPlusPlus_init, kMeansRandom_init, \
    miniBatchKMeans, naiveKMeans

ModuleNotFoundError: No module named 'pyspark'

In [3]:
# creating the zipped environment if it doesn't already exist
! if [ ! -f "environment.tar.gz" ]; then venv-pack -o "environment.tar.gz" ; fi
# creating the zipped module src
! if [ -f "src.tar.gz" ]; then rm src.tar.gz ; fi
! tar -czf src.tar.gz src

In [4]:
# starting the cluster
! $SPARK_HOME/sbin/start-all.sh

starting org.apache.spark.deploy.master.Master, logging to /usr/local/spark/logs/spark-ubuntu-org.apache.spark.deploy.master.Master-1-mapd-b-14-1.out
master: starting org.apache.spark.deploy.worker.Worker, logging to /usr/local/spark/logs/spark-ubuntu-org.apache.spark.deploy.worker.Worker-1-mapd-b-14-1.out
worker1: starting org.apache.spark.deploy.worker.Worker, logging to /usr/local/spark/logs/spark-ubuntu-org.apache.spark.deploy.worker.Worker-1-mapd-b-14-2.out
worker3: starting org.apache.spark.deploy.worker.Worker, logging to /usr/local/spark/logs/spark-ubuntu-org.apache.spark.deploy.worker.Worker-1-mapd-b-14-4.out
worker2: starting org.apache.spark.deploy.worker.Worker, logging to /usr/local/spark/logs/spark-ubuntu-org.apache.spark.deploy.worker.Worker-1-mapd-b-14-3.out


In [5]:
# telling spark where to find the python binary
os.environ["PYSPARK_PYTHON"] = "./environment/bin/python"

In [2]:
# creating a sparkSession
spark = sparkSetup("kMeans")
sc = spark.sparkContext
# exporting the src module
sc.addPyFile("src.tar.gz") 

25/09/10 10:12:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/10 10:12:50 ERROR SparkContext: Error initializing SparkContext.
java.nio.file.NoSuchFileException: /mapd-workspace/environment.tar.gz
	at java.base/sun.nio.fs.UnixException.translateToIOException(UnixException.java:92)
	at java.base/sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:106)
	at java.base/sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:111)
	at java.base/sun.nio.fs.UnixCopyFile.copy(UnixCopyFile.java:548)
	at java.base/sun.nio.fs.UnixFileSystemProvider.copy(UnixFileSystemProvider.java:257)
	at java.base/java.nio.file.Files.copy(Files.java:1305)
	at org.apache.spark.util.Utils$.copyRecursive(Utils.scala:681)
	at org.apache.spark.util.Utils$.copyFile(Utils.scala:652)
	a

Py4JJavaError: An error occurred while calling None.org.apache.spark.api.java.JavaSparkContext.
: java.nio.file.NoSuchFileException: /mapd-workspace/environment.tar.gz
	at java.base/sun.nio.fs.UnixException.translateToIOException(UnixException.java:92)
	at java.base/sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:106)
	at java.base/sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:111)
	at java.base/sun.nio.fs.UnixCopyFile.copy(UnixCopyFile.java:548)
	at java.base/sun.nio.fs.UnixFileSystemProvider.copy(UnixFileSystemProvider.java:257)
	at java.base/java.nio.file.Files.copy(Files.java:1305)
	at org.apache.spark.util.Utils$.copyRecursive(Utils.scala:681)
	at org.apache.spark.util.Utils$.copyFile(Utils.scala:652)
	at org.apache.spark.util.Utils$.doFetchFile(Utils.scala:725)
	at org.apache.spark.util.Utils$.fetchFile(Utils.scala:467)
	at org.apache.spark.SparkContext.addFile(SparkContext.scala:1804)
	at org.apache.spark.SparkContext.$anonfun$new$17(SparkContext.scala:535)
	at org.apache.spark.SparkContext.$anonfun$new$17$adapted(SparkContext.scala:535)
	at scala.collection.immutable.List.foreach(List.scala:431)
	at org.apache.spark.SparkContext.<init>(SparkContext.scala:535)
	at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
	at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
	at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
	at java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:500)
	at java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:481)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:238)
	at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
	at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:840)


### Naming conventions

The single datum is named as `datumName`, while the RDD that is a collection of one or more data is called `datumName_rdd`.

Example: `compute_clusterDistances` returns `clusterDistances` (i.e. a numpy array of distances between a point `x` and the `centroids`). 
The RDD that collects all the `clusterDistances` will be called `clusterDistances_rdd`, and here is a sample implementation of that:
```python
def compute_centroidDistances(x, centroids):
    return np.sum((centroids - x)**2, axis = 1)

# `data_rdd` is an RDD
# `centroids` is a numpy array
clusterDistances_rdd = data_rdd \
    .map(lambda x: compute_clusterDistances(x, centroids))
```

### Load and preprocess the dataset

In [None]:
kdd_data, kdd_labels, entries_dict = kddSetup(standardize=True)

# get the number of clusters from kdd_labels
k = np.unique(kdd_labels).shape[0]

#parallelize
data_rdd = sc.parallelize([row for row in kdd_data])
data_rdd = data_rdd.persist()

In [None]:
k = 15
l = k * 10
centroids = kMeansParallel_init(data_rdd, k, l)

25/09/09 08:28:44 WARN TaskSetManager: Stage 24 contains a task of very large size (10124 KiB). The maximum recommended task size is 1000 KiB.
25/09/09 08:28:45 WARN TaskSetManager: Stage 25 contains a task of very large size (10124 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [23]:
final_centroids = miniBatchKMeans(data_rdd, centroids, 10, 0.1)

25/09/08 08:28:23 WARN TaskSetManager: Stage 40 contains a task of very large size (10124 KiB). The maximum recommended task size is 1000 KiB.
25/09/08 08:28:24 WARN TaskSetManager: Stage 41 contains a task of very large size (10124 KiB). The maximum recommended task size is 1000 KiB.
25/09/08 08:28:24 WARN TaskSetManager: Stage 42 contains a task of very large size (10124 KiB). The maximum recommended task size is 1000 KiB.
25/09/08 08:28:25 WARN TaskSetManager: Stage 43 contains a task of very large size (10124 KiB). The maximum recommended task size is 1000 KiB.
25/09/08 08:28:26 WARN TaskSetManager: Stage 44 contains a task of very large size (10124 KiB). The maximum recommended task size is 1000 KiB.
25/09/08 08:28:27 WARN TaskSetManager: Stage 45 contains a task of very large size (10124 KiB). The maximum recommended task size is 1000 KiB.
25/09/08 08:28:28 WARN TaskSetManager: Stage 46 contains a task of very large size (10124 KiB). The maximum recommended task size is 1000 KiB.

In [11]:
sc.stop()
spark.stop()

In [24]:
# stopping the cluster
! $SPARK_HOME/sbin/stop-all.sh

worker1: stopping org.apache.spark.deploy.worker.Worker
worker3: stopping org.apache.spark.deploy.worker.Worker
worker2: stopping org.apache.spark.deploy.worker.Worker
master: stopping org.apache.spark.deploy.worker.Worker
stopping org.apache.spark.deploy.master.Master


25/09/10 09:18:57 WARN StandaloneAppClient$ClientEndpoint: Connection to 10.67.22.224:7077 failed; waiting for master to reconnect...
25/09/10 09:18:57 WARN StandaloneSchedulerBackend: Disconnected from Spark cluster! Waiting for reconnection...
25/09/10 09:19:02 ERROR TaskSchedulerImpl: Lost executor 0 on 10.67.22.170: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
25/09/10 09:19:02 ERROR TaskSchedulerImpl: Lost executor 1 on 10.67.22.208: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
25/09/10 09:19:02 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_0_11 !
25/09/10 09:19:02 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_0_15 !
25/09/10 09:19:02 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_0_7 !
25/09/10 09:19:02 WARN BlockManagerMasterEndpoint: N