In [2]:
# general imports
import matplotlib.pyplot as plt
import numpy as np
import numpy.typing as npt
import os
import pandas as pd

from functools import singledispatch

# dataset
from sklearn.datasets import fetch_kddcup99
from sklearn.preprocessing import StandardScaler

# pyspark module
from pyspark.rdd import RDD

# src module
from src.utils import kddSetup, sparkSetup
from src.kmeans import compute_centroidDistances, get_clusterId, get_minDistance, \
    kMeansParallel_init, kMeansPlusPlus_init, kMeansRandom_init, \
    miniBatchKMeans, naiveKMeans

In [3]:
# creating the zipped environment if it doesn't already exist
! if [ ! -f "environment.tar.gz" ]; then venv-pack -o "environment.tar.gz" ; fi
# creating the zipped module src
! if [ -d "src/__pycache__" ]; then rm -r "src/__pycache__" ; fi
! if [ -f "src.tar.gz" ]; then rm src.tar.gz ; fi
! tar -czf src.tar.gz src

In [4]:
# starting the cluster
! $SPARK_HOME/sbin/start-all.sh

starting org.apache.spark.deploy.master.Master, logging to /usr/local/spark/logs/spark-ubuntu-org.apache.spark.deploy.master.Master-1-mapd-b-14-1.out
worker1: starting org.apache.spark.deploy.worker.Worker, logging to /usr/local/spark/logs/spark-ubuntu-org.apache.spark.deploy.worker.Worker-1-mapd-b-14-2.out
worker2: starting org.apache.spark.deploy.worker.Worker, logging to /usr/local/spark/logs/spark-ubuntu-org.apache.spark.deploy.worker.Worker-1-mapd-b-14-3.out
worker3: starting org.apache.spark.deploy.worker.Worker, logging to /usr/local/spark/logs/spark-ubuntu-org.apache.spark.deploy.worker.Worker-1-mapd-b-14-4.out
master: starting org.apache.spark.deploy.worker.Worker, logging to /usr/local/spark/logs/spark-ubuntu-org.apache.spark.deploy.worker.Worker-1-mapd-b-14-1.out


In [5]:
# telling spark where to find the python binary
os.environ["PYSPARK_PYTHON"] = "./environment/bin/python"

In [6]:
# creating a sparkSession
spark = sparkSetup("kMeans")
sc = spark.sparkContext
# exporting the src module
sc.addPyFile("src.tar.gz") 

25/09/10 14:55:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/10 14:55:10 WARN Utils: Untarring behavior will be deprecated at spark.files and SparkContext.addFile. Consider using spark.archives or SparkContext.addArchive instead.


### Naming conventions

The single datum is named as `datumName`, while the RDD that is a collection of one or more data is called `datumName_rdd`.

Example: `compute_clusterDistances` returns `clusterDistances` (i.e. a numpy array of distances between a point `x` and the `centroids`). 
The RDD that collects all the `clusterDistances` will be called `clusterDistances_rdd`, and here is a sample implementation of that:
```python
def compute_centroidDistances(x, centroids):
    return np.sum((centroids - x)**2, axis = 1)

# `data_rdd` is an RDD
# `centroids` is a numpy array
clusterDistances_rdd = data_rdd \
    .map(lambda x: compute_clusterDistances(x, centroids))
```

### Load and preprocess the dataset

In [7]:
kdd_data, kdd_labels, entries_dict = kddSetup(standardize=True)

# get the number of clusters from kdd_labels
k = np.unique(kdd_labels).shape[0]

#parallelize
data_rdd = sc.parallelize([row for row in kdd_data])
data_rdd = data_rdd.persist()

In [8]:
k = 15
l = k * 10
centroids = kMeansParallel_init(data_rdd, k, l)

25/09/10 14:55:28 WARN TaskSetManager: Stage 0 contains a task of very large size (10603 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 14:55:32 WARN TaskSetManager: Stage 1 contains a task of very large size (10603 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 14:55:32 WARN TaskSetManager: Stage 2 contains a task of very large size (10603 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 14:55:34 WARN TaskSetManager: Stage 3 contains a task of very large size (10603 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 14:55:35 WARN TaskSetManager: Stage 4 contains a task of very large size (10603 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 14:55:37 WARN TaskSetManager: Stage 5 contains a task of very large size (10603 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 14:55:38 WARN TaskSetManager: Stage 6 contains a task of very large size (10603 KiB). The maximum recommended task size is 1000 KiB.
ERROR:

KeyboardInterrupt: 

                                                                                

In [None]:
final_centroids = miniBatchKMeans(data_rdd, centroids, 10, 0.1)

In [9]:
sc.stop()
spark.stop()

In [10]:
# stopping the cluster
! $SPARK_HOME/sbin/stop-all.sh

worker3: stopping org.apache.spark.deploy.worker.Worker
worker1: stopping org.apache.spark.deploy.worker.Worker
worker2: stopping org.apache.spark.deploy.worker.Worker
master: stopping org.apache.spark.deploy.worker.Worker
stopping org.apache.spark.deploy.master.Master
