In [1]:
# general imports
import matplotlib.pyplot as plt
import numpy as np
import numpy.typing as npt
import os
import pandas as pd
import time
import warnings, logging

from functools import singledispatch

# dataset
from sklearn.datasets import fetch_kddcup99
from sklearn.preprocessing import StandardScaler

# pyspark module
from pyspark.rdd import RDD

# src module
from src.utils import kddSetup, sparkSetup
from src.kmeans.base import compute_cost
from src.kmeans.initialization import kMeansParallel_init, kMeansPlusPlus_init, kMeansRandom_init
from src.kmeans.update import lloydKMeans, miniBatchKMeans

Starting the cluster

In [2]:
# creating the zipped environment if it doesn't already exist
! if [ ! -f "environment.tar.gz" ]; then venv-pack -o "environment.tar.gz" ; fi
# creating the zipped module src
! if [ -f "src.tar.gz" ]; then rm src.tar.gz ; fi
! tar -czf src.tar.gz src

In [3]:
# starting the cluster
! $SPARK_HOME/sbin/start-all.sh

starting org.apache.spark.deploy.master.Master, logging to /usr/local/spark/logs/spark-ubuntu-org.apache.spark.deploy.master.Master-1-mapd-b-14-1.out
master: starting org.apache.spark.deploy.worker.Worker, logging to /usr/local/spark/logs/spark-ubuntu-org.apache.spark.deploy.worker.Worker-1-mapd-b-14-1.out
worker1: starting org.apache.spark.deploy.worker.Worker, logging to /usr/local/spark/logs/spark-ubuntu-org.apache.spark.deploy.worker.Worker-1-mapd-b-14-2.out
worker2: starting org.apache.spark.deploy.worker.Worker, logging to /usr/local/spark/logs/spark-ubuntu-org.apache.spark.deploy.worker.Worker-1-mapd-b-14-3.out
worker3: starting org.apache.spark.deploy.worker.Worker, logging to /usr/local/spark/logs/spark-ubuntu-org.apache.spark.deploy.worker.Worker-1-mapd-b-14-4.out


In [4]:
os.environ["PYSPARK_PYTHON"] = "./environment/bin/python"

In [5]:
# creating a sparkSession
spark = sparkSetup("kMeans")
sc = spark.sparkContext
# exporting the src module
sc.addPyFile("src.tar.gz")
sc.setLogLevel("ERROR")
# Setup the spark warnings
log4j_conf_path = "./Settings//log4j.properties"
warnings.filterwarnings("ignore") 
logging.getLogger('py4j').setLevel(logging.ERROR) 
logging.getLogger('pyspark').setLevel(logging.ERROR) 

25/09/12 19:55:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/12 19:55:53 WARN Utils: Untarring behavior will be deprecated at spark.files and SparkContext.addFile. Consider using spark.archives or SparkContext.addArchive instead.


Loading the dataset

In [6]:
kdd_data, kdd_labels, entries_dict = kddSetup(standardize=True)

In [7]:
k = np.unique(kdd_labels).shape[0]
l = 0.5 * k

In [8]:
data_rdd = sc.parallelize([row for row in kdd_data]).persist()
centroids = kMeansParallel_init(data_rdd, k, l)
data_rdd.unpersist()

                                                                                

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:289

In [9]:
time_dict = {}

Lloyd's kMeans serial

In [10]:
start_time = time.time()
final_centroids = lloydKMeans(kdd_data, centroids, epochs = 10)
delta_time = time.time() - start_time
time_dict[1] = {"lloyd": delta_time}

Parallel

In [11]:
partitions = [2, 4, 8, 16, 32, 64, 128]
for numSlices in partitions:
    time_dict[numSlices] = {}
    data_rdd = sc.parallelize([row for row in kdd_data], numSlices=numSlices).persist()
    
    start_time = time.time()
    final_centroids = lloydKMeans(data_rdd, centroids, epochs = 10)
    delta_time = time.time() - start_time
    time_dict[numSlices]["lloyd"] = delta_time

    start_time = time.time()
    final_centroids = miniBatchKMeans(data_rdd, centroids, epochs = 10)
    delta_time = time.time() - start_time
    time_dict[numSlices]["miniBatch"] = delta_time
    
    data_rdd.unpersist()

                                                                                

CONVERGED! in 4 iterations


                                                                                

CONVERGED! in 5 iterations


                                                                                

CONVERGED! in 5 iterations


                                                                                

CONVERGED! in 7 iterations


                                                                                

CONVERGED! in 4 iterations


                                                                                

CONVERGED! in 4 iterations


                                                                                

In [12]:
time_dict

{1: {'lloyd': 26.646967887878418},
 2: {'lloyd': 111.75289154052734, 'miniBatch': 14.334075689315796},
 4: {'lloyd': 63.27473545074463, 'miniBatch': 14.519975185394287},
 8: {'lloyd': 39.25138187408447, 'miniBatch': 25.16306710243225},
 16: {'lloyd': 31.23866581916809, 'miniBatch': 8.106281757354736},
 32: {'lloyd': 32.67539715766907, 'miniBatch': 15.810601234436035},
 64: {'lloyd': 38.274736642837524, 'miniBatch': 7.483483076095581},
 128: {'lloyd': 52.767820835113525, 'miniBatch': 10.829732656478882}}

### Cost vs iterations


In [None]:
max_epochs = 200

# final results dictionary
results_dict = {
    "lloyd_serial": [],
    "lloyd_parallel": [],
    "minibatch_01": [],
    "minibatch_02": [],
    "minibatch_05": [],
    "minibatch_07": []
}

# partial_results
patience = 3       # number of previous costs to average for stopping
threshold = 1e-3    # convergence threshold

# Parameters
n_runs = 3  # number of independent runs
all_histories_runs = {key: [] for key in results_dict.keys()}

In [None]:
for key in results_dict.keys():
    print(f"Running {key}")

    for run in range(n_runs):
        print(f"  Run {run+1}/{n_runs}")
        history = []

        current_centroids = centroids.copy()  # initialize centroids

        # compute initial cost
        if key.startswith("minibatch") or key == "lloyd_parallel":
            initial_cost = compute_cost(data_rdd, current_centroids)
        else:  # lloyd_serial
            initial_cost = compute_cost(kdd_data, current_centroids)
        history.append(initial_cost)

        # compute costs for each epoch
        for epoch in range(max_epochs):
            if key == 'lloyd_serial':
                current_centroids = lloydKMeans(kdd_data, centroids=current_centroids, epochs=1)
                cost = compute_cost(kdd_data, current_centroids)
            elif key == "lloyd_parallel":
                current_centroids = lloydKMeans(data_rdd, centroids=current_centroids, epochs=1)
                cost = compute_cost(data_rdd, current_centroids)
            elif key.startswith("minibatch"):
                fraction = {
                    "minibatch_01": 0.1,
                    "minibatch_02": 0.2,
                    "minibatch_05": 0.5,
                    "minibatch_07": 0.7,
                }[key]

                current_centroids = miniBatchKMeans(
                    data_rdd, centroids=current_centroids, epochs=1, batch_fraction=fraction
                )
                cost = compute_cost(data_rdd, current_centroids)

            history.append(cost)

            # early stopping: compare current cost to average of last `patience` costs
            if len(history) > patience:
                recent_avg = np.mean(history[-patience:])
                if abs(history[-1] - recent_avg) < threshold:
                    break

        # pad history to max_epochs
        if len(history) < max_epochs + 1:  # +1 because of initial cost
            history += [history[-1]] * ((max_epochs + 1) - len(history))

        all_histories_runs[key].append(history)

In [None]:
# Compute mean and std for each algorithm
results_mean_std = {}
for key, histories in all_histories_runs.items():
    arr = np.array(histories)  # shape: (n_runs, max_epochs+1)
    mean_cost = arr.mean(axis=0)
    std_cost = arr.std(axis=0)
    results_mean_std[key] = {"mean": mean_cost, "std": std_cost}



In [None]:
plt.figure(figsize=(12, 6))

# different markers
markers = ['o', 's', '^', 'D', 'x', '*']

offsets = [0,1,0,0,1,-1]
for i, (key, stats) in enumerate(results_mean_std.items()):
    mean = stats["mean"]
    std = stats["std"]
    epochs = np.arange(len(mean))

    # style
    if "lloyd" in key:
        line_style = "--"   
    else:
        line_style = "-"    

    # cycle markers
    marker = markers[i]

    #offset to avoid errorbar overlaps
    
    plt.errorbar(
        epochs[::10+offsets[i]], mean[::10+offsets[i]], yerr=std[::10+offsets[i]],
        fmt=line_style+marker, capsize=3, label=key
    )
    offset += 1

plt.xlabel("Epoch", fontsize=14)
plt.ylabel("Cost", fontsize=14)
plt.title("K-Means Convergence: Cost vs Epochs", fontsize=16)
plt.grid(alpha=0.6)
plt.legend(title="Algorithm:", loc="best")
plt.show()


Stopping the cluster

In [13]:
sc.stop()
spark.stop()

In [14]:
# stopping the cluster
! $SPARK_HOME/sbin/stop-all.sh
# clearing the `$SPARK_HOME/work` directory in all the nodes
# this avoids cluttering of storage among nodes
! rm -rf $SPARK_HOME/work/*
! ssh worker1 "rm -rf $SPARK_HOME/work/*"
! ssh worker2 "rm -rf $SPARK_HOME/work/*"
! ssh worker3 "rm -rf $SPARK_HOME/work/*"

worker1: stopping org.apache.spark.deploy.worker.Worker
worker3: stopping org.apache.spark.deploy.worker.Worker
worker2: stopping org.apache.spark.deploy.worker.Worker
master: stopping org.apache.spark.deploy.worker.Worker
stopping org.apache.spark.deploy.master.Master
