In [None]:
# general imports
import matplotlib.pyplot as plt
import numpy as np
import numpy.typing as npt
import os
import pandas as pd

from functools import singledispatch

# dataset
from sklearn.datasets import fetch_kddcup99
from sklearn.preprocessing import StandardScaler

# pyspark module
from pyspark.rdd import RDD

# src module
from src.utils import kddSetup, sparkSetup
from src.kmeans import compute_centroidDistances, get_clusterId, get_minDistance, kMeansPlusPlus_init, kMeansRandom_init, miniBatchKMeans, naiveKMeans

## Start a Spark session

In [None]:
# creating the zipped environment if it doesn't already exist
! if [ ! -f "environment.tar.gz" ]; then venv-pack -o "environment.tar.gz" ; fi
# creating the zipped module src
! if [ -f "src.tar.gz" ]; then rm src.tar.gz ; fi
! tar -czf src.tar.gz src

In [None]:
# starting the cluster
! $SPARK_HOME/sbin/start-all.sh

In [None]:
os.environ["PYSPARK_PYTHON"] = "./environment/bin/python"

In [None]:
# creating a sparkSession
spark = sparkSetup("kMeans")
sc = spark.sparkContext
# exporting the src module
sc.addPyFile("src.tar.gz") # telling spark where to find the python binary

## Load and preprocess the dataset

The first dataset we would like to test is a synthetic GaussMixture. To generate it, we sampled kcenters from a 15-dimensional spherical Gaussian distribution with mean at the origin and variance R∈{1,10,100}. We then added points from Gaussian distributions of unit variance around each center. Given the k centers, this is a mixture of k spherical Gaussians with equal weights.

ref{paper kmeans||}

In [None]:
def gauss_mixture(
    num_points_per_cluster = 50,  
    num_clusters = 10, # k centers
    dim = 15,                  
    R = 10 # center variances
):
    seed = 42
    np.random.seed(seed)
    # Centers generation N(0, R*I)
    centers = np.random.normal(loc=0, scale=np.sqrt(R), size=(num_clusters, dim))
    # Point generation N(center, I) for each cluster
    data = np.concatenate(
        [center + np.random.randn(num_points_per_cluster, dim) for center in centers],
        axis=0
    )
    # Distributes and stores in memory data across worker nodes
    gm_data_rdd = sc.parallelize([row for row in data])
    gm_data_rdd = gm_data_rdd.persist()
    
    return gm_data_rdd

In [None]:
# try to generate data
gm10_data_rdd = gauss_mixture()  # R=10, k=10, shape=(500, 15)

In [None]:
num_elements = gm10_data_rdd.count()
element_dim = len(gm10_data_rdd.first())
print(f"Shape of RDD object: ({num_elements}, {element_dim})")

In [None]:
# Convert to numpy array
gm10_data_list = gm10_data_rdd.collect()
gm10_data_array = np.array(gm10_data_list)

In [None]:
gm10_df = pd.DataFrame(gm10_data_array)
gm10_df.describe()

In [None]:
k = 10
l = 2*k
centroids = kMeansParallel_init(gm10_data_rdd, k, l)

print(centroids.shape)

In [None]:
final_centroids = miniBatchKMeans(gm10_data_rdd, centroids)

print(final_centroids.shape)

In [None]:
# KDD dataset
# Change percent10 to 'False' to fetch the full dataset (4M rows)
kdd = fetch_kddcup99(shuffle=True, percent10=True) 
kdd_data = kdd.data

# Remove string features and standardize them
data_kdd = np.delete(kdd_data,np.arange(1,4,1),axis = 1) 
scaler_kdd = StandardScaler()
data_kdd = scaler_kdd.fit_transform(data_kdd)

# Parallelize
kdd_data_rdd = sc.parallelize([row for row in data_kdd])
# data_rdd = sc.parallelize(np.array(data).tolist(), numSlices=16)
kdd_data_rdd = kdd_data_rdd.persist()

In [None]:
num_elements = kdd_data_rdd.count()
element_dim = len(kdd_data_rdd.first())
print(f"Shape of RDD object: ({num_elements}, {element_dim})")

## Benchmark testing

In [None]:
def cost_function(data: RDD, centroids: npt.ArrayLike) -> float:
    """
    Compute cost function: square distance between data and centroids
    data: (N, d)
    centroids: (k, d)
    """
    minDistance_rdd = data \
        .map(lambda x: (x, get_minDistance(compute_centroidDistances(x, centroids)))) \
        .persist()
    cost = minDistance_rdd \
        .map(lambda x: x[1]) \
        .sum()
    return cost

In [None]:
def analysis_init(
        data_array: npt.ArrayLike, 
        data_rdd: RDD, 
        r: int, 
        k: int, 
        iterations: int=10, 
        batch_fraction: float=0.1
        ) -> npt.DTypeLike:
    results = []
    """"
    Computes performance tracking metrics for different initialization algorithms with *seed cost* (after initialization) and *final cost* (after miniBatchKmean iterations)
    """


    # 1 - Random initialization
    start = time.time()
    centroidsRandom = data_array[np.random.choice(data_array.shape[0], size=k, replace=False)]
    init_time = time.time() - start

    seed_cost = cost_function(data_rdd, centroidsRandom)

    #final_centroids = miniBatchKMeans(data_rdd, centroidsRandom, iterations, batch_fraction)
    final_centroids = kMeansNaive(data_array, centroidsRandom, iterations)

    final_cost = cost_function(data_rdd, final_centroids)
    
    results.append({
        "method": "random",
        "k": k, 
        "l": np.nan,
        "r": r, 
        "initialization_time": init_time,
        "seed": seed_cost,
        "final": final_cost
    })


    # 2 - k-means++
    start = time.time()
    centroidsPlusPlus = kMeansPlusPlus_init(data_array, k)
    init_time = time.time() - start

    seed_cost = cost_function(data_rdd, centroidsPlusPlus)
    #final_centroids = miniBatchKMeans(data_rdd, centroidsPlusPlus, iterations, batch_fraction)
    final_centroids = kMeansNaive(data_array, centroidsRandom, iterations)

    final_cost = cost_function(data_rdd, final_centroids)

    results.append({
        "method": "kmeans++",
        "k": k,
        "l": np.nan,
        "r": r, 
        "initialization_time": init_time,
        "seed": seed_cost,
        "final": final_cost
    })


    # 3 - Parallel l=0.5k
    start = time.time()
    centroidsParallel1 = kMeansParallel_init(data_rdd, k=k, l=0.5*k, r=r)
    init_time = time.time() - start

    seed_cost = cost_function(data_rdd, centroidsParallel1)

    #final_centroids = miniBatchKMeans(data_rdd, centroidsParallel1, iterations, batch_fraction)
    final_centroids = kMeansNaive(data_array, centroidsRandom, iterations)

    final_cost = cost_function(data_rdd, final_centroids)

    results.append({
        "method": "kmeans|| (l=k/2)",
        "k": k,
        "l": 0.5*k,
        "r": r, 
        "initialization_time": init_time,
        "seed": seed_cost,
        "final": final_cost
    })


    # 4 - Parallel l=2k
    start = time.time()
    centroidsParallel2 = kMeansParallel_init(data_rdd, k=k, l=2*k, r=r)
    init_time = time.time() - start
    
    seed_cost = cost_function(data_rdd, centroidsParallel2)

    #final_centroids = miniBatchKMeans(data_rdd, centroidsParallel2, iterations, batch_fraction)
    final_centroids = kMeansNaive(data_array, centroidsRandom, iterations)

    final_cost = cost_function(data_rdd, final_centroids)
    
    results.append({
        "method": "kmeans|| (l=2k)",
        "k": k,
        "l": 2*k,
        "r": r, 
        "initialization_time": init_time,
        "seed": seed_cost,
        "final": final_cost
    })

    df = pd.DataFrame(results)
    return df

In [None]:
def analysis_fin(
        data_array: npt.ArrayLike, 
        data_rdd: RDD, 
        r: int, 
        k: int, 
        iterations: int=10, 
        batch_fraction: float=0.1
        ) -> npt.DTypeLike:
    results = []
    """"
    Computes performance tracking metrics for different initialization algorithms with *seed cost* (after initialization) and *final cost* (after miniBatchKmean iterations)
    """
    # 1 - Random initialization
    start = time.time()
    centroidsRandom = data_array[np.random.choice(data_array.shape[0], size=k, replace=False)]
    init_time = time.time() - start
    seed_cost = cost_function(data_array, centroidsRandom)
    
    final_centroids = miniBatchKMeans(data_rdd, centroidsRandom, iterations, batch_fraction)
    final_cost = cost_function(data_array, final_centroids)
    
    results.append({
        "method": "random",
        "k": k, 
        "l": np.nan,
        "r": r, 
        "initialization_time": init_time,
        "seed": seed_cost,
        "final": final_cost
    })

    # 2 - k-means++
    start = time.time()
    centroidsPlusPlus = kMeansPlusPlus_init(data_array, k)
    init_time = time.time() - start
    seed_cost = cost_function(data_array, centroidsPlusPlus)
    final_centroids = miniBatchKMeans(data_rdd, centroidsPlusPlus, iterations, batch_fraction)
    final_cost = cost_function(data_array, final_centroids)
    results.append({
        "method": "kmeans++",
        "k": k,
        "l": np.nan,
        "r": r, 
        "initialization_time": init_time,
        "seed": seed_cost,
        "final": final_cost
    })

    # 3 - Parallel l=0.5k
    start = time.time()
    centroidsParallel1 = kMeansParallel_init(data_rdd, k=k, l=0.5*k)
    init_time = time.time() - start
    seed_cost = cost_function(data_array, centroidsParallel1)
    final_centroids = miniBatchKMeans(data_rdd, centroidsParallel1, iterations, batch_fraction)
    final_cost = cost_function(data_array, final_centroids)
    results.append({
        "method": "kmeans|| (l=k/2)",
        "k": k,
        "l": 0.5*k,
        "r": r, 
        "initialization_time": init_time,
        "seed": seed_cost,
        "final": final_cost
    })

    # 4 - Parallel l=2k
    start = time.time()
    centroidsParallel2 = kMeansParallel_init(data_rdd, k=k, l=2*k)
    init_time = time.time() - start
    seed_cost = cost_function(data_array, centroidsParallel2)
    final_centroids = miniBatchKMeans(data_rdd, centroidsParallel2, iterations, batch_fraction)
    final_cost = cost_function(data_array, final_centroids)
    results.append({
        "method": "kmeans|| (l=2k)",
        "k": k,
        "l": 2*k,
        "r": r, 
        "initialization_time": init_time,
        "seed": seed_cost,
        "final": final_cost
    })

    df = pd.DataFrame(results)
    return df

Paper analysis on Gaussian mixture

In [None]:
R = [1, 10, 100]

# generate data with Gaussian mixture changing R
for RR in R:
    data_rdd = gauss_mixture(num_points_per_cluster=5000, num_clusters=50, dim=15, R=RR)
    # Convert to numpy array
    data_list = data_rdd.collect()
    data_array = np.array(data_list)

    df = analysis_init(data_array, data_rdd, r=5, k=50)
    print("R =", RR, "\n")
    print(df)

In [None]:
# stopping the cluster
! $SPARK_HOME/sbin/stop-all.sh