In [1]:
# general imports
import matplotlib.pyplot as plt
import numpy as np
import numpy.typing as npt
import pandas as pd
import time
import os

# dataset
from sklearn.datasets import fetch_kddcup99
from sklearn.preprocessing import StandardScaler

# pyspark module
from pyspark import SparkContext, SparkConf
from pyspark.rdd import RDD
from pyspark.sql import Row
from pyspark.sql import SparkSession
from pyspark.sql.functions import pandas_udf, col

# src module
from src.utils import sparkSetup
from src.kmeans import compute_centroidDistances, get_clusterId, get_minDistance

## Import algorithms functions

In [2]:
def kMeansPlusPlus_init(
    data: npt.NDArray,
    k: int,
    weights: npt.NDArray = np.array([])
) -> npt.NDArray:
    
    #Standard kMeans++ initialization method:
    #given `data` (eventually weighted), returns `k` cluster centroids
    
    if weights.shape[0] == 0:
        weights = np.ones(shape=(data.shape[0],))
    
    centroids = data[np.random.randint(0, data.shape[0]),:].reshape(1, -1) # reshaping for easier stacking
    
    while (centroids.shape[0] < k):
        # since the original functions are made for map
        # we need to loop over the data
        minDistance_array = np.array(
            [get_minDistance(compute_centroidDistances(datum, centroids)) for datum in data]
        ) * weights # multiplyling by the weight simulates multiple copies of the same datum
        total_minDistance = np.sum(minDistance_array)
        # sampling probability proportional to minDistance
        new_centroid_idx = np.random.choice(minDistance_array.shape[0], size = 1, p = minDistance_array / total_minDistance)
        new_centroid = data[new_centroid_idx,:].reshape(1, -1)

        # edge case in which the same centroid is selected twice:
        # redo the iteration without saving the centroid
        if any(np.array_equal(new_centroid, row) for row in centroids): continue
        centroids = np.concatenate((centroids, new_centroid), axis = 0)

    return centroids

In [3]:
def kMeansNaive(
    data: npt.NDArray,
    centroids: npt.NDArray,
    epochs: int = 5
) -> npt.NDArray:
    """
    Standard kMeans algorithm:
    given `data`, updates the (k) `centroids` for `epochs` times,
    improving the clustering each time
    """
    k = centroids.shape[0]
    for _ in range(epochs):
        assignments = np.array(
            [get_clusterId(compute_centroidDistances(x, centroids)) for x in data]
        )
        centroids = np.array(
            [np.mean(data[assignments==i,:], axis = 0) for i in range(k)]
        )
    return centroids

In [35]:
def kMeansParallel_init(
    data_rdd: RDD,
    k: int,
    l: float,
    r: int,
) -> npt.NDArray:
    """
    kMeans|| initialization method:
    returns `k` good `centroids`.
    `l` controls the probability of each point
    in `data_rdd` of being sampled as a pre-processed centroid.
    """

    centroids = np.array(
        data_rdd.takeSample(num=1, withReplacement=False)
    )
    # centroid = data_rdd.takeSample(num=1, withReplacement=False, seed=42)[0]
    # centroids = np.array([centroid])

    
    minDistance_rdd = data_rdd \
        .map(lambda x: (x, get_minDistance(compute_centroidDistances(x, centroids)))) \
        .persist()

    cost = minDistance_rdd \
        .map(lambda x: x[1]) \
        .sum()
    
    if r!=0: iterations = int(np.ceil(np.log(cost))) if (cost > 1) else 1
    else: iterations = r
    
    for _ in range(iterations):
        new_centroids = np.array(
            minDistance_rdd \
                .filter(lambda x: np.random.rand() < np.min((l * x[1] / cost, 1))) \
                .map(lambda x: x[0]) \
                .collect()
        )
        # edge case in which no new centroid is sampled:
        # this avoids the following `np.concatenate` to fail
        if len(new_centroids.shape) < 2:
            continue

        minDistance_rdd.unpersist()
        centroids = np.unique(
            np.concatenate((centroids, new_centroids), axis = 0), 
            axis = 0
        )

        minDistance_rdd = data_rdd \
            .map(lambda x: (x, get_minDistance(compute_centroidDistances(x, centroids)))) \
            .persist()
        cost = minDistance_rdd \
            .map(lambda x: x[1]) \
            .sum()
    
    minDistance_rdd.unpersist()
    clusterCounts = data_rdd \
        .map(lambda x: (get_clusterId(compute_centroidDistances(x, centroids)), 1)) \
        .countByKey()
    
    clusterCounts = np.array([w[1] for w in clusterCounts.items()])
    centroids = kMeansNaive(
        centroids, 
        kMeansPlusPlus_init(centroids, k, clusterCounts)
    )
    
    return centroids

In [5]:
def miniBatchKMeans(
    data_rdd: RDD,
    centroids: npt.NDArray,
    iterations: int = 10,
    batch_fraction: float = 0.1
) -> npt.NDArray:
    k = centroids.shape[0]
    clusterCounters = np.zeros((k,)) # 1 / learning_rate
    for iter in range(iterations):
        miniBatch_rdd = data_rdd \
            .sample(withReplacement=False, fraction=batch_fraction)
        miniBatch_rdd = miniBatch_rdd \
            .map(lambda x: (get_clusterId(compute_centroidDistances(x, centroids)), 1, x)) \
            .persist()
        
        # counting how many assigments per cluster
        clusterCounts_dict = miniBatch_rdd \
            .map(lambda x: (x[0], x[1])) \
            .countByKey()
        clusterCounts = np.array(
            [clusterCounts_dict[i] if i in clusterCounts_dict.keys() else 0 for i in range(k)]
        )
        clusterCounters += clusterCounts
        
        # edge case in which a cluster has no assignments:
        # if also its counter is zero the whole iteration is repeated
        if any(np.isclose(v, 0) for v in clusterCounters): 
            iter -= 1
            miniBatch_rdd.unpersist()
            continue
        # otherwise its count will be set to 1 to avoid division by 0 in the update step
        clusterCounts = np.where(clusterCounts >= 1, clusterCounts, 1)

        # summing all points assigned to the same cluster
        # (in the update step this will be divided by the counts 
        # in order to get the mean for every cluster).
        # A dict is used for convenience and consistency with clusterCounts
        """clusterSums_dict = dict(miniBatch_rdd \
            .map(lambda x: (x[0], x[2])) \
            .reduceByKey(lambda x, y: x + y) \
            .collect()
        )
        # edge case in which a cluster has no assignments:
        # the centroid is returned instead of 0 
        # (which would have been the sum of its assigned points) 
        # in order to not update its position 
        # (note how the terms cancel out in the update step)
        clusterSums = np.array([
            np.array(clusterSums_dict[i]) if i in clusterSums_dict else centroids[i, :]
            for i in range(k)
        ])"""
        # summing all points assigned to the same cluster
        clusterSums_dict = dict(
            miniBatch_rdd
                .map(lambda x: (x[0], np.array(x[2])))  # FORZA conversione a numpy
                .reduceByKey(lambda x, y: x + y)
                .collect()
        )
        
        clusterSums = np.stack([
            clusterSums_dict[i] if i in clusterSums_dict else centroids[i, :]
            for i in range(k)
        ])



        # update step: c <- (1 - eta) * c + eta * x_mean
        # (note x_mean = x_sums / c_count)
        centroids = (1 - 1 / clusterCounters).reshape(-1, 1) * centroids + \
                    (1 / (clusterCounters * clusterCounts)).reshape(-1, 1) * clusterSums
        
        miniBatch_rdd.unpersist()
        
    return centroids

## Start a Spark session

In [6]:
# creating the zipped environment if it doesn't already exist
! if [ ! -f "environment.tar.gz" ]; then venv-pack -o "environment.tar.gz" ; fi
# creating the zipped module src
! if [ -f "src.tar.gz" ]; then rm src.tar.gz ; fi
! tar -czf src.tar.gz src

In [7]:
# starting the cluster
! $SPARK_HOME/sbin/start-all.sh

org.apache.spark.deploy.master.Master running as process 168659.  Stop it first.
worker3: org.apache.spark.deploy.worker.Worker running as process 61429.  Stop it first.
worker1: org.apache.spark.deploy.worker.Worker running as process 61790.  Stop it first.
worker2: org.apache.spark.deploy.worker.Worker running as process 60686.  Stop it first.
master: org.apache.spark.deploy.worker.Worker running as process 168868.  Stop it first.


In [8]:
os.environ["PYSPARK_PYTHON"] = "./environment/bin/python"

In [9]:
# creating a sparkSession
spark = sparkSetup("kMeans")
sc = spark.sparkContext
# exporting the src module
sc.addPyFile("src.tar.gz") # telling spark where to find the python binary

25/09/10 12:35:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/10 12:35:42 WARN Utils: Untarring behavior will be deprecated at spark.files and SparkContext.addFile. Consider using spark.archives or SparkContext.addArchive instead.


## Load and preprocess the dataset

In [10]:
"""# Generated dataset

# Parameters for toy dataset
num_points_per_cluster = 50
num_clusters = 10
dim = 3  # 2D points for easy visualization
spread = 0.5
seed = 42
np.random.seed(seed)

centers = np.random.uniform(-10, 10, (num_clusters, dim))
data = np.concatenate(
    [center + spread * np.random.randn(num_points_per_cluster, dim) for center in centers],
    axis = 0
)
data_rdd = sc.parallelize([row for row in data])

data_rdd = data_rdd.persist()"""

'# Generated dataset\n\n# Parameters for toy dataset\nnum_points_per_cluster = 50\nnum_clusters = 10\ndim = 3  # 2D points for easy visualization\nspread = 0.5\nseed = 42\nnp.random.seed(seed)\n\ncenters = np.random.uniform(-10, 10, (num_clusters, dim))\ndata = np.concatenate(\n    [center + spread * np.random.randn(num_points_per_cluster, dim) for center in centers],\n    axis = 0\n)\ndata_rdd = sc.parallelize([row for row in data])\n\ndata_rdd = data_rdd.persist()'

The first dataset we would like to test is a synthetic GaussMixture. To generate it, we sampled kcenters from a 15-dimensional spherical Gaussian distribution with mean at the origin and variance R∈{1,10,100}. We then added points from Gaussian distributions of unit variance around each center. Given the k centers, this is a mixture of k spherical Gaussians with equal weights.

ref{paper kmeans||}

In [11]:
def gauss_mixture(
    num_points_per_cluster = 50,  
    num_clusters = 10, # k centers
    dim = 15,                  
    R = 10 # center variances
):
    seed = 42
    np.random.seed(seed)
    # Centers generation N(0, R*I)
    centers = np.random.normal(loc=0, scale=np.sqrt(R), size=(num_clusters, dim))
    # Point generation N(center, I) for each cluster
    data = np.concatenate(
        [center + np.random.randn(num_points_per_cluster, dim) for center in centers],
        axis=0
    )
    # Distributes and stores in memory data across worker nodes
    gm_data_rdd = sc.parallelize([row for row in data])
    gm_data_rdd = gm_data_rdd.persist()
    
    return gm_data_rdd

In [12]:
# try to generate data
gm10_data_rdd = gauss_mixture()  # R=10, k=10, shape=(500, 15)

In [13]:
num_elements = gm10_data_rdd.count()
element_dim = len(gm10_data_rdd.first())
print(f"Shape of RDD object: ({num_elements}, {element_dim})")

[Stage 0:>                                                          (0 + 2) / 2]

Shape of RDD object: (500, 15)


                                                                                

In [14]:
# Convert to numpy array
gm10_data_list = gm10_data_rdd.collect()
gm10_data_array = np.array(gm10_data_list)

In [15]:
gm10_df = pd.DataFrame(gm10_data_array)
gm10_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,0.50638,0.340572,0.384488,-0.602726,-1.591257,0.164334,0.606168,-0.411214,-0.051488,-0.250236,-0.174647,-0.392226,-1.078956,-0.076183,-1.373043
std,2.504021,3.423244,2.5164,3.05926,3.315899,4.208496,2.503849,3.507559,3.428951,2.097866,2.818569,2.818404,2.199171,3.244448,3.654475
min,-4.764675,-6.124204,-6.405527,-7.140586,-9.117571,-8.621237,-5.096887,-8.200883,-6.804307,-4.943611,-6.953143,-6.488304,-6.595198,-7.921115,-10.124439
25%,-1.575652,-2.218023,-1.464426,-3.104237,-4.642731,-3.497547,-1.018609,-2.927902,-2.151891,-1.879498,-2.138697,-2.428322,-2.83948,-2.298371,-4.217135
50%,0.553398,-0.633207,0.276181,-0.636885,-1.216215,0.060562,0.31585,0.124552,-0.124692,-0.478456,0.472355,-0.662115,-0.884332,-0.2083,-0.476261
75%,2.204466,3.051157,2.131623,1.454351,1.305417,3.789406,1.912839,2.198466,1.418633,1.146245,1.96724,1.209987,0.642208,2.240366,1.633263
max,6.950031,8.302471,6.801433,7.006046,4.889473,9.706793,7.06931,7.825568,9.887677,5.8811,5.445155,8.106792,3.854257,7.459394,5.173387


In [16]:
k = 10
l = 2*k
centroids = kMeansParallel_init(gm10_data_rdd, k, l)

print(centroids.shape)

(10, 15)


In [19]:
final_centroids = miniBatchKMeans(gm10_data_rdd, centroids)

print(final_centroids.shape)

(10, 15)


In [17]:
# KDD dataset
# Change percent10 to 'False' to fetch the full dataset (4M rows)
kdd = fetch_kddcup99(shuffle=True, percent10=True) 
kdd_data = kdd.data

# Remove string features and standardize them
data_kdd = np.delete(kdd_data,np.arange(1,4,1),axis = 1) 
scaler_kdd = StandardScaler()
data_kdd = scaler_kdd.fit_transform(data_kdd)

# Parallelize
kdd_data_rdd = sc.parallelize([row for row in data_kdd])
# data_rdd = sc.parallelize(np.array(data).tolist(), numSlices=16)
kdd_data_rdd = kdd_data_rdd.persist()

In [18]:
num_elements = kdd_data_rdd.count()
element_dim = len(kdd_data_rdd.first())
print(f"Shape of RDD object: ({num_elements}, {element_dim})")

25/09/10 12:36:29 WARN TaskSetManager: Stage 31 contains a task of very large size (10124 KiB). The maximum recommended task size is 1000 KiB.

Shape of RDD object: (494021, 38)


25/09/10 12:36:32 WARN TaskSetManager: Stage 32 contains a task of very large size (10124 KiB). The maximum recommended task size is 1000 KiB.


## Benchmark testing

In [22]:
def cost_function(data: RDD, centroids: npt.ArrayLike) -> float:
    """
    Compute cost function: square distance between data and centroids
    data: (N, d)
    centroids: (k, d)
    """
    minDistance_rdd = data \
        .map(lambda x: (x, get_minDistance(compute_centroidDistances(x, centroids)))) \
        .persist()
    cost = minDistance_rdd \
        .map(lambda x: x[1]) \
        .sum()
    return cost

In [None]:
def analysis_init(
        data_array: npt.ArrayLike, 
        data_rdd: RDD, 
        r: int, 
        k: int, 
        iterations: int=10, 
        batch_fraction: float=0.1
        ) -> npt.DTypeLike:
    results = []
    """"
    Computes performance tracking metrics for different initialization algorithms with *seed cost* (after initialization) and *final cost* (after miniBatchKmean iterations)
    """


    # 1 - Random initialization
    start = time.time()
    centroidsRandom = data_array[np.random.choice(data_array.shape[0], size=k, replace=False)]
    init_time = time.time() - start

    seed_cost = cost_function(data_rdd, centroidsRandom)

    #final_centroids = miniBatchKMeans(data_rdd, centroidsRandom, iterations, batch_fraction)
    final_centroids = kMeansNaive(data_array, centroidsRandom, iterations)

    final_cost = cost_function(data_rdd, final_centroids)
    
    results.append({
        "method": "random",
        "k": k, 
        "l": np.nan,
        "r": r, 
        "initialization_time": init_time,
        "seed": seed_cost,
        "final": final_cost
    })


    # 2 - k-means++
    start = time.time()
    centroidsPlusPlus = kMeansPlusPlus_init(data_array, k)
    init_time = time.time() - start

    seed_cost = cost_function(data_rdd, centroidsPlusPlus)
    #final_centroids = miniBatchKMeans(data_rdd, centroidsPlusPlus, iterations, batch_fraction)
    final_centroids = kMeansNaive(data_array, centroidsRandom, iterations)

    final_cost = cost_function(data_rdd, final_centroids)

    results.append({
        "method": "kmeans++",
        "k": k,
        "l": np.nan,
        "r": r, 
        "initialization_time": init_time,
        "seed": seed_cost,
        "final": final_cost
    })


    # 3 - Parallel l=0.5k
    start = time.time()
    centroidsParallel1 = kMeansParallel_init(data_rdd, k=k, l=0.5*k, r=r)
    init_time = time.time() - start

    seed_cost = cost_function(data_rdd, centroidsParallel1)

    #final_centroids = miniBatchKMeans(data_rdd, centroidsParallel1, iterations, batch_fraction)
    final_centroids = kMeansNaive(data_array, centroidsRandom, iterations)

    final_cost = cost_function(data_rdd, final_centroids)

    results.append({
        "method": "kmeans|| (l=k/2)",
        "k": k,
        "l": 0.5*k,
        "r": r, 
        "initialization_time": init_time,
        "seed": seed_cost,
        "final": final_cost
    })


    # 4 - Parallel l=2k
    start = time.time()
    centroidsParallel2 = kMeansParallel_init(data_rdd, k=k, l=2*k, r=r)
    init_time = time.time() - start
    
    seed_cost = cost_function(data_rdd, centroidsParallel2)

    #final_centroids = miniBatchKMeans(data_rdd, centroidsParallel2, iterations, batch_fraction)
    final_centroids = kMeansNaive(data_array, centroidsRandom, iterations)

    final_cost = cost_function(data_rdd, final_centroids)
    
    results.append({
        "method": "kmeans|| (l=2k)",
        "k": k,
        "l": 2*k,
        "r": r, 
        "initialization_time": init_time,
        "seed": seed_cost,
        "final": final_cost
    })

    df = pd.DataFrame(results)
    return df

In [None]:
def analysis_fin(
        data_array: npt.ArrayLike, 
        data_rdd: RDD, 
        r: int, 
        k: int, 
        iterations: int=10, 
        batch_fraction: float=0.1
        ) -> npt.DTypeLike:
    results = []
    """"
    Computes performance tracking metrics for different initialization algorithms with *seed cost* (after initialization) and *final cost* (after miniBatchKmean iterations)
    """
    # 1 - Random initialization
    start = time.time()
    centroidsRandom = data_array[np.random.choice(data_array.shape[0], size=k, replace=False)]
    init_time = time.time() - start
    seed_cost = cost_function(data_array, centroidsRandom)
    
    final_centroids = miniBatchKMeans(data_rdd, centroidsRandom, iterations, batch_fraction)
    final_cost = cost_function(data_array, final_centroids)
    
    results.append({
        "method": "random",
        "k": k, 
        "l": np.nan,
        "r": r, 
        "initialization_time": init_time,
        "seed": seed_cost,
        "final": final_cost
    })

    # 2 - k-means++
    start = time.time()
    centroidsPlusPlus = kMeansPlusPlus_init(data_array, k)
    init_time = time.time() - start
    seed_cost = cost_function(data_array, centroidsPlusPlus)
    final_centroids = miniBatchKMeans(data_rdd, centroidsPlusPlus, iterations, batch_fraction)
    final_cost = cost_function(data_array, final_centroids)
    results.append({
        "method": "kmeans++",
        "k": k,
        "l": np.nan,
        "r": r, 
        "initialization_time": init_time,
        "seed": seed_cost,
        "final": final_cost
    })

    # 3 - Parallel l=0.5k
    start = time.time()
    centroidsParallel1 = kMeansParallel_init(data_rdd, k=k, l=0.5*k)
    init_time = time.time() - start
    seed_cost = cost_function(data_array, centroidsParallel1)
    final_centroids = miniBatchKMeans(data_rdd, centroidsParallel1, iterations, batch_fraction)
    final_cost = cost_function(data_array, final_centroids)
    results.append({
        "method": "kmeans|| (l=k/2)",
        "k": k,
        "l": 0.5*k,
        "r": r, 
        "initialization_time": init_time,
        "seed": seed_cost,
        "final": final_cost
    })

    # 4 - Parallel l=2k
    start = time.time()
    centroidsParallel2 = kMeansParallel_init(data_rdd, k=k, l=2*k)
    init_time = time.time() - start
    seed_cost = cost_function(data_array, centroidsParallel2)
    final_centroids = miniBatchKMeans(data_rdd, centroidsParallel2, iterations, batch_fraction)
    final_cost = cost_function(data_array, final_centroids)
    results.append({
        "method": "kmeans|| (l=2k)",
        "k": k,
        "l": 2*k,
        "r": r, 
        "initialization_time": init_time,
        "seed": seed_cost,
        "final": final_cost
    })

    df = pd.DataFrame(results)
    return df

Paper analysis on Gaussian mixture

In [None]:
R = [1, 10, 100]

# generate data with Gaussian mixture changing R
for RR in R:
    data_rdd = gauss_mixture(num_points_per_cluster=5000, num_clusters=50, dim=15, R=RR)
    # Convert to numpy array
    data_list = data_rdd.collect()
    data_array = np.array(data_list)

    df = analysis_init(data_array, data_rdd, r=5, k=50)
    print("R =", RR, "\n")
    print(df)

R = 1 

             method   k      l  r  initialization_time          seed  \
0            random  50    NaN  5             0.000880  1.163835e+06   
1          kmeans++  50    NaN  5            21.054334  1.220724e+06   
2  kmeans|| (l=k/2)  50   25.0  5             7.992525  1.018898e+06   
3   kmeans|| (l=2k)  50  100.0  5            10.445904  9.134255e+05   

           final  
0  838773.149056  
1  838773.149056  
2  838773.149056  
3  838773.149056  
R = 10 

             method   k      l  r  initialization_time          seed  \
0            random  50    NaN  5             0.000839  4.811139e+06   
1          kmeans++  50    NaN  5            20.531098  4.252064e+06   
2  kmeans|| (l=k/2)  50   25.0  5             8.088294  2.967986e+06   
3   kmeans|| (l=2k)  50  100.0  5            11.255208  2.637538e+06   

          final  
0  2.505666e+06  
1  2.505666e+06  
2  2.505666e+06  
3  2.505666e+06  


KeyboardInterrupt: 