In [3]:
# Import library
import pickle
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_kddcup99
from pyspark.sql import SparkSession
from time import time, sleep
import subprocess
import pprint
import sys
import logging
import warnings

In [4]:
# Setup the spark warnings
warnings.filterwarnings("ignore") 
logging.getLogger('py4j').setLevel(logging.ERROR) 
logging.getLogger('pyspark').setLevel(logging.ERROR) 
log4j_conf_path = "file:///home/quivigorelli/Distributed-K-Means-Clustering/spark/DistributedKmeans/log4j.properties"

Global hyperparameters and data paths.

In [5]:
# Paths to files for reading and writing of data dictionaries
pickle_fileP = 'dataP/log1P_C11.pkl' # Parallel initialization data
pickle_fileR = 'dataR/log1U_C11.pkl' # Random initialization data

# Setting seeds for reproducibility
np.random.seed(12345)
spark_seed = 54321

# Number of partitions 
nSlices = [256] # done 2, 4, 8, 16, 32, 64, 128, 256

# Size of considered subset
subLen = 300_000

# Maximum number of iterations in Lloyds algorithm
lloydsMaxIterations=20

In [6]:
# Functions
def labelToInt(label):
    '''
    Map from set of labels in original dataset (`strings`) into set of natural numbers (`int`) for easier manipulation of rdd
    '''
    uniqueLabels=list(np.unique(y))
    return uniqueLabels.index(label)


def deleteBytes(datum):
    '''
    Clean dataset from categorical attributes, leaving numerical ones
    Arguments:
    One datum of the rdd.
    Return:
    Updated datum.
    '''
    x = datum[1]["x"]
    mask = [type(i) != bytes for i in x]
    datum[1]["x"] = np.asarray(x[mask])
    print(x)
    print(mask)
    return datum


def localPlusPlusInit(points, k): 
    '''
    KMeans++ initialization.
    Arguments:
    `points`: array (n, dim) of points to be clustered;
    `k`: desired number of centroids. 
    Returns:
    Initial array (k, dim) of centroids, k<=n.
    '''
    # Sample one point uniformly from points array
    C=points[np.random.choice(points.shape[0])]
    C=C[np.newaxis, :]
    
    for _ in range(k):
        # Compute array (n,1) of probabilities associated to each point
        probs=np.min(np.sum((points[:,:,np.newaxis]-C.T[np.newaxis,:,:])**2, axis=1), axis=1).flatten()
        # Normalize probability distribution
        probs=probs/np.sum(probs)
        
        # Draw one new centroid according to distrbution
        nextCentroid=points[np.random.choice(points.shape[0], p=probs)][np.newaxis,:]
        # Add centroid to array
        C=np.vstack((C, nextCentroid))
    return C


def weightedAverage(group):
    """
    Compute weighted average of a group from a pd.DataFrame with point coordinates, weights, clusterId.
    Utilized in local (non-distributed) version of Lloyds algorithm, needed also in K-Means//
    """
    weight_column='weights'
    groupby_column='clusterId'
    columns_to_average = group.columns.difference([weight_column, groupby_column])
    weighted_averages = group[columns_to_average].multiply(group[weight_column], axis=0).sum() / group[weight_column].sum()
    return weighted_averages


def localLloyds(points, k, C_init=None, weights=None, n_iterations=100, logDict=None):
    """
    Local (non-distributed) Lloyds algorithm
    Arguments:
    `points`: array (n, dim) of points to cluster;
    `k`: number of desired clusters;
    `C_init`: optional, array (k, dim) of initial centroids
    `weights`: optional, weights for weighted average in centroid re-computing;
    `n_iterations`: optional, number of iteration in lloyds algorithm;
    `logDict`: optional, dictionary {'CostsKmeans', 'tIterations', 'tTotal'} to store cost and time info.
    Return:
    Array of expected centroids.
    """
    t0 = time()

    # Storing cost and time info
    my_kMeansCosts = []
    tIterations = []
    
    df=pd.DataFrame(points)

    # If weights not given, assume uniform weights for points
    if weights is None:
        weights=np.ones(shape=len(points))
    df['weights']=weights
    df['clusterId']=np.zeros(shape=len(points))

    # If no C_init, default to K-Means++ initialization
    if C_init is None:
        C=localPlusPlusInit(points, k)
    else:
        C=C_init
   
    clusterId=np.argmin(np.sum((points[:,:,np.newaxis]-C.T[np.newaxis,:,:])**2, axis=1), axis=1)
    for iteration in range(n_iterations):
        t1=time()

        # Compute centroid given cluster
        df['clusterId']=clusterId
        C_df=df.groupby('clusterId')\
            .apply(weightedAverage)\
            .reset_index()

        # Compute cluster given centroid
        C_array=C_df[C_df.columns.difference(['weights', 'clusterId'])].reset_index(drop=True).to_numpy()
        squared_distances=np.sum((points[:,:,np.newaxis]-C_array.T[np.newaxis,:,:])**2, axis=1)
        clusterId=np.argmin(squared_distances, axis=1)
        my_cost=sum(squared_distances[np.arange(len(squared_distances)), clusterId])

        my_kMeansCosts.append(my_cost)
        t2 = time()
        
        tIteration = t2 - t1
        tIterations.append(tIteration)

    tEnd = time()
    tTotal = tEnd - t0

    # Store cost and time info
    if logDict is not None:
        logDict["CostsKmeans"] = my_kMeansCosts
        logDict["tIterations"] = tIterations
        logDict["tTotal"] = tTotal
    
    return C_array 


def minmaxRescale(datum, minS, maxS):
    """
    Rescale datum in [0,1] interval for better clusterization
    Arguments:
    `datum`: see rdd format;
    `minS`: array of min coordinate value among points for each attribute;
    `maxS`: as `minS` with max.
    Return:
    Updated datum.
    """
    mask = np.array(minS < maxS).astype(bool)
    feature = datum[1]["x"] 
    feature = (feature[mask] - minS[mask])/(maxS[mask] - minS[mask])
    return (datum[0], {"x": feature, "y": datum[1]["y"], "d2":datum[1]["d2"]}) 


def selectCluster(datum, C, updateDistances=True):
    """
    Associate datum to its centroid and optionally updates squared distance between them.
    Arguments:
    `datum`: see rdd format;
    `C`: array (k, len(datum[1]["x"]));
    `updateDistances`: if True, updates `datum[1]["d2"]` with squared distance between datum point and closest centroid in C.
    Return:
    Updated datum.
    """
    distances = np.sum((datum[1]["x"] - C)**2, axis=1)
    print('distances: ',distances)
    clusterId = np.argmin(distances)
    if updateDistances is True:
        return (clusterId, {'x':datum[1]['x'], 'y':datum[1]['y'], 'd2':distances[clusterId]})
    else:
        return (clusterId, datum[1])


def updateCentroids(Rdd):
    """
    Update centroids as spatial average of cluster points
    Argument:
    `Rdd`: see rdd format;
    Return:
    Updated array of centroids.
    """
    C=Rdd.mapValues(lambda xy: (xy['x'], 1))\
              .reduceByKey(lambda a,b : (a[0]+b[0], a[1]+b[1]))\
              .mapValues(lambda a:a[0]/a[1])\
              .values()\
              .collect() 
    C=np.array(C) #check later more carefully if causes some overhead
    return C


def updateDistances(Rdd, C):
    """
    Update Rdd with square distances from centroids, given Rdd with clusters already assigned to each point
    Arguments:
    `Rdd`: see rdd format;
    `C`: array of cluster centroids.
    Return:
    Updated rdd.
    """
    def datumUpdate(datum, C):
        '''
        Update a datum of the rdd with distance from assigned centroid
        '''
        d2=np.sum((datum[1]['x']-C[datum[0]])**2)
        #return datum
        return (datum[0], {"x": datum[1]["x"], "y": datum[1]["y"], "d2":d2})
    Rdd=Rdd.map(lambda datum:datumUpdate(datum, C))
    return Rdd


def cost(Rdd):
    """
    Calculate global cost of clusterization, from an Rdd with distances from centroids already updated
    """
    my_cost=Rdd.map(lambda datum : datum[1]['d2'])\
               .reduce(lambda a,b: a+b)
    return my_cost 


def kMeans(Rdd, C_init, maxIterations, logParallelKmeans=None):
    """
    Distributed (parallel) Lloyds algorithm
    Arguments:
    `Rdd`: see rdd format;
    `C_init`: array (k, dim) of initial centroids;
    `maxIterations`: max number of iterations;
    `logParallelKmeans`: optional, dictionary {'CostsKmeans', 'tIterations', 'tTotal'} to store cost and time info.
    Return:
    Array of expected centroids.
    """
    
    t0 = time()

    # Storing cost and time info
    my_kMeansCosts = []
    tIterations = []
    C=C_init

    for t in range(maxIterations):
        t1 = time()
        RddCached = Rdd.map(lambda datum: selectCluster(datum, C)).persist() ###
        
        # Now we compute the new centroids by calculating the averages of points belonging to the same cluster.
        C=updateCentroids(RddCached)
        my_cost = cost(RddCached)
        
        my_kMeansCosts.append(my_cost)
        t2 = time()
        
        tIteration = t2 - t1
        tIterations.append(tIteration)
        
        #RddCached.unpersist() 

        # Break loop if convergence of cost is reached
        if (len(my_kMeansCosts) > 1) and (my_kMeansCosts[-1] > 0.999*my_kMeansCosts[-2]):
            break

    tEnd = time()
    tTotal = tEnd - t0

    # Store cost and time info in argument dictionary
    if logParallelKmeans is not None:
        logParallelKmeans["CostsKmeans"] = my_kMeansCosts
        logParallelKmeans["tIterations"] = tIterations
        logParallelKmeans["tTotal"] = tTotal

    return C


def naiveInitFromSet(Rdd, k, logNaiveInit=None, spark_seed=54321):
    """
    Uniform sampling of k points from Rdd
    Arguments:
    `Rdd`: see rdd structure;
    `k`: desired number of clusters;
    `spark_seed`: optional, seed for spark random sampling;
    `logNaiveInit`: optional, dictionary {'tTotal'} to store time info.
    Return:
    Initial array (k, dim) of centroids.
    """
    t0 = time()
    # Sampling. Replacement is set to False to avoid coinciding centroids BUT no guarantees that in the original dataset all points are distinct!!!
    kSubset=Rdd.takeSample(False, k, seed=spark_seed)
    C_init=np.array([datum[1]['x'] for datum in kSubset])

    tEnd = time()
    
    if logNaiveInit is not None:
        logNaiveInit["tTotal"] = tEnd - t0
        
    return C_init


def naiveInitFromSpace(k, dim):
    """
    Uniform drawing of k points from euclidean space assuming the Rdd has been mapped into a [0,1]^dim space
    Arguments:
    `k`: desired number of clusters;
    `dim`: dimensionality of points space.
    Return:
    Initial array (k, dim) of centroids.
    """
    C_init=np.random.uniform(size=(k,dim))
    return C_init


def parallelInit(Rdd, k, l, logParallelInit=None):
    """
    Parallel initialization
    Arguments:
    `Rdd`: see rdd structure;
    `k`: desired number of clusters;
    `l`: coefficient to adjust sampling probability in order to obtain at least k centroids;
    `logParallelInit`: optional, dictionary {'CostsKmeans', 'tIterations', 'tTotal'} to store cost and time info.
    Return:
    Initial array (k, dim) of centroids.
    """
    t0 = time()
    
    # initialize C as a point in the dataset
    C=naiveInitFromSet(Rdd, 1) 
    
    # associate each datum to the only centroid (computed before) and computed distances and cost
    Rdd=Rdd.map(lambda datum : (0, datum[1]))
    Rdd=updateDistances(Rdd, C).persist() ###
    
    my_cost=cost(Rdd)

    # number of iterations (log(cost))
    n_iterations=int(np.log(my_cost))
    if(n_iterations<1): n_iterations=1
    
    tSamples = []
    tCentroids = []
    CostInits = [my_cost]

    # iterative sampling of the centroids
    for _ in range(n_iterations):

        t1=time()
        # sample C' according to the probability
        C_prime=Rdd.filter(lambda datum : np.random.uniform()<l*datum[1]['d2']/my_cost)\
                   .map(lambda datum : datum[1]['x'])\
                   .collect()
        C_prime=np.array(C_prime)
        t2=time()

        # stack C and C', update distances, centroids, and cost
        if (C_prime.shape[0]>0):
            C=np.vstack((C, C_prime))
            
            #Rdd.unpersist() ###
            Rdd=Rdd.map(lambda datum: selectCluster(datum, C)).persist() ###
            
            my_cost=cost(Rdd)
        t3=time()

        tSample = t2 -t1
        tCentroid = t3 - t2
        tSamples.append(tSample)
        tCentroids.append(tCentroid)
        CostInits.append(my_cost)
       
    #erase centroids sampled more than once 
    C=C.astype(float)
    C=np.unique(C, axis=0)
    Rdd=Rdd.map(lambda datum: selectCluster(datum, C))
    
    #compute weights of centroids (sizes of each cluster) and put them in a list whose index is same centroid index as C
    wx=Rdd.countByKey()
    weights=np.zeros(len(C))
    weights[[list(wx.keys())]]=[list(wx.values())]
    
    #subselection of k centroids from C, using local Lloyds algorithm with k-means++ initialization
    if C.shape[0]<=k:
        C_init=C
    else:
        C_init=localLloyds(C, k, weights=weights, n_iterations=100) #can be set to lloydsMaxIterations for consistency TODO

    tEnd = time()
    
    if logParallelInit is not None:
        logParallelInit["tSamples"] = tSamples
        logParallelInit["tCentroids"] = tCentroids
        logParallelInit["CostInit"] = CostInits
        logParallelInit["tTotal"] = tEnd - t0

    #Rdd.unpersist() ###
    return C_init

def predictedCentroidsLabeler(C_expected, C_predicted):
    """
    Associate expected and predicted centroids based on distance.
    Parameters:
    `C_expected`: array (k, dim) of expected centroids;
    `C_predicted`: array (k,dim) of predicted centroids;
    Return:
    List of labels, one for each expected centroid and pointing to its nearest predicted centroid;
    List of corresponding distances.
    """
    # Compute the distance matrix
    distMatrix=np.sum((C_expected[:,:,np.newaxis]-C_predicted.T[np.newaxis, :,:])**2,axis=1)
    # The labeler i-th entry j, tells that i-th centroid of C_expected is associated to j-th element of C_predicted
    labeler=np.argmin(distMatrix,axis=1)
    # Square distance of element of C_expected to nearest point in C_predicted
    distances=np.sqrt(np.array(distMatrix[np.arange(len(distMatrix)),labeler]).astype(float))
    return labeler, distances


def nearestCentroidDistances(C):
    """
    Associate each centroid to the distance of the nearest one
    Parameters:
    `C`:  array (k, dim) of centroids;
    Return:
    List of labels, one for each centroid and pointing to its nearest centroid;
    List of corresponding distances.
    """
    # Compute the distance matrix
    distMatrix=np.sum((C[:,:,np.newaxis]-C.T[np.newaxis, :,:])**2,axis=1)
    distMatrix+=np.diag(np.repeat(np.inf, distMatrix.shape[0]))
    
    # The labeler i-th entry j, tells that i-th centroid of C_expected is associated to j-th element of C_predicted
    labeler=np.argmin(distMatrix,axis=1)
    
    # Square distance of element of C_expected to nearest point in C_predicted
    distances=np.sqrt(np.array(distMatrix[np.arange(distMatrix.shape[0]),labeler]).astype(float))
    return labeler, distances

In [7]:
%%time
### SPARK SETUP ###

# Build a spark session
spark = SparkSession.builder \
    .master("spark://spark-master:7077")\
    .appName("Clustering")\
    .config("spark.executor.memory", "7g")\
    .config("spark.driver.extraJavaOptions", f"-Dlog4j.configuration=file:{log4j_conf_path}")\
    .getOrCreate()

# Create a spark context
sc = spark.sparkContext
sc.setLogLevel("ERROR")

# Eventually clear old data (if re-running)
spark.catalog.clearCache() 
# for (id, rdd) in sc._jsc.getPersistentRDDs().items():
#     rdd.unpersist()

#### IMPORT THE DATA SET ####
data = fetch_kddcup99(return_X_y = True, percent10 = True) # default percent10=True

# collect samples and features (target)
x = data[0]
y = data[1] 

# Shuffle
shuffled_indices = np.random.permutation(len(x))
x=x[shuffled_indices]
y=y[shuffled_indices]

# cut the data fro memory reasons
x = x[:subLen,]
y = y[:subLen]

for nSlice in nSlices:
    ### PARALLEL ###

    # Open file if exists
    sleep(1)
    if os.path.isfile(pickle_fileP):
        with open(pickle_fileP, "rb") as f:
            logParallel = pickle.load(f)
            totalLogParallelInit, totalLogParallelKmeans, tDurationsParallel, tPreOperationsParallel = logParallel.values()
    else:
        totalLogParallelInit = {}
        totalLogParallelKmeans = {}
        tDurationsParallel = {}
        tPreOperationsParallel = {}

    # Start the algorithm
    tInit = time() # compute the time of the beginning of the iteration over the number of partitions
    print(f"The iteration with {nSlice} number of partition started at time {tInit}")
    
    # Parallelize over nSlice partitions
    Rdd = sc.parallelize([(None, {"x": x[i],"y": y[i], "d2":None}) for i in range(len(y))], numSlices = nSlice)

    # Cut the categorical attributes
    Rdd = Rdd.map(deleteBytes)\
             .persist()

    # Setting the theoretical number of clusters
    kTrue = Rdd.map(lambda datum: datum[1]["y"])\
               .distinct()\
               .count()
    
    # Rescale the RDD over the max
    maxS = Rdd.map(lambda datum: datum[1]["x"])\
           .reduce(lambda a, b: np.maximum(a, b))
    minS = Rdd.map(lambda datum: datum[1]["x"])\
           .reduce(lambda a, b: np.minimum(a, b))

    Rdd = Rdd.map(lambda datum: minmaxRescale(datum, minS, maxS))\
             .persist()
    
    # Setting up the input and output information for the algorithm
    logParallelInit = {}
    logParallelKmeans = {}

    # Setup k and l
    k=kTrue
    l=k*2 
    
    tInitI = time()

    tPreOperation = tInitI - tInit
    print(f"Finished the pre-steps after {tPreOperation} seconds")
          
    # Initialization kMeans //
    C_init = parallelInit(Rdd, k, l, logParallelInit)
    
    tInitialization = time() - tInitI
    print(f"Finished the initialization after {tInitialization} seconds")
    
    # Run the k-means alghoritm
    C = kMeans(Rdd, C_init, lloydsMaxIterations, logParallelKmeans)
    
    # Time information
    tEnd = time() # compute the time of the end of the iteration over the number of partitions
    tDuration = tEnd - tInit
    
    print(f"The iteration with {nSlice} number of partition ended at time {tEnd} after {tDuration} seconds")

    # Output in the correct memory adresses
    totalLogParallelInit[f"Number of partition" + str(nSlice)] = logParallelInit
    totalLogParallelKmeans[f"Number of partition" + str(nSlice)] = logParallelKmeans
    tDurationsParallel[f"Number of partition" + str(nSlice)] = tDuration
    tPreOperationsParallel[f"Number of partition" + str(nSlice)] = tPreOperation

    #Rdd.unpersist()

    spark.catalog.clearCache() 
    # for (id, rdd) in sc._jsc.getPersistentRDDs().items():
    #     rdd.unpersist()
    # print("Persisted RRDs: ", len(sc._jsc.getPersistentRDDs().items()))


    # Compute the total log
    logParallel = {"totalLogParallelInit": totalLogParallelInit, "totalLogParallelKmeans": totalLogParallelKmeans, "tDurationsParallel": tDurationsParallel, "tPreOperationsParallel": tPreOperationsParallel}
    
    # Save the log file
    if not os.path.exists('dataP'): # create a directory if it doesnt exist
        os.makedirs('dataP')
    
    with open(pickle_fileP, "wb") as file:
        pickle.dump(logParallel, file)

    # Clear the space
    subprocess.run("ssh slave2 'cd /usr/local/spark/work/ && [ \"$(ls -A .)\" ] && rm -r ./*'", shell=True)
    subprocess.run("ssh slave3 'cd /usr/local/spark/work/ && [ \"$(ls -A .)\" ] && rm -r ./*'", shell=True)


    ### NAIVE INIT ###
    
    # Load log if it exists
    sleep(1)
    if os.path.isfile(pickle_fileR):
        with open(pickle_fileR, "rb") as f:
            logNaive = pickle.load(f)
            totalLogNaiveInit, totalLogNaiveKmeans, tDurationsNaive, tPreOperationsNaive = logNaive.values()
    else:
        totalLogNaiveInit = {}
        totalLogNaiveKmeans = {}
        tDurationsNaive = {}
        tPreOperationsNaive = {}
    
    # Start algo
    tInit = time() # compute the time of the beginning of the iteration over the number of partitions
    print(f"The iteration with {nSlice} number of partition started at time {tInit}")
    
    # Parallelize over nSlice partitions
    Rdd = sc.parallelize([(None, {"x": x[i],"y": y[i], "d2":None}) for i in range(len(y))], numSlices = nSlice)

    # Cut the categorical attributes
    Rdd = Rdd.map(deleteBytes)\
             .persist()

    # Setting the theoretical number of clusters
    kTrue = Rdd.map(lambda datum: datum[1]["y"])\
               .distinct()\
               .count()
    
    # Rescale the RDD over the max
    maxS = Rdd.map(lambda datum: datum[1]["x"])\
           .reduce(lambda a, b: np.maximum(a, b))
    minS = Rdd.map(lambda datum: datum[1]["x"])\
           .reduce(lambda a, b: np.minimum(a, b))

    Rdd = Rdd.map(lambda datum: minmaxRescale(datum, minS, maxS))\
             .persist()
    
    # Setting up the input and output information for the algorithm
    logNaiveInit = {}
    logNaiveKmeans = {}

    # Setup k and l
    k=kTrue
    l=k*2 
    
    tInitI = time()

    tPreOperation = tInitI - tInit
    print(f"Finished the pre-steps after {tPreOperation} seconds")
          
    # initialization kMeans//
    C_init = naiveInitFromSet(Rdd, k, logNaiveInit)
    
    tInitialization = time() - tInitI
    print(f"Finished the initialization after {tInitialization} seconds")
    
    # Run the k-means algorithm
    C = kMeans(Rdd, C_init, lloydsMaxIterations, logNaiveKmeans)
    
    # Time information
    tEnd = time() # compute the time of the end of the iteration over the number of partitions
    tDuration = tEnd - tInit
    
    print(f"The iteration with {nSlice} number of partition ended at time {tEnd} after {tDuration} seconds")

    # Output in the correct memory adresses
    totalLogNaiveInit[f"Number of partition" + str(nSlice)] = logNaiveInit
    totalLogNaiveKmeans[f"Number of partition" + str(nSlice)] = logNaiveKmeans
    tDurationsNaive[f"Number of partition" + str(nSlice)] = tDuration
    tPreOperationsNaive[f"Number of partition" + str(nSlice)] = tPreOperation

    #Rdd.unpersist()

    spark.catalog.clearCache() 
    # for (id, rdd) in sc._jsc.getPersistentRDDs().items():
    #     rdd.unpersist()
    # print("Persisted RRDs: ", len(sc._jsc.getPersistentRDDs().items()))

    # Compute the total log
    logNaive = {"totalLogNaiveInit": totalLogNaiveInit, "totalLogNaiveKmeans": totalLogNaiveKmeans, "tDurationsNaive": tDurationsNaive, "tPreOperationsNaive": tPreOperationsNaive}
    
    # Save the log file
    if not os.path.exists('dataR'): # create a directory if it doesnt exist
        os.makedirs('dataR')
    
    with open(pickle_fileR, "wb") as filer:
        pickle.dump(logNaive, filer)

    # Clear the space
    subprocess.run("ssh slave2 'cd /usr/local/spark/work/ && [ \"$(ls -A .)\" ] && rm -r ./*'", shell=True)
    subprocess.run("ssh slave3 'cd /usr/local/spark/work/ && [ \"$(ls -A .)\" ] && rm -r ./*'", shell=True)

24/07/08 18:07:24 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


The iteration with 256 number of partition started at time 1720462046.8569777


                                                                                

Finished the pre-steps after 25.964742422103882 seconds


                                                                                

Finished the initialization after 1078.2120974063873 seconds


                                                                                

The iteration with 256 number of partition ended at time 1720463233.6977935 after 1186.840815782547 seconds
The iteration with 256 number of partition started at time 1720463235.0972173


                                                                                

Finished the pre-steps after 20.938359260559082 seconds


                                                                                

Finished the initialization after 9.937275171279907 seconds


                                                                                

The iteration with 256 number of partition ended at time 1720463487.6936898 after 252.59647250175476 seconds
CPU times: user 9.72 s, sys: 1.59 s, total: 11.3 s
Wall time: 24min 4s


In [8]:
logParallel

{'totalLogParallelInit': {'Number of partition2': {'tSamples': [0.9625167846679688,
    1.102644920349121,
    1.056347370147705,
    1.0816810131072998,
    1.0999093055725098,
    1.1076600551605225,
    1.1391963958740234,
    1.1393795013427734,
    1.145113229751587,
    1.0341508388519287,
    0.992769718170166,
    1.0650136470794678,
    1.0378704071044922],
   'tCentroids': [34.30888915061951,
    58.26218247413635,
    85.10184264183044,
    108.61568975448608,
    133.30386662483215,
    161.54800367355347,
    186.69220972061157,
    214.4812912940979,
    238.2676498889923,
    269.3384690284729,
    296.69858288764954,
    327.43224453926086,
    345.6207664012909],
   'CostInit': [891555.5554243359,
    20459.66555650932,
    9781.530894359516,
    5471.4048364394,
    4303.815164680175,
    3489.5419803761797,
    2790.4851080146254,
    2501.6291871887997,
    2191.7318527853213,
    1964.2360937496173,
    1781.205574382117,
    1635.3199696277575,
    1508.1708678204

In [9]:
logNaive

{'totalLogNaiveInit': {'Number of partition2': {'tTotal': 3.5391743183135986},
  'Number of partition4': {'tTotal': 2.372647285461426},
  'Number of partition8': {'tTotal': 1.561168909072876},
  'Number of partition16': {'tTotal': 1.9718973636627197},
  'Number of partition32': {'tTotal': 2.4401488304138184},
  'Number of partition64': {'tTotal': 3.2234396934509277},
  'Number of partition128': {'tTotal': 5.893186569213867},
  'Number of partition256': {'tTotal': 9.937079906463623}},
 'totalLogNaiveKmeans': {'Number of partition2': {'CostsKmeans': [42516.03346356709,
    30489.899569214176,
    28870.557007268813,
    27552.014438794737,
    23278.24060503552,
    18984.3147713026,
    18760.240816989506,
    18693.13101953215,
    18670.316806382605,
    18655.67316314641],
   'tIterations': [23.019476652145386,
    20.80378556251526,
    23.17156481742859,
    21.14793062210083,
    21.443459510803223,
    20.854639530181885,
    21.03282380104065,
    20.922189474105835,
    20.9952

In [None]:
# Kill spark and the context
sc.stop()
#spark.stop()