In [0]:
import numpy as np
import pandas as pd
import time

### Create synthetic dataset

In [0]:
n_rows = 1000000
n_cols = 500
n_clusters_data = 200
cluster_std = 1.0
# our transform code doesn't yet work with np.float32
#dtype='float32'
dtype='float64'
from cuml.datasets import make_blobs
data, _ = make_blobs(
        n_rows, n_cols, n_clusters_data, cluster_std=cluster_std, random_state=0, dtype=dtype
    )  # make_blobs creates a random dataset of isotropic gaussian blobs.

data = data.get()

In [0]:
pd_data = pd.DataFrame({"features": list(data)})
df = spark.createDataFrame(pd_data)


### Run Cuml-on-Spark KMeans.fit

In [0]:
from sparkcuml.cluster import SparkCumlKMeans

In [0]:
num_workers = 2
n_clusters = 200
max_iter = 30

In [0]:
gpu_kmeans = SparkCumlKMeans(num_workers=num_workers,tol=0.0)\
            .setK(n_clusters)\
            .setFeaturesCol("features")\
            .setMaxIter(max_iter)

KMeans estimator can be persisted and reloaded.

In [0]:
gpu_kmeans.write().overwrite().save("dbfs:/tmp/sparkcuml-kmeans-estimator")
gpu_kmeans_loaded = SparkCumlKMeans.load("dbfs:/tmp/sparkcuml-kmeans-estimator")


In [0]:
start_time = time.time()
gpu_model = gpu_kmeans_loaded.fit(df)
print(f"Fit took: {time.time() - start_time} sec")

In [0]:
gpu_kmeans_loaded.getOrDefault("n_clusters")

In [0]:
sorted_clusters = sorted(gpu_model.cluster_centers_)

In [0]:
[vec[0:10] for vec in sorted_clusters[0:2]]

### Cuml-on-Spark KMeans model save, load, and transform

In [0]:
gpu_model.write().overwrite().save("dbfs:/tmp/sparkcuml-kmeans-model")

In [0]:
gpu_model_loaded = gpu_model.read().load("dbfs:/tmp/sparkcuml-kmeans-model")

In [0]:
[vec[0:10] for vec in sorted(gpu_model_loaded.cluster_centers_)[0:2]]

In [0]:
transformed_df = gpu_model_loaded.setPredictionCol("transformed").transform(df)

In [0]:
transformed_df.printSchema()

In [0]:
transformed_df.count()

In [0]:
transformed_df.show(10)

### Run PySpark ML KMeans.fit

Timed with 6 tasks per node parallelism, 2 nodes.

In [0]:
from pyspark.ml.clustering import KMeans

In [0]:
spark_ml_kmeans = KMeans()\
    .setFeaturesCol("features")\
    .setK(n_clusters)\
    .setTol(0.0)\
    .setMaxIter(max_iter)

Convert array sql type to VectorUDT expected by Spark ML algos

In [0]:
from pyspark.ml.functions import array_to_vector

In [0]:
vector_df = df.select(array_to_vector(df.features).alias("features"))

In [0]:
start_time = time.time()
spark_ml_kmeans_model = spark_ml_kmeans.fit(vector_df)
print(f"Fit took: {time.time() - start_time} sec")

In [0]:
type(spark_ml_kmeans_model.clusterCenters()[0])

In [0]:
sorted_spark_ml_cluster_centers = sorted([vec.tolist() for vec in spark_ml_kmeans_model.clusterCenters()])
[vec[0:10] for vec in sorted_spark_ml_cluster_centers]

In [0]:
spark_transformed = spark_ml_kmeans_model.setPredictionCol("transformed").transform(vector_df)

In [0]:
spark_transformed.filter(spark_transformed.transformed >= 0).count()

In [0]:
spark_transformed.show(10)

### Running cuml.KMeans.fit single-node on driver

In [0]:
from cuml import KMeans as KMeansCuml

In [0]:
cuml_kmeans = KMeansCuml(n_clusters=200, max_iter=max_iter, tol=0.0, verbose=6)

In [0]:
cuml_model = cuml_kmeans.fit(data)

In [0]:
cuml_model.n_iter_

In [0]:
cuml_centers_sorted = sorted([vec.tolist() for vec in cuml_model.cluster_centers_])
[vec[0:10] for vec in cuml_centers_sorted[0:2]]