# KMeans

In [None]:
import numpy as np
import pandas as pd
import time

### Create synthetic dataset

In [None]:
n_rows = 1000000
n_cols = 500
n_clusters_data = 200
cluster_std = 1.0
# our transform code doesn't yet work with np.float32
#dtype='float32'
dtype='float64'
from cuml.datasets import make_blobs
data, _ = make_blobs(
        n_rows, n_cols, n_clusters_data, cluster_std=cluster_std, random_state=0, dtype=dtype
    )  # make_blobs creates a random dataset of isotropic gaussian blobs.

data = data.get()

### Convert dataset to Spark DataFrame

In [None]:
# pd_data = pd.DataFrame({"features": list(data)})
# df = spark.createDataFrame(pd_data)
df = sc.parallelize(list(data)).map(lambda x: [x.tolist()]).toDF(["features"])

## Spark-RAPIDS ML

In [None]:
from spark_rapids_ml.clustering import KMeans

In [None]:
num_workers = 2
n_clusters = 200
max_iter = 30

In [None]:
gpu_kmeans = KMeans(num_workers=num_workers,tol=0.0)\
            .setK(n_clusters)\
            .setFeaturesCol("features")\
            .setMaxIter(max_iter)

Estimator can be persisted and reloaded.

In [None]:
estimator_path = "dbfs:/tmp/sparkcuml-kmeans-estimator"

In [None]:
gpu_kmeans.write().overwrite().save(estimator_path)
gpu_kmeans_loaded = KMeans.load(estimator_path)

### Fit

In [None]:
start_time = time.time()
gpu_model = gpu_kmeans_loaded.fit(df)
print(f"Fit took: {time.time() - start_time} sec")

In [None]:
gpu_kmeans_loaded.getOrDefault("n_clusters")

In [None]:
sorted_clusters = sorted(gpu_model.cluster_centers_)

In [None]:
[vec[0:10] for vec in sorted_clusters[0:2]]

### Transform

In [None]:
model_path = "dbfs:/tmp/sparkcuml-kmeans-model"

In [None]:
gpu_model.write().overwrite().save(model_path)

In [None]:
gpu_model_loaded = gpu_model.read().load(model_path)

In [None]:
[vec[0:10] for vec in sorted(gpu_model_loaded.cluster_centers_)[0:2]]

In [None]:
transformed_df = gpu_model_loaded.setPredictionCol("transformed").transform(df)

In [None]:
transformed_df.printSchema()

In [None]:
transformed_df.count()

In [None]:
transformed_df.show(10)

## PySpark ML

In [None]:
from pyspark.ml.clustering import KMeans

In [None]:
spark_ml_kmeans = KMeans()\
    .setFeaturesCol("features")\
    .setK(n_clusters)\
    .setTol(0.0)\
    .setMaxIter(max_iter)

Convert array sql type to VectorUDT expected by Spark ML algos

In [None]:
from pyspark.ml.functions import array_to_vector

In [None]:
vector_df = df.select(array_to_vector(df.features).alias("features"))

### Fit

In [None]:
start_time = time.time()
spark_ml_kmeans_model = spark_ml_kmeans.fit(vector_df)
print(f"Fit took: {time.time() - start_time} sec")

In [None]:
type(spark_ml_kmeans_model.clusterCenters()[0])

In [None]:
sorted_spark_ml_cluster_centers = sorted([vec.tolist() for vec in spark_ml_kmeans_model.clusterCenters()])
[vec[0:10] for vec in sorted_spark_ml_cluster_centers]

### Transform

In [None]:
spark_transformed = spark_ml_kmeans_model.setPredictionCol("transformed").transform(vector_df)

In [None]:
spark_transformed.filter(spark_transformed.transformed >= 0).count()

In [None]:
spark_transformed.show(10)

## cuML (single-node on driver)

In [None]:
from cuml import KMeans

In [None]:
cuml_kmeans = KMeans(n_clusters=200, max_iter=max_iter, tol=0.0, verbose=6)

### Fit

In [None]:
start_time = time.time()
cuml_model = cuml_kmeans.fit(data)
print(f"Fit took: {time.time() - start_time} sec")

In [None]:
cuml_model.n_iter_

In [None]:
cuml_centers_sorted = sorted([vec.tolist() for vec in cuml_model.cluster_centers_])
[vec[0:10] for vec in cuml_centers_sorted[0:2]]