# KMeans - no import change

In [None]:
# The below will redirect all subsequent pyspark.ml imports (e.g. pyspark.ml.clustering.Kmeans)
# to GPU accelerated spark_rapids_ml counterparts.   
# Comment out and restart kernel to revert to CPU mode.
import spark_rapids_ml.install

In [0]:
import numpy as np
import pandas as pd
import time

### Create synthetic dataset

In [None]:
### Create synthetic dataset
n_rows = 1000000
n_cols = 500
n_clusters_data = 200
cluster_std = 1.0
dtype='float32'
from sklearn.datasets import make_blobs
data, _ = make_blobs(
        n_samples=n_rows, n_features=n_cols, centers=n_clusters_data, cluster_std=cluster_std, random_state=0
    )  # make_blobs creates a random dataset of isotropic gaussian blobs.

data = data.astype(dtype)

### Convert dataset to Spark DataFrame with Vector type

In [0]:
pd_data = pd.DataFrame({"feature_array": list(data)})
df = spark.createDataFrame(pd_data)

In [0]:
from pyspark.ml.functions import array_to_vector

In [0]:
vector_df = df.select(array_to_vector(df.feature_array).alias("features")).drop("feature_array")

In [0]:
vector_df

## Spark RAPIDS ML (GPU)

If `import spark_rapids_ml.install` in the first cell was executed the below will redirect to the GPU accelerated `KMeans` in `spark_rapids_ml`.

In [0]:
from pyspark.ml.clustering import KMeans
kmeans = ( KMeans(initMode="random")
            .setTol(1.0e-20)        
            .setK(200)
            .setFeaturesCol("features")
            .setMaxIter(15)
        )

In [0]:
type(kmeans)

Estimator can be persisted and reloaded.

In [0]:
estimator_path = "/tmp/kmeans-estimator"

In [0]:
kmeans.write().overwrite().save(estimator_path)
kmeans_loaded = KMeans.load(estimator_path)

### Fit

In [0]:
start_time = time.time()
model = kmeans_loaded.fit(vector_df)
print(f"Fit took: {time.time() - start_time} sec")

In [0]:
kmeans_loaded.getK()

In [0]:
sorted_clusters = sorted([vec.tolist() for vec in model.clusterCenters()])

In [0]:
[vec[0:10] for vec in sorted_clusters[0:2]]

### Transform

In [0]:
model_path = "/tmp/kmeans-model"

In [0]:
model.write().overwrite().save(model_path)

In [0]:
model_loaded = model.read().load(model_path)

In [0]:
[vec[0:10] for vec in sorted([vec.tolist() for vec in model.clusterCenters()])[0:2]]

In [0]:
transformed_df = model_loaded.setPredictionCol("transformed").transform(vector_df)

In [0]:
transformed_df.printSchema()

In [0]:
transformed_df.count()

In [0]:
transformed_df.show(25)