In [0]:
from sparkcuml.decomposition import SparkCumlPCA
import numpy as np
import pandas as pd
import time


### Create synthetic dataset

In [0]:
num_vecs = 100000
dim = 3000
vec = np.random.randn(dim).reshape([1,dim])
arr = np.random.randn(num_vecs).reshape([num_vecs,1])
data = arr * vec
data = data

In [0]:
pd_data = pd.DataFrame({"features": list(data)})

In [0]:
df = spark.createDataFrame(pd_data)

### Run Cuml-on-Spark PCA.fit

In [0]:
num_workers = 2
topk = 2

In [0]:
gpu_pca = SparkCumlPCA(num_workers=num_workers,verbose=6).setInputCol("features").setK(topk)

PCA estimator can be persisted and reloaded.

In [0]:
gpu_pca.write().overwrite().save("/tmp/sparkcuml-pca-estimator")
gpu_pca_loaded = SparkCumlPCA.load("/tmp/sparkcuml-pca-estimator")


In [0]:
start_time = time.time()
gpu_model = gpu_pca_loaded.fit(df)
print(f"Fit took: {time.time() - start_time} sec")

In [0]:
gpu_model.explained_variance

In [0]:
gpu_model.mean[0:10]

In [0]:
[component[0:10] for component in gpu_model.pc]

In [0]:
np.divide(vec,gpu_model.pc[0])[0][0:10]

### Cuml-on-Spark PCA model save, load, and transform

In [0]:
gpu_model.write().overwrite().save("dbfs:/tmp/sparkcuml-pca-model")

In [0]:
gpu_model_loaded = gpu_model.load("dbfs:/tmp/sparkcuml-pca-model")

In [0]:
gpu_model_loaded.mean[0:10]

In [0]:
gpu_model_loaded.explained_variance

In [0]:
[component[0:10] for component in gpu_model_loaded.pc]

In [0]:
transformed_df = gpu_model.setOutputCol("transformed").transform(df)

In [0]:
transformed_df.printSchema()

In [0]:
transformed_df.count()

In [0]:
transformed_df.show(10)

### Run Spark ML PCA.fit

In [0]:
from pyspark.ml.feature import PCA

In [0]:
spark_ml_pca = PCA().setInputCol("features").setK(topk)

Convert array sql type to VectorUDT expected by Spark ML algos

In [0]:
from pyspark.ml.functions import array_to_vector

In [0]:
vector_df = df.select(array_to_vector(df.features).alias("features"))

In [0]:
vector_df.printSchema()

In [0]:
start_time = time.time()
spark_ml_pca_model = spark_ml_pca.fit(vector_df)
print(f"Fit took: {time.time() - start_time} sec")

In [0]:
spark_ml_pca_model.explainedVariance

In [0]:
pc = spark_ml_pca_model.pc

In [0]:
np.divide(vec,pc.toArray()[:,0])[0][0:10]

### Running cuml.PCA.fit single-node on driver

In [0]:
from cuml import PCA

In [0]:
cuml_pca = PCA(n_components=topk)

In [0]:
cuml_model = cuml_pca.fit(data[:100000,:])

In [0]:
cuml_model.explained_variance_

In [0]:
cuml_model.explained_variance_ratio_