# Principal Component Analysis (PCA)

In [None]:
import numpy as np
import pandas as pd
import time

### Create synthetic dataset

In [None]:
num_vecs = 100000
dim = 3000
vec = np.random.randn(dim).reshape([1,dim])
arr = np.random.randn(num_vecs).reshape([num_vecs,1])
data = arr * vec
data = data

### Convert dataset to Spark DataFrame

In [None]:
# pd_data = pd.DataFrame({"features": list(data)})
# df = spark.createDataFrame(pd_data)
df = sc.parallelize(list(data)).map(lambda x: [x.tolist()]).toDF(["features"])

## Spark RAPIDS ML

In [None]:
from spark_rapids_ml.feature import PCA

In [None]:
num_workers = 2
topk = 2

In [None]:
gpu_pca = PCA(num_workers=num_workers,verbose=6).setInputCol("features").setK(topk)

PCA estimator can be persisted and reloaded.

In [None]:
estimator_path = "dbfs:/tmp/sparkcuml-pca-estimator"

In [None]:
gpu_pca.write().overwrite().save(estimator_path)
gpu_pca_loaded = PCA.load(estimator_path)

### Fit

In [None]:
start_time = time.time()
gpu_model = gpu_pca_loaded.fit(df)
print(f"Fit took: {time.time() - start_time} sec")

In [None]:
gpu_model.explained_variance

In [None]:
gpu_model.mean[0:10]

In [None]:
[component[0:10] for component in gpu_model.pc]

In [None]:
np.divide(vec,gpu_model.pc[0])[0][0:10]

### Transform

In [None]:
model_path = "dbfs:/tmp/sparkcuml-pca-model"

In [None]:
gpu_model.write().overwrite().save(model_path)

In [None]:
gpu_model_loaded = gpu_model.load(model_path)

In [None]:
gpu_model_loaded.mean[0:10]

In [None]:
gpu_model_loaded.explained_variance

In [None]:
[component[0:10] for component in gpu_model_loaded.pc]

In [None]:
transformed_df = gpu_model.setOutputCol("transformed").transform(df)

In [None]:
transformed_df.printSchema()

In [None]:
transformed_df.count()

In [None]:
transformed_df.show(10)

## Spark ML

In [None]:
from pyspark.ml.feature import PCA

In [None]:
spark_ml_pca = PCA().setInputCol("features").setK(topk)

Convert array sql type to VectorUDT expected by Spark ML algos

In [None]:
from pyspark.ml.functions import array_to_vector

In [None]:
vector_df = df.select(array_to_vector(df.features).alias("features"))

In [None]:
vector_df.printSchema()

### Fit

In [None]:
start_time = time.time()
spark_ml_pca_model = spark_ml_pca.fit(vector_df)
print(f"Fit took: {time.time() - start_time} sec")

In [None]:
spark_ml_pca_model.explainedVariance

In [None]:
pc = spark_ml_pca_model.pc

In [None]:
np.divide(vec,pc.toArray()[:,0])[0][0:10]

## cuML (single-node on driver)

In [None]:
from cuml import PCA

In [None]:
cuml_pca = PCA(n_components=topk)

### Fit

In [None]:
cuml_model = cuml_pca.fit(data[:100000,:])

In [None]:
cuml_model.explained_variance_

In [None]:
cuml_model.explained_variance_ratio_