# Principal Component Analysis (PCA)

In [None]:
import numpy as np
import pandas as pd
import time

### Create synthetic dataset

Here we create a rank 1 matrix.  All rows are a multiple of a single vector.  PCA should identify this vector direction as capturing all the variance in the data.

In [None]:
num_vecs = 100000
dim = 2000
dtype = 'float32'
vec = np.random.randn(dim).reshape([1,dim])
arr = np.random.randn(num_vecs).reshape([num_vecs,1])
data = arr * vec
data = data.astype(dtype)

### Convert dataset to Spark DataFrame

In [None]:
pd_data = pd.DataFrame({"features": list(data)})
df = spark.createDataFrame(pd_data)

### We will use this function to build both the Spark RAPIDS ML (GPU) and Spark ML (CPU) linear estimator objects, demonstrating the common API

In [None]:
def build_pca_estimator(estimator_class):
    return ( 
            estimator_class()
            .setK(2)
            .setInputCol("features")
           )

## Spark RAPIDS ML (GPU)

In [None]:
from spark_rapids_ml.feature import PCA
gpu_pca = build_pca_estimator(PCA)

PCA estimator can be persisted and reloaded.

In [None]:
estimator_path = "/tmp/pca-estimator"

In [None]:
gpu_pca.write().overwrite().save(estimator_path)
gpu_pca_loaded = PCA.load(estimator_path)

### Fit

In [None]:
start_time = time.time()
gpu_model = gpu_pca_loaded.fit(df)
print(f"Fit took: {time.time() - start_time} sec")

Verify that all variance is captured by the first component.

In [None]:
gpu_model.explainedVariance

In [None]:
gpu_model.mean[0:10]

In [None]:
gpu_model.pc

In [None]:
gpu_model.pc.toArray()[0:10,:]

Verify that the computed largest principal component vector is a multiple of the original vector used to compute the rows of the rank 1 data.

In [None]:
np.divide(vec,gpu_model.pc.toArray()[:,0])[0:10]

### Transform

In [None]:
model_path = "/tmp/pca-model"

In [None]:
gpu_model.write().overwrite().save(model_path)

In [None]:
gpu_model_loaded = gpu_model.load(model_path)

In [None]:
gpu_model_loaded.mean[0:10]

In [None]:
gpu_model_loaded.explainedVariance

In [None]:
gpu_model_loaded.pc.toArray()[0:10,:]

In [None]:
transformed_df = gpu_model.setOutputCol("transformed").transform(df.repartition(gpu_model.num_workers))

In [None]:
transformed_df.printSchema()

In [None]:
transformed_df.count()

In [None]:
transformed_df.show(10)

## Spark ML (CPU)

In [None]:
from pyspark.ml.feature import PCA
cpu_pca = build_pca_estimator(PCA)

Convert array sql type to VectorUDT expected by Spark ML algos (Note: Spark RAPIDS ML also accepts VectorUDT Dataframes in addition to array type Dataframe above, along with a scalar column format - see docs).

In [None]:
from pyspark.ml.functions import array_to_vector

In [None]:
vector_df = df.select(array_to_vector(df.features).alias("features"))

In [None]:
vector_df.printSchema()

### Fit

In [None]:
start_time = time.time()
cpu_pca_model = cpu_pca.fit(vector_df)
print(f"Fit took: {time.time() - start_time} sec")

In [None]:
cpu_pca_model.explainedVariance

In [None]:
pc = cpu_pca_model.pc

In [None]:
np.divide(vec,pc.toArray()[:,0])[0][0:10]