In [1]:
import org.apache.spark.ml.linalg._
import org.apache.spark.sql.functions._

### Generate dummy data

In [2]:
val dim = 2048
val rows = 1000
val r = new scala.util.Random(0)

dim = 2048
rows = 1000
r = scala.util.Random@69597699


scala.util.Random@69597699

In [3]:
val dataDf = spark.createDataFrame(
  (0 until rows).map(_ => Tuple1(List.fill(dim)(r.nextDouble)))).withColumnRenamed("_1", "feature")

Waiting for a Spark session to start...

dataDf = [feature: array<double>]


[feature: array<double>]

### Convert array content to Vector to fit Spark ML requirement 

In [4]:
val convertToVector = udf((array: Seq[Float]) => {
  Vectors.dense(array.map(_.toDouble).toArray)
})

convertToVector = SparkUserDefinedFunction($Lambda$3074/2079913946@44fcc164,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,List(Some(class[value[0]: array<float>])),Some(class[value[0]: vector]),None,true,true)


SparkUserDefinedFunction($Lambda$3074/2079913946@44fcc164,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,List(Some(class[value[0]: array<float>])),Some(class[value[0]: vector]),None,true,true)

In [5]:
val vectorDf = dataDf.withColumn("feature", convertToVector(col("feature")))

vectorDf = [feature: vector]


[feature: vector]

### Use Spark RAPIDS accelerated PCA

Comparing to the original PCA training API:

```scala
val pca = new org.apache.spark.ml.feature.PCA()
  .setInputCol("feature")
  .setOutputCol("feature_value_3d")
  .setK(3)
  .fit(vectorDf)
```

We used a customized class and user will need to do `no code change` to enjoy the GPU acceleration:

```scala
val pca = new com.nvidia.spark.ml.feature.PCA()
...
```

Besides, we provide some switch APIs to allow users to highly customize their training process:

```scala
  .useGemm(true) // or false, default: true. Switch to use original BLAS bsr or cuBLAS gemm to compute covariance matrix
  .useCuSolverSVD(true) // or false, default: true. Switch to use original LAPack solver or cuSolver to compute SVD
  .meanCentering(true) // or false, default: true. Wwitch to do mean centering or not before computing covariance matrix
```


In [6]:
val pcaGpu = new com.nvidia.spark.ml.feature.PCA().setInputCol("feature").setOutputCol("pca_features").setK(3)

pcaGpu = pca_f193f5312c9c


pca_f193f5312c9c

In [7]:
val pcaModelGpu = spark.time(pcaGpu.fit(vectorDf))

pcaModelGpu = PCAModel: uid=pca_f193f5312c9c, k=3


Time taken: 6050 ms


PCAModel: uid=pca_f193f5312c9c, k=3

### Note, only training part is accelerated

In [8]:
pcaModelGpu.transform(vectorDf).select("pca_features").show(false)

+--------------------------------------------------------------+
|pca_features                                                  |
+--------------------------------------------------------------+
|[0.15717805548068634,-0.2318699999040465,-0.461671909593393]  |
|[0.7158627381372129,0.13153154407688541,-0.4442954525503737]  |
|[-1.4731836722589815,1.0180008471121302,-1.0826642183281767]  |
|[-0.7029890598027423,1.333891775489114,0.39238250798616214]   |
|[0.2014369637145508,-0.8703550218640328,-0.24322806294266627] |
|[0.275433430680443,1.3346319204901427,-0.4688782254621992]    |
|[-0.28874651925511247,1.6028033903235694,-0.6746507493746138] |
|[0.5316489646267731,-0.41619400253580746,0.37249197734141415] |
|[0.006169252894359221,0.29303167279789405,-0.2506269491180383]|
|[0.8957070727996569,-0.06592993750041293,-1.1845527678574415] |
|[-0.2522528217443958,1.3056536527087803,-0.5279938298955382]  |
|[0.7076659611970204,0.9454528437411879,-1.0475312455277548]   |
|[0.09759587288737456,0.0

### Use original Spark PCA

In [9]:
val pcaCpu = new org.apache.spark.ml.feature.PCA().setInputCol("feature").setOutputCol("pca_features").setK(3)

pcaCpu = pca_f7331970a638


pca_f7331970a638

In [10]:
val pcaModelCpu = spark.time(pcaCpu.fit(vectorDf))

pcaModelCpu = PCAModel: uid=pca_f7331970a638, k=3


Time taken: 26637 ms


PCAModel: uid=pca_f7331970a638, k=3

In [11]:
pcaModelCpu.transform(vectorDf).select("pca_features").show(false)

+--------------------------------------------------------------+
|pca_features                                                  |
+--------------------------------------------------------------+
|[0.1571780554806858,-0.23186999990402402,0.46167190959341087] |
|[0.7158627381372131,0.13153154407689774,0.4442954525503689]   |
|[-1.4731836722589833,1.018000847112193,1.0826642183281039]    |
|[-0.7029890598027354,1.3338917754890973,-0.3923825079861781]  |
|[0.20143696371454542,-0.8703550218640007,0.24322806294262717] |
|[0.27543343068044335,1.334631920490186,0.46887822546212093]   |
|[-0.28874651925511097,1.6028033903236105,0.674650749374526]   |
|[0.5316489646267726,-0.41619400253582495,-0.3724919773413851] |
|[0.006169252894362159,0.29303167279789005,0.25062694911808425]|
|[0.8957070727996495,-0.06592993750035629,1.184552767857375]   |
|[-0.25225282174438984,1.3056536527088072,0.5279938298954825]  |
|[0.7076659611970186,0.9454528437412306,1.0475312455277326]    |
|[0.09759587288736693,0.0

### Note

Some columns in GPU output have different sign from that in CPU output, this is due to the calculation nature of SVD algorithm which doesn't impact the effectiveness of SVD results. More details could be found in the [wiki](https://en.wikipedia.org/wiki/Singular_value_decomposition#Relation_to_eigenvalue_decomposition)