In [1]:
import org.apache.spark.ml.linalg._
import org.apache.spark.sql.functions._

### Generate dummy data

In [2]:
val dim = 2048
val rows = 1000
val r = new scala.util.Random(0)

dim = 2048
rows = 1000
r = scala.util.Random@49299a38


scala.util.Random@49299a38

In [3]:
val dataDf = spark.createDataFrame(
  (0 until rows).map(_ => Tuple1(List.fill(dim)(r.nextDouble)))).withColumnRenamed("_1", "feature")

Waiting for a Spark session to start...

dataDf = [feature: array<double>]


[feature: array<double>]

### Convert array content to Vector to fit Spark ML requirement 

In [4]:
val convertToVector = udf((array: Seq[Double]) => {
  Vectors.dense(array.map(_.toDouble).toArray)
})

convertToVector = SparkUserDefinedFunction($Lambda$3007/784907921@5cc35775,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,List(Some(class[value[0]: array<double>])),None,true,true)


SparkUserDefinedFunction($Lambda$3007/784907921@5cc35775,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,List(Some(class[value[0]: array<double>])),None,true,true)

In [5]:
val vectorDf = dataDf.withColumn("feature_vec", convertToVector(col("feature")))

vectorDf = [feature: array<double>, feature_vec: vector]


[feature: array<double>, feature_vec: vector]

### Use Spark RAPIDS accelerated PCA

Comparing to the original PCA training API:

```scala
val pca = new org.apache.spark.ml.feature.PCA()
  .setInputCol("feature")
  .setOutputCol("feature_value_3d")
  .setK(3)
  .fit(vectorDf)
```

We used a customized class and user will need to do `no code change` to enjoy the GPU acceleration:

```scala
val pca = new com.nvidia.spark.ml.feature.PCA()
...
```

Besides, we provide some switch APIs to allow users to highly customize their training process:

```scala
  .useGemm(true) // or false, default: true. Switch to use original BLAS bsr or cuBLAS gemm to compute covariance matrix
  .useCuSolverSVD(true) // or false, default: true. Switch to use original LAPack solver or cuSolver to compute SVD
  .meanCentering(true) // or false, default: true. Wwitch to do mean centering or not before computing covariance matrix
```


In [6]:
val pcaGpu = new com.nvidia.spark.ml.feature.PCA().setInputCol("feature_vec").setOutputCol("pca_features").setK(3).setTransformInputCol("feature")

pcaGpu = pca_df61aecb5cd7


pca_df61aecb5cd7

In [7]:
val pcaModelGpu = spark.time(pcaGpu.fit(vectorDf))

pcaModelGpu = PCAModel: uid=pca_df61aecb5cd7, k=3


Time taken: 7693 ms


PCAModel: uid=pca_df61aecb5cd7, k=3

### Note, only training part is accelerated

In [8]:
pcaModelGpu.transform(vectorDf).select("pca_features").show(false)

+----------------------------------------------------------------+
|pca_features                                                    |
+----------------------------------------------------------------+
|[0.1571780693548623, -0.23187006467064833, -0.46167188675884646]|
|[0.7158627282461565, 0.13153153200956821, -0.4442953928398168]  |
|[-1.4731837213812808, 1.0180010259859054, -1.0826638289743056]  |
|[-0.7029891385777162, 1.3338918673605578, 0.3923829953456968]   |
|[0.20143689500692416, -0.8703548818323712, -0.2432277880072864] |
|[0.2754334513628796, 1.33463187048047, -0.4688778170587651]     |
|[-0.28874653644571857, 1.6028036414967264, -0.6746504395378097] |
|[0.5316489767411673, -0.41619420672435353, 0.37249197134194695] |
|[0.006169260868807792, 0.2930315481196259, -0.25062686448625626]|
|[0.8957069993141488, -0.0659297400345417, -1.1845525664838452]  |
|[-0.25225280829647395, 1.3056536989607161, -0.5279936240446955] |
|[0.7076658917620148, 0.9454529189449887, -1.0475310703805158]

### Use original Spark PCA

In [9]:
val pcaCpu = new org.apache.spark.ml.feature.PCA().setInputCol("feature_vec").setOutputCol("pca_features").setK(3)

pcaCpu = pca_0e91da90f836


pca_0e91da90f836

In [10]:
val pcaModelCpu = spark.time(pcaCpu.fit(vectorDf))

pcaModelCpu = PCAModel: uid=pca_0e91da90f836, k=3


Time taken: 25938 ms


PCAModel: uid=pca_0e91da90f836, k=3

In [11]:
pcaModelCpu.transform(vectorDf).select("pca_features").show(false)

+--------------------------------------------------------------+
|pca_features                                                  |
+--------------------------------------------------------------+
|[0.15717806935486517,-0.23187006467067675,0.46167188675884124]|
|[0.7158627282461498,0.1315315320096182,0.4442953928397998]    |
|[-1.4731837213812993,1.0180010259858214,1.0826638289743715]   |
|[-0.7029891385777376,1.3338918673605542,-0.39238299534567095] |
|[0.2014368950069466,-0.8703548818324696,0.24322778800733796]  |
|[0.2754334513628797,1.3346318704803797,0.4688778170588576]    |
|[-0.28874653644575704,1.6028036414967575,0.6746504395378663]  |
|[0.531648976741172,-0.41619420672430724,-0.3724919713419888]  |
|[0.006169260868809271,0.2930315481196483,0.2506268644862148]  |
|[0.8957069993141605,-0.06592974003465354,1.1845525664838967]  |
|[-0.2522528082964935,1.3056536989607195,0.5279936240447436]   |
|[0.7076658917620019,0.9454529189449806,1.0475310703805307]    |
|[0.09759584670278604,0.0

### Note

Some columns in GPU output have different sign from that in CPU output, this is due to the calculation nature of SVD algorithm which doesn't impact the effectiveness of SVD results. More details could be found in the [wiki](https://en.wikipedia.org/wiki/Singular_value_decomposition#Relation_to_eigenvalue_decomposition)