## Principal Component Analysis (PCA)

In this notebook, we will demonstrate the end-to-end workflow of Spark RAPIDS accelerated PCA.

In [1]:
import os
import requests
import numpy as np
import pandas as pd
import time

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, FloatType
from pyspark.sql import functions as F
from pyspark import SparkConf
from pyspark.sql.functions import pandas_udf

In [3]:
### Download Spark Rapids jar ###

SPARK_RAPIDS_VERSION = "24.08.1"
rapids_jar = f"rapids-4-spark_2.12-{SPARK_RAPIDS_VERSION}.jar"

if not os.path.exists(rapids_jar):
    print("Downloading spark rapids jar")
    url = f"https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/{SPARK_RAPIDS_VERSION}/{rapids_jar}"
    response = requests.get(url)
    if response.status_code == 200:
        with open(rapids_jar, "wb") as f:
            f.write(response.content)
        print(f"File '{rapids_jar}' downloaded and saved successfully.")
    else:
        print(f"Failed to download the file. Status code: {response.status_code}")
else:
    print("File already exists. Skipping download.")

Downloading spark rapids jar
File 'rapids-4-spark_2.12-24.08.1.jar' downloaded and saved successfully.


In [None]:
### Configure Spark Session ###
conda_env = os.environ.get("CONDA_PREFIX")

conf = SparkConf()
conf.setMaster(f"spark://{hostname}:7077") # Set to your hostname
conf.set("spark.task.maxFailures", "1")
conf.set("spark.driver.memory", "10g")
conf.set("spark.executor.memory", "8g")
conf.set("spark.rpc.message.maxSize", "1024")
conf.set("spark.sql.pyspark.jvmStacktrace.enabled", "true")
conf.set("spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled", "false")
conf.set("spark.sql.pyspark.jvmStacktrace.enabled", "true")
conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
conf.set("spark.python.worker.reuse", "true")
conf.set("spark.rapids.ml.uvm.enabled", "true")
conf.set("spark.jars", rapids_jar)
conf.set("spark.executorEnv.PYTHONPATH", rapids_jar)
conf.set("spark.rapids.memory.gpu.minAllocFraction", "0.0001")
conf.set("spark.plugins", "com.nvidia.spark.SQLPlugin")
conf.set("spark.locality.wait", "0s")
conf.set("spark.sql.cache.serializer", "com.nvidia.spark.ParquetCachedBatchSerializer")
conf.set("spark.rapids.memory.gpu.pooling.enabled", "false")
conf.set("spark.sql.execution.sortBeforeRepartition", "false")
conf.set("spark.rapids.sql.format.parquet.reader.type", "MULTITHREADED")
conf.set("spark.rapids.sql.format.parquet.multiThreadedRead.maxNumFilesParallel", "20")
conf.set("spark.rapids.sql.multiThreadedRead.numThreads", "20")
conf.set("spark.rapids.sql.python.gpu.enabled", "true")
conf.set("spark.rapids.memory.pinnedPool.size", "2G")
conf.set("spark.python.daemon.module", "rapids.daemon")
conf.set("spark.rapids.sql.batchSizeBytes", "512m")
conf.set("spark.sql.adaptive.enabled", "false")
conf.set("spark.sql.files.maxPartitionBytes", "512m")
conf.set("spark.rapids.sql.concurrentGpuTasks", "1")
conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "20000")
conf.set("spark.rapids.sql.explain", "NONE")
# Create Spark Session
spark = SparkSession.builder.appName("spark-rapids-ml-pca").config(conf=conf).getOrCreate()
sc = spark.sparkContext

### Generate synthetic dataset

Here we generate a 100,000 x 2048 random dataset.

In [7]:
rows = 100000
dim = 2048
dtype = 'float32'
np.random.seed(42)

data = np.random.rand(rows, dim).astype(dtype)
pd_data = pd.DataFrame({"features": list(data)})
prepare_df = spark.createDataFrame(pd_data)
prepare_df.write.mode("overwrite").parquet("PCA_data.parquet")

24/10/03 23:19:53 WARN TaskSetManager: Stage 0 contains a task of very large size (160085 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

#### Spark-RAPIDS-ML accepts ArrayType input

Note that in the original Spark-ML PCA, we must `Vectorize` the input column:

```python
from pyspark.ml.linalg import Vectors
data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
    (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
    (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]
df = spark.createDataFrame(data,["features"])
df.show()
```

...whereas the Spark-RAPIDS-ML version does not require extra Vectorization, and can accept an ArrayType column as the input column:

In [8]:
data_df = spark.read.parquet("PCA_data.parquet")
data_df.printSchema()

root
 |-- features: array (nullable = true)
 |    |-- element: float (containsNull = true)



### Using Spark-RAPIDS-ML PCA (GPU)

Compared to the Spark-ML PCA training API:

```python
from pyspark.ml.feature import PCA
pca = PCA(k=3, inputCol="features")
pca.setOutputCol("pca_features")
```

We use a customized class which requires **no code change** from the user to enjoy GPU acceleration:

```python
from spark_rapids_ml.feature import PCA
pca = PCA(k=3, inputCol="features")
pca.setOutputCol("pca_features")
```

In [9]:
from spark_rapids_ml.feature import PCA

gpu_pca = PCA(k=2, inputCol="features")
gpu_pca.setOutputCol("pca_features")

PCA_bcd33d128594

The PCA estimator object can be persisted and reloaded.

In [10]:
estimator_path = "/tmp/pca_estimator"
gpu_pca.write().overwrite().save(estimator_path)
gpu_pca_loaded = PCA.load(estimator_path)

#### Fit

In [11]:
start_time = time.time()
gpu_pca_model = gpu_pca_loaded.fit(data_df)
gpu_fit_time = time.time() - start_time
print(f"GPU PCA fit took: {gpu_fit_time} sec")

2024-10-03 23:20:00,131 - spark_rapids_ml.feature.PCA - INFO - CUDA managed memory enabled.
2024-10-03 23:20:00,239 - spark_rapids_ml.feature.PCA - INFO - Stage-level scheduling in spark-rapids-ml requires spark.executor.cores, spark.executor.resource.gpu.amount to be set.
2024-10-03 23:20:00,241 - spark_rapids_ml.feature.PCA - INFO - Training spark-rapids-ml with 1 worker(s) ...
2024-10-03 23:20:08,190 - spark_rapids_ml.feature.PCA - INFO - Finished training


GPU PCA fit took: 8.758668899536133 sec


#### Transform

In [12]:
start_time = time.time()
embeddings = gpu_pca_model.transform(data_df).select("pca_features").show(truncate=False)
gpu_transform_time = time.time() - start_time
print(f"GPU PCA transform took: {gpu_transform_time} sec")

+---------------------------+
|pca_features               |
+---------------------------+
|[-0.24846071, 0.33562037]  |
|[0.5184792, 0.48330337]    |
|[0.2507918, 0.3815673]     |
|[0.39100257, 0.4842953]    |
|[0.4037514, 0.70158374]    |
|[0.30750397, 0.5324805]    |
|[0.6082078, 0.5151396]     |
|[0.21961018, 0.64743024]   |
|[-0.1901558, 0.63220304]   |
|[-0.61287963, 0.5951108]   |
|[0.027350709, -0.03707385] |
|[0.29946682, -0.05652547]  |
|[0.54797435, -0.198609]    |
|[0.6652416, 0.10023773]    |
|[0.12782758, 0.46697623]   |
|[0.43612525, -0.0074159503]|
|[-0.62129164, 0.54278356]  |
|[-0.048607834, 0.7038538]  |
|[-0.6254531, 0.35484123]   |
|[-0.16294907, 0.7283848]   |
+---------------------------+
only showing top 20 rows

GPU PCA transform took: 0.4192674160003662 sec


### Using Spark-ML PCA (CPU)

In [13]:
from pyspark.ml.feature import PCA

cpu_pca = PCA(k=2, inputCol="features")
cpu_pca.setOutputCol("pca_features")

PCA_791eff82b929

In [14]:
from pyspark.ml.functions import array_to_vector

vector_df = data_df.select(array_to_vector("features").alias("features"))
vector_df.printSchema()

root
 |-- features: vector (nullable = true)



#### Fit

In [15]:
start_time = time.time()
cpu_pca_model = cpu_pca.fit(vector_df)
pca_fit_time = time.time() - start_time
print(f"CPU PCA fit took: {pca_fit_time} sec")

24/10/03 23:20:58 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


CPU PCA fit took: 64.01513171195984 sec


#### Transform

In [16]:
start_time = time.time()
embeddings = cpu_pca_model.transform(vector_df).select("pca_features").show(truncate=False)
pca_transform_time = time.time() - start_time
print(f"CPU PCA transform took: {pca_transform_time} sec")

+-------------------------------------------+
|pca_features                               |
+-------------------------------------------+
|[0.24926765828229927,0.3425432972889563]   |
|[-0.5175207040808384,0.48893065865444574]  |
|[-0.2505049373829902,0.381272141155778]    |
|[-0.39046980420292005,0.4870705091697811]  |
|[-0.4024088726395023,0.707133448810984]    |
|[-0.3061227832285992,0.5363554872099332]   |
|[-0.6065136982526093,0.5205197626985932]   |
|[-0.21870566838630084,0.6516598402789231]  |
|[0.1910036552854184,0.6336513389989592]    |
|[0.6139537641786907,0.6055187085018856]    |
|[-0.026502904776425647,-0.0366087508156753]|
|[-0.2989311781309336,-0.05136110567458389] |
|[-0.5474468086054212,-0.18779964958125014] |
|[-0.6644746232216499,0.10351178251944647]  |
|[-0.12685301272617464,0.47394431583661295] |
|[-0.4355221246718862,-0.00346289187881239] |
|[0.6222719258951077,0.5488293416698503]    |
|[0.04966907735703511,0.7138677407505005]   |
|[0.6260486995906139,0.35532284504

### Summary

In [27]:
speedup = (pca_fit_time + pca_transform_time) / (gpu_fit_time + gpu_transform_time)
print(f"CPU runtime: ({pca_fit_time:.2f}s + {pca_transform_time:.2f}s)")
print(f"GPU runtime: ({gpu_fit_time:.2f}s + {gpu_transform_time:.2f}s)")
print(f"End-to-end speedup: CPU / GPU = {speedup:.2f}x")

CPU runtime: (64.02s + 0.20s)
GPU runtime: (8.76s + 0.42s)
End-to-end speedup: CPU / GPU = 7.00x
