## Principal Component Analysis (PCA)

In this notebook, we will demonstrate the end-to-end workflow of Spark RAPIDS accelerated PCA.

In [1]:
import numpy as np
import pandas as pd
import time
import os

In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

def get_rapids_jar():
    import os
    import requests

    SPARK_RAPIDS_VERSION = "24.08.1"
    rapids_jar = f"rapids-4-spark_2.12-{SPARK_RAPIDS_VERSION}.jar"
    if not os.path.exists(rapids_jar):
        print("Downloading spark rapids jar")
        url = f"https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/{SPARK_RAPIDS_VERSION}/{rapids_jar}"
        response = requests.get(url)
        if response.status_code == 200:
            with open(rapids_jar, "wb") as f:
                f.write(response.content)
            print(f"File '{rapids_jar}' downloaded and saved successfully.")
        else:
            print(f"Failed to download the file. Status code: {response.status_code}")
    else:
        print("File already exists. Skipping download.")
    return rapids_jar

def initialize_spark(rapids_jar: str):
    '''
    If no active Spark session is found, initialize and configure a new one. 
    '''
    import socket
    hostname = socket.gethostname()

    conf = SparkConf()
    conf.setMaster(f"spark://{hostname}:7077") # Assuming master is on host and default port. 
    conf.set("spark.task.maxFailures", "1")
    conf.set("spark.driver.memory", "10g")
    conf.set("spark.executor.memory", "8g")
    conf.set("spark.rpc.message.maxSize", "1024")
    conf.set("spark.sql.pyspark.jvmStacktrace.enabled", "true")
    conf.set("spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled", "false")
    conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
    conf.set("spark.python.worker.reuse", "true")
    conf.set("spark.rapids.ml.uvm.enabled", "true")
    conf.set("spark.jars", rapids_jar)
    conf.set("spark.executorEnv.PYTHONPATH", rapids_jar)
    conf.set("spark.rapids.memory.gpu.minAllocFraction", "0.0001")
    conf.set("spark.plugins", "com.nvidia.spark.SQLPlugin")
    conf.set("spark.locality.wait", "0s")
    conf.set("spark.sql.cache.serializer", "com.nvidia.spark.ParquetCachedBatchSerializer")
    conf.set("spark.rapids.memory.gpu.pooling.enabled", "false")
    conf.set("spark.sql.execution.sortBeforeRepartition", "false")
    conf.set("spark.rapids.sql.format.parquet.reader.type", "MULTITHREADED")
    conf.set("spark.rapids.sql.format.parquet.multiThreadedRead.maxNumFilesParallel", "20")
    conf.set("spark.rapids.sql.multiThreadedRead.numThreads", "20")
    conf.set("spark.rapids.sql.python.gpu.enabled", "true")
    conf.set("spark.rapids.memory.pinnedPool.size", "2G")
    conf.set("spark.python.daemon.module", "rapids.daemon")
    conf.set("spark.rapids.sql.batchSizeBytes", "512m")
    conf.set("spark.sql.adaptive.enabled", "false")
    conf.set("spark.sql.files.maxPartitionBytes", "512m")
    conf.set("spark.rapids.sql.concurrentGpuTasks", "1")
    conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "20000")
    conf.set("spark.rapids.sql.explain", "NONE")
    
    spark = SparkSession.builder.appName("spark-rapids-ml-pca").config(conf=conf).getOrCreate()
    return spark

# Check if Spark session is already active, if not, initialize it
if 'spark' not in globals():
    print("No active Spark session found, initializing manually.")
    rapids_jar = os.environ.get('RAPIDS_JAR')
    if rapids_jar is None:
        rapids_jar = get_rapids_jar()
    spark = initialize_spark(rapids_jar)
else:
    print("Using existing Spark session.")

No active Spark session found, initializing manually.
File already exists. Skipping download.


24/10/04 18:04:27 WARN Utils: Your hostname, cb4ae00-lcedt resolves to a loopback address: 127.0.1.1; using 10.110.47.100 instead (on interface eno1)
24/10/04 18:04:27 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/10/04 18:04:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/04 18:04:27 WARN RapidsPluginUtils: RAPIDS Accelerator 24.08.1 using cudf 24.08.0, private revision 9fac64da220ddd6bf5626bd7bd1dd74c08603eac
24/10/04 18:04:27 WARN RapidsPluginUtils: RAPIDS Accelerator is enabled, to disable GPU support set `spark.rapids.sql.enabled` to false.
24/10/04 18:04:31 WARN GpuDeviceManager: RMM pool is disabled since spark.rapids.memory.gpu.pooling.enabled is set to false; however, this configuration is deprecated and the behavior may change in a futur

### Generate synthetic dataset

Here we generate a 100,000 x 2048 random dataset.

In [3]:
rows = 100000
dim = 2048
dtype = 'float32'
np.random.seed(42)

data = np.random.rand(rows, dim).astype(dtype)
pd_data = pd.DataFrame({"features": list(data)})
prepare_df = spark.createDataFrame(pd_data)
prepare_df.write.mode("overwrite").parquet("PCA_data.parquet")

24/10/04 18:04:45 WARN TaskSetManager: Stage 0 contains a task of very large size (160085 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

#### Spark-RAPIDS-ML accepts ArrayType input

Note that in the original Spark-ML PCA, we must `Vectorize` the input column:

```python
from pyspark.ml.linalg import Vectors
data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
    (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
    (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]
df = spark.createDataFrame(data,["features"])
df.show()
```

...whereas the Spark-RAPIDS-ML version does not require extra Vectorization, and can accept an ArrayType column as the input column:

In [4]:
data_df = spark.read.parquet("PCA_data.parquet")
data_df.printSchema()

root
 |-- features: array (nullable = true)
 |    |-- element: float (containsNull = true)



### Using Spark-RAPIDS-ML PCA (GPU)

Compared to the Spark-ML PCA training API:

```python
from pyspark.ml.feature import PCA
pca = PCA(k=3, inputCol="features")
pca.setOutputCol("pca_features")
```

We use a customized class which requires **no code change** from the user to enjoy GPU acceleration:

```python
from spark_rapids_ml.feature import PCA
pca = PCA(k=3, inputCol="features")
pca.setOutputCol("pca_features")
```

In [5]:
from spark_rapids_ml.feature import PCA

gpu_pca = PCA(k=2, inputCol="features")
gpu_pca.setOutputCol("pca_features")

PCA_570681141389

The PCA estimator object can be persisted and reloaded.

In [6]:
estimator_path = "/tmp/pca_estimator"
gpu_pca.write().overwrite().save(estimator_path)
gpu_pca_loaded = PCA.load(estimator_path)

#### Fit

In [7]:
start_time = time.time()
gpu_pca_model = gpu_pca_loaded.fit(data_df)
gpu_fit_time = time.time() - start_time
print(f"GPU PCA fit took: {gpu_fit_time} sec")

24/10/04 18:04:58 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 32 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20
2024-10-04 18:04:58,487 - spark_rapids_ml.feature.PCA - INFO - CUDA managed memory enabled.
2024-10-04 18:04:58,570 - spark_rapids_ml.feature.PCA - INFO - Training spark-rapids-ml with 1 worker(s) ...
INFO: Process 2762394 found CUDA visible device(s): 0
2024-10-04 18:05:01,613 - spark_rapids_ml.feature.PCA - INFO - Loading data into python worker memory
2024-10-04 18:05:02,551 - spark_rapids_ml.feature.PCA - INFO - Initializing cuml context
2024-10-04 18:05:03,795 - spark_rapids_ml.feature.PCA - INFO - Invoking cuml fit
2024-10-04 18:05:05,326 - spark_rapids_ml.feature.PCA - INFO - Cuml fit complete
2024-10-04 18:05:06,858 - spark_rapids_ml.feature.PCA - INFO - Finished training


GPU PCA fit took: 8.90433144569397 sec


#### Transform

In [12]:
start_time = time.time()
embeddings = gpu_pca_model.transform(data_df).select("pca_features").show(truncate=False)
gpu_transform_time = time.time() - start_time
print(f"GPU PCA transform took: {gpu_transform_time} sec")

+---------------------------+
|pca_features               |
+---------------------------+
|[0.062363233, 0.4037608]   |
|[0.49734917, 0.703541]     |
|[0.0035427138, 0.29358602] |
|[-0.06798951, 0.37400067]  |
|[0.10075127, 0.34651726]   |
|[-0.22320557, 0.6660976]   |
|[0.49608234, 0.6761328]    |
|[0.25515205, 0.20352581]   |
|[-0.5102935, 0.319284]     |
|[-0.5109488, 0.2756377]    |
|[0.411546, -0.17954555]    |
|[0.21616393, -0.46268395]  |
|[-0.0924304, 0.65660465]   |
|[0.12355948, 0.9478601]    |
|[0.49234354, 0.63746333]   |
|[-0.86077166, 0.0037032962]|
|[-0.013956882, 0.663955]   |
|[-0.30510652, 0.02372247]  |
|[-0.05999008, 0.28261736]  |
|[0.36605445, 0.9674797]    |
+---------------------------+
only showing top 20 rows

GPU PCA transform took: 0.43911027908325195 sec


### Using Spark-ML PCA (CPU)

In [13]:
from pyspark.ml.feature import PCA

cpu_pca = PCA(k=2, inputCol="features")
cpu_pca.setOutputCol("pca_features")

PCA_58add243f20d

In [14]:
from pyspark.ml.functions import array_to_vector

vector_df = data_df.select(array_to_vector("features").alias("features"))
vector_df.printSchema()

root
 |-- features: vector (nullable = true)



#### Fit

In [15]:
start_time = time.time()
cpu_pca_model = cpu_pca.fit(vector_df)
pca_fit_time = time.time() - start_time
print(f"CPU PCA fit took: {pca_fit_time} sec")

24/10/04 17:07:07 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


CPU PCA fit took: 63.37388610839844 sec


#### Transform

In [16]:
start_time = time.time()
embeddings = cpu_pca_model.transform(vector_df).select("pca_features").show(truncate=False)
pca_transform_time = time.time() - start_time
print(f"CPU PCA transform took: {pca_transform_time} sec")

+-------------------------------------------+
|pca_features                               |
+-------------------------------------------+
|[0.24926765828229927,0.3425432972889563]   |
|[-0.5175207040808384,0.48893065865444574]  |
|[-0.2505049373829902,0.381272141155778]    |
|[-0.39046980420292005,0.4870705091697811]  |
|[-0.4024088726395023,0.707133448810984]    |
|[-0.3061227832285992,0.5363554872099332]   |
|[-0.6065136982526093,0.5205197626985932]   |
|[-0.21870566838630084,0.6516598402789231]  |
|[0.1910036552854184,0.6336513389989592]    |
|[0.6139537641786907,0.6055187085018856]    |
|[-0.026502904776425647,-0.0366087508156753]|
|[-0.2989311781309336,-0.05136110567458389] |
|[-0.5474468086054212,-0.18779964958125014] |
|[-0.6644746232216499,0.10351178251944647]  |
|[-0.12685301272617464,0.47394431583661295] |
|[-0.4355221246718862,-0.00346289187881239] |
|[0.6222719258951077,0.5488293416698503]    |
|[0.04966907735703511,0.7138677407505005]   |
|[0.6260486995906139,0.35532284504

### Summary

In [27]:
speedup = (pca_fit_time + pca_transform_time) / (gpu_fit_time + gpu_transform_time)
print(f"CPU runtime: ({pca_fit_time:.2f}s + {pca_transform_time:.2f}s)")
print(f"GPU runtime: ({gpu_fit_time:.2f}s + {gpu_transform_time:.2f}s)")
print(f"End-to-end speedup: CPU / GPU = {speedup:.2f}x")

CPU runtime: (64.02s + 0.20s)
GPU runtime: (8.76s + 0.42s)
End-to-end speedup: CPU / GPU = 7.00x
