# Spark RAPIDS Parquet acceleration



<a target="_blank" href="https://colab.research.google.com/github/NVIDIA/spark-rapids-examples/blob/main/examples/SQL%2BDF-Examples/demo/Spark_parquet_microkernels.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>



Before getting started - be sure to change your runtime to use a GPU Hardware accelerator! Use the Runtime -> "Change runtime type" menu option to add a GPU.

# Let's get started using the RAPIDS Accelerator for Apache Spark

In [None]:
!nvidia-smi

In [None]:
!cat /proc/cpuinfo

In [None]:
spark_version='3.5.0'
rapids_version='24.12.0'

In [None]:
%pip install --quiet \
  pyspark=={spark_version}

In [None]:
from importlib.resources import files
from pyspark.sql import SparkSession
import glob
import os
import re
import time
import statistics

In [None]:
pyspark_files = files('pyspark')
spark_sql_jar_path, *_ = glob.glob(f"{pyspark_files}/*/spark-sql_*jar")
spark_sql_jar = os.path.basename(spark_sql_jar_path)
scala_version = re.search(r'^spark-sql_(\d+.\d+)-.*\.jar$', spark_sql_jar).group(1)

In [None]:
spark = (
    SparkSession.builder
      .appName('Parquet Spark GPU Acceleration')
      .master('local[*]')
      .config('spark.driver.memory', '5g')
      .config('spark.plugins', 'com.nvidia.spark.SQLPlugin')
      .config('spark.jars.packages', f"com.nvidia:rapids-4-spark_{scala_version}:{rapids_version}")
      .getOrCreate()
)
spark

In [None]:
location = "./TMP_DATA"
iters = 5

In [None]:
from pyspark.sql.types import IntegerType, StringType, StructType, StructField
from pyspark.sql import functions as F
import random
import string

# Define schema
schema = StructType([
    StructField("id", IntegerType(), False),
    StructField("name", StringType(), False),
    StructField("age", IntegerType(), False),
    StructField("salary", IntegerType(), False)
])

# Function to generate random strings
def random_string(length=10):
    return ''.join(random.choices(string.ascii_letters, k=length))

# Generate DataFrame with 20M rows
df = spark.range(0, 20_000_000).toDF("id") \
    .withColumn("name", F.udf(lambda: random_string(), StringType())()) \
    .withColumn("age", (F.rand() * 50 + 20).cast(IntegerType())) \
    .withColumn("salary", (F.rand() * 100000 + 30000).cast(IntegerType()))

df.write.mode("overwrite").parquet(location)

In [None]:
# Run the Parquet scan test on the GPU
spark.conf.set("spark.rapids.sql.enabled",True)
gpu_times = []
for i in range(iters):
    start = time.time()
    df = spark.read.parquet(location).selectExpr("count(name) as rows", "avg(salary) as average_salary", "median(salary) as median_salary", "sum(salary) as total_salary", "avg(age) as average_age", "median(age) as median_age")
    if i == 0:
      df.show()
    else:
      df.collect()
    end = time.time()
    gpu_times.append(end - start)

gpu_median = statistics.median(gpu_times)

print(f"Median execution time of {iters} runs for GPU Parquet scan: {gpu_median:.3f}")

In [None]:
# Run the Parquet scan test on the CPU
spark.conf.set("spark.rapids.sql.enabled",False)
cpu_times = []
for i in range(iters):
    start = time.time()
    df = spark.read.parquet(location).selectExpr("count(name) as rows", "avg(salary) as average_salary", "median(salary) as median_salary", "sum(salary) as total_salary", "avg(age) as average_age", "median(age) as median_age")
    if i == 0:
      df.show()
    else:
      df.collect()
    end = time.time()
    cpu_times.append(end - start)

cpu_median = statistics.median(cpu_times)
print(f"Median execution time of {iters} runs for CPU Parquet scan: {cpu_median:.3f}")

In [None]:
# GPU speedup should be in the range of 5-10x
speedup = cpu_median / gpu_median
print(f"GPU speedup: {speedup:.2f}x")