# Spark RAPIDS get_json_object acceleration



<a target="_blank" href="https://colab.research.google.com/github/rapidsai-community/showcase/blob/main/getting_started_tutorials/10min_to_cudf_colab.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>



Before getting started - be sure to change your runtime to use a GPU Hardware accelerator! Use the Runtime -> "Change runtime type" menu option to add a GPU.

# Let's get started using the RAPIDS Accelerator for Apache Spark

In [None]:
!nvidia-smi

In [None]:
!cat /proc/cpuinfo

In [None]:
spark_version='3.5.0'
rapids_version='24.12.0'

In [None]:
%pip install --quiet \
  pyspark=={spark_version}

In [None]:
from importlib.resources import files
from pyspark.sql import SparkSession
import glob
import os
import re
import time
import statistics

In [None]:
pyspark_files = files('pyspark')
spark_sql_jar_path, *_ = glob.glob(f"{pyspark_files}/*/spark-sql_*jar")
spark_sql_jar = os.path.basename(spark_sql_jar_path)
scala_version = re.search(r'^spark-sql_(\d+.\d+)-.*\.jar$', spark_sql_jar).group(1)

In [None]:
spark = (
    SparkSession.builder
      .appName('JSON PySpark RAPIDS=ON/OFF')
      .config('spark.driver.memory', '5g')
      .config('spark.plugins', 'com.nvidia.spark.SQLPlugin')
      .config('spark.jars.packages', f"com.nvidia:rapids-4-spark_{scala_version}:{rapids_version}")
      .getOrCreate()
)
spark

In [None]:
location = "./TMP_DATA"
iters = 3

In [None]:
def mk_json_column(i):
    return """ '"', CAST(rand(""" + str(i) + """) * 10000 AS LONG), '":""" + str(i) + """'"""

# generate json lines with very sparse keys
spark.range(1000000).selectExpr("""concat('{', """ + (""", ',' ,""".join([mk_json_column(i) for i in range(100)])) + """'}') as json""").write.mode("overwrite").parquet(location)

In [None]:
# Test pulling out a few keys using the GPU
spark.conf.set("spark.rapids.sql.enabled",True)
gpu_times = []
for i in range(iters):
    start = time.time()
    df = spark.read.parquet(location).selectExpr("count(get_json_object(json,'$.0')) as zero", "count(get_json_object(json,'$.10')) as ten", "count(get_json_object(json,'$.100')) as hundred", "count(get_json_object(json,'$.1000')) as thousand", "count(get_json_object(json,'$.1001')) as thousandAndOne", "avg(octet_length(json)) as len")
    if i == 0:
      df.show()
    else:
      df.collect()
    end = time.time()
    gpu_times.append(end - start)


print(f"Median execution time of {iters} runs for GPU get_json_object: {statistics.median(gpu_times):.3f}")

In [None]:
#  Run the same test using the CPU. Note that this is a exceptional result
#  because Colab provides very little CPU (2 cores) to go with the GPU (T4)
#  on a 16 core AMD CPU that is not overcommited and with an NVMe to load the
#  data, and an A6000 GPU, the GPU takes about 0.662 seconds to complete and
#  the CPU taks about 2.986 seconds, or about a 4.5x speedup, compared to this
#  notebook's ~30x speedup.
spark.conf.set("spark.rapids.sql.enabled",False)
cpu_times = []
for i in range(iters):
    start = time.time()
    df = spark.read.parquet(location).selectExpr("count(get_json_object(json,'$.0')) as zero", "count(get_json_object(json,'$.10')) as ten", "count(get_json_object(json,'$.100')) as hundred", "count(get_json_object(json,'$.1000')) as thousand", "count(get_json_object(json,'$.1001')) as thousandAndOne", "avg(octet_length(json)) as len")
    if i == 0:
      df.show()
    else:
      df.collect()
    end = time.time()
    cpu_times.append(end - start)

print(f"Median execution time of {iters} runs for CPU get_json_object: {statistics.median(cpu_times):.3f}")