# Install packages

In [None]:
sparkmeasure_version='0.27'

In [None]:
%pip install --quiet \
  tpcds_pyspark \
  pandas \
  sparkmeasure=={sparkmeasure_version}.0 \
  matplotlib

# Import modules

In [None]:
from importlib.resources import files
from pyspark.sql import SparkSession
from tpcds_pyspark import TPCDS
import glob
import os
import pandas as pd
import re
import time

# Init a SparkSession with RAPIDS Spark

## Detect Scala Version used in PySpark package

In [None]:
spark_sql_jar_path, *_ = glob.glob(f"/usr/lib/spark/jars/spark-sql_*jar")
spark_sql_jar = os.path.basename(spark_sql_jar_path)
scala_version = re.search(r'^spark-sql_(\d+.\d+)-.*\.jar$', spark_sql_jar).group(1)
scala_version

## Create Spark Session

In [None]:
spark = (
    SparkSession.builder
    .appName("NDS Example") \
    .config("spark.rapids.sql.enabled", "true") \
    .getOrCreate()
)
spark

# Verify SQL Acceleration on GPU can be enabled by checking the query plan

In [None]:
spark.conf.set('spark.rapids.sql.enabled', True)
sum_df = spark.range(1000).selectExpr('SUM(*)')
sum_df.collect()
sum_df.explain()

# TPCDS App

In [None]:
# https://github.com/LucaCanali/Miscellaneous/tree/master/Performance_Testing/TPCDS_PySpark/tpcds_pyspark/Queries
# queries = None to run all (takes much longer)
queries = None
queries = [
    'q14a',
    'q14b',
    'q23a',
    'q23b',
    # 'q24a',
    # 'q24b',
    # 'q88',
]

demo_start = time.time()
tpcds = TPCDS(data_path='gs://GCS_PATH_TO_TPCDS_DATA/', num_runs=1, queries_repeat_times=1, queries=queries)

## Register TPC-DS tables before running queries

In [None]:
tpcds.map_tables() 

## Measure Apache Spark GPU

In [None]:
tpcds.spark.conf.set('spark.rapids.sql.enabled', True)
%time tpcds.run_TPCDS()
gpu_grouped_results = tpcds.grouped_results_pdf.copy()
gpu_grouped_results

## Measure Apache Spark CPU

In [None]:
tpcds.spark.conf.set('spark.rapids.sql.enabled', False)
%time tpcds.run_TPCDS()
cpu_grouped_results = tpcds.grouped_results_pdf.copy()
cpu_grouped_results

## Show Speedup Factors achieved by GPU

In [None]:
res = pd.merge(cpu_grouped_results, gpu_grouped_results, on='query', how='inner', suffixes=['_cpu', '_gpu'])
res['speedup'] = res['elapsedTime_cpu'] / res['elapsedTime_gpu']
res = res.sort_values(by='elapsedTime_cpu', ascending=False)
res

In [None]:
demo_dur = time.time() - demo_start
print(f"CPU and GPU run took: {demo_dur=} seconds")

In [None]:
res.plot(title='TPC-DS query elapsedTime on CPU vs GPU (lower is better)', 
         kind='bar', x='query', y=['elapsedTime_cpu', 'elapsedTime_gpu'],
         color=['blue', '#76B900'])

In [None]:
res.plot(title='Speedup factors of TPC-DS queries on GPU', kind='bar', 
         x='query', y='speedup', color='#76B900')