# Welcome to the Profiling Tool for the RAPIDS Accelerator for Apache Spark
To run the tool, you need to enter a log path that represents the DBFS location for your Spark GPU event logs.  Then you can select "Run all" to execute the notebook.  After the notebook completes, you will see various output tables show up below.  More options for running the profiling tool can be found here: https://nvidia.github.io/spark-rapids/docs/spark-profiling-tool.html#profiling-tool-options.

## GPU Job Tuning Recommendations
This has general suggestions for tuning your applications to run optimally on GPUs.

## Per-Job Profile
The profiler output includes information about the application, data sources, executors, SQL stages, Spark properties, and key application metrics at the job and stage levels.

In [0]:
import json
import requests
import base64
import shlex
import subprocess
import pandas as pd

TOOL_JAR_URL = 'https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark-tools_2.12/23.04.0/rapids-4-spark-tools_2.12-23.04.0.jar'
TOOL_JAR_LOCAL_PATH = '/tmp/rapids-4-spark-tools.jar'

# Profiling tool output directory.
OUTPUT_DIR = '/tmp' 

response = requests.get(TOOL_JAR_URL)
open(TOOL_JAR_LOCAL_PATH, "wb").write(response.content)

In [0]:
dbutils.widgets.text("log_path", "")
eventlog_string=dbutils.widgets.get("log_path") 

dbutils.widgets.text("output_path", "")
outputpath_string=dbutils.widgets.get("output_path")

In [0]:
worker_info_path = "/tmp/worker_info.yaml"

worker_info = """
  system:
    numCores: 32
    memory: 212992MiB
    numWorkers: 5
  gpu:
    memory: 15109MiB
    count: 4
    name: T4
  softwareProperties:
    spark.driver.maxResultSize: 7680m
    spark.driver.memory: 15360m
    spark.executor.cores: '8'
    spark.executor.instances: '2'
    spark.executor.memory: 47222m
    spark.executorEnv.OPENBLAS_NUM_THREADS: '1'
    spark.scheduler.mode: FAIR
    spark.sql.cbo.enabled: 'true'
    spark.ui.port: '0'
    spark.yarn.am.memory: 640m
"""

with open(worker_info_path, 'w') as f:
    f.write(worker_info)

In [0]:
!java -Xmx10g -cp /tmp/rapids-4-spark-tools.jar:/databricks/jars/* com.nvidia.spark.rapids.tool.profiling.ProfileMain --csv --worker-info $worker_info_path --auto-tuner -o $outputpath_string $eventlog_string &> /tmp/prof_debug.log

In [0]:
import os

app_df = pd.DataFrame(columns = ['appId', 'appName'])

for x in os.scandir(outputpath_string + "/rapids_4_spark_profile/"):
  tmp_df = pd.read_csv(x.path + "/application_information.csv")
  app_df = app_df.append(tmp_df[['appId', 'appName']])

## GPU Job Tuning Recommendations

In [0]:
app_list = app_df["appId"].tolist()
app_recommendations = pd.DataFrame(columns=['app', 'recommendations'])

for app in app_list:
  app_file = open(outputpath_string + "/rapids_4_spark_profile/" + app + "/profile.log")
  recommendations_start = 0
  recommendations_str = ""
  for line in app_file:
    if recommendations_start == 1:
      recommendations_str = recommendations_str + line
    if "### D. Recommended Configuration ###" in line:
      recommendations_start = 1
  app_recommendations = app_recommendations.append({'app': app, 'recommendations': recommendations_str}, ignore_index=True)
    
display(app_recommendations)

app,recommendations
app-20220210005817-0212,Spark Properties: --conf spark.executor.cores=8 --conf spark.executor.instances=20 --conf spark.executor.memoryOverhead=5734m --conf spark.rapids.memory.pinnedPool.size=4096m --conf spark.rapids.sql.concurrentGpuTasks=2 --conf spark.sql.files.maxPartitionBytes=1197m --conf spark.sql.shuffle.partitions=200 --conf spark.task.resource.gpu.amount=0.125 Comments: - 'spark.executor.memoryOverhead' was not set. - 'spark.executor.memoryOverhead' must be set if using 'spark.rapids.memory.pinnedPool.size - 'spark.sql.shuffle.partitions' was not set.
app-20220210004538-0189,Spark Properties: --conf spark.executor.cores=8 --conf spark.executor.instances=20 --conf spark.executor.memoryOverhead=5734m --conf spark.rapids.memory.pinnedPool.size=4096m --conf spark.rapids.sql.concurrentGpuTasks=2 --conf spark.sql.files.maxPartitionBytes=4096m --conf spark.sql.shuffle.partitions=200 --conf spark.task.resource.gpu.amount=0.125 Comments: - 'spark.executor.memoryOverhead' was not set. - 'spark.executor.memoryOverhead' must be set if using 'spark.rapids.memory.pinnedPool.size - 'spark.sql.shuffle.partitions' was not set. - Average JVM GC time is very high. Other Garbage Collectors can be used for better performance.
app-20220210000414-0117,Spark Properties: --conf spark.executor.cores=8 --conf spark.executor.instances=20 --conf spark.executor.memoryOverhead=5734m --conf spark.rapids.memory.pinnedPool.size=4096m --conf spark.rapids.sql.concurrentGpuTasks=2 --conf spark.sql.files.maxPartitionBytes=2353m --conf spark.sql.shuffle.partitions=200 --conf spark.task.resource.gpu.amount=0.125 Comments: - 'spark.executor.memoryOverhead' was not set. - 'spark.executor.memoryOverhead' must be set if using 'spark.rapids.memory.pinnedPool.size - 'spark.sql.shuffle.partitions' was not set.
app-20220210005713-0210,Spark Properties: --conf spark.executor.cores=8 --conf spark.executor.instances=20 --conf spark.executor.memoryOverhead=5734m --conf spark.rapids.memory.pinnedPool.size=4096m --conf spark.rapids.sql.concurrentGpuTasks=2 --conf spark.sql.files.maxPartitionBytes=4096m --conf spark.sql.shuffle.partitions=200 --conf spark.task.resource.gpu.amount=0.125 Comments: - 'spark.executor.memoryOverhead' was not set. - 'spark.executor.memoryOverhead' must be set if using 'spark.rapids.memory.pinnedPool.size - 'spark.sql.shuffle.partitions' was not set.
app-20220210000744-0123,Spark Properties: --conf spark.executor.cores=8 --conf spark.executor.instances=20 --conf spark.executor.memoryOverhead=5734m --conf spark.rapids.memory.pinnedPool.size=4096m --conf spark.rapids.sql.concurrentGpuTasks=2 --conf spark.sql.files.maxPartitionBytes=4096m --conf spark.sql.shuffle.partitions=200 --conf spark.task.resource.gpu.amount=0.125 Comments: - 'spark.executor.memoryOverhead' was not set. - 'spark.executor.memoryOverhead' must be set if using 'spark.rapids.memory.pinnedPool.size - 'spark.sql.shuffle.partitions' was not set.
app-20220210002521-0154,Spark Properties: --conf spark.executor.cores=8 --conf spark.executor.instances=20 --conf spark.executor.memoryOverhead=5734m --conf spark.rapids.memory.pinnedPool.size=4096m --conf spark.rapids.sql.concurrentGpuTasks=2 --conf spark.sql.files.maxPartitionBytes=4096m --conf spark.sql.shuffle.partitions=200 --conf spark.task.resource.gpu.amount=0.125 Comments: - 'spark.executor.memoryOverhead' was not set. - 'spark.executor.memoryOverhead' must be set if using 'spark.rapids.memory.pinnedPool.size - 'spark.sql.shuffle.partitions' was not set.
app-20220210004801-0193,Spark Properties: --conf spark.executor.cores=8 --conf spark.executor.instances=20 --conf spark.executor.memoryOverhead=5734m --conf spark.rapids.memory.pinnedPool.size=4096m --conf spark.rapids.sql.concurrentGpuTasks=2 --conf spark.sql.files.maxPartitionBytes=3158m --conf spark.sql.shuffle.partitions=200 --conf spark.task.resource.gpu.amount=0.125 Comments: - 'spark.executor.memoryOverhead' was not set. - 'spark.executor.memoryOverhead' must be set if using 'spark.rapids.memory.pinnedPool.size - 'spark.sql.shuffle.partitions' was not set.
app-20220210002620-0156,Spark Properties: --conf spark.executor.cores=8 --conf spark.executor.instances=20 --conf spark.executor.memoryOverhead=5734m --conf spark.rapids.memory.pinnedPool.size=4096m --conf spark.rapids.sql.concurrentGpuTasks=2 --conf spark.sql.shuffle.partitions=200 --conf spark.task.resource.gpu.amount=0.125 Comments: - 'spark.executor.memoryOverhead' was not set. - 'spark.executor.memoryOverhead' must be set if using 'spark.rapids.memory.pinnedPool.size - 'spark.sql.shuffle.partitions' was not set. - Average JVM GC time is very high. Other Garbage Collectors can be used for better performance.
app-20220210001501-0135,Spark Properties: --conf spark.executor.cores=8 --conf spark.executor.instances=20 --conf spark.executor.memoryOverhead=5734m --conf spark.rapids.memory.pinnedPool.size=4096m --conf spark.rapids.sql.concurrentGpuTasks=2 --conf spark.sql.files.maxPartitionBytes=1365m --conf spark.sql.shuffle.partitions=200 --conf spark.task.resource.gpu.amount=0.125 Comments: - 'spark.executor.memoryOverhead' was not set. - 'spark.executor.memoryOverhead' must be set if using 'spark.rapids.memory.pinnedPool.size - 'spark.sql.shuffle.partitions' was not set.
app-20220210001417-0134,Spark Properties: --conf spark.executor.cores=8 --conf spark.executor.instances=20 --conf spark.executor.memoryOverhead=5734m --conf spark.rapids.memory.pinnedPool.size=4096m --conf spark.rapids.sql.concurrentGpuTasks=2 --conf spark.sql.files.maxPartitionBytes=1365m --conf spark.sql.shuffle.partitions=200 --conf spark.task.resource.gpu.amount=0.125 Comments: - 'spark.executor.memoryOverhead' was not set. - 'spark.executor.memoryOverhead' must be set if using 'spark.rapids.memory.pinnedPool.size - 'spark.sql.shuffle.partitions' was not set.
