## PCBS benchmark
The benchmark on [RAPIDS Accelerator For Apache Spark](https://nvidia.github.io/spark-rapids/) is to time the
conversion of incoming RDDs to and from a CachedBatch. Specifically to compare the performance of
ParquetCachedBatchSerializer to DefaultCachedBatchSerializer.

In [1]:
import java.lang.reflect.Method

import org.apache.spark.{sql, TaskContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.columnar.{CachedBatch, CachedBatchSerializer}
import org.apache.spark.sql.execution.columnar.DefaultCachedBatchSerializer
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.vectorized.ColumnarBatch
import org.apache.spark.storage.StorageLevel
import org.apache.spark.storage.StorageLevel.MEMORY_ONLY
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.execution.columnar.DefaultCachedBatch
import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec
import org.apache.spark.sql.rapids.GpuInMemoryTableScanExec
import com.nvidia.spark.GpuCachedBatchSerializer
import com.nvidia.spark.rapids.ParquetCachedBatch
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.columnar.InMemoryRelation
import com.nvidia.spark.ParquetCachedBatchSerializer

In [2]:
val RAPIDS_JAR = "rapids-plugin-jar-file"

def createSparkSession(): SparkSession = {
    val conf = new SparkConf()
    conf.setMaster("your-spark-master-url")
    conf.setAppName("PCBS Benchmark")
    conf.set("spark.jars", RAPIDS_JAR)
    conf.set("spark.driver.memory", "4G")
    conf.set("spark.executor.memory", "4G")
    conf.set("spark.executor.cores", "4")
    conf.set("spark.locality.wait", "0")
    conf.set("spark.dynamicAllocation.enabled", "false")

    //rapids config
    conf.set("spark.rapids.sql.enabled", "true") 
    conf.set("spark.plugins", "com.nvidia.spark.SQLPlugin")
    conf.set("spark.executor.resource.gpu.amount", "1")
    conf.set("spark.task.resource.gpu.amount", "0.25") 
    conf.set("spark.rapids.sql.concurrentGpuTasks", "2")
    conf.set("spark.rapids.memory.pinnedPool.size", "1G")
    
    conf.set("spark.driver.extraJavaOptions","-ea -Duser.timezone=UTC")
    conf.set("spark.executor.extraJavaOptions","-ea -Duser.timezone=UTC -Dai.rapids.cudf.prefer-pinned=true")
    conf.set("spark.sql.cache.serializer","com.nvidia.spark.ParquetCachedBatchSerializer")
    conf.set("spark.driver.extraClassPath", RAPIDS_JAR)
    conf.set("spark.executor.extraClassPath", RAPIDS_JAR)

    SparkSession.builder.config(conf=conf).getOrCreate()

}

RAPIDS_JAR = /home/yuanli/work/pcbs/re-build/spark-rapids/dist/target/rapids-4-spark_2.12-22.06.0-SNAPSHOT-cuda11.jar


createSparkSession: ()org.apache.spark.sql.SparkSession


/home/yuanli/work/pcbs/re-build/spark-rapids/dist/target/rapids-4-spark_2.12-22.06.0-SNAPSHOT-cuda11.jar

In [4]:
class TestCachedBatchSerializer(
    useCompression: Boolean,
    batchSize: Int) extends DefaultCachedBatchSerializer {

  override def convertInternalRowToCachedBatch(input: RDD[InternalRow],
      schema: Seq[Attribute],
      storageLevel: StorageLevel,
      conf: SQLConf): RDD[CachedBatch] = {
    convertForCacheInternal(input, schema, batchSize, useCompression)
  }
}

defined class TestCachedBatchSerializer


In [5]:
case class CloseableColumnBatchIterator(iter: Iterator[ColumnarBatch]) extends
    Iterator[ColumnarBatch] {
  var cb: ColumnarBatch = _

  private def closeCurrentBatch(): Unit = {
    if (cb != null) {
      cb.close()
      cb = null
    }
  }

  TaskContext.get().addTaskCompletionListener[Unit]((_: TaskContext) => {
    closeCurrentBatch()
  })

  override def hasNext: Boolean = iter.hasNext

  override def next(): ColumnarBatch = {
    closeCurrentBatch()
    cb = iter.next()
    cb
  }
}

defined class CloseableColumnBatchIterator


In [6]:
def callPrivate(obj: AnyRef, methodName: String, parameters:AnyRef*) = {
  val parameterTypes = parameters.map(_.getClass())
  val method = obj.getClass.getDeclaredMethod(methodName, parameterTypes:_*)
  method.setAccessible(true)
  method.invoke(obj, parameters:_*)
}

callPrivate: (obj: AnyRef, methodName: String, parameters: AnyRef*)Object


In [7]:
import scala.reflect.runtime.universe._
def getReflectLogicalPlan(df: DataFrame) : LogicalPlan = {
    val mirror = runtimeMirror(scala.reflect.runtime.universe.getClass.getClassLoader)
    val instanceMirror = mirror.reflect(df)
    val field = scala.reflect.runtime.universe.typeOf[DataFrame].decl(TermName("logicalPlan")).asTerm
    val fieldMirror = instanceMirror.reflectField(field)
    fieldMirror.get.asInstanceOf[LogicalPlan]
}

getReflectLogicalPlan: (df: org.apache.spark.sql.DataFrame)org.apache.spark.sql.catalyst.plans.logical.LogicalPlan


In [8]:
def readWriteCache(
      acc: Boolean,
      spark: SparkSession,
      ser: CachedBatchSerializer,
      func: DataFrame => (Method, Seq[SparkPlan]),
      verifyFunc: CachedBatch => Any,
      query: SparkSession => sql.DataFrame) = {
    
      val df = query(spark).cache()
      val storageLevel = MEMORY_ONLY
      val logicalPlan = getReflectLogicalPlan(df)
      val plan = spark.sessionState.executePlan(logicalPlan).sparkPlan
      val relation = InMemoryRelation(ser, storageLevel, plan, None, logicalPlan)  
      val (doExecuteMethod, inMemoryScans) = func(df)
        
      val start = System.currentTimeMillis()
      val cb = relation.cacheBuilder.cachedColumnBuffers.first()
      val defaWriteTime = System.currentTimeMillis() - start
      verifyFunc(cb)
      df.unpersist(true)
        
      println(s"write cache with ${doExecuteMethod}, cost ${defaWriteTime} milliseconds.")  
    
      relation.cacheBuilder.cachedColumnBuffers
    
      val startR = System.currentTimeMillis()
      val inMemoryScan = inMemoryScans.head
      val rdd = doExecuteMethod.invoke(inMemoryScan).asInstanceOf[RDD[ColumnarBatch]]
      if (ser.isInstanceOf[ParquetCachedBatchSerializer] && acc) {
        rdd.mapPartitions(iter => CloseableColumnBatchIterator(iter)).count()
        rdd.foreach {
          cb => cb.close()
        }
      } else {
        rdd.count()
      }
      
      val defaReadTime = System.currentTimeMillis() - startR
      println(s"read cache with ${doExecuteMethod}, cost ${defaReadTime} milliseconds.")

      df.unpersist()
}

readWriteCache: (acc: Boolean, spark: org.apache.spark.sql.SparkSession, ser: org.apache.spark.sql.columnar.CachedBatchSerializer, func: org.apache.spark.sql.DataFrame => (java.lang.reflect.Method, Seq[org.apache.spark.sql.execution.SparkPlan]), verifyFunc: org.apache.spark.sql.columnar.CachedBatch => Any, query: org.apache.spark.sql.SparkSession => org.apache.spark.sql.DataFrame)org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]


In [9]:
def runDefaInternal[T](
      query: SparkSession => DataFrame,
      acc: Boolean,
      ser: CachedBatchSerializer) = {
    val spark = createSparkSession()
    if (acc) {
      spark.conf.set("spark.rapids.sql.enabled", "true")
    } else {
      spark.conf.set("spark.rapids.sql.enabled", "false")
    }

    println(s"spark.rapids.sql.enabled is ${acc} .") 
    readWriteCache(acc, spark, ser, { df =>
        val doExecuteMethod =
          classOf[InMemoryTableScanExec].getDeclaredMethod("doExecute")
        doExecuteMethod.setAccessible(true)
        val inMemScans = df.queryExecution.executedPlan.collect {
          case m: InMemoryTableScanExec => m
        }
        (doExecuteMethod, inMemScans)
        }, cb =>
        cb match {
          case _: T =>
          case other => throw new IllegalStateException(s"Unexpected cached batch type: ${other.getClass.getName}")
        }, query )
}

             case _: T =>
                     ^
runDefaInternal: [T](query: org.apache.spark.sql.SparkSession => org.apache.spark.sql.DataFrame, acc: Boolean, ser: org.apache.spark.sql.columnar.CachedBatchSerializer)org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]


In [10]:
def runPcbsInternal(query: SparkSession => DataFrame, acc: Boolean) = {
  val spark = createSparkSession()
  if (acc) {
      spark.conf.set("spark.rapids.sql.enabled", "true")
  } else {
      spark.conf.set("spark.rapids.sql.enabled", "false")
  }
  println(s"spark.rapids.sql.enabled is ${acc} .") 

  readWriteCache(acc, spark, new ParquetCachedBatchSerializer(), { df =>
    val doExecuteMethod =
      classOf[GpuInMemoryTableScanExec].getDeclaredMethod("doExecuteColumnar")
    doExecuteMethod.setAccessible(true)
    val inMemScans = df.queryExecution.executedPlan.collect {
      case g: GpuInMemoryTableScanExec => g
      case m: InMemoryTableScanExec => m
    }
    (doExecuteMethod, inMemScans)
  }, cb =>
   cb match {
      case _: ParquetCachedBatch =>
      case other => throw new IllegalStateException(s"Unexpected cached batch type: ${other.getClass.getName}")
   }, query)
}

runPcbsInternal: (query: org.apache.spark.sql.SparkSession => org.apache.spark.sql.DataFrame, acc: Boolean)org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]


In [11]:
def runPcbs(query: SparkSession => DataFrame): Unit = {
    println("----------write and read cache with Rapids PBCS----------")
    runPcbsInternal(query, true)
    println("----------write and read cache with Spark Default Cache----------")
    val pcbs = new ParquetCachedBatchSerializer()
    runDefaInternal[ParquetCachedBatch](query, false, pcbs)
}

runPcbs: (query: org.apache.spark.sql.SparkSession => org.apache.spark.sql.DataFrame)Unit


In [12]:
runPcbs(spark => spark.read.parquet("your-parquet-file"))

----------write and read cache with Rapids PBCS----------
spark.rapids.sql.enabled is true .
write cache with public org.apache.spark.rdd.RDD org.apache.spark.sql.rapids.GpuInMemoryTableScanExec.doExecuteColumnar(), cost 671 milliseconds.
read cache with public org.apache.spark.rdd.RDD org.apache.spark.sql.rapids.GpuInMemoryTableScanExec.doExecuteColumnar(), cost 269 milliseconds.
----------write and read cache with Spark Default Cache----------
spark.rapids.sql.enabled is false .
write cache with public org.apache.spark.rdd.RDD org.apache.spark.sql.execution.columnar.InMemoryTableScanExec.doExecute(), cost 1487 milliseconds.
read cache with public org.apache.spark.rdd.RDD org.apache.spark.sql.execution.columnar.InMemoryTableScanExec.doExecute(), cost 990 milliseconds.
