In [0]:
import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassificationModel, XGBoostClassifier}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.sql.{SparkSession, SQLContext}
import org.apache.spark.SparkConf
import org.apache.spark.sql.types.{DoubleType, FloatType, IntegerType, StructField, StructType}
import scala.util.Properties

In [1]:
val trainPath = "gs://dataproc-nv-demo/criteo/train/"
val evalPath  = "gs://dataproc-nv-demo/criteo/test/"
val transPath = "gs://dataproc-nv-demo/criteo/test/"
val modelPath = "gs://dataproc-nv-demo/criteo/model/criteo"

// val trainPath = "hdfs:///criteo/train/"
// val evalPath  = "hdfs:///criteo/test/"
// val transPath = "hdfs:///criteo/test/"
// val modelPath = "hdfs:///criteo/model/criteo"

In [2]:
val conf = new SparkConf()
conf.set("spark.executor.instances", "20")
conf.set("spark.executor.cores", "7")
conf.set("spark.task.cpus", "7")
conf.set("spark.executor.memory", "24g")
conf.set("spark.rapids.memory.pinnedPool.size", "2G")
conf.set("spark.executor.memoryOverhead", "16G")
conf.set("spark.executor.extraJavaOptions", "-Dai.rapids.cudf.prefer-pinned=true")
conf.set("spark.locality.wait", "0s")
conf.set("spark.sql.files.maxPartitionBytes", "512m")
conf.set("spark.executor.resource.gpu.amount", "1")
conf.set("spark.task.resource.gpu.amount", "1")
conf.set("spark.plugins", "com.nvidia.spark.SQLPlugin")
conf.set("spark.rapids.sql.hasNans", "false")
conf.set("spark.rapids.sql.batchSizeBytes", "512M")
conf.set("spark.rapids.sql.reader.batchSizeBytes", "768M")
conf.set("spark.rapids.sql.variableFloatAgg.enabled", "true")
conf.set("spark.rapids.memory.gpu.pooling.enabled", "false")
// conf.set("spark.rapids.memory.gpu.allocFraction", "0.1")
val spark = SparkSession.builder.appName("criteo-gpu")
                               .enableHiveSupport()
                               .config(conf)
                               .getOrCreate
val reader = spark.read.option("header", true)

In [3]:
val trainSet = reader.parquet(trainPath)
val evalSet  = reader.parquet(evalPath)
val transSet = reader.parquet(transPath)

In [4]:
def getFeatureNames(length: Int): List[String] =
    1.until(length).map(i => s"_c$i").toList
val labelColName = "_c0"
val featureNames = getFeatureNames(40)

In [5]:
val commParamMap = Map(
    "eval_metric" -> "logloss",
    "eta" -> 0.1,
    "gamma" -> 0.1,
    "missing" -> 0.0,
    "max_depth" -> 10,
    "max_leaves" -> 256,
    "objective" -> "binary:logistic",
    "grow_policy" -> "depthwise",
    "min_child_weight" -> 30,
    "lambda" -> 1,
    "scale_pos_weight" -> 2,
    "subsample" -> 1,
    "num_round" -> 100)
val xgbParamFinal = commParamMap ++ Map("tree_method" -> "gpu_hist", "num_workers" -> 20, "nthread" ->7)

In [6]:
val xgbClassifier = new XGBoostClassifier(xgbParamFinal)
      .setLabelCol(labelColName)
      // === diff ===
      .setFeaturesCols(featureNames)
xgbClassifier.setEvalSets(Map("eval" -> evalSet))


def benchmark[R](phase: String)(block: => R): (R, Float) = {
  val t0 = System.currentTimeMillis
  val result = block // call-by-name
  val t1 = System.currentTimeMillis
  println("Elapsed time [" + phase + "]: " + ((t1 - t0).toFloat / 1000) + "s")
  (result, (t1 - t0).toFloat / 1000)
}

 // Start training
println("\n------ Training ------")
val (xgbClassificationModel, _) = benchmark("train") {
  xgbClassifier.fit(trainSet)
}


In [7]:
println("\n------ Transforming ------")
val (results, _) = benchmark("transform") {
  val ret = xgbClassificationModel.transform(transSet).cache()
  ret
}
z.show(results.select(labelColName,"rawPrediction","probability","prediction").limit(10))

println("\n------Accuracy of Evaluation------")
val evaluator = new MulticlassClassificationEvaluator().setLabelCol(labelColName)
val accuracy = evaluator.evaluate(results)
println(accuracy)

In [8]:
xgbClassificationModel.write.overwrite.save(modelPath)

val modelFromDisk = XGBoostClassificationModel.load(modelPath)

val (results2, _) = benchmark("transform2") {
  modelFromDisk.transform(transSet)
}
z.show(results2.limit(2))