Skip to content
Switch branches/tags
Go to file
Cannot retrieve contributors at this time
package io.insightedge.demo.ctr
import com.gigaspaces.spark.context.GigaSpacesConfig
import com.gigaspaces.spark.implicits._
import{OneHotEncoder, StringIndexer, VectorAssembler}
import{CrossValidator, ParamGridBuilder}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.insightedge._
import org.apache.spark.sql.{SaveMode, Row, DataFrame, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.Vector
* @author Oleksiy_Dyagilev
object CtrDemo3 {
def main(args: Array[String]): Unit = {
if (args.length < 5) {
System.err.println("Usage: CtrDemo1 <spark master url> <grid locator> <train collection> <test collection> <prediction result path>")
val Array(master, gridLocator, trainCollection, testCollection, predictResPath) = args
// Configure InsightEdge settings
val gsConfig = GigaSpacesConfig("insightedge-space", None, Some(gridLocator))
val sc = new SparkContext(new SparkConf().setAppName("CtrDemo3").setMaster(master).setGigaSpaceConfig(gsConfig))
val sqlContext = new SQLContext(sc)
// load training and test collection from data grid
val trainDf =
// add fictive `click` column to test dataset so that we can union test and train correctly later
val testDf ="click", lit(-1))
// use one-hot-encoder to convert categorical features into a vector
val (encodedTrainDf, encodedTestDf) = encodeLabels(trainDf, testDf)
// assemble multiple feature vectors into a single one
val assembler = new VectorAssembler()
// Train a model
val lr = new LogisticRegression().setLabelCol("click")
val pipeline = new Pipeline().setStages(Array(assembler, lr))
val paramGrid = new ParamGridBuilder()
.addGrid(lr.regParam, Array(0.01 , 0.1 /*, 1.0 */))
.addGrid(lr.elasticNetParam, Array(0.0 /*, 0.5 , 1.0 */))
.addGrid(lr.fitIntercept, Array(false /*, true */))
val cv = new CrossValidator()
.setEvaluator(new BinaryClassificationEvaluator().setLabelCol("click"))
val cvModel =
// output train results
println("Grid search results:")
println("Best set of parameters found:" + cvModel.getEstimatorParamMaps
import sqlContext.implicits._
// predict test dataset
val predictionDf = cvModel.transform(encodedTestDf).select("id", "probability").map {
case Row(id: String, probability: Vector) => (id, probability(1))
}.toDF("id", "click")
// save prediction to csv
.option("header", "true")
.option("inferSchema", "true")
val categoricalColumns = Seq(
// "device_id",
// "device_ip",
// "device_model",
// "C1",
// "banner_pos",
// "site_id",
// "site_domain",
// "site_category",
// "app_id",
// "app_domain",
// "app_category",
// "C14",
val categoricalColumnsVectors =
def encodeLabel(unionDf: DataFrame, df1: DataFrame, df2: DataFrame, col: String): (DataFrame, DataFrame) = {
println(s"Encoding label $col")
val indexer = new StringIndexer()
def transform(df: DataFrame) = {
val indexed = indexer.transform(df)
val encoder = new OneHotEncoder()
(transform(df1), transform(df2))
def encodeLabels(trainDf: DataFrame, testDf: DataFrame): (DataFrame, DataFrame) = {
// we have to encode categorical features on a union dataset,
// since there might be labels in a test that don't exist in training
val unionDf = trainDf.unionAll(testDf)
categoricalColumns.foldLeft(trainDf -> testDf) { case ((df1, df2), col) => encodeLabel(unionDf, df1, df2, col) }
def vectorCol(col: String) = col + "_vector"
def indexCol(col: String) = col + "_index"