From 5636f7bb69ca774250926b7e997544f98019e022 Mon Sep 17 00:00:00 2001 From: jtengyp Date: Tue, 24 Oct 2017 12:36:59 +0800 Subject: [PATCH] add optimizer and iteration parameters for LDA --- bin/functions/hibench_prop_env_mapping.py | 2 ++ bin/workloads/ml/lda/spark/run.sh | 2 +- conf/workloads/ml/lda.conf | 3 +++ .../scala/com/intel/sparkbench/ml/LDAExample.scala | 10 +++++++--- 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/bin/functions/hibench_prop_env_mapping.py b/bin/functions/hibench_prop_env_mapping.py index a9a53f7d9..8dd2b535a 100644 --- a/bin/functions/hibench_prop_env_mapping.py +++ b/bin/functions/hibench_prop_env_mapping.py @@ -132,6 +132,8 @@ NUM_TOPICS_LDA="hibench.lda.num_of_topics", DOC_LEN_MIN_LDA="hibench.lda.doc_len_min", DOC_LEN_MAX_LDA="hibench.lda.doc_len_max", + NUM_ITERATIONS_LDA="hibench.lda.num_iterations", + OPTIMIZER_LDA="hibench.lda.optimizer", MAXRESULTSIZE_LDA="hibench.lda.maxresultsize", # For Pagerank PAGERANK_BASE_HDFS="hibench.pagerank.base.hdfs", diff --git a/bin/workloads/ml/lda/spark/run.sh b/bin/workloads/ml/lda/spark/run.sh index 48d7bab5d..2dce15812 100755 --- a/bin/workloads/ml/lda/spark/run.sh +++ b/bin/workloads/ml/lda/spark/run.sh @@ -26,7 +26,7 @@ rmr_hdfs $OUTPUT_HDFS || true SIZE=`dir_size $INPUT_HDFS` START_TIME=`timestamp` -run_spark_job com.intel.hibench.sparkbench.ml.LDAExample $INPUT_HDFS $OUTPUT_HDFS $NUM_TOPICS_LDA $MAXRESULTSIZE_LDA +run_spark_job com.intel.hibench.sparkbench.ml.LDAExample $INPUT_HDFS $OUTPUT_HDFS $NUM_TOPICS_LDA $NUM_ITERATIONS_LDA $OPTIMIZER_LDA $MAXRESULTSIZE_LDA END_TIME=`timestamp` gen_report ${START_TIME} ${END_TIME} ${SIZE} diff --git a/conf/workloads/ml/lda.conf b/conf/workloads/ml/lda.conf index b8bc6ff2b..f8ca82dca 100644 --- a/conf/workloads/ml/lda.conf +++ b/conf/workloads/ml/lda.conf @@ -44,5 +44,8 @@ hibench.lda.doc_len_max ${hibench.lda.${hibench.scale. hibench.lda.maxresultsize ${hibench.lda.${hibench.scale.profile}.maxresultsize} hibench.lda.partitions ${hibench.default.map.parallelism} +hibench.lda.optimizer "online" +hibench.lda.num_iterations 10 + hibench.workload.input ${hibench.hdfs.data.dir}/LDA/Input hibench.workload.output ${hibench.hdfs.data.dir}/LDA/Output diff --git a/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/LDAExample.scala b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/LDAExample.scala index d568bc92a..c9a55f0b1 100644 --- a/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/LDAExample.scala +++ b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/LDAExample.scala @@ -29,13 +29,17 @@ object LDAExample { var inputPath = "" var outputPath = "" var numTopics: Int = 10 + var maxIterations: Int = 10 + var optimizer = "online" var maxResultSize = "1g" - if (args.length == 4) { + if (args.length == 6) { inputPath = args(0) outputPath = args(1) numTopics = args(2).toInt - maxResultSize = args(3) + maxIterations = args(3).toInt + optimizer = args(4) + maxResultSize = args(5) } else { System.err.println( s"Usage: $LDAExample " @@ -51,7 +55,7 @@ object LDAExample { val corpus: RDD[(Long, Vector)] = sc.objectFile(inputPath) // Cluster the documents into numTopics topics using LDA - val ldaModel = new LDA().setK(numTopics).setOptimizer("online").run(corpus) + val ldaModel = new LDA().setK(numTopics).setMaxIterations(maxIterations).setOptimizer(optimizer).run(corpus) // Save and load model. ldaModel.save(sc, outputPath)