Skip to content
This repository was archived by the owner on Dec 15, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions bin/functions/hibench_prop_env_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,8 @@
NUM_TOPICS_LDA="hibench.lda.num_of_topics",
DOC_LEN_MIN_LDA="hibench.lda.doc_len_min",
DOC_LEN_MAX_LDA="hibench.lda.doc_len_max",
NUM_ITERATIONS_LDA="hibench.lda.num_iterations",
OPTIMIZER_LDA="hibench.lda.optimizer",
MAXRESULTSIZE_LDA="hibench.lda.maxresultsize",
# For Pagerank
PAGERANK_BASE_HDFS="hibench.pagerank.base.hdfs",
Expand Down
2 changes: 1 addition & 1 deletion bin/workloads/ml/lda/spark/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ rmr_hdfs $OUTPUT_HDFS || true

SIZE=`dir_size $INPUT_HDFS`
START_TIME=`timestamp`
run_spark_job com.intel.hibench.sparkbench.ml.LDAExample $INPUT_HDFS $OUTPUT_HDFS $NUM_TOPICS_LDA $MAXRESULTSIZE_LDA
run_spark_job com.intel.hibench.sparkbench.ml.LDAExample $INPUT_HDFS $OUTPUT_HDFS $NUM_TOPICS_LDA $NUM_ITERATIONS_LDA $OPTIMIZER_LDA $MAXRESULTSIZE_LDA
END_TIME=`timestamp`

gen_report ${START_TIME} ${END_TIME} ${SIZE}
Expand Down
3 changes: 3 additions & 0 deletions conf/workloads/ml/lda.conf
Original file line number Diff line number Diff line change
Expand Up @@ -44,5 +44,8 @@ hibench.lda.doc_len_max ${hibench.lda.${hibench.scale.
hibench.lda.maxresultsize ${hibench.lda.${hibench.scale.profile}.maxresultsize}
hibench.lda.partitions ${hibench.default.map.parallelism}

hibench.lda.optimizer "online"
hibench.lda.num_iterations 10

hibench.workload.input ${hibench.hdfs.data.dir}/LDA/Input
hibench.workload.output ${hibench.hdfs.data.dir}/LDA/Output
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,17 @@ object LDAExample {
var inputPath = ""
var outputPath = ""
var numTopics: Int = 10
var maxIterations: Int = 10
var optimizer = "online"
var maxResultSize = "1g"

if (args.length == 4) {
if (args.length == 6) {
inputPath = args(0)
outputPath = args(1)
numTopics = args(2).toInt
maxResultSize = args(3)
maxIterations = args(3).toInt
optimizer = args(4)
maxResultSize = args(5)
} else {
System.err.println(
s"Usage: $LDAExample <INPUT_PATH> <OUTPUT_PATH> <NUM_TOPICS> <MAX_RESULT_SIZE>"
Expand All @@ -51,7 +55,7 @@ object LDAExample {
val corpus: RDD[(Long, Vector)] = sc.objectFile(inputPath)

// Cluster the documents into numTopics topics using LDA
val ldaModel = new LDA().setK(numTopics).setOptimizer("online").run(corpus)
val ldaModel = new LDA().setK(numTopics).setMaxIterations(maxIterations).setOptimizer(optimizer).run(corpus)

// Save and load model.
ldaModel.save(sc, outputPath)
Expand Down