Skip to content
This repository was archived by the owner on Dec 15, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions autogen/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<parent>
<groupId>com.intel.hibench</groupId>
<artifactId>hibench</artifactId>
<version>6.0-SNAPSHOT</version>
</parent>

<artifactId>autogen</artifactId>
<packaging>jar</packaging>
<groupId>com.intel.hibench</groupId>
<name>HiBench data generation tools</name>

<dependencies>
<dependency>
<groupId>org.apache.mahout</groupId>
<artifactId>mahout-core</artifactId>
<version>${mahout.version}</version>
</dependency>
<dependency>
<groupId>org.apache.mahout</groupId>
<artifactId>mahout-math</artifactId>
<version>${mahout.version}</version>
</dependency>
<dependency>
<groupId>org.uncommons.maths</groupId>
<artifactId>uncommons-maths</artifactId>
<version>${uncommons-maths.version}</version>
</dependency>
</dependencies>

<build>
<plugins>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<version>${maven-assembly-plugin.version}</version>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
2 changes: 1 addition & 1 deletion bin/functions/hibench_prop_env_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
HIBENCH_HOME="hibench.home",
HIBENCH_CONF="hibench.configure.dir",

DEPENDENCY_DIR="hibench.dependency.dir",
#DEPENDENCY_DIR="hibench.dependency.dir",
REPORT_COLUMN_FORMATS="hibench.report.formats",
SPARKBENCH_JAR="hibench.sparkbench.jar",
NUM_MAPS="hibench.default.map.parallelism",
Expand Down
4 changes: 2 additions & 2 deletions bin/functions/workload-functions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -292,12 +292,12 @@ function ensure-hivebench-release(){
}

function ensure-mahout-release (){
if [ ! -e ${DEPENDENCY_DIR}"/mahout/target/"$MAHOUT_RELEASE".tar.gz" ]; then
if [ ! -e ${HIBENCH_HOME}"/mahout/target/"$MAHOUT_RELEASE".tar.gz" ]; then
assert 0 "Error: The mahout bin file hasn't be downloaded by maven, please check!"
exit
fi

cd ${DEPENDENCY_DIR}"/mahout/target"
cd ${HIBENCH_HOME}"/mahout/target"
if [ ! -d $MAHOUT_HOME ]; then
tar zxf $MAHOUT_RELEASE".tar.gz"
fi
Expand Down
3 changes: 3 additions & 0 deletions bin/workloads/micro/wordcount/prepare/prepare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,6 @@ run-hadoop-job ${HADOOP_EXAMPLES_JAR} randomtextwriter \
${INPUT_HDFS}
END_TIME=`timestamp`

show_bannar finish
leave_bench

40 changes: 40 additions & 0 deletions bin/workloads/ml/kmeans/hadoop/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/bin/bash
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

current_dir=`dirname "$0"`
root_dir=${current_dir}/../../../../../
workload_config=${root_dir}/conf/workloads/ml/kmeans.conf
. "${root_dir}/bin/functions/load-bench-config.sh"

enter_bench HadoopKmeans ${workload_config}
show_bannar start

ensure-mahout-release

rmr-hdfs $OUTPUT_HDFS || true

SIZE=`dir_size $INPUT_HDFS`
OPTION="-i ${INPUT_SAMPLE} -c ${INPUT_CLUSTER} -o ${OUTPUT_HDFS} -x ${MAX_ITERATION} -ow -cl -cd 0.5 -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure -xm mapreduce"
CMD="${MAHOUT_HOME}/bin/mahout kmeans ${OPTION}"
MONITOR_PID=`start-monitor`
START_TIME=`timestamp`
execute_withlog $CMD
END_TIME=`timestamp`
stop-monitor $MONITOR_PID

gen_report ${START_TIME} ${END_TIME} ${SIZE}
show_bannar finish
leave_bench
37 changes: 37 additions & 0 deletions bin/workloads/ml/kmeans/prepare/prepare.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/bin/bash
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

current_dir=`dirname "$0"`
root_dir=${current_dir}/../../../../../
workload_config=${root_dir}/conf/workloads/ml/kmeans.conf
. "${root_dir}/bin/functions/load-bench-config.sh"

enter_bench HadoopPrepareKmeans ${workload_config}
show_bannar start

rmr-hdfs $INPUT_HDFS || true
ensure-mahout-release

START_TIME=`timestamp`

OPTION="-sampleDir ${INPUT_SAMPLE} -clusterDir ${INPUT_CLUSTER} -numClusters ${NUM_OF_CLUSTERS} -numSamples ${NUM_OF_SAMPLES} -samplesPerFile ${SAMPLES_PER_INPUTFILE} -sampleDimension ${DIMENSIONS}"
export HADOOP_CLASSPATH=`${MAHOUT_HOME}/bin/mahout classpath`
export_withlog HADOOP_CLASSPATH
run-hadoop-job ${DATATOOLS} org.apache.mahout.clustering.kmeans.GenKMeansDataset -D hadoop.job.history.user.location=${INPUT_SAMPLE} ${OPTION}
END_TIME=`timestamp`

show_bannar finish
leave_bench
35 changes: 35 additions & 0 deletions bin/workloads/ml/kmeans/spark/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/bin/bash
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

current_dir=`dirname "$0"`
root_dir=${current_dir}/../../../../../
workload_config=${root_dir}/conf/workloads/ml/kmeans.conf
. "${root_dir}/bin/functions/load-bench-config.sh"

enter_bench ScalaSparkKmeans ${workload_config}
show_bannar start

rmr-hdfs $OUTPUT_HDFS || true

SIZE=`dir_size $INPUT_HDFS`
START_TIME=`timestamp`

run-spark-job com.intel.hibench.sparkbench.ml.DenseKMeans -k $K --numIterations $MAX_ITERATION $INPUT_HDFS/samples
END_TIME=`timestamp`

gen_report ${START_TIME} ${END_TIME} ${SIZE}
show_bannar finish
leave_bench
2 changes: 1 addition & 1 deletion conf/hadoop.conf
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ hibench.hadoop.configure.dir ${hibench.hadoop.home}/etc/hadoop

hibench.hdfs.master hdfs://localhost:8020

#cdh4, cdh5, apache, hdp
#cdh5, apache, hdp
hibench.hadoop.release apache

#only hadoop2 is supported
Expand Down
20 changes: 9 additions & 11 deletions conf/hibench.conf
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

hibench.scale.profile large
hibench.scale.profile small

hibench.default.map.parallelism 12
hibench.default.shuffle.parallelism 12
Expand Down Expand Up @@ -30,11 +30,9 @@ hibench.configure.dir ${hibench.home}/conf
hibench.hdfs.data.dir ${hibench.hdfs.master}/HiBench

# path of hibench datatools
hibench.hibench.datatool.dir ${hibench.home}/src/autogen/target/autogen-5.0-SNAPSHOT-jar-with-dependencies.jar


hibench.dependency.dir ${hibench.home}/src
hibench.hibench.datatool.dir ${hibench.home}/autogen/target/autogen-6.0-SNAPSHOT-jar-with-dependencies.jar

hibench.sparkbench.jar ${hibench.home}/sparkbench/common/target/sparkbench-common-6.0-SNAPSHOT-${hibench.spark.version}-jar-with-dependencies.jar

#======================================================
# workload home/input/ouput path
Expand All @@ -45,19 +43,19 @@ hibench.hive.release hive-0.12.0-bin
hibench.hivebench.template.dir ${hibench.dependency.dir}/hivebench/hive_template
hibench.hive.dir.name.input ${hibench.workload.dir.name.input}
hibench.hive.dir.name.ouput ${hibench.workload.dir.name.output}
hibench.kmeans.dir.name.input ${hibench.workload.dir.name.input}
hibench.kmeans.dir.name.output ${hibench.workload.dir.name.output}
hibench.bayes.dir.name.input ${hibench.workload.dir.name.input}
hibench.bayes.dir.name.output ${hibench.workload.dir.name.output}
hibench.pagerank.dir.name.input ${hibench.workload.dir.name.input}
hibench.pagerank.dir.name.output ${hibench.workload.dir.name.output}
hibench.pagerank.pegasus.dir ${hibench.dependency.dir}/pegasus/target/pegasus-2.0-SNAPSHOT.jar
hibench.mahout.home ${hibench.dependency.dir}/mahout/target/${hibench.mahout.release}


hibench.mahout.release.apache mahout-distribution-0.9
hibench.mahout.release.hdp mahout-distribution-0.9
hibench.mahout.release.cdh4 mahout-0.7-cdh4.7.1
hibench.mahout.release.cdh5 mahout-0.9-cdh5.1.0
hibench.mahout.release ${hibench.mahout.release.${hibench.hadoop.release}}
hibench.mahout.release.cdh5 mahout-0.9-cdh5.1.0
hibench.mahout.release ${hibench.mahout.release.${hibench.hadoop.release}}
hibench.mahout.home ${hibench.home}/mahout/target/${hibench.mahout.release}


hibench.nutch.dir.name.input ${hibench.workload.dir.name.input}
hibench.nutch.dir.name.output ${hibench.workload.dir.name.output}
Expand Down
4 changes: 2 additions & 2 deletions conf/spark.conf
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
hibench.spark.home /home/yuzhou/cluster/spark/1.6.0
hibench.spark.home /home/yuzhou/cluster/spark/spark-1.6.2-bin-hadoop2.6
hibench.spark.version spark1.6


hibench.yarn.executor.num 2
hibench.yarn.executor.cores 4

Expand All @@ -9,7 +10,6 @@ hibench.spark.master yarn-client
spark.executor.memory 2g
spark.driver.memory 1g


# set spark parallelism property according to hibench's parallelism value
spark.default.parallelism ${hibench.default.map.parallelism}

Expand Down
58 changes: 58 additions & 0 deletions conf/workloads/ml/kmeans.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
hibench.kmeans.tiny.num_of_clusters 5
hibench.kmeans.tiny.dimensions 3
hibench.kmeans.tiny.num_of_samples 30000
hibench.kmeans.tiny.samples_per_inputfile 6000
hibench.kmeans.tiny.max_iteration 5
hibench.kmeans.tiny.k 10
hibench.kmeans.tiny.convergedist 0.5
hibench.kmeans.small.num_of_clusters 5
hibench.kmeans.small.dimensions 20
hibench.kmeans.small.num_of_samples 3000000
hibench.kmeans.small.samples_per_inputfile 600000
hibench.kmeans.small.max_iteration 5
hibench.kmeans.small.k 10
hibench.kmeans.small.convergedist 0.5
hibench.kmeans.large.num_of_clusters 5
hibench.kmeans.large.dimensions 20
hibench.kmeans.large.num_of_samples 20000000
hibench.kmeans.large.samples_per_inputfile 4000000
hibench.kmeans.large.max_iteration 5
hibench.kmeans.large.k 10
hibench.kmeans.large.convergedist 0.5
hibench.kmeans.huge.num_of_clusters 5
hibench.kmeans.huge.dimensions 20
hibench.kmeans.huge.num_of_samples 100000000
hibench.kmeans.huge.samples_per_inputfile 20000000
hibench.kmeans.huge.max_iteration 5
hibench.kmeans.huge.k 10
hibench.kmeans.huge.convergedist 0.5
hibench.kmeans.gigantic.num_of_clusters 5
hibench.kmeans.gigantic.dimensions 20
hibench.kmeans.gigantic.num_of_samples 200000000
hibench.kmeans.gigantic.samples_per_inputfile 40000000
hibench.kmeans.gigantic.max_iteration 5
hibench.kmeans.gigantic.k 10
hibench.kmeans.gigantic.convergedist 0.5
hibench.kmeans.bigdata.num_of_clusters 5
hibench.kmeans.bigdata.dimensions 20
hibench.kmeans.bigdata.num_of_samples 1200000000
hibench.kmeans.bigdata.samples_per_inputfile 40000000
hibench.kmeans.bigdata.max_iteration 10
hibench.kmeans.bigdata.k 10
hibench.kmeans.bigdata.convergedist 0.5

hibench.kmeans.num_of_clusters ${hibench.kmeans.${hibench.scale.profile}.num_of_clusters}
hibench.kmeans.dimensions ${hibench.kmeans.${hibench.scale.profile}.dimensions}
hibench.kmeans.num_of_samples ${hibench.kmeans.${hibench.scale.profile}.num_of_samples}
hibench.kmeans.samples_per_inputfile ${hibench.kmeans.${hibench.scale.profile}.samples_per_inputfile}
hibench.kmeans.max_iteration ${hibench.kmeans.${hibench.scale.profile}.max_iteration}
hibench.kmeans.k ${hibench.kmeans.${hibench.scale.profile}.k}
hibench.kmeans.convergedist ${hibench.kmeans.${hibench.scale.profile}.convergedist}
hibench.kmeans.base.hdfs ${hibench.hdfs.data.dir}/Kmeans
hibench.kmeans.input.sample ${hibench.workload.input}/samples
hibench.kmeans.input.cluster ${hibench.workload.input}/cluster

hibench.sparkbench.jar ${hibench.home}/sparkbench/ml/target/sparkbench-ml-6.0-SNAPSHOT-${hibench.spark.version}-jar-with-dependencies.jar

hibench.workload.input ${hibench.hdfs.data.dir}/Kmeans/Input
hibench.workload.output ${hibench.hdfs.data.dir}/Kmeans/Output
57 changes: 57 additions & 0 deletions mahout/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<parent>
<groupId>com.intel.hibench</groupId>
<artifactId>hibench</artifactId>
<version>6.0-SNAPSHOT</version>
</parent>

<groupId>com.intel.hibench</groupId>
<artifactId>mahout</artifactId>
<packaging>jar</packaging>

<properties>
<repo1>http://archive.apache.org</repo1>
<file1>dist/mahout/0.9/mahout-distribution-0.9.tar.gz</file1>
<checksum1>09b999fbee70c9853789ffbd8f28b8a3</checksum1>
<repo2>http://archive.cloudera.com</repo2>
<file2>cdh5/cdh/5/mahout-0.9-cdh5.1.0.tar.gz</file2>
<checksum2>aa953e0353ac104a22d314d15c88d78f</checksum2>
</properties>

<build>
<plugins>
<plugin>
<groupId>com.googlecode.maven-download-plugin</groupId>
<artifactId>download-maven-plugin</artifactId>
<version>${download-maven-plugin.version}</version>
<executions>
<execution>
<phase>process-sources</phase>
<goals>
<goal>wget</goal>
</goals>
<configuration>
<url>${repo1}/${file1}</url>
<md5>${checksum1}</md5>
</configuration>
</execution>
<execution>
<id>extra-download-execution</id>
<phase>process-sources</phase>
<goals>
<goal>wget</goal>
</goals>
<configuration>
<url>${repo2}/${file2}</url>
<md5>${checksum2}</md5>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>

</project>
Loading