Skip to content

Commit

Permalink
Adding Spark 2.0.1 to the demo (#429)
Browse files Browse the repository at this point in the history
  • Loading branch information
tgianos committed Oct 31, 2016
1 parent 48395c1 commit 097af0f
Show file tree
Hide file tree
Showing 15 changed files with 283 additions and 34 deletions.
18 changes: 7 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,18 +89,14 @@ application (hadoop)
* Review the Genie UI again and notice that now clusters, commands and applications have data in them
* Run some jobs. Recommend running the Hadoop job first so others have something interesting to show.
Available jobs include:
* `./gradlew demoRunProdHadoopJob`
* See the MR job at `http://localhost:8088`
* `./gradlew demoRunTestHadoopJob`
* See the MR job at `http://localhost:8089`
* `./gradlew demoRunProdHDFSJob`
* Runs a `dfs -ls` on the input directory and stores results in stdout
* `./gradlew demoRunTestHDFSJob`
* Runs a `dfs -ls` on the input directory and stores results in stdout
* `./gradlew demoRunProdYarnJob`
* Lists all yarn applications from the resource manager into stdout
* `./gradlew demoRunTestYarnJob`
* `./gradlew demoRunProdHadoopJob` or `./gradlew demoRunTestHadoopJob`
* See the MR job at `http://localhost:8088` or `http://localhost:8089` respectively
* `./gradlew demoRunProdHDFSJob` or `./gradlew demoRunTestHDFSJob`
* Runs a `dfs -ls` on the input directory on HDFS and stores results in stdout
* `./gradlew demoRunProdYarnJob` or `./gradlew demoRunTestYarnJob`
* Lists all yarn applications from the resource manager into stdout
* `./gradlew demoRunProdSparkSubmitJob` or `./gradlew demoRunTestSparkSubmitJob`
* Runs the SparkPi example with input of 10. Results stored in stdout
* For each of these jobs you can see their status, output and other information via the Genie UI
* For how everything is configured and run you can view the scripts in `genie-demo/src/main/docker/client/example`
* Once you're done trying everything out you can shut down the demo
Expand Down
28 changes: 22 additions & 6 deletions genie-demo/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -22,30 +22,46 @@ task demoInit(type: Exec, group: "Demo", description: "Setup the configuration m
commandLine "docker", "exec", "docker_genie-client_1", "./init_demo.py"
}

task demoRunProdHadoopJob(type: Exec, group: "Demo", description: "Run example Hadoop job on production cluster") {
task demoRunProdHadoopJob(type: Exec, group: "Demo", description: "Run demo Hadoop job on production cluster") {
commandLine "docker", "exec", "docker_genie-client_1", "./run_hadoop_job.py", "sla"
}

task demoRunTestHadoopJob(type: Exec, group: "Demo", description: "Run example Hadoop job on test cluster") {
task demoRunTestHadoopJob(type: Exec, group: "Demo", description: "Run demo Hadoop job on test cluster") {
commandLine "docker", "exec", "docker_genie-client_1", "./run_hadoop_job.py", "test"
}

task demoRunProdHDFSJob(type: Exec, group: "Demo", description: "Run example HDFS job on production cluster") {
task demoRunProdHDFSJob(type: Exec, group: "Demo", description: "Run demo HDFS job on production cluster") {
commandLine "docker", "exec", "docker_genie-client_1", "./run_hdfs_job.py", "sla"
}

task demoRunTestHDFSJob(type: Exec, group: "Demo", description: "Run example HDFS job on test cluster") {
task demoRunTestHDFSJob(type: Exec, group: "Demo", description: "Run demo HDFS job on test cluster") {
commandLine "docker", "exec", "docker_genie-client_1", "./run_hdfs_job.py", "test"
}

task demoRunProdYarnJob(type: Exec, group: "Demo", description: "Run example YARN job on production cluster") {
task demoRunProdYarnJob(type: Exec, group: "Demo", description: "Run demo YARN job on production cluster") {
commandLine "docker", "exec", "docker_genie-client_1", "./run_yarn_job.py", "sla"
}

task demoRunTestYarnJob(type: Exec, group: "Demo", description: "Run example YARN job on test cluster") {
task demoRunTestYarnJob(type: Exec, group: "Demo", description: "Run demo YARN job on test cluster") {
commandLine "docker", "exec", "docker_genie-client_1", "./run_yarn_job.py", "test"
}

task demoRunProdSparkShellJob(type: Exec, group: "Demo", description: "Run demo Spark Shell job on production cluster") {
commandLine "docker", "exec", "docker_genie-client_1", "./run_spark_shell_job.py", "sla"
}

task demoRunTestSparkShellJob(type: Exec, group: "Demo", description: "Run demo Spark Shell job on test cluster") {
commandLine "docker", "exec", "docker_genie-client_1", "./run_spark_shell_job.py", "test"
}

task demoRunProdSparkSubmitJob(type: Exec, group: "Demo", description: "Run demo Spark Submit job on production cluster") {
commandLine "docker", "exec", "docker_genie-client_1", "./run_spark_submit_job.py", "sla"
}

task demoRunTestSparkSubmitJob(type: Exec, group: "Demo", description: "Run demo Spark Submit job on test cluster") {
commandLine "docker", "exec", "docker_genie-client_1", "./run_spark_submit_job.py", "test"
}

task demoStop(type: Exec, group: "Demo", description: "Bring down the Genie demo") {
workingDir dockerDir
environment("GENIE_VERSION", project.version)
Expand Down
1 change: 1 addition & 0 deletions genie-demo/src/main/docker/apache/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ from httpd:2.4
MAINTAINER NetflixOSS <netflixoss@netflix.com>
COPY ./files/ /usr/local/apache2/htdocs/
ADD http://apache.cs.utah.edu/hadoop/common/hadoop-2.7.1/hadoop-2.7.1.tar.gz /usr/local/apache2/htdocs/applications/hadoop/2.7.1/
ADD http://d3kbcqa49mib13.cloudfront.net/spark-2.0.1-bin-hadoop2.7.tgz /usr/local/apache2/htdocs/applications/spark/2.0.1/
#ADD http://apache.cs.utah.edu/pig/pig-0.16.0/pig-0.16.0.tar.gz /usr/local/apache2/htdocs/applications/pig/0.16.0/
#ADD http://apache.cs.utah.edu/hive/hive-2.1.0/apache-hive-2.1.0-bin.tar.gz /usr/local/apache2/htdocs/applications/hive/2.1.0/
RUN chmod -R +r /usr/local/apache2/htdocs/
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash

set -o errexit -o nounset -o pipefail

start_dir=`pwd`
cd `dirname ${BASH_SOURCE[0]}`
SPARK_BASE=`pwd`
cd $start_dir

export SPARK_DAEMON_JAVA_OPTS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps"

SPARK_DEPS=${SPARK_BASE}/dependencies

export SPARK_VERSION="2.0.1"

tar xzf ${SPARK_DEPS}/spark-${SPARK_VERSION}-bin-hadoop2.7.tgz -C ${SPARK_DEPS}

# Set the required environment variable.
export SPARK_HOME=${SPARK_DEPS}/spark-${SPARK_VERSION}-bin-hadoop2.7
export SPARK_CONF_DIR=${SPARK_HOME}/conf
export SPARK_LOG_DIR=${GENIE_JOB_DIR}
export SPARK_LOG_FILE=spark.log
export SPARK_LOG_FILE_PATH=${GENIE_JOB_DIR}/${SPARK_LOG_FILE}
export CURRENT_JOB_WORKING_DIR=${GENIE_JOB_DIR}
export CURRENT_JOB_TMP_DIR=${CURRENT_JOB_WORKING_DIR}/tmp

# Make Sure Script is on the Path
export PATH=$PATH:${SPARK_HOME}/bin

# Delete the tarball to save space
rm ${SPARK_DEPS}/spark-${SPARK_VERSION}-bin-hadoop2.7.tgz

Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash

set -o errexit -o nounset -o pipefail

# copy hive-site.xml configuration
#cp ${GENIE_COMMAND_DIR}/config/* ${SPARK_CONF_DIR}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash

set -o errexit -o nounset -o pipefail

# copy hive-site.xml configuration
#cp ${GENIE_COMMAND_DIR}/config/* ${SPARK_CONF_DIR}
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
id: spark201
name: spark
user: genieDemo
status: ACTIVE
description: Spark Application
setupFile: http://genie-apache/applications/spark/2.0.1/setup.sh
version: 2.0.1
type: spark
tags: ['type:spark', 'ver:2.0.1', 'ver:2.0']
dependencies:
- http://genie-apache/applications/spark/2.0.1/spark-2.0.1-bin-hadoop2.7.tgz
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
id: sparkshell201
name: Spark Shell
user: genieDemo
description: Spark Shell Command
status: ACTIVE
setupFile: http://genie-apache/commands/spark/2.0.1/setupShell.sh
configs: []
executable: ${SPARK_HOME}/bin/spark-shell
version: 2.0.1
tags: ['type:spark-shell', 'ver:2.0.1']
checkDelay: 5000
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
id: sparksubmit201
name: Spark Submit
user: genieDemo
description: Spark Submit Command
status: ACTIVE
setupFile: http://genie-apache/commands/spark/2.0.1/setupSubmit.sh
configs: []
executable: ${SPARK_HOME}/bin/spark-submit --master yarn --deploy-mode client
version: 2.0.1
tags: ['type:spark-submit', 'ver:2.0.1']
checkDelay: 5000
51 changes: 40 additions & 11 deletions genie-demo/src/main/docker/client/example/init_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,12 @@
# limitations under the License.

import logging

import yaml
from pygenie.conf import GenieConf
from pygenie.client import Genie
from pygenie.conf import GenieConf

logging.basicConfig(level=logging.ERROR)
logging.basicConfig(level=logging.WARNING)

LOGGER = logging.getLogger(__name__)

Expand All @@ -28,38 +29,66 @@ def load_yaml(yaml_file):
with open(yaml_file) as _file:
return yaml.load(_file)


genie_conf = GenieConf()
genie_conf.genie.url = "http://genie:8080"

genie = Genie(genie_conf)

hadoop_application = load_yaml("applications/hadoop271.yml")
hadoop_application_id = genie.create_application(hadoop_application)
LOGGER.info("Created hadoop application with id = [%s]" % hadoop_application_id)
LOGGER.warn("Created Hadoop application with id = [%s]" % hadoop_application_id)

spark_application = load_yaml("applications/spark201.yml")
spark_application_id = genie.create_application(spark_application)
LOGGER.warn("Created Spark application with id = [%s]" % hadoop_application_id)

hadoop_command = load_yaml("commands/hadoop271.yml")
hadoop_command_id = genie.create_command(hadoop_command)
LOGGER.info("Created hadoop command with id = [%s]" % hadoop_command_id)
LOGGER.warn("Created Hadoop command with id = [%s]" % hadoop_command_id)

hdfs_command = load_yaml("commands/hdfs271.yml")
hdfs_command_id = genie.create_command(hdfs_command)
LOGGER.info("Created HDFS command with id = [%s]" % hdfs_command_id)
LOGGER.warn("Created HDFS command with id = [%s]" % hdfs_command_id)

yarn_command = load_yaml("commands/yarn271.yml")
yarn_command_id = genie.create_command(yarn_command)
LOGGER.info("Created yarn command with id = [%s]" % yarn_command_id)
LOGGER.warn("Created Yarn command with id = [%s]" % yarn_command_id)

spark_shell_command = load_yaml("commands/sparkShell201.yml")
spark_shell_command_id = genie.create_command(spark_shell_command)
LOGGER.warn("Created Spark Shell command with id = [%s]" % spark_shell_command_id)

spark_submit_command = load_yaml("commands/sparkSubmit201.yml")
spark_submit_command_id = genie.create_command(spark_submit_command)
LOGGER.warn("Created Spark Submit command with id = [%s]" % spark_submit_command_id)

genie.set_application_for_command(hadoop_command_id, [hadoop_application_id])
LOGGER.warn("Set applications for Hadoop command to = [%s]" % hadoop_application_id)
genie.set_application_for_command(hdfs_command_id, [hadoop_application_id])
LOGGER.warn("Set applications for HDFS command to = [[%s]]" % hadoop_application_id)
genie.set_application_for_command(yarn_command_id, [hadoop_application_id])
LOGGER.warn("Set applications for Yarn command to = [[%s]]" % hadoop_application_id)
genie.set_application_for_command(spark_shell_command_id, [hadoop_application_id, spark_application_id])
LOGGER.warn("Set applications for Spark Shell command to = [%s]" % [hadoop_application_id, spark_application_id])
genie.set_application_for_command(spark_submit_command_id, [hadoop_application_id, spark_application_id])
LOGGER.warn("Set applications for Spark Submit command to = [%s]" % [hadoop_application_id, spark_application_id])

prod_cluster = load_yaml("clusters/prod.yml")
prod_cluster_id = genie.create_cluster(prod_cluster)
LOGGER.info("Created prod cluster with id = [%s]" % prod_cluster_id)
LOGGER.warn("Created prod cluster with id = [%s]" % prod_cluster_id)

test_cluster = load_yaml("clusters/test.yml")
test_cluster_id = genie.create_cluster(test_cluster)
LOGGER.info("Created test cluster with id = [%s]" % test_cluster_id)

genie.set_commands_for_cluster(prod_cluster_id, [hadoop_command_id, hdfs_command_id, yarn_command_id])
genie.set_commands_for_cluster(test_cluster_id, [hadoop_command_id, hdfs_command_id, yarn_command_id])
LOGGER.warn("Created test cluster with id = [%s]" % test_cluster_id)

genie.set_commands_for_cluster(
prod_cluster_id,
[hadoop_command_id, hdfs_command_id, yarn_command_id, spark_shell_command_id, spark_submit_command_id]
)
LOGGER.warn("Added all commands to the prod cluster with id = [%s]" % prod_cluster_id)
genie.set_commands_for_cluster(
test_cluster_id,
[hadoop_command_id, hdfs_command_id, yarn_command_id, spark_shell_command_id, spark_submit_command_id]
)
LOGGER.warn("Added all commands to the test cluster with id = [%s]" % test_cluster_id)
4 changes: 2 additions & 2 deletions genie-demo/src/main/docker/client/example/run_hadoop_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@

# Create a job instance and fill in the required parameters
job = pygenie.jobs.HadoopJob() \
.job_name('GenieDockerExampleHadoopJob') \
.job_name('Genie Demo Hadoop Job') \
.genie_username('root') \
.job_version('2.7.1')
.job_version('3.0.0')

# Set cluster criteria which determine the cluster to run the job on
job.cluster_tags(['sched:' + str(sys.argv[1]), 'type:yarn'])
Expand Down
4 changes: 2 additions & 2 deletions genie-demo/src/main/docker/client/example/run_hdfs_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@

# Create a job instance and fill in the required parameters
job = pygenie.jobs.HadoopJob() \
.job_name('GenieDemoHDFSJob') \
.job_name('Genie Demo HDFS Job') \
.genie_username('root') \
.job_version('2.7.1')
.job_version('3.0.0')

# Set cluster criteria which determine the cluster to run the job on
job.cluster_tags(['sched:' + str(sys.argv[1]), 'type:yarn'])
Expand Down
64 changes: 64 additions & 0 deletions genie-demo/src/main/docker/client/example/run_spark_shell_job.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/usr/bin/python2.7

# Copyright 2016 Netflix, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

##################################################################################
# This script assumes setup.py has already been run to configure Genie and that
# this script is executed on the host where Genie is running. If it's executed on
# another host change the localhost line below.
##################################################################################

from __future__ import absolute_import, division, print_function, unicode_literals

import logging
import pygenie
import sys

logging.basicConfig(level=logging.ERROR)

LOGGER = logging.getLogger(__name__)

pygenie.conf.DEFAULT_GENIE_URL = "http://genie:8080"

# Create a job instance and fill in the required parameters
job = pygenie.jobs.GenieJob() \
.job_name('Genie Demo Spark Shell Job') \
.genie_username('root') \
.job_version('3.0.0')

# Set cluster criteria which determine the cluster to run the job on
job.cluster_tags(['sched:' + str(sys.argv[1]), 'type:yarn'])

# Set command criteria which will determine what command Genie executes for the job
job.command_tags(['type:spark-shell'])

# Any command line arguments to run along with the command. In this case it holds
# the actual query but this could also be done via an attachment or file dependency.
# This jar location is where it is installed on the Genie node but could also pass
# the jar as attachment and use it locally
job.command_arguments(
"--help"
)

# Submit the job to Genie
running_job = job.execute()

print('Job {} is {}'.format(running_job.job_id, running_job.status))
print(running_job.job_link)

# Block and wait until job is done
running_job.wait()

print('Job {} finished with status {}'.format(running_job.job_id, running_job.status))
Loading

0 comments on commit 097af0f

Please sign in to comment.