Adding Spark 2.0.1 to the demo (#429)

Netflix · Oct 31, 2016 · 097af0f · 097af0f
1 parent 48395c1
commit 097af0f
Show file tree

Hide file tree

Showing 15 changed files with 283 additions and 34 deletions.
diff --git a/README.md b/README.md
@@ -89,18 +89,14 @@ application (hadoop)
 * Review the Genie UI again and notice that now clusters, commands and applications have data in them
 * Run some jobs. Recommend running the Hadoop job first so others have something interesting to show. 
 Available jobs include:
-    * `./gradlew demoRunProdHadoopJob`
-        * See the MR job at `http://localhost:8088`
-    * `./gradlew demoRunTestHadoopJob`
-        * See the MR job at `http://localhost:8089`
-    * `./gradlew demoRunProdHDFSJob`
-        * Runs a `dfs -ls` on the input directory and stores results in stdout
-    * `./gradlew demoRunTestHDFSJob`
-        * Runs a `dfs -ls` on the input directory and stores results in stdout
-    * `./gradlew demoRunProdYarnJob`
-        * Lists all yarn applications from the resource manager into stdout
-    * `./gradlew demoRunTestYarnJob`
+    * `./gradlew demoRunProdHadoopJob` or `./gradlew demoRunTestHadoopJob`
+        * See the MR job at `http://localhost:8088` or `http://localhost:8089` respectively
+    * `./gradlew demoRunProdHDFSJob` or `./gradlew demoRunTestHDFSJob`
+        * Runs a `dfs -ls` on the input directory on HDFS and stores results in stdout
+    * `./gradlew demoRunProdYarnJob` or `./gradlew demoRunTestYarnJob`
         * Lists all yarn applications from the resource manager into stdout
+    * `./gradlew demoRunProdSparkSubmitJob` or `./gradlew demoRunTestSparkSubmitJob`
+        * Runs the SparkPi example with input of 10. Results stored in stdout
 * For each of these jobs you can see their status, output and other information via the Genie UI
 * For how everything is configured and run you can view the scripts in `genie-demo/src/main/docker/client/example`
 * Once you're done trying everything out you can shut down the demo

diff --git a/genie-demo/build.gradle b/genie-demo/build.gradle
@@ -22,30 +22,46 @@ task demoInit(type: Exec, group: "Demo", description: "Setup the configuration m
     commandLine "docker", "exec", "docker_genie-client_1", "./init_demo.py"
 }
 
-task demoRunProdHadoopJob(type: Exec, group: "Demo", description: "Run example Hadoop job on production cluster") {
+task demoRunProdHadoopJob(type: Exec, group: "Demo", description: "Run demo Hadoop job on production cluster") {
     commandLine "docker", "exec", "docker_genie-client_1", "./run_hadoop_job.py", "sla"
 }
 
-task demoRunTestHadoopJob(type: Exec, group: "Demo", description: "Run example Hadoop job on test cluster") {
+task demoRunTestHadoopJob(type: Exec, group: "Demo", description: "Run demo Hadoop job on test cluster") {
     commandLine "docker", "exec", "docker_genie-client_1", "./run_hadoop_job.py", "test"
 }
 
-task demoRunProdHDFSJob(type: Exec, group: "Demo", description: "Run example HDFS job on production cluster") {
+task demoRunProdHDFSJob(type: Exec, group: "Demo", description: "Run demo HDFS job on production cluster") {
     commandLine "docker", "exec", "docker_genie-client_1", "./run_hdfs_job.py", "sla"
 }
 
-task demoRunTestHDFSJob(type: Exec, group: "Demo", description: "Run example HDFS job on test cluster") {
+task demoRunTestHDFSJob(type: Exec, group: "Demo", description: "Run demo HDFS job on test cluster") {
     commandLine "docker", "exec", "docker_genie-client_1", "./run_hdfs_job.py", "test"
 }
 
-task demoRunProdYarnJob(type: Exec, group: "Demo", description: "Run example YARN job on production cluster") {
+task demoRunProdYarnJob(type: Exec, group: "Demo", description: "Run demo YARN job on production cluster") {
     commandLine "docker", "exec", "docker_genie-client_1", "./run_yarn_job.py", "sla"
 }
 
-task demoRunTestYarnJob(type: Exec, group: "Demo", description: "Run example YARN job on test cluster") {
+task demoRunTestYarnJob(type: Exec, group: "Demo", description: "Run demo YARN job on test cluster") {
     commandLine "docker", "exec", "docker_genie-client_1", "./run_yarn_job.py", "test"
 }
 
+task demoRunProdSparkShellJob(type: Exec, group: "Demo", description: "Run demo Spark Shell job on production cluster") {
+    commandLine "docker", "exec", "docker_genie-client_1", "./run_spark_shell_job.py", "sla"
+}
+
+task demoRunTestSparkShellJob(type: Exec, group: "Demo", description: "Run demo Spark Shell job on test cluster") {
+    commandLine "docker", "exec", "docker_genie-client_1", "./run_spark_shell_job.py", "test"
+}
+
+task demoRunProdSparkSubmitJob(type: Exec, group: "Demo", description: "Run demo Spark Submit job on production cluster") {
+    commandLine "docker", "exec", "docker_genie-client_1", "./run_spark_submit_job.py", "sla"
+}
+
+task demoRunTestSparkSubmitJob(type: Exec, group: "Demo", description: "Run demo Spark Submit job on test cluster") {
+    commandLine "docker", "exec", "docker_genie-client_1", "./run_spark_submit_job.py", "test"
+}
+
 task demoStop(type: Exec, group: "Demo", description: "Bring down the  Genie demo") {
     workingDir dockerDir
     environment("GENIE_VERSION", project.version)

diff --git a/genie-demo/src/main/docker/apache/Dockerfile b/genie-demo/src/main/docker/apache/Dockerfile
@@ -2,6 +2,7 @@ from httpd:2.4
 MAINTAINER NetflixOSS <netflixoss@netflix.com>
 COPY ./files/ /usr/local/apache2/htdocs/
 ADD http://apache.cs.utah.edu/hadoop/common/hadoop-2.7.1/hadoop-2.7.1.tar.gz /usr/local/apache2/htdocs/applications/hadoop/2.7.1/
+ADD http://d3kbcqa49mib13.cloudfront.net/spark-2.0.1-bin-hadoop2.7.tgz /usr/local/apache2/htdocs/applications/spark/2.0.1/
 #ADD http://apache.cs.utah.edu/pig/pig-0.16.0/pig-0.16.0.tar.gz /usr/local/apache2/htdocs/applications/pig/0.16.0/
 #ADD http://apache.cs.utah.edu/hive/hive-2.1.0/apache-hive-2.1.0-bin.tar.gz /usr/local/apache2/htdocs/applications/hive/2.1.0/
 RUN chmod -R +r /usr/local/apache2/htdocs/
diff --git a/genie-demo/src/main/docker/apache/files/applications/spark/2.0.1/setup.sh b/genie-demo/src/main/docker/apache/files/applications/spark/2.0.1/setup.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+set -o errexit -o nounset -o pipefail
+
+start_dir=`pwd`
+cd `dirname ${BASH_SOURCE[0]}`
+SPARK_BASE=`pwd`
+cd $start_dir
+
+export SPARK_DAEMON_JAVA_OPTS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps"
+
+SPARK_DEPS=${SPARK_BASE}/dependencies
+
+export SPARK_VERSION="2.0.1"
+
+tar xzf ${SPARK_DEPS}/spark-${SPARK_VERSION}-bin-hadoop2.7.tgz -C ${SPARK_DEPS}
+
+# Set the required environment variable.
+export SPARK_HOME=${SPARK_DEPS}/spark-${SPARK_VERSION}-bin-hadoop2.7
+export SPARK_CONF_DIR=${SPARK_HOME}/conf
+export SPARK_LOG_DIR=${GENIE_JOB_DIR}
+export SPARK_LOG_FILE=spark.log
+export SPARK_LOG_FILE_PATH=${GENIE_JOB_DIR}/${SPARK_LOG_FILE}
+export CURRENT_JOB_WORKING_DIR=${GENIE_JOB_DIR}
+export CURRENT_JOB_TMP_DIR=${CURRENT_JOB_WORKING_DIR}/tmp
+
+# Make Sure Script is on the Path
+export PATH=$PATH:${SPARK_HOME}/bin
+
+# Delete the tarball to save space
+rm ${SPARK_DEPS}/spark-${SPARK_VERSION}-bin-hadoop2.7.tgz
+
diff --git a/genie-demo/src/main/docker/apache/files/commands/spark/2.0.1/setupShell.sh b/genie-demo/src/main/docker/apache/files/commands/spark/2.0.1/setupShell.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+set -o errexit -o nounset -o pipefail
+
+# copy hive-site.xml configuration
+#cp ${GENIE_COMMAND_DIR}/config/* ${SPARK_CONF_DIR}
diff --git a/genie-demo/src/main/docker/apache/files/commands/spark/2.0.1/setupSubmit.sh b/genie-demo/src/main/docker/apache/files/commands/spark/2.0.1/setupSubmit.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+set -o errexit -o nounset -o pipefail
+
+# copy hive-site.xml configuration
+#cp ${GENIE_COMMAND_DIR}/config/* ${SPARK_CONF_DIR}
diff --git a/genie-demo/src/main/docker/client/example/applications/spark201.yml b/genie-demo/src/main/docker/client/example/applications/spark201.yml
@@ -0,0 +1,11 @@
+id: spark201
+name: spark
+user: genieDemo
+status: ACTIVE
+description: Spark Application
+setupFile: http://genie-apache/applications/spark/2.0.1/setup.sh
+version: 2.0.1
+type: spark
+tags: ['type:spark', 'ver:2.0.1', 'ver:2.0']
+dependencies:
+  - http://genie-apache/applications/spark/2.0.1/spark-2.0.1-bin-hadoop2.7.tgz
diff --git a/genie-demo/src/main/docker/client/example/commands/sparkShell201.yml b/genie-demo/src/main/docker/client/example/commands/sparkShell201.yml
@@ -0,0 +1,11 @@
+id: sparkshell201
+name: Spark Shell
+user: genieDemo
+description: Spark Shell Command
+status: ACTIVE
+setupFile: http://genie-apache/commands/spark/2.0.1/setupShell.sh
+configs: []
+executable: ${SPARK_HOME}/bin/spark-shell
+version: 2.0.1
+tags: ['type:spark-shell', 'ver:2.0.1']
+checkDelay: 5000
diff --git a/genie-demo/src/main/docker/client/example/commands/sparkSubmit201.yml b/genie-demo/src/main/docker/client/example/commands/sparkSubmit201.yml
@@ -0,0 +1,11 @@
+id: sparksubmit201
+name: Spark Submit
+user: genieDemo
+description: Spark Submit Command
+status: ACTIVE
+setupFile: http://genie-apache/commands/spark/2.0.1/setupSubmit.sh
+configs: []
+executable: ${SPARK_HOME}/bin/spark-submit --master yarn --deploy-mode client
+version: 2.0.1
+tags: ['type:spark-submit', 'ver:2.0.1']
+checkDelay: 5000
diff --git a/genie-demo/src/main/docker/client/example/init_demo.py b/genie-demo/src/main/docker/client/example/init_demo.py
@@ -15,11 +15,12 @@
 # limitations under the License.
 
 import logging
+
 import yaml
-from pygenie.conf import GenieConf
 from pygenie.client import Genie
+from pygenie.conf import GenieConf
 
-logging.basicConfig(level=logging.ERROR)
+logging.basicConfig(level=logging.WARNING)
 
 LOGGER = logging.getLogger(__name__)
 
@@ -28,38 +29,66 @@ def load_yaml(yaml_file):
     with open(yaml_file) as _file:
         return yaml.load(_file)
 
+
 genie_conf = GenieConf()
 genie_conf.genie.url = "http://genie:8080"
 
 genie = Genie(genie_conf)
 
 hadoop_application = load_yaml("applications/hadoop271.yml")
 hadoop_application_id = genie.create_application(hadoop_application)
-LOGGER.info("Created hadoop application with id = [%s]" % hadoop_application_id)
+LOGGER.warn("Created Hadoop application with id = [%s]" % hadoop_application_id)
+
+spark_application = load_yaml("applications/spark201.yml")
+spark_application_id = genie.create_application(spark_application)
+LOGGER.warn("Created Spark application with id = [%s]" % hadoop_application_id)
 
 hadoop_command = load_yaml("commands/hadoop271.yml")
 hadoop_command_id = genie.create_command(hadoop_command)
-LOGGER.info("Created hadoop command with id = [%s]" % hadoop_command_id)
+LOGGER.warn("Created Hadoop command with id = [%s]" % hadoop_command_id)
 
 hdfs_command = load_yaml("commands/hdfs271.yml")
 hdfs_command_id = genie.create_command(hdfs_command)
-LOGGER.info("Created HDFS command with id = [%s]" % hdfs_command_id)
+LOGGER.warn("Created HDFS command with id = [%s]" % hdfs_command_id)
 
 yarn_command = load_yaml("commands/yarn271.yml")
 yarn_command_id = genie.create_command(yarn_command)
-LOGGER.info("Created yarn command with id = [%s]" % yarn_command_id)
+LOGGER.warn("Created Yarn command with id = [%s]" % yarn_command_id)
+
+spark_shell_command = load_yaml("commands/sparkShell201.yml")
+spark_shell_command_id = genie.create_command(spark_shell_command)
+LOGGER.warn("Created Spark Shell command with id = [%s]" % spark_shell_command_id)
+
+spark_submit_command = load_yaml("commands/sparkSubmit201.yml")
+spark_submit_command_id = genie.create_command(spark_submit_command)
+LOGGER.warn("Created Spark Submit command with id = [%s]" % spark_submit_command_id)
 
 genie.set_application_for_command(hadoop_command_id, [hadoop_application_id])
+LOGGER.warn("Set applications for Hadoop command to = [%s]" % hadoop_application_id)
 genie.set_application_for_command(hdfs_command_id, [hadoop_application_id])
+LOGGER.warn("Set applications for HDFS command to = [[%s]]" % hadoop_application_id)
 genie.set_application_for_command(yarn_command_id, [hadoop_application_id])
+LOGGER.warn("Set applications for Yarn command to = [[%s]]" % hadoop_application_id)
+genie.set_application_for_command(spark_shell_command_id, [hadoop_application_id, spark_application_id])
+LOGGER.warn("Set applications for Spark Shell command to = [%s]" % [hadoop_application_id, spark_application_id])
+genie.set_application_for_command(spark_submit_command_id, [hadoop_application_id, spark_application_id])
+LOGGER.warn("Set applications for Spark Submit command to = [%s]" % [hadoop_application_id, spark_application_id])
 
 prod_cluster = load_yaml("clusters/prod.yml")
 prod_cluster_id = genie.create_cluster(prod_cluster)
-LOGGER.info("Created prod cluster with id = [%s]" % prod_cluster_id)
+LOGGER.warn("Created prod cluster with id = [%s]" % prod_cluster_id)
 
 test_cluster = load_yaml("clusters/test.yml")
 test_cluster_id = genie.create_cluster(test_cluster)
-LOGGER.info("Created test cluster with id = [%s]" % test_cluster_id)
-
-genie.set_commands_for_cluster(prod_cluster_id, [hadoop_command_id, hdfs_command_id, yarn_command_id])
-genie.set_commands_for_cluster(test_cluster_id, [hadoop_command_id, hdfs_command_id, yarn_command_id])
+LOGGER.warn("Created test cluster with id = [%s]" % test_cluster_id)
+
+genie.set_commands_for_cluster(
+    prod_cluster_id,
+    [hadoop_command_id, hdfs_command_id, yarn_command_id, spark_shell_command_id, spark_submit_command_id]
+)
+LOGGER.warn("Added all commands to the prod cluster with id = [%s]" % prod_cluster_id)
+genie.set_commands_for_cluster(
+    test_cluster_id,
+    [hadoop_command_id, hdfs_command_id, yarn_command_id, spark_shell_command_id, spark_submit_command_id]
+)
+LOGGER.warn("Added all commands to the test cluster with id = [%s]" % test_cluster_id)
diff --git a/genie-demo/src/main/docker/client/example/run_hadoop_job.py b/genie-demo/src/main/docker/client/example/run_hadoop_job.py
@@ -34,9 +34,9 @@
 
 # Create a job instance and fill in the required parameters
 job = pygenie.jobs.HadoopJob() \
-    .job_name('GenieDockerExampleHadoopJob') \
+    .job_name('Genie Demo Hadoop Job') \
     .genie_username('root') \
-    .job_version('2.7.1')
+    .job_version('3.0.0')
 
 # Set cluster criteria which determine the cluster to run the job on
 job.cluster_tags(['sched:' + str(sys.argv[1]), 'type:yarn'])

diff --git a/genie-demo/src/main/docker/client/example/run_hdfs_job.py b/genie-demo/src/main/docker/client/example/run_hdfs_job.py
@@ -34,9 +34,9 @@
 
 # Create a job instance and fill in the required parameters
 job = pygenie.jobs.HadoopJob() \
-    .job_name('GenieDemoHDFSJob') \
+    .job_name('Genie Demo HDFS Job') \
     .genie_username('root') \
-    .job_version('2.7.1')
+    .job_version('3.0.0')
 
 # Set cluster criteria which determine the cluster to run the job on
 job.cluster_tags(['sched:' + str(sys.argv[1]), 'type:yarn'])

diff --git a/genie-demo/src/main/docker/client/example/run_spark_shell_job.py b/genie-demo/src/main/docker/client/example/run_spark_shell_job.py
@@ -0,0 +1,64 @@
+#!/usr/bin/python2.7
+
+# Copyright 2016 Netflix, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+##################################################################################
+# This script assumes setup.py has already been run to configure Genie and that
+# this script is executed on the host where Genie is running. If it's executed on
+# another host change the localhost line below.
+##################################################################################
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import logging
+import pygenie
+import sys
+
+logging.basicConfig(level=logging.ERROR)
+
+LOGGER = logging.getLogger(__name__)
+
+pygenie.conf.DEFAULT_GENIE_URL = "http://genie:8080"
+
+# Create a job instance and fill in the required parameters
+job = pygenie.jobs.GenieJob() \
+    .job_name('Genie Demo Spark Shell Job') \
+    .genie_username('root') \
+    .job_version('3.0.0')
+
+# Set cluster criteria which determine the cluster to run the job on
+job.cluster_tags(['sched:' + str(sys.argv[1]), 'type:yarn'])
+
+# Set command criteria which will determine what command Genie executes for the job
+job.command_tags(['type:spark-shell'])
+
+# Any command line arguments to run along with the command. In this case it holds
+# the actual query but this could also be done via an attachment or file dependency.
+# This jar location is where it is installed on the Genie node but could also pass
+# the jar as attachment and use it locally
+job.command_arguments(
+    "--help"
+)
+
+# Submit the job to Genie
+running_job = job.execute()
+
+print('Job {} is {}'.format(running_job.job_id, running_job.status))
+print(running_job.job_link)
+
+# Block and wait until job is done
+running_job.wait()
+
+print('Job {} finished with status {}'.format(running_job.job_id, running_job.status))