In [1]:
import findspark
findspark.init()
findspark.find()

'C:\\Spark\\spark-hadoop'

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]")\
.config("spark.driver.memory", "12g")\
.config("spark.jars.packages","databricks:spark-deep-learning:1.2.0-spark2.3-s_2.11")\
.appName("ch31_DeepLearningLappy").getOrCreate()

#.config("spark.jars.packages","databricks:spark-deep-learning:1.2.0-spark2.3-s_2.11,databricks:tensorframes:0.5.0-s_2.11")\
# .config("spark.driver.memory", "8g")\
# .config("spark.executor.memory", "3g")\
# .config("spark.executor.cores", "4")\
# .config("spark.executor.instances", "10")\


#!/usr/bin/env bash

# This file is sourced when running various Spark programs.
# Copy it as spark-env.sh and edit that to configure Spark for your site.

# Options read in YARN client mode
#SPARK_EXECUTOR_INSTANCES="2" #Number of workers to start (Default: 2)
#SPARK_EXECUTOR_CORES="1" #Number of cores for the workers (Default: 1).
#SPARK_EXECUTOR_MEMORY="1G" #Memory per Worker (e.g. 1000M, 2G) (Default: 1G)
#SPARK_DRIVER_MEMORY="512M" #Memory for Master (e.g. 1000M, 2G) (Default: 512 Mb)
#SPARK_YARN_APP_NAME="spark" #The name of your application (Default: Spark)
#SPARK_YARN_QUEUE="default" #The hadoop queue to use for allocation requests (Default: default)
#SPARK_YARN_DIST_FILES="" #Comma separated list of files to be distributed with the job.
#SPARK_YARN_DIST_ARCHIVES="" #Comma separated list of archives to be distributed with the job.

In [3]:
sc = spark.sparkContext

In [4]:
spark

In [5]:
from sparkdl.image import imageIO

from pyspark.sql.functions import lit
from pyspark.sql.functions import expr
from pyspark.sql.types import DoubleType

from pyspark.ml.image import ImageSchema

Using TensorFlow backend.


In [6]:
# In[15]:
img_dir = "D:/Learn/GitRepos/Spark-The-Definitive-Guide/data/deep-learning-images"
img_dir_comb = "D:\Learn\GitRepos\Spark-The-Definitive-Guide\data\deep-learning-images\Combined30"

In [7]:
tulips_df = ImageSchema.readImages(img_dir + "/tulips").withColumn("label", lit(1))
tulips_df.count()

# Signature: ImageSchema.readImages(path, recursive=False, 
#                                   numPartitions=-1, dropImageFailures=False, 
#                                   sampleRatio=1.0, seed=0)
# Docstring:
# Reads the directory of images from the local or remote source.

# .. note:: If multiple jobs are run in parallel with different sampleRatio or recursive flag,
#     there may be a race condition where one job overwrites the hadoop configs of another.

# .. note:: If sample ratio is less than 1, sampling uses a PathFilter that is efficient but
#     potentially non-deterministic.

# :param str path: Path to the image directory.
# :param bool recursive: Recursive search flag.
# :param int numPartitions: Number of DataFrame partitions.
# :param bool dropImageFailures: Drop the files that are not valid images.
# :param float sampleRatio: Fraction of the images loaded.
# :param int seed: Random number seed.
# :return: a :class:`DataFrame` with a single column of "images",
#        see ImageSchema for details.

30

In [8]:
# In[16]:
daisy_df = imageIO.readImagesWithCustomFn(img_dir + "/daisy", decode_f=imageIO.PIL_decode).withColumn("label", lit(0))
daisy_df.count()

# Signature: imageIO.readImagesWithCustomFn(path, decode_f, numPartition=None)
# Docstring:
# Read a directory of images (or a single image) into a DataFrame using a custom library to
# decode the images.

# :param path: str, file path.
# :param decode_f: function to decode the raw bytes into an array compatible with one of the
#     supported OpenCv modes. see @imageIO.PIL_decode for an example.

30

In [9]:
# In[ ]:
tulips_train, tulips_test = tulips_df.randomSplit([0.6, 0.4])
daisy_train, daisy_test = daisy_df.randomSplit([0.6, 0.4])

In [10]:
# In[ ]:
train_df = tulips_train.unionAll(daisy_train).cache()
test_df = tulips_test.unionAll(daisy_test).cache()

In [11]:
train_df.printSchema()

root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = false)
 |    |-- width: integer (nullable = false)
 |    |-- nChannels: integer (nullable = false)
 |    |-- mode: integer (nullable = false)
 |    |-- data: binary (nullable = false)
 |-- label: integer (nullable = false)



In [12]:
train_df.show(10)

+--------------------+-----+
|               image|label|
+--------------------+-----+
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
+--------------------+-----+
only showing top 10 rows



In [13]:
test_df.show(10)

+--------------------+-----+
|               image|label|
+--------------------+-----+
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
+--------------------+-----+
only showing top 10 rows



In [14]:
image_df = train_df.unionAll(test_df)

In [15]:
image_df.show(10)

+--------------------+-----+
|               image|label|
+--------------------+-----+
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
+--------------------+-----+
only showing top 10 rows



In [16]:
from keras.applications import InceptionV3
from sparkdl.udf.keras_image_model import registerKerasImageUDF

In [17]:
registerKerasImageUDF("my_keras_inception_udf", InceptionV3(weights="imagenet"))

# Signature: registerKerasImageUDF(udf_name, keras_model_or_file_path, preprocessor=None)
# Docstring:
# Create a Keras image model as a Spark SQL UDF.
# The UDF takes a column (formatted in :py:const:`sparkdl.image.imageIO.imageSchema`)
# and produces the output of the given Keras model (e.g.
# for `Inception V3 <https://keras.io/applications/#inceptionv3]>`_
# it produces a real valued score vector over the ImageNet object categories).
# For other models, the output could have different meanings.
# Please consult the actual models specification.


INFO:tensorflow:Froze 0 variables.
INFO:tensorflow:Converted 0 variables to const ops.




INFO:tensorflow:Froze 378 variables.
INFO:tensorflow:Converted 378 variables to const ops.
INFO:tensorflow:Froze 0 variables.
INFO:tensorflow:Converted 0 variables to const ops.
INFO:tensorflow:Froze 0 variables.
INFO:tensorflow:Converted 0 variables to const ops.


<sparkdl.graph.builder.GraphFunction at 0x24b177cbbe0>