## Part 1

In [1]:
import findspark
findspark.init()
findspark.find()

'C:\\Spark\\spark-hadoop'

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]")\
.config("spark.driver.memory", "12g")\
.config("spark.jars.packages","databricks:spark-deep-learning:1.2.0-spark2.3-s_2.11")\
.appName("ch31_DeepLearningLappy-docs").getOrCreate()

#.config("spark.jars.packages","databricks:spark-deep-learning:1.2.0-spark2.3-s_2.11,databricks:tensorframes:0.5.0-s_2.11")\
# .config("spark.driver.memory", "8g")\
# .config("spark.executor.memory", "3g")\
# .config("spark.executor.cores", "4")\
# .config("spark.executor.instances", "10")\


#!/usr/bin/env bash

# This file is sourced when running various Spark programs.
# Copy it as spark-env.sh and edit that to configure Spark for your site.

# Options read in YARN client mode
#SPARK_EXECUTOR_INSTANCES="2" #Number of workers to start (Default: 2)
#SPARK_EXECUTOR_CORES="1" #Number of cores for the workers (Default: 1).
#SPARK_EXECUTOR_MEMORY="1G" #Memory per Worker (e.g. 1000M, 2G) (Default: 1G)
#SPARK_DRIVER_MEMORY="512M" #Memory for Master (e.g. 1000M, 2G) (Default: 512 Mb)
#SPARK_YARN_APP_NAME="spark" #The name of your application (Default: Spark)
#SPARK_YARN_QUEUE="default" #The hadoop queue to use for allocation requests (Default: default)
#SPARK_YARN_DIST_FILES="" #Comma separated list of files to be distributed with the job.
#SPARK_YARN_DIST_ARCHIVES="" #Comma separated list of archives to be distributed with the job.

In [3]:
sc = spark.sparkContext

In [4]:
spark

In [5]:
from sparkdl.image import imageIO
from pyspark.ml.image import ImageSchema

Using TensorFlow backend.


In [6]:
from pyspark.sql.functions import lit
from pyspark.sql.functions import expr
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import udf

In [7]:
# In[15]:
img_dir = "D:/Learn/GitRepos/Spark-The-Definitive-Guide/data/deep-learning-images"
img_dir_comb = "D:\Learn\GitRepos\Spark-The-Definitive-Guide\data\deep-learning-images\Combined30"

In [8]:
tulips_df = ImageSchema.readImages(img_dir + "/tulips").withColumn("label", lit(1))
tulips_df.count()

# Signature: ImageSchema.readImages(path, recursive=False, 
#                                   numPartitions=-1, dropImageFailures=False, 
#                                   sampleRatio=1.0, seed=0)
# Docstring:
# Reads the directory of images from the local or remote source.

# .. note:: If multiple jobs are run in parallel with different sampleRatio or recursive flag,
#     there may be a race condition where one job overwrites the hadoop configs of another.

# .. note:: If sample ratio is less than 1, sampling uses a PathFilter that is efficient but
#     potentially non-deterministic.

# :param str path: Path to the image directory.
# :param bool recursive: Recursive search flag.
# :param int numPartitions: Number of DataFrame partitions.
# :param bool dropImageFailures: Drop the files that are not valid images.
# :param float sampleRatio: Fraction of the images loaded.
# :param int seed: Random number seed.
# :return: a :class:`DataFrame` with a single column of "images",
#        see ImageSchema for details.

30

In [9]:
# In[16]:
daisy_df = imageIO.readImagesWithCustomFn(img_dir + "/daisy", decode_f=imageIO.PIL_decode).withColumn("label", lit(0))
daisy_df.count()

# Signature: imageIO.readImagesWithCustomFn(path, decode_f, numPartition=None)
# Docstring:
# Read a directory of images (or a single image) into a DataFrame using a custom library to
# decode the images.

# :param path: str, file path.
# :param decode_f: function to decode the raw bytes into an array compatible with one of the
#     supported OpenCv modes. see @imageIO.PIL_decode for an example.

30

In [10]:
# In[ ]:
tulips_train, tulips_test = tulips_df.randomSplit([0.8, 0.2])
daisy_train, daisy_test = daisy_df.randomSplit([0.8, 0.2])

In [11]:
# In[ ]:
train_images_df = tulips_train.unionAll(daisy_train).cache()
test_images_df = tulips_test.unionAll(daisy_test).cache()

In [12]:
print(train_images_df.count())
print(test_images_df.count())

48
12


In [13]:
train_images_df.printSchema()

root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = false)
 |    |-- width: integer (nullable = false)
 |    |-- nChannels: integer (nullable = false)
 |    |-- mode: integer (nullable = false)
 |    |-- data: binary (nullable = false)
 |-- label: integer (nullable = false)



In [14]:
print(train_images_df.schema)

StructType(List(StructField(image,StructType(List(StructField(origin,StringType,true),StructField(height,IntegerType,false),StructField(width,IntegerType,false),StructField(nChannels,IntegerType,false),StructField(mode,IntegerType,false),StructField(data,BinaryType,false))),true),StructField(label,IntegerType,false)))


## Part 2 

In [15]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

In [16]:
from sparkdl import DeepImageFeaturizer

In [17]:
featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3")

# Init signature: DeepImageFeaturizer(inputCol=None, outputCol=None, 
#                                     modelName=None, 
#                                     scaleHint='SCALE_AREA_AVERAGING')
# Docstring:     
# Applies the model specified by its popular name, with its prediction layer(s) chopped off,
# to the image column in DataFrame. The output is a MLlib Vector so that DeepImageFeaturizer
# can be used in a MLlib Pipeline.
# The input image column should be ImageSchema.

In [18]:
lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label")

p = Pipeline(stages=[featurizer, lr])

model = p.fit(train_images_df)    # train_images_df is a dataset of images and labels

In [19]:
# Inspect training error
fitted_train_df = model.transform(train_images_df.limit(10))

In [20]:
fitted_train_df.printSchema()

root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = false)
 |    |-- width: integer (nullable = false)
 |    |-- nChannels: integer (nullable = false)
 |    |-- mode: integer (nullable = false)
 |    |-- data: binary (nullable = false)
 |-- label: integer (nullable = false)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [21]:
predictionAndLabels = fitted_train_df.select("prediction", "label")

In [22]:
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Training set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

Training set accuracy = 1.0


In [23]:
# Inspect testing error
fitted_test_df = model.transform(test_images_df.limit(10))
predictionAndLabels = fitted_test_df.select("prediction", "label")

evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

Test set accuracy = 1.0


In [24]:
fitted_test_df.printSchema()

root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = false)
 |    |-- width: integer (nullable = false)
 |    |-- nChannels: integer (nullable = false)
 |    |-- mode: integer (nullable = false)
 |    |-- data: binary (nullable = false)
 |-- label: integer (nullable = false)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [25]:
fitted_test_df.select("probability").take(1)

[Row(probability=DenseVector([0.0148, 0.9852]))]

In [26]:
fitted_test_df.select("probability").take(1)[0][0][1]

0.9851838298776144

In [27]:
# COMMAND ----------
def _p11(v):
  return float(v[1])

p11 = udf(_p11, DoubleType())

In [28]:
# COMMAND ----------
def _p1(v):
  return float(v.array[1])

p1 = udf(_p1, DoubleType())

In [29]:
fitted_test_df.withColumn("p_1", p11(fitted_test_df.probability)).show(5)

+--------------------+-----+--------------------+--------------------+--------------------+----------+------------------+
|               image|label|            features|       rawPrediction|         probability|prediction|               p_1|
+--------------------+-----+--------------------+--------------------+--------------------+----------+------------------+
|[file:/D:/Learn/G...|    1|[0.0,0.4272569119...|[-4.1971090928960...|[0.01481617012238...|       1.0|0.9851838298776144|
|[file:/D:/Learn/G...|    1|[1.04818153381347...|[-2.9468799135994...|[0.04988418288782...|       1.0|0.9501158171121739|
|[file:/D:/Learn/G...|    1|[0.0,0.0,2.699823...|[-2.0780772894035...|[0.11124592357950...|       1.0|0.8887540764204914|
|[file:/D:/Learn/G...|    1|[0.0,0.8529530763...|[-3.5127146706501...|[0.02895261692765...|       1.0| 0.971047383072343|
|[file:/D:/Learn/G...|    1|[0.05848193168640...|[-3.5488074453439...|[0.02795496140989...|       1.0|0.9720450385901044|
+--------------------+--

In [30]:
df = fitted_test_df.withColumn("p_1", p1(fitted_test_df.probability))

In [31]:
df.show(10)

+--------------------+-----+--------------------+--------------------+--------------------+----------+--------------------+
|               image|label|            features|       rawPrediction|         probability|prediction|                 p_1|
+--------------------+-----+--------------------+--------------------+--------------------+----------+--------------------+
|[file:/D:/Learn/G...|    1|[0.0,0.4272569119...|[-4.1971090928960...|[0.01481617012238...|       1.0|  0.9851838298776144|
|[file:/D:/Learn/G...|    1|[1.04818153381347...|[-2.9468799135994...|[0.04988418288782...|       1.0|  0.9501158171121739|
|[file:/D:/Learn/G...|    1|[0.0,0.0,2.699823...|[-2.0780772894035...|[0.11124592357950...|       1.0|  0.8887540764204914|
|[file:/D:/Learn/G...|    1|[0.0,0.8529530763...|[-3.5127146706501...|[0.02895261692765...|       1.0|   0.971047383072343|
|[file:/D:/Learn/G...|    1|[0.05848193168640...|[-3.5488074453439...|[0.02795496140989...|       1.0|  0.9720450385901044|
|[file:/

In [32]:
wrong_df = df.orderBy(expr("abs(p_1 - label)"), ascending=False)

In [33]:
wrong_df.show(10)

+--------------------+-----+--------------------+--------------------+--------------------+----------+--------------------+
|               image|label|            features|       rawPrediction|         probability|prediction|                 p_1|
+--------------------+-----+--------------------+--------------------+--------------------+----------+--------------------+
|[file:/D:/Learn/G...|    0|[0.0,0.0,0.0,0.0,...|[1.40196189433121...|[0.80249502700031...|       0.0|  0.1975049729996854|
|[file:/D:/Learn/G...|    1|[0.0,0.0,2.699823...|[-2.0780772894035...|[0.11124592357950...|       1.0|  0.8887540764204914|
|[file:/D:/Learn/G...|    0|[0.0,1.2654095888...|[2.56011979726147...|[0.92825043683049...|       0.0| 0.07174956316950797|
|[file:/D:/Learn/G...|    1|[1.04818153381347...|[-2.9468799135994...|[0.04988418288782...|       1.0|  0.9501158171121739|
|[file:/D:/Learn/G...|    0|[0.29323047399520...|[3.00433183848632...|[0.95276944159999...|       0.0|  0.0472305584000098|
|[file:/

In [34]:
# COMMAND ----------
image1_df = imageIO.readImagesWithCustomFn(img_dir_comb, decode_f=imageIO.PIL_decode)
image1_df.count() 
#Will not recurse through directories

0

In [35]:
train_images_df.show(10)

+--------------------+-----+
|               image|label|
+--------------------+-----+
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
+--------------------+-----+
only showing top 10 rows



In [36]:
train_images_df.show(10)

+--------------------+-----+
|               image|label|
+--------------------+-----+
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
+--------------------+-----+
only showing top 10 rows



In [37]:
image_df = train_images_df.unionAll(test_images_df)

In [38]:
image_df.show(10)

+--------------------+-----+
|               image|label|
+--------------------+-----+
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
+--------------------+-----+
only showing top 10 rows



In [39]:
predictions_transfer_df = model.transform(image_df).cache()

In [40]:
predictions_transfer_df.show(5)

+--------------------+-----+--------------------+--------------------+--------------------+----------+
|               image|label|            features|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+--------------------+----------+
|[file:/D:/Learn/G...|    1|[0.0,0.0,0.0,0.0,...|[-3.2110744003065...|[0.03875109380452...|       1.0|
|[file:/D:/Learn/G...|    1|[0.0,0.9765790700...|[-4.0164494456135...|[0.01769796022814...|       1.0|
|[file:/D:/Learn/G...|    1|[1.43618118762969...|[-3.2169692472658...|[0.03853211006698...|       1.0|
|[file:/D:/Learn/G...|    1|[0.0,0.0,0.0,0.0,...|[-3.4733017899780...|[0.03008149686384...|       1.0|
|[file:/D:/Learn/G...|    1|[0.25410449504852...|[-3.4679241210213...|[0.03023879608859...|       1.0|
+--------------------+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows

