In [1]:
import findspark
findspark.init()
findspark.find()

'C:\\Spark\\spark-hadoop'

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]")\
.config("spark.driver.memory", "28g")\
.config("spark.jars.packages","databricks:spark-deep-learning:1.2.0-spark2.3-s_2.11")\
.appName("ch31_DeepLearningLappy").getOrCreate()

#.config("spark.jars.packages","databricks:spark-deep-learning:1.2.0-spark2.3-s_2.11,databricks:tensorframes:0.5.0-s_2.11")\
# .config("spark.driver.memory", "8g")\
# .config("spark.executor.memory", "3g")\
# .config("spark.executor.cores", "4")\
# .config("spark.executor.instances", "10")\


#!/usr/bin/env bash

# This file is sourced when running various Spark programs.
# Copy it as spark-env.sh and edit that to configure Spark for your site.

# Options read in YARN client mode
#SPARK_EXECUTOR_INSTANCES="2" #Number of workers to start (Default: 2)
#SPARK_EXECUTOR_CORES="1" #Number of cores for the workers (Default: 1).
#SPARK_EXECUTOR_MEMORY="1G" #Memory per Worker (e.g. 1000M, 2G) (Default: 1G)
#SPARK_DRIVER_MEMORY="512M" #Memory for Master (e.g. 1000M, 2G) (Default: 512 Mb)
#SPARK_YARN_APP_NAME="spark" #The name of your application (Default: Spark)
#SPARK_YARN_QUEUE="default" #The hadoop queue to use for allocation requests (Default: default)
#SPARK_YARN_DIST_FILES="" #Comma separated list of files to be distributed with the job.
#SPARK_YARN_DIST_ARCHIVES="" #Comma separated list of archives to be distributed with the job.

In [3]:
sc = spark.sparkContext

In [4]:
spark

In [5]:
from sparkdl.image import imageIO
from sparkdl import DeepImageFeaturizer
from sparkdl.udf.keras_image_model import registerKerasImageUDF
from sparkdl import DeepImagePredictor

Using TensorFlow backend.


In [6]:
from pyspark.sql.functions import lit
from pyspark.sql.functions import expr
from pyspark.sql.types import DoubleType

from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from pyspark.ml.image import ImageSchema

In [7]:
from pyspark.sql.functions import udf

In [37]:
import keras
from keras import applications
from keras.applications import InceptionV3
from keras_applications import inception_v3

In [9]:
# In[15]:
img_dir = "D:/Learn/GitRepos/Spark-The-Definitive-Guide/data/deep-learning-images"

In [10]:
tulips_df = ImageSchema.readImages(img_dir + "/tulips").withColumn("label", lit(1))
tulips_df.count()

30

In [11]:
# In[16]:
daisy_df = imageIO.readImagesWithCustomFn(img_dir + "/daisy", decode_f=imageIO.PIL_decode).withColumn("label", lit(0))
daisy_df.count()

30

In [12]:
# In[ ]:
tulips_train, tulips_test = tulips_df.randomSplit([0.6, 0.4])
daisy_train, daisy_test = daisy_df.randomSplit([0.6, 0.4])

In [13]:
# In[ ]:
train_df = tulips_train.unionAll(daisy_train).cache()
test_df = tulips_test.unionAll(daisy_test).cache()

In [14]:
featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3")

In [15]:
lr = LogisticRegression(maxIter=1, regParam=0.05, elasticNetParam=0.3, labelCol="label")

In [16]:
p = Pipeline(stages=[featurizer, lr])

In [17]:
p_model = p.fit(train_df)

In [18]:
tested_df = p_model.transform(test_df)

In [19]:
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

In [20]:
print("Test set accuracy = " + str(evaluator.evaluate(tested_df.select(
  "prediction", "label"))))

Test set accuracy = 0.7916666666666666


In [21]:
# COMMAND ----------
def _p1(v):
  return float(v.array[1])

p1 = udf(_p1, DoubleType())

In [22]:
df = tested_df.withColumn("p_1", p1(tested_df.probability))

In [23]:
df.show(10)

+--------------------+-----+--------------------+--------------------+--------------------+----------+------------------+
|               image|label|            features|       rawPrediction|         probability|prediction|               p_1|
+--------------------+-----+--------------------+--------------------+--------------------+----------+------------------+
|[file:/D:/Learn/G...|    1|[0.0,0.3294818997...|[-5.3918399720315...|[0.00453294598533...|       1.0|0.9954670540146618|
|[file:/D:/Learn/G...|    1|[0.0,0.0,0.359686...|[-4.6291384043407...|[0.00966876956653...|       1.0|0.9903312304334672|
|[file:/D:/Learn/G...|    1|[0.16776393353939...|[-8.804625524635,...|[1.50014957699018...|       1.0| 0.999849985042301|
|[file:/D:/Learn/G...|    1|[0.12303753197193...|[-11.211174730091...|[1.35220583753356...|       1.0|0.9999864779416247|
|[file:/D:/Learn/G...|    1|[0.27778720855712...|[-6.5460241641661...|[0.00143375421856...|       1.0|0.9985662457814325|
|[file:/D:/Learn/G...|  

In [24]:
wrong_df = df.orderBy(expr("abs(p_1 - label)"), ascending=False)

In [25]:
wrong_df.show(10)

+--------------------+-----+--------------------+--------------------+--------------------+----------+-------------------+
|               image|label|            features|       rawPrediction|         probability|prediction|                p_1|
+--------------------+-----+--------------------+--------------------+--------------------+----------+-------------------+
|[file:/D:/Learn/G...|    0|[0.0,0.0,0.245340...|[-1.8436742601419...|[0.13661732352612...|       1.0| 0.8633826764738716|
|[file:/D:/Learn/G...|    0|[0.60087662935256...|[-1.6599966513645...|[0.15976244645864...|       1.0| 0.8402375535413534|
|[file:/D:/Learn/G...|    0|[0.0,0.0,0.0,0.0,...|[-1.1559568327055...|[0.23940272580155...|       1.0| 0.7605972741984476|
|[file:/D:/Learn/G...|    1|[0.0,0.0,0.0,0.0,...|[0.54781988125848...|[0.63362963904372...|       0.0| 0.3663703609562728|
|[file:/D:/Learn/G...|    0|[0.58451932668685...|[-0.0743093599174...|[0.48143120379124...|       1.0| 0.5185687962087551|
|[file:/D:/Learn

In [26]:
# COMMAND ----------
# image_df = imageIO.readImagesWithCustomFn(img_dir, decode_f=imageIO.PIL_decode)
# image_df.count()

In [27]:
train_df.show(10)

+--------------------+-----+
|               image|label|
+--------------------+-----+
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
+--------------------+-----+
only showing top 10 rows



In [28]:
test_df.show(10)

+--------------------+-----+
|               image|label|
+--------------------+-----+
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    0|
+--------------------+-----+
only showing top 10 rows



In [29]:
image_df = train_df.unionAll(test_df)

In [30]:
image_df.show(10)

+--------------------+-----+
|               image|label|
+--------------------+-----+
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
|[file:/D:/Learn/G...|    1|
+--------------------+-----+
only showing top 10 rows



In [31]:
predictor = DeepImagePredictor(
  inputCol="image",
  outputCol="predicted_labels",
  modelName="InceptionV3",
  decodePredictions=True,
  topK=10)

In [32]:
predictions_base_model_df = predictor.transform(image_df).cache()

INFO:tensorflow:Froze 378 variables.
INFO:tensorflow:Converted 378 variables to const ops.
INFO:tensorflow:Froze 0 variables.
INFO:tensorflow:Converted 0 variables to const ops.


In [33]:
predictions_transfer_df = p_model.transform(image_df)

In [34]:
predictions_transfer_df.count()

60

In [41]:
#predictions_base_model_df.count()

# File "d:\learn\.virtualenvs\pyspark-lab\lib\site-packages\keras_applications\__init__.py", line 39, in get_keras_submodule
#     raise ImportError('You need to first `import keras` '
# ImportError: You need to first `import keras` in order to use `keras_applications`. For instance, you can do:

# ```
# import keras
# from keras_applications import vgg16
# ```

# Or, preferably, this equivalent formulation:

# ```
# from keras import applications