In [1]:
import findspark
findspark.init('/usr/hdp/current/spark2-client')
findspark.find()

'/usr/hdp/current/spark2-client'

In [2]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]")\
.appName("ch31_DeepLearning").getOrCreate()

# .config("spark.driver.memory", "8g")\
# .config("spark.executor.memory", "3g")\
# .config("spark.executor.cores", "5")\
# .config("spark.executor.instances", "10")\


# This file is sourced when running various Spark programs.
# Copy it as spark-env.sh and edit that to configure Spark for your site.
# Options read in YARN client mode
#SPARK_EXECUTOR_INSTANCES="2" #Number of workers to start (Default: 2)
#SPARK_EXECUTOR_CORES="1" #Number of cores for the workers (Default: 1).
#SPARK_EXECUTOR_MEMORY="1G" #Memory per Worker (e.g. 1000M, 2G) (Default: 1G)
#SPARK_DRIVER_MEMORY="512M" #Memory for Master (e.g. 1000M, 2G) (Default: 512 Mb)

In [3]:
sc = spark.sparkContext

In [4]:
for x in sc._conf.getAll():
    if "/proxy/" in x[1]:
        print(x[1])

In [5]:
# pip uninstall pandas tensorflow keras tensorframes tensorflowonspark kafka jieba Pillow sparkdl -y
# rm -rf ~/.cache/pip/*
# pip install --upgrade pandas tensorflow keras tensorframes tensorflowonspark kafka jieba Pillow sparkdl 

from sparkdl import readImages

Using TensorFlow backend.


In [6]:
img_dir = '/user/kranthidr/dataSets/spark-guide/deep-learning-images/'


In [7]:
# COMMAND ----------
from pyspark.sql.functions import lit

In [8]:
tulips_df = readImages(img_dir + "/tulips").withColumn("label", lit(1))

In [9]:
daisy_df = readImages(img_dir + "/daisy").withColumn("label", lit(0))

In [10]:
tulips_train, tulips_test = tulips_df.randomSplit([0.6, 0.4])
daisy_train, daisy_test = daisy_df.randomSplit([0.6, 0.4])

In [11]:
train_df = tulips_train.unionAll(daisy_train)
test_df = tulips_test.unionAll(daisy_test)

In [12]:
# COMMAND ----------
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from sparkdl import DeepImageFeaturizer

In [13]:
featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3")

In [14]:
lr = LogisticRegression(maxIter=1, regParam=0.05, elasticNetParam=0.3, labelCol="label")

In [15]:
p = Pipeline(stages=[featurizer, lr])

In [16]:
p_model = p.fit(train_df)

INFO:tensorflow:Froze 376 variables.


2018-09-07 02:13:01,283 INFO (MainThread-25903) Froze 376 variables.


INFO:tensorflow:Converted 376 variables to const ops.


2018-09-07 02:13:01,441 INFO (MainThread-25903) Converted 376 variables to const ops.


INFO:tensorflow:Froze 0 variables.


2018-09-07 02:13:37,641 INFO (MainThread-25903) Froze 0 variables.


INFO:tensorflow:Converted 0 variables to const ops.


2018-09-07 02:13:37,675 INFO (MainThread-25903) Converted 0 variables to const ops.
2018-09-07 02:13:38,441 INFO (MainThread-25903) Fetch names: ['sdl_flattened_mixed10/concat:0']
2018-09-07 02:13:38,443 INFO (MainThread-25903) Spark context = <SparkContext master=local[*] appName=ch31_DeepLearning>


Py4JJavaError: An error occurred while calling o216.loadClass.
: java.lang.ClassNotFoundException: org.tensorframes.impl.DebugRowOps
	at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:745)


In [None]:
# COMMAND ----------

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
tested_df = p_model.transform(test_df)

In [None]:
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

In [None]:
print("Test set accuracy = " + str(evaluator.evaluate(tested_df.select(
  "prediction", "label"))))

In [None]:
# COMMAND ----------

from pyspark.sql.types import DoubleType
from pyspark.sql.functions import expr
# a simple UDF to convert the value to a double
def _p1(v):
  return float(v.array[1])
p1 = udf(_p1, DoubleType())
df = tested_df.withColumn("p_1", p1(tested_df.probability))
wrong_df = df.orderBy(expr("abs(p_1 - label)"), ascending=False)
wrong_df.select("filePath", "p_1", "label").limit(10).show()


# COMMAND ----------

from sparkdl import readImages, DeepImagePredictor
image_df = readImages(img_dir)
predictor = DeepImagePredictor(
  inputCol="image",
  outputCol="predicted_labels",
  modelName="InceptionV3",
  decodePredictions=True,
  topK=10)
predictions_df = predictor.transform(image_df)


# COMMAND ----------

df = p_model.transform(image_df)


# COMMAND ----------

from keras.applications import InceptionV3
from sparkdl.udf.keras_image_model import registerKerasImageUDF
from keras.applications import InceptionV3
registerKerasImageUDF("my_keras_inception_udf", InceptionV3(weights="imagenet"))


# COMMAND ----------