In [27]:
%pip list


Package                       Version
----------------------------- -----------
aiobotocore                   2.5.0
aiohttp                       3.8.4
aioitertools                  0.11.0
aiosignal                     1.3.1
alembic                       1.10.3
anyio                         3.6.2
argon2-cffi                   21.3.0
argon2-cffi-bindings          21.2.0
asttokens                     2.2.1
async-generator               1.10
async-timeout                 4.0.2
attrs                         22.2.0
autovizwidget                 0.20.5
Babel                         2.12.1
backcall                      0.2.0
backports.functools-lru-cache 1.6.4
beautifulsoup4                4.12.2
bleach                        6.0.0
blinker                       1.6.2
boto3                         1.26.76
botocore                      1.20.106
brotlipy                      0.7.0
cachetools                    5.3.0
certifi                       2022.12.7
certipy                       0.1.3
cffi

**Table of contents**<a id='toc0_'></a>    
- 1. [Launch a Spark session](#toc1_)    
- 2. [Libraries imports](#toc2_)    
- 3. [Define paths to the S3 bucket](#toc3_)    
- 4. [ Get images path and label into a spark df](#toc4_)    
- 5. [Build model for feature extraction and broadcast its weights.](#toc5_)    
- 6. [Functions](#toc6_)    
- 7. [Actions](#toc7_)    
- 8. [Loading the results](#toc8_)    

<!-- vscode-jupyter-toc-config
	numbering=true
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

# 1.  [&#9650;](#toc0_) <a id='toc1_'></a>Launch a Spark session

This notebook is designed to be launched via a spark kernel inside an EMR cluster.

If so, the next cell automatically launches a Spark Session and returns its major attributes.

# 2.  [&#9650;](#toc0_) <a id='toc2_'></a>Libraries imports

In [1]:
import pandas as pd
import numpy as np
import os
from PIL import Image
import io

import tensorflow as tf
from tensorflow.keras.applications.mobilenet_v2 import MobileNetV2, preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras import Model

from pyspark.sql.functions import col, element_at, split
from pyspark.sql.functions import udf, pandas_udf, PandasUDFType
from pyspark.sql.types import *
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.feature import StandardScaler, PCA
from pyspark.ml import Pipeline

from typing import Iterator

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
0,application_1689584546333_0001,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# 3.  [&#9650;](#toc0_) <a id='toc3_'></a>Define paths to the S3 bucket

In [4]:
PATH = 's3://oc-cloud-computing'
IMAGE_SUBSET_PATH = os.path.join(PATH, "images_subset")
RESULT_PATH = os.path.join(PATH, 'results')

print('PATH: '+ PATH \
      +'\nIMAGE_SUBSET_PATH: '+  IMAGE_SUBSET_PATH \
      +'\nRESULT_PATH: '+RESULT_PATH)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

PATH: s3://oc-cloud-computing
IMAGE_SUBSET_PATH: s3://oc-cloud-computing/images_subset
RESULT_PATH: s3://oc-cloud-computing/results

# 4.  [&#9650;](#toc0_) <a id='toc4_'></a> Get images path and label into a spark df

In [14]:
images = spark.read.format("binaryFile") \
    .option("pathGlobFilter", "*.jpg") \
    .option("recursiveFileLookup", "true") \
    .load(IMAGE_SUBSET_PATH)
    
# Add a label column from the image path
# and select the path, content and label only.
images = (
    images
    .withColumn('label', element_at(split(images['path'], '/'),-2))
    .select(
        col("path"),
        col("label"),
        col("content"),
    )
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [18]:
images.show(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------------------------------------------------------------------+------------------+
|path                                                                 |label             |
+---------------------------------------------------------------------+------------------+
|s3://oc-cloud-computing/images_subset/Huckleberry/r_162_100.jpg      |Huckleberry       |
|s3://oc-cloud-computing/images_subset/Huckleberry/r_138_100.jpg      |Huckleberry       |
|s3://oc-cloud-computing/images_subset/Huckleberry/r_136_100.jpg      |Huckleberry       |
|s3://oc-cloud-computing/images_subset/Huckleberry/r_311_100.jpg      |Huckleberry       |
|s3://oc-cloud-computing/images_subset/Huckleberry/r_322_100.jpg      |Huckleberry       |
|s3://oc-cloud-computing/images_subset/Huckleberry/r_278_100.jpg      |Huckleberry       |
|s3://oc-cloud-computing/images_subset/Huckleberry/r_255_100.jpg      |Huckleberry       |
|s3://oc-cloud-computing/images_subset/Tomato not Ripened/r_61_100.jpg|Tomato not Ripened|

# 5.  [&#9650;](#toc0_) <a id='toc5_'></a>Build model for feature extraction and broadcast its weights.

In [7]:
# DL or load mobilenetV2
model = MobileNetV2(
    weights='imagenet',
    include_top=True,
    input_shape=(224, 224, 3)
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224.h5

In [8]:
# Create a model without the last layer
new_model = Model(
    inputs=model.input,
    outputs=model.layers[-2].output
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
new_model.summary()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 224, 224, 3  0           []                               
                                )]                                                                
                                                                                                  
 Conv1 (Conv2D)                 (None, 112, 112, 32  864         ['input_1[0][0]']                
                                )                                                                 
                                                                                                  
 bn_Conv1 (BatchNormalization)  (None, 112, 112, 32  128         ['Conv1[0][0]']                  
                                )                                                             

In [10]:
# Make a broadcast object to broadcast the model weights to
# each worker.
broadcast_weights = sc.broadcast(new_model.get_weights())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [11]:
broadcast_weights

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

<pyspark.broadcast.Broadcast object at 0x7f5ece1160d0>

In [None]:
# spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "1024")

# 6.  [&#9650;](#toc0_) <a id='toc6_'></a>Functions

In [12]:
def model_fn():
    """
    Returns a MobileNetV2 model with top layer removed 
    and set weights to the broadcasted pretrained weights.
    """
    model = MobileNetV2(weights='imagenet',
                        include_top=True,
                        input_shape=(224, 224, 3))
    
    for layer in model.layers:
        layer.trainable = False
        
    new_model = Model(inputs=model.input,
                      outputs=model.layers[-2].output)
    new_model.set_weights(broadcast_weights.value)
    return new_model


def preprocess(content):
    """
    - Takes raw image bytes and transforms it to a pillow image.
    - Resize the images to the Mobilenet input size 
    and preprocess the pixel values.    
    """
    img = Image.open(io.BytesIO(content)).resize([224, 224])
    arr = img_to_array(img)
    return preprocess_input(arr)

def featurize_series(model, content_series):
    """
    Featurize a pd.Series of images using the input model.
    :return: a pd.Series of image features
    """
    input = np.stack(content_series.map(preprocess))
    preds = model.predict(input)
    # For some layers, the output features can be multi-dimensional tensors.
    # Thus, we flatten the feature tensors to vectors for easier storage in Spark DataFrames.
    output = [p.flatten() for p in preds]
    return pd.Series(output)


@pandas_udf('array<float>')
def featurize_udf(
    content_series_iter: Iterator[pd.Series]
) -> Iterator[pd.Series]:
    '''
    This method is a Scalar Iterator pandas UDF wrapping our featurization function.
    The decorator specifies that this returns a Spark DataFrame column of type ArrayType(FloatType).
    
    :param content_series_iter: This argument is an iterator over batches of data, where each batch
                                is a pandas Series of image data.
    '''
    # With Scalar Iterator pandas UDFs, we can load the model once and then re-use it
    # for multiple data batches.  This amortizes the overhead of loading big models.
    model = model_fn()
    for content_series in content_series_iter:
        yield featurize_series(model, content_series)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# 7.  [&#9650;](#toc0_) <a id='toc7_'></a>Actions

In [20]:
# Build a DF with the path, label and features of each image
features_df = (
    images
    .repartition(8)
    .select(
        col("path"),
        col("label"),
        featurize_udf("content").alias("features")
   )
)

#MLLib needs some post processing of the features column format
list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT())

features_df = (
    features_df
    .select(
        col("path"),  
        col("label"),
        list_to_vector_udf(features_df["features"]).alias("features")
   )
)

# Define a pipeline to Standardize the features
# and compute the PCA projection onto the 300 first PCs.
scaler = StandardScaler(
    inputCol="features",
    outputCol="scaled_features",
    withStd=True,
    withMean=True
)

pca = PCA(
    k=300,
    inputCol=scaler.getOutputCol(),
    outputCol="pca_features",
)

pipeline = Pipeline(stages=[scaler , pca])
model = pipeline.fit(features_df)
features_df = model.transform(features_df)

# Write results
(
    features_df
    .drop('scaled_features')
    .write.mode("overwrite")
    .parquet(RESULT_PATH)
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# 8.  [&#9650;](#toc0_) <a id='toc8_'></a>Loading the results

In [21]:
df = pd.read_parquet(RESULT_PATH, engine='pyarrow')
df.shape

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

(50, 4)

In [22]:
df.columns

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Index(['path', 'label', 'features', 'pca_features'], dtype='object')

In [23]:
df.loc[0, 'features']["values"].shape

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

(1280,)

In [24]:
df.loc[0, 'pca_features']["values"].shape

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

(300,)

In [25]:
df.loc[0, 'pca_features']["values"]

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

array([-1.25796514e+01, -4.25875415e+00,  3.69081670e+00, -2.35523345e+00,
       -1.20642283e+00,  1.28430673e-01, -5.34792324e-01, -7.05377030e+00,
       -7.06826161e-01, -1.20758157e+01,  1.24143635e-01, -8.28513671e-09,
        2.24670543e-14,  6.04377659e-15, -5.16427179e-15, -4.31946146e-15,
       -5.44476722e+00,  4.40446291e-15,  9.88792381e-15,  1.33613671e-01,
       -3.05828656e+00, -1.39063007e+00, -1.23900774e+01, -8.45689306e+00,
        6.66274738e+00,  8.02657146e-01, -8.71539236e+00,  3.50754569e+00,
        2.76121055e+00, -3.91154076e+00,  3.46485471e+00,  1.19850807e+00,
        1.00190964e+00, -6.52015677e+00,  8.61906603e+00,  4.57432580e-01,
        8.35096213e-02,  4.77904281e+00, -5.97565414e+00,  3.22152064e+00,
       -1.81016704e-01, -2.82596644e+00,  4.04521017e+00,  8.48694051e-01,
       -9.12946248e-01,  6.35104802e-02,  1.91275852e-02,  5.67157561e+00,
       -4.09040476e+00, -3.63237135e-01, -4.09040476e+00, -4.09040476e+00,
       -4.09040476e+00, -

Results have the right dimension.