**GOALS**:
- Discover spark and use it locally.
- Extract image features and reduce its dimensionality with PCA.

In [1]:
import pandas as pd
from PIL import Image
import numpy as np
import io
import os
import shutil
import random

import tensorflow as tf
from tensorflow.keras.applications.mobilenet_v2 import MobileNetV2, preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras import Model

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, element_at, split
from pyspark.sql.functions import udf, pandas_udf, PandasUDFType
from pyspark.sql.types import *
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.feature import StandardScaler, PCA
from pyspark.ml import Pipeline

2023-07-10 11:27:44.069662: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-10 11:27:44.148334: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-10 11:27:44.149771: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Define paths and build a small subset of images to experiment with my local machine and avoid huge cost on the cloud.

In [2]:
PATH = os.getcwd()
DATA_PATH = os.path.join(PATH ,'fruits_360')
RESULT_PATH = os.path.join(PATH ,'Results')
TRAIN_PATH = os.path.join(DATA_PATH , 'Training')
TEST_PATH = os.path.join(DATA_PATH , 'Test')
IMAGE_SUBSET_PATH = os.path.join(PATH, 'images_subset')

if not os.path.exists(RESULT_PATH):
    os.mkdir(RESULT_PATH)

if not os.path.exists(IMAGE_SUBSET_PATH):
    os.mkdir(IMAGE_SUBSET_PATH)

print('PATH: '+ PATH \
      +'\nDATA_PATH: '+  DATA_PATH \
      +'\nRESULT_PATH: '+RESULT_PATH
        +'\nTRAIN_PATH: '+  TRAIN_PATH \
      +'\nTEST_PATH: '+TEST_PATH)

PATH: /home/louberehc/OCR/projets/8_cloud_computing
DATA_PATH: /home/louberehc/OCR/projets/8_cloud_computing/fruits_360
RESULT_PATH: /home/louberehc/OCR/projets/8_cloud_computing/Results
TRAIN_PATH: /home/louberehc/OCR/projets/8_cloud_computing/fruits_360/Training
TEST_PATH: /home/louberehc/OCR/projets/8_cloud_computing/fruits_360/Test


In [3]:
!tree /home/louberehc/OCR/projets/8_cloud_computing/fruits_360/Training -L 1

[01;34m/home/louberehc/OCR/projets/8_cloud_computing/fruits_360/Training[0m
├── [01;34mApple Braeburn[0m
├── [01;34mApple Crimson Snow[0m
├── [01;34mApple Golden 1[0m
├── [01;34mApple Golden 2[0m
├── [01;34mApple Golden 3[0m
├── [01;34mApple Granny Smith[0m
├── [01;34mApple Pink Lady[0m
├── [01;34mApple Red 1[0m
├── [01;34mApple Red 2[0m
├── [01;34mApple Red 3[0m
├── [01;34mApple Red Delicious[0m
├── [01;34mApple Red Yellow 1[0m
├── [01;34mApple Red Yellow 2[0m
├── [01;34mApricot[0m
├── [01;34mAvocado[0m
├── [01;34mAvocado ripe[0m
├── [01;34mBanana[0m
├── [01;34mBanana Lady Finger[0m
├── [01;34mBanana Red[0m
├── [01;34mBeetroot[0m
├── [01;34mBlueberry[0m
├── [01;34mCactus fruit[0m
├── [01;34mCantaloupe 1[0m
├── [01;34mCantaloupe 2[0m
├── [01;34mCarambula[0m
├── [01;34mCauliflower[0m
├── [01;34mCherry 1[0m
├── [01;34mCherry 2[0m
├── [01;34mCherry Rainier[0m
├── [01;34mCherry Wax Black[0m
├── [01;34mCherry Wax Red[0m
├── [0

In [28]:
!tree /home/louberehc/OCR/projets/8_cloud_computing/fruits_360/Test -L 1


[01;34m/home/louberehc/OCR/projets/8_cloud_computing/fruits_360/Test[0m
├── [01;34mApple Braeburn[0m
├── [01;34mApple Crimson Snow[0m
├── [01;34mApple Golden 1[0m
├── [01;34mApple Golden 2[0m
├── [01;34mApple Golden 3[0m
├── [01;34mApple Granny Smith[0m
├── [01;34mApple Pink Lady[0m
├── [01;34mApple Red 1[0m
├── [01;34mApple Red 2[0m
├── [01;34mApple Red 3[0m
├── [01;34mApple Red Delicious[0m
├── [01;34mApple Red Yellow 1[0m
├── [01;34mApple Red Yellow 2[0m
├── [01;34mApricot[0m
├── [01;34mAvocado[0m
├── [01;34mAvocado ripe[0m
├── [01;34mBanana[0m
├── [01;34mBanana Lady Finger[0m
├── [01;34mBanana Red[0m
├── [01;34mBeetroot[0m
├── [01;34mBlueberry[0m
├── [01;34mCactus fruit[0m
├── [01;34mCantaloupe 1[0m
├── [01;34mCantaloupe 2[0m
├── [01;34mCarambula[0m
├── [01;34mCauliflower[0m
├── [01;34mCherry 1[0m
├── [01;34mCherry 2[0m
├── [01;34mCherry Rainier[0m
├── [01;34mCherry Wax Black[0m
├── [01;34mCherry Wax Red[0m
├── [01;34

In [42]:
# Copy some random images from the training dataset in the subset dir.
# 10 images per fruits among the 5 first fruits appearing in listdir.
n_images = 10
n_fruit_types = 5

for fruit_name in os.listdir(TRAIN_PATH)[:n_fruit_types]:
    train_fruit_path = os.path.join(TRAIN_PATH, fruit_name)
    subset_fruit_path = os.path.join(IMAGE_SUBSET_PATH, fruit_name)
    
    if not os.path.exists(subset_fruit_path):
        os.mkdir(subset_fruit_path)
    
    random_relative_filenames = random.sample(
        os.listdir(train_fruit_path),
        k=n_images
    )

    for fn in random_relative_filenames:
        shutil.copy(
            os.path.join(train_fruit_path, fn),
            os.path.join(subset_fruit_path, fn)
        )

In [43]:
!tree images_subset

[01;34mimages_subset[0m
├── [01;34mGrape White 4[0m
│   ├── [01;35m21_100.jpg[0m
│   ├── [01;35m233_100.jpg[0m
│   ├── [01;35m245_100.jpg[0m
│   ├── [01;35m285_100.jpg[0m
│   ├── [01;35m39_100.jpg[0m
│   ├── [01;35mr_191_100.jpg[0m
│   ├── [01;35mr_235_100.jpg[0m
│   ├── [01;35mr_254_100.jpg[0m
│   ├── [01;35mr_256_100.jpg[0m
│   └── [01;35mr_94_100.jpg[0m
├── [01;34mHuckleberry[0m
│   ├── [01;35m203_100.jpg[0m
│   ├── [01;35m271_100.jpg[0m
│   ├── [01;35m48_100.jpg[0m
│   ├── [01;35mr_136_100.jpg[0m
│   ├── [01;35mr_138_100.jpg[0m
│   ├── [01;35mr_162_100.jpg[0m
│   ├── [01;35mr_255_100.jpg[0m
│   ├── [01;35mr_278_100.jpg[0m
│   ├── [01;35mr_311_100.jpg[0m
│   └── [01;35mr_322_100.jpg[0m
├── [01;34mKumquats[0m
│   ├── [01;35m171_100.jpg[0m
│   ├── [01;35m198_100.jpg[0m
│   ├── [01;35m205_100.jpg[0m
│   ├── [01;35m292_100.jpg[0m
│   ├── [01;35mr_102_100.jpg[0m
│   ├── [01;35mr_128_100.jpg[0m
│   ├── [01;35mr_315_100.jpg[0m
│

# Launch a Spark Session

First, I had errors of modules not found when using PandasUDF.

To avoid that, I had to create a conda environment (p8_env) with packages called in such functions (tensorflow, numpy...) , conda-pack it, and provide
it to the spark session. 

This makes the session much longer to open because the dependencies are installed on the driver and the executors, but then, it works.

It introduces some new problems with the heartbeater, but it works for what I am willing to do, so I neglect it for now.

In [4]:
os.environ['PYSPARK_PYTHON'] = "/home/louberehc/miniconda3/envs/p8_env/bin/python" 
os.environ['PYSPARK_DRIVER_PYTHON'] = "/home/louberehc/miniconda3/envs/p8_env/bin/python" 

spark = (
    SparkSession
    .builder
    .appName('P8')
    .master('local[*]')
    .config("spark.sql.parquet.writeLegacyFormat", 'true')
    .config(
        "spark.archives",  # 'spark.yarn.dist.archives' in YARN.
        "p8_env.tar.gz#environment"
    )
    .getOrCreate()
)

23/07/10 11:28:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/07/10 11:28:47 ERROR Inbox: Ignoring error
java.lang.NullPointerException
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$register(BlockManagerMasterEndpoint.scala:600)
	at org.apache.spark.storage.BlockManagerMasterEndpoint$$anonfun$receiveAndReply$1.applyOrElse(BlockManagerMasterEndpoint.scala:123)
	at org.apache.spark.rpc.netty.Inbox.$anonfun$process$1(Inbox.scala:103)
	at org.apache.spark.rpc.netty.Inbox.safelyCall(Inbox.scala:213)
	at org.apache.spark.rpc.netty.Inbox.process(Inbox.scala:100)
	at org.apache.spark.rpc.netty.MessageLoop.org$apache$spark$rpc$netty$MessageLoop$$receiveLoop(MessageLoop.scala:75)
	at org.apache.spark.rpc.netty.MessageLoop$$

In [5]:
sc = spark.sparkContext

In [6]:
spark

# Load Images

In [7]:
images = spark.read.format("binaryFile") \
    .option("pathGlobFilter", "*.jpg") \
    .option("recursiveFileLookup", "true") \
    .load(IMAGE_SUBSET_PATH)
    
# Add a label column from the image path
images = images.withColumn('label', element_at(split(images['path'], '/'),-2))

In [8]:
images.printSchema()

root
 |-- path: string (nullable = true)
 |-- modificationTime: timestamp (nullable = true)
 |-- length: long (nullable = true)
 |-- content: binary (nullable = true)
 |-- label: string (nullable = true)



In [9]:
images.count()

                                                                                

50

In [10]:
# Inspect path and label
print(images.select('path','label').show(10,False))

+------------------------------------------------------------------------------------------------+------------------+
|path                                                                                            |label             |
+------------------------------------------------------------------------------------------------+------------------+
|file:/home/louberehc/OCR/projets/8_cloud_computing/images_subset/Huckleberry/r_162_100.jpg      |Huckleberry       |
|file:/home/louberehc/OCR/projets/8_cloud_computing/images_subset/Huckleberry/r_138_100.jpg      |Huckleberry       |
|file:/home/louberehc/OCR/projets/8_cloud_computing/images_subset/Huckleberry/r_136_100.jpg      |Huckleberry       |
|file:/home/louberehc/OCR/projets/8_cloud_computing/images_subset/Huckleberry/r_311_100.jpg      |Huckleberry       |
|file:/home/louberehc/OCR/projets/8_cloud_computing/images_subset/Huckleberry/r_322_100.jpg      |Huckleberry       |
|file:/home/louberehc/OCR/projets/8_cloud_computing/imag

In [11]:
print(images.select('content').show(1,False))

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Build model for feature extraction and broadcast its weights.

In [12]:
# DL or load mobilenetV2
model = MobileNetV2(
    weights='imagenet',
    include_top=True,
    input_shape=(224, 224, 3)
)

2023-07-10 11:29:09.729369: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [13]:
# Create a model without the last layer
new_model = Model(
    inputs=model.input,
    outputs=model.layers[-2].output
)

In [14]:
new_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 224, 224, 3  0           []                               
                                )]                                                                
                                                                                                  
 Conv1 (Conv2D)                 (None, 112, 112, 32  864         ['input_1[0][0]']                
                                )                                                                 
                                                                                                  
 bn_Conv1 (BatchNormalization)  (None, 112, 112, 32  128         ['Conv1[0][0]']                  
                                )                                                             

In [15]:
# Make a broadcast object to broadcast the model weights to
# each worker.
broadcast_weights = sc.broadcast(new_model.get_weights())

In [16]:
broadcast_weights

<pyspark.broadcast.Broadcast at 0x7f0510ecada0>

In [17]:
# spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "1024")

# Functions 

In [18]:
def model_fn():
    """
    Returns a MobileNetV2 model with top layer removed 
    and broadcasted pretrained weights.
    """
    model = MobileNetV2(weights='imagenet',
                        include_top=True,
                        input_shape=(224, 224, 3))
    
    for layer in model.layers:
        layer.trainable = False
        
    new_model = Model(inputs=model.input,
                      outputs=model.layers[-2].output)
    new_model.set_weights(broadcast_weights.value)
    return new_model


def preprocess(content):
    """
    Preprocesses raw image bytes for prediction.
    """
    img = Image.open(io.BytesIO(content)).resize([224, 224])
    arr = img_to_array(img)
    return preprocess_input(arr)

def featurize_series(model, content_series):
    """
    Featurize a pd.Series of raw images using the input model.
    :return: a pd.Series of image features
    """
    input = np.stack(content_series.map(preprocess))
    preds = model.predict(input)
    # For some layers, output features will be multi-dimensional tensors.
    # We flatten the feature tensors to vectors for easier storage in Spark DataFrames.
    output = [p.flatten() for p in preds]
    return pd.Series(output)


@pandas_udf('array<float>', PandasUDFType.SCALAR_ITER)
def featurize_udf(content_series_iter):
    '''
    This method is a Scalar Iterator pandas UDF wrapping our featurization function.
    The decorator specifies that this returns a Spark DataFrame column of type ArrayType(FloatType).
    
    :param content_series_iter: This argument is an iterator over batches of data, where each batch
                                is a pandas Series of image data.
    '''
    # With Scalar Iterator pandas UDFs, we can load the model once and then re-use it
    # for multiple data batches.  This amortizes the overhead of loading big models.
    model = model_fn()
    for content_series in content_series_iter:
        yield featurize_series(model, content_series)



In [19]:
# Build a DF with the path, label and features of each image
features_df = (
    images
    .repartition(16)
    .select(
        col("path"),
        col("label"),
        featurize_udf("content").alias("features")
   )
)

#MLLib needs some post processing of the features column format
list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT())

features_df = (
    features_df
    .select(
        col("path"),  
        col("label"),
        list_to_vector_udf(features_df["features"]).alias("features")
   )
)

# Define a pipeline to Standardize the features
# and compute the PCA projection onto the 300 first PCs.
scaler = StandardScaler(
    inputCol="features",
    outputCol="scaled_features",
    withStd=True,
    withMean=True
)

pca = PCA(
    k=300,
    inputCol=scaler.getOutputCol(),
    outputCol="pca_features",
)

pipeline = Pipeline(stages=[scaler , pca])
model = pipeline.fit(features_df)
features_df = model.transform(features_df)

# Write results
(
    features_df
    .drop('scaled_features')
    .write.mode("overwrite")
    .parquet(RESULT_PATH)
)

2023-07-10 11:29:46.908120: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2023-07-10 11:29:46.911812: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2023-07-10 11:29:46.933860: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would li

I am not going for optimization right now, but it is not fast... It took ~2 minutes to compute the features and its PCA projection for 50 images.

# Loading the results

In [20]:
df = pd.read_parquet(RESULT_PATH, engine='pyarrow')
df.shape

(50, 4)

In [21]:
df.columns

Index(['path', 'label', 'features', 'pca_features'], dtype='object')

In [22]:
df.loc[0, 'features']["values"].shape

(1280,)

In [23]:
df.loc[0, 'pca_features']["values"].shape

(300,)

In [24]:
df.loc[0, 'pca_features']["values"]

array([ 3.98710174e+00,  1.10383853e+01,  2.38947932e+01, -7.69875881e+00,
        2.37189645e+00, -4.17323052e+00, -8.75384999e-01,  7.58112684e+00,
        8.46811258e-01, -1.63814549e-01, -1.94542496e+00, -1.54144326e-01,
       -2.80772266e+00, -3.84350279e+00, -3.65789436e-01,  4.48795007e-03,
       -5.93215632e-01,  7.51930420e-01, -1.21083240e+00,  3.10585734e-01,
       -9.34863532e-01, -8.04337676e-01, -1.59773086e+00, -1.16569854e+01,
        4.49314885e+00,  3.54640000e+00,  6.11986943e+00, -1.55038903e+00,
       -1.06345044e+01,  1.89692596e+00,  6.92818642e+00, -6.49194137e-02,
       -8.11849837e-01, -3.15826681e-01,  3.81113795e+00,  9.31022245e-01,
       -1.36885986e+00, -6.04712899e-01, -2.13370353e-01, -2.28626013e+00,
        6.87706819e-01,  1.70728531e-01, -1.34922969e+00,  5.75543040e-01,
        1.64755088e-01, -1.87366122e-01,  3.14558154e-01,  4.69869065e-01,
       -1.30640922e-01, -6.58373160e-07, -4.26451166e-07,  7.44523201e-07,
       -1.36457132e-06,  

Results have the right dimension.

In [25]:
sc.stop()