# P8 Déployer un modèle dans le cloud - Local PySpark and local dataset

In [27]:
import requests

import PIL.Image
import boto3

import pyspark
import numpy as np
import torch

import os

from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import Row

import datetime 

## Common (Spark Context & Session)

In [28]:
def start_spark() -> SparkSession:
    """
    Returns: (SparkSession) my Spark session
    """
    #sc = SparkContext()
    #sc.setLogLevel("ERROR")
    #session = SparkSession(sparkContext=sc)
    #print(f"\n{'#' * 100} \n Cloud Fruits P8 \n{'#' * 100}\n")
    #return session
    spark = SparkSession.builder \
          .master("local") \
          .appName("Cloud-Fruits-P8") \
          .getOrCreate()
    return spark

## Check files from local folder

In [20]:
dataset_folder = "/opt/spark/fruits-360-dataset/"

light = "Light"

training_folder = dataset_folder + light + "Training"
test_folder = dataset_folder + light + "Test"

In [21]:
len(os.listdir(training_folder))

4

In [22]:
def load_image(img: Row) -> PIL.Image.Image:
    """
    Load an image from pyspark row.
    Args:
        img (Row): a row that contains the image to be loaded.
            It should have the attributes specified in `pyspark.ml.image.ImageSchema.columnSchema`.
    Returns: (PIL.Image.Image) RGB image data
    """
    image_array = np.ndarray(
        buffer=img.data,
        dtype=np.uint8,
        shape=(img.height, img.width, img.nChannels),
        strides=(img.width * img.nChannels, img.nChannels, 1)
    )
    image_array = image_array[:, :, ::-1]  # rotate colors from BGR to RGB

    return PIL.Image.fromarray(image_array)

In [23]:
def process_image(img: Row) -> Row:
    """
    Encodes an image into a fixed-length vector of float values using a pre-trained DNN encoder.
    Args:
        img (Row): a row that contains the image to be processed.
            It should have the attributes specified in `pyspark.ml.image.ImageSchema.columnSchema`.
    Returns: (Row) the image encoding, represented by a Row with fields:
        origin (str): image original path (i.e., the image key in S3 bucket)
        label (str): the image label
        x0 (float): first feature of image encoding
        (...)
        x_(n-1) (float): last feature of image encoding
    """
    print(f"...... process image {img.origin}")
    image_label = img.origin.split('/')[-2]
    image_array = load_image(img=img)
    image_encoding = broadcastEncoder.value.encode(image_array)
    image_encoding = {f'x{i}': value for i, value in enumerate(image_encoding)}
    return Row(origin=img.origin, label=image_label, **image_encoding)

## Encoder

In [24]:
from encoder import Encoder

# broadcast a user defined variable in a PySpark application works from a separate module
# ie a different file

## Start Session and process files

In [38]:
spark = start_spark()

print("... list images from Fruits 360 dataset")
df = spark.read.format('Image').load(training_folder + "/*/*.jpg")

print("... load image encoder")
broadcastEncoder = spark.sparkContext.broadcast(Encoder())

print("... encode images and write output to file")
output = df.select('image.*').rdd.map(process_image).toDF()

currentDate = datetime.datetime.today()

if not os.path.exists('output'):
    os.makedirs('output')
    
# The coalesce method reduces the number of partitions in a DataFrame
# coalesce(1) consolidate the data in one partition only
# output.coalesce(1).write.csv(f'output/{currentDate.strftime("%Y%m%d:%H%M%S")}/output.csv', header="true", mode="overwrite") # save as csv file
output.coalesce(1).write.parquet(f'output/{currentDate.strftime("%Y%m%d:%H%M%S")}/output.parquet') # save as parquet file

spark.sparkContext.stop()

... list images from Fruits 360 dataset


                                                                                

... load image encoder
... encode images and write output to file


...... process image file:/opt/spark/fruits-360-dataset/LightTraining/Raspberry/r_68_100.jpg
[W NNPACK.cpp:51] Could not initialize NNPACK! Reason: Unsupported hardware.
...... process image file:/opt/spark/fruits-360-dataset/LightTraining/Raspberry/r_68_100.jpg
[W NNPACK.cpp:51] Could not initialize NNPACK! Reason: Unsupported hardware.
...... process image file:/opt/spark/fruits-360-dataset/LightTraining/Raspberry/r_72_100.jpg
...... process image file:/opt/spark/fruits-360-dataset/LightTraining/Raspberry/r_69_100.jpg
...... process image file:/opt/spark/fruits-360-dataset/LightTraining/Raspberry/r_71_100.jpg
...... process image file:/opt/spark/fruits-360-dataset/LightTraining/Raspberry/r_70_100.jpg
...... process image file:/opt/spark/fruits-360-dataset/LightTraining/Raspberry/r_324_100.jpg
...... process image file:/opt/spark/fruits-360-dataset/LightTraining/Raspberry/r_326_100.jpg
...... process image file:/opt/spark/fruits-360-dataset/LightTraining/Raspberry/r_88_100.jpg
...... 

...... process image file:/opt/spark/fruits-360-dataset/LightTraining/Banana/217_100.jpg
...... process image file:/opt/spark/fruits-360-dataset/LightTraining/Banana/218_100.jpg
...... process image file:/opt/spark/fruits-360-dataset/LightTraining/Banana/38_100.jpg
...... process image file:/opt/spark/fruits-360-dataset/LightTraining/Banana/37_100.jpg
...... process image file:/opt/spark/fruits-360-dataset/LightTraining/Banana/10_100.jpg
...... process image file:/opt/spark/fruits-360-dataset/LightTraining/Banana/11_100.jpg
...... process image file:/opt/spark/fruits-360-dataset/LightTraining/Banana/35_100.jpg
...... process image file:/opt/spark/fruits-360-dataset/LightTraining/Banana/r_307_100.jpg
...... process image file:/opt/spark/fruits-360-dataset/LightTraining/Banana/r_309_100.jpg
...... process image file:/opt/spark/fruits-360-dataset/LightTraining/Banana/r_310_100.jpg
...... process image file:/opt/spark/fruits-360-dataset/LightTraining/Banana/r_322_100.jpg
...... process ima

...... process image file:/opt/spark/fruits-360-dataset/LightTraining/Apricot/r_110_100.jpg
...... process image file:/opt/spark/fruits-360-dataset/LightTraining/Apricot/r_284_100.jpg
...... process image file:/opt/spark/fruits-360-dataset/LightTraining/Apricot/r_112_100.jpg
...... process image file:/opt/spark/fruits-360-dataset/LightTraining/Apricot/r_108_100.jpg
...... process image file:/opt/spark/fruits-360-dataset/LightTraining/Banana/279_100.jpg
...... process image file:/opt/spark/fruits-360-dataset/LightTraining/Banana/280_100.jpg
...... process image file:/opt/spark/fruits-360-dataset/LightTraining/Banana/282_100.jpg
...... process image file:/opt/spark/fruits-360-dataset/LightTraining/Banana/281_100.jpg
...... process image file:/opt/spark/fruits-360-dataset/LightTraining/Banana/152_100.jpg
...... process image file:/opt/spark/fruits-360-dataset/LightTraining/Banana/153_100.jpg
...... process image file:/opt/spark/fruits-360-dataset/LightTraining/Banana/283_100.jpg
...... pr