# Projet 8 : Déployez un modèle dans le cloud
## Notebook preprocessing et featuring

*Julie Neury-Ormanni*

### Import des librairies

In [15]:
#imports
import pandas as pd
from PIL import Image
import numpy as np
import time
import io
import tensorflow as tf
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input, decode_predictions
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.image import img_to_array
from pyspark.sql.functions import col, pandas_udf, element_at, PandasUDFType, split, udf
from pyspark.ml.linalg import Vectors, VectorUDT
import gc

from pyspark.sql.types import ArrayType, StructField, StructType, FloatType, StringType
from pyspark.ml.feature import PCA, VectorAssembler, StandardScaler, StringIndexer
from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Chargement des images depuis S3

In [3]:
path_image = 's3://p8neurybucket/Training-fruit/**'

images = spark.read.format("binaryFile") \
  .option("recursiveFileLookup", "true") \
  .load(path_image).limit(100)

images.show(n=5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+-------------------+------+--------------------+
|                path|   modificationTime|length|             content|
+--------------------+-------------------+------+--------------------+
|s3://p8neurybucke...|2022-04-29 12:48:08|  5278|[FF D8 FF E0 00 1...|
|s3://p8neurybucke...|2022-04-29 13:27:25|  5277|[FF D8 FF E0 00 1...|
|s3://p8neurybucke...|2022-04-29 13:27:34|  5277|[FF D8 FF E0 00 1...|
|s3://p8neurybucke...|2022-04-29 12:48:08|  5272|[FF D8 FF E0 00 1...|
|s3://p8neurybucke...|2022-04-29 13:27:36|  5264|[FF D8 FF E0 00 1...|
+--------------------+-------------------+------+--------------------+
only showing top 5 rows

### Preprocessing des images et transfer learning

In [4]:
def preprocess(content):
    """
    Preprocesses raw image bytes for prediction.
    """
    img = Image.open(io.BytesIO(content)).resize([224, 224])
    arr = img_to_array(img)
    return preprocess_input(arr)

def featurize_series(model, content_series):
    """
    Featurize a pd.Series of raw images using the input model.
    :return: a pd.Series of image features
    """
    input = np.stack(content_series.map(preprocess))
    preds = model.predict(input)
  # For some layers, output features will be multi-dimensional tensors.
  # We flatten the feature tensors to vectors for easier storage in Spark DataFrames.
    output = [p.flatten() for p in preds]
    #ouput = np.array(output)
    return pd.Series(output)

model = ResNet50(weights=None, include_top=False)
bc_model_weights = sc.broadcast(model.get_weights())

def model_fn():
    """
    Returns a ResNet50 model with top layer removed and broadcasted pretrained weights.
    """
    model = ResNet50(weights=None, include_top=False)
    model.set_weights(bc_model_weights.value)
    return model

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Extraction des features

In [5]:
from typing import Iterator, Tuple
@pandas_udf('array<float>')
def featurize_udf(content_series_iter: Iterator[Tuple[pd.Series, pd.Series]]) -> Iterator[pd.Series]:
    '''
    This method is a Scalar Iterator pandas UDF wrapping our featurization function.
    The decorator specifies that this returns a Spark DataFrame column of type ArrayType(FloatType).
    :param content_series_iter: This argument is an iterator over batches of data, where each batch
    is a pandas Series of image data.
    '''
  # With Scalar Iterator pandas UDFs, we can load the model once and then re-use it
  # for multiple data batches.  This amortizes the overhead of loading big models.
    model = model_fn()
    for content_series in content_series_iter:
        yield featurize_series(model, content_series)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
df_feat = images.repartition(16).select(col("path"), featurize_udf("content").alias("feats"))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
# Getting the label by splitting the path of the image and getting its last directory
df_feat = (df_feat.withColumn('label',element_at(split(df_feat['path'],"/"),-2)))

df_feat.show(n=3)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+--------------------+---------------+
|                path|               feats|          label|
+--------------------+--------------------+---------------+
|s3://p8neurybucke...|[0.0, 5.6815786, ...| Apple Braeburn|
|s3://p8neurybucke...|[0.0, 5.8709025, ...| Apple Braeburn|
|s3://p8neurybucke...|[0.0, 5.6673794, ...|Apple Pink Lady|
+--------------------+--------------------+---------------+
only showing top 3 rows

### Réduction des features par ACP

In [9]:
def pca_transformation(df, n_components=50, col_image='feats'):
    
    """
    Applique un algorithme de PCA sur l'ensemble des images pour réduire la dimension de chaque image 
    du jeu de données.
    
    Paramètres:
    df(pyspark dataFrame): contient une colonne avec les données images
    n_components(int): nombre de dimensions à conserver
    col_image(string): nom de la colonne où récupérer les données images
    """

    # Initilisation du temps de calcul
    start_time = time.time()

    # Les données images sont converties au format vecteur dense
    ud_f = udf(lambda r: Vectors.dense(r), VectorUDT())
    df = df.withColumn('feats', ud_f('feats'))
    
    standardizer = StandardScaler(inputCol='feats', outputCol="scaledFeatures",
                                  withStd=True, withMean=True)
    model_std = standardizer.fit(df)
    df = model_std.transform(df)

    # Entrainement de l'algorithme
    pca = PCA(k=n_components, inputCol='scaledFeatures', outputCol='pcaFeatures')
    model_pca = pca.fit(df)

    # Transformation des images sur les k premières composantes
    df = model_pca.transform(df)

    df = df.filter(df.pcaFeatures.isNotNull())
    
    # Affiche le temps de calcul
    print("Temps d'execution {:.2f} minutes".format((time.time() - start_time)/60))


    return df

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
print("---Réduction dimmensionnelle---")
pca_df = pca_transformation(df_feat)
pca_df.show(n=1)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

---R?duction dimmensionnelle---
Temps d'execution 28.88 minutes
+--------------------+--------------------+--------------+--------------------+--------------------+
|                path|               feats|         label|      scaledFeatures|         pcaFeatures|
+--------------------+--------------------+--------------+--------------------+--------------------+
|s3://p8neurybucke...|[0.0,5.4075827598...|Apple Braeburn|[0.0,-0.477697638...|[0.13785683017128...|
+--------------------+--------------------+--------------+--------------------+--------------------+
only showing top 1 row

### Export des features en parquet

In [12]:
pca_df.select(['path', 'label', 'pcaFeatures']).write.parquet('s3://p8neurybucket/output_feat.parquet')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [13]:
parquetFile = spark.read.parquet('s3://p8neurybucket/output_feat.parquet')
parquetFile.show(n=5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+---------------+--------------------+
|                path|          label|         pcaFeatures|
+--------------------+---------------+--------------------+
|s3://p8neurybucke...| Apple Braeburn|[0.09293418329549...|
|s3://p8neurybucke...| Apple Braeburn|[-135952.93232378...|
|s3://p8neurybucke...| Apple Braeburn|[0.01688021266067...|
|s3://p8neurybucke...|Apple Pink Lady|[0.12184944035153...|
|s3://p8neurybucke...| Apple Braeburn|[-0.0663827312183...|
+--------------------+---------------+--------------------+
only showing top 5 rows

In [16]:
sparse_format_udf = udf(lambda x: ','.join([str(elem) for elem in x]), StringType())

parquetFile = parquetFile.withColumn('pcaFeaturesString', sparse_format_udf(parquetFile.pcaFeatures))

parquetFile.show(n=5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+---------------+--------------------+--------------------+
|                path|          label|         pcaFeatures|   pcaFeaturesString|
+--------------------+---------------+--------------------+--------------------+
|s3://p8neurybucke...| Apple Braeburn|[0.09293418329549...|0.092934183295497...|
|s3://p8neurybucke...| Apple Braeburn|[-135952.93232378...|-135952.932323783...|
|s3://p8neurybucke...| Apple Braeburn|[0.01688021266067...|0.016880212660676...|
|s3://p8neurybucke...|Apple Pink Lady|[0.12184944035153...|0.121849440351535...|
|s3://p8neurybucke...| Apple Braeburn|[-0.0663827312183...|-0.06638273121833...|
+--------------------+---------------+--------------------+--------------------+
only showing top 5 rows

In [17]:
parquetFile.select(['path', 'label', 'pcaFeaturesString']) \
    .write.option("delimiter", "\t").option("header", True) \
    .csv('s3://p8-s3-cindygs/output_feat.csv')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

An error was encountered:
An error occurred while calling o267.csv.
: java.io.IOException: com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.services.s3.model.AmazonS3Exception: All access to this object has been disabled (Service: Amazon S3; Status Code: 403; Error Code: AllAccessDisabled; Request ID: EYGQPJTP8AE0W45Y; S3 Extended Request ID: a85D9DfcPn+NZSTSqqAlnLElriu1iTwuz+1ypGZbLeyPdfZ7Q8L54SEHthequrGLagdd4fcg97w=; Proxy: null), S3 Extended Request ID: a85D9DfcPn+NZSTSqqAlnLElriu1iTwuz+1ypGZbLeyPdfZ7Q8L54SEHthequrGLagdd4fcg97w=
	at com.amazon.ws.emr.hadoop.fs.s3n.Jets3tNativeFileSystemStore.list(Jets3tNativeFileSystemStore.java:421)
	at com.amazon.ws.emr.hadoop.fs.s3n.Jets3tNativeFileSystemStore.isFolderUsingFolderObject(Jets3tNativeFileSystemStore.java:247)
	at com.amazon.ws.emr.hadoop.fs.s3n.Jets3tNativeFileSystemStore.isFolder(Jets3tNativeFileSystemStore.java:210)
	at com.amazon.ws.emr.hadoop.fs.s3n.S3NativeFileSystem.getFileStatus(S3NativeFileSystem.java:515)
	at org.apache.had