<a href="https://colab.research.google.com/github/MadesLaurence/OpenClassrooms-Fruits-Project-8-/blob/main/P8_03_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Objective
Set-up a first big data architecture using AWS products (Mobile application with a fruit pictures classifier engine)

# Data
Link to upload data: https://www.kaggle.com/moltean/fruits

# Table of contents <a class="anchor" id="chapter0"></a> 
* [Imports and declarations](#chapter1)
    * [Import packages](#sub1_1)
    * [Declare constants](#sub1_2)
* [Exploration of the full dataset](#chapter2)
    * [Get picture information from local full dataset](#sub2_1)
    * [Explore picture information](#sub2_2)
    * [Get class information](#sub2_3) 
    * [Target label encoding](#sub2_4)
* [Preparation of the local sampled picture set](#chapter3)
    * [Create a local sampled picture set](#sub3_1)
    * [Get picture information from local sampled picture set](#sub3_2)
* [Create and configure a Spark Session](#chapter4)
* [Load data](#chapter5)
    * [Load pictures](#sub5_1)
    * [Distinguish Target and Dataset](#sub5_2)
    * Index Target](#sub5_3)
* [Features extraction](#chapter6)
    * [Instantiate a ResNet50 model with pre-trained weights](#sub6_1)
    * [Functions](#sub6_2)
    * [Features extraction](#sub6_3)
* [Train a new model using pre-computed features](#chapter7)
    * [Prepare my new model](#sub7_1)
    * [Index target in datasets](#sub7_2)
    * [Train my new model on Training dataset](#sub7_3)
    * [Predict fruit class with my new model on Test dataset](#sub7_4)
* [Visualise classification results](#chapter8)
    * [Reduce dimension with PCA](#sub8_1)
    * [Draw predictions in the first main plan](#sub8_2)
* [Go to End](#chapter100)

# Imports and declarations <a class="anchor" id="chapter1"></a>

## Import packages <a class="anchor" id="sub1_1"></a>

In [117]:
pip install pyspark



In [118]:
#import P8_02_module as MyMod

import numpy as np
import pandas as pd 

import os
from os import path
import glob
import shutil
import time

import matplotlib.pyplot as plt
#from matplotlib.image import imread

#from cv2 import cv2
import PIL
from PIL import Image
#, ImageDraw, ImageOps, ImageFilter

#from sklearn import cluster
#from sklearn import decomposition
#from sklearn import metrics
#from sklearn.neighbors import KNeighborsClassifier
#from sklearn.model_selection import learning_curve
#from sklearn.utils import shuffle

#import boto3

# Pyspark
#import findspark
#findspark.init("C:\spark\spark-3.2.1-bin-hadoop3.2") # Path to hadoop 
import pyspark
print("PySpark version:{}".format(pyspark.__version__)) # Verify PySpark version
from pyspark import SparkContext
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import element_at, split
from pyspark.sql.functions import col, pandas_udf, PandasUDFType

from pyspark.ml.feature import StringIndexer
#import pyarrow
#print("PyArrow version:{}".format(pyarrow.__version__)) # Verify PySpark version

# ?
from pyspark.sql.types import *
#import pyspark.sql.functions as F
#from pyspark.ml.image import ImageSchema # RDD (Resilient Distributed Dataset) from .jpg file
#from pyspark.ml.linalg import DenseVector, VectorUDT

import keras

# Tensorflow
import tensorflow as tf
import io
from typing import Iterator

from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras import optimizers
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.models import Sequential

#from tensorflow.keras.preprocessing.image import ImageDataGenerator
#from tensorflow.keras.optimizers import Adam
#from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint
#, decode_predictions
#from tensorflow.keras.preprocessing import image
#from tensorflow.keras.utils import image_dataset_from_directory
#from tensorflow.keras.losses import BinaryCrossentropy
#from tensorflow.keras.metrics import BinaryAccuracy

from google.colab import drive
drive.mount('/content/drive')

PySpark version:3.2.1
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Declare constants <a class="anchor" id="sub1_2"></a>

In [119]:
# Sample limitations
GET_PICTURES_NB_PER_CLASS = 2

# Local repositories
LOCAL_SRC_PATH = '../fruits-360-original-size/'
LOCAL_DEST_PATH = 'C:/fruits-360-sample/'
GOOGLE_DEST_PATH = 'drive/MyDrive/fruits-360-sample/'

# Image size
IMAGE_RESIZE = 224

# CNN model
BATCH_SIZE = 16
RESNET50_POOLING_AVERAGE = 'avg'
DENSE_LAYER_ACTIVATION = 'softmax'
OBJECTIVE_FUNCTION = 'categorical_crossentropy'
LOSS_METRICS = ['accuracy']
NUM_EPOCHS = 30
EARLY_STOP_PATIENCE = 3  # EARLY_STOP_PATIENCE must be < NUM_EPOCHS

# S3 Bucket
BUCKET_NAME = "moncompartimentamoi"

# KMeans hyper-param
#KMEANS_N_CLUSTERS = 90

# Exploration of the full dataset <a class="anchor" id="chapter2"></a>

## Get picture information from local full dataset <a class="anchor" id="sub2_1"></a>

In [120]:
def rep_2_picture_info(path):

    # Initiate Dataframe with Dataset names, Target class names and Picture names
    df = pd.DataFrame(columns = ['FullFileName', 'Dataset', 'Target', 'Picture', 'FileSize (in KB)']) 

    for file in glob.iglob(path+'**/*.jpg', recursive = True):

        lst = file.split('\\')
        
        # update DataFrame
        lst.append(os.path.getsize(file) / 1024)  # in KBytes    
        lst[0] = lst[0] + "/" + lst[1] + "/" + lst[2] + "/" + lst[3]
        df.loc[len(df)] = lst

    return df

In [121]:
df_main = rep_2_picture_info(LOCAL_SRC_PATH)
df_main

Unnamed: 0,FullFileName,Dataset,Target,Picture,FileSize (in KB)


## Explore picture information <a class="anchor" id="sub2_2"></a>

### Assess volumes and modalities

In [None]:
df_main.describe()

Unnamed: 0,FullFileName,Dataset,Target,Picture,FileSize (in KB)
count,0.0,0.0,0.0,0.0,0.0
unique,0.0,0.0,0.0,0.0,0.0
top,,,,,
freq,,,,,


 > 12 455 pictures, 24 target classes, 3 datasets  
 > 958 picture names mean that some pictures have the same name and are not classified in the same repository

### Count pictures by target class and dataset

In [None]:
pd.DataFrame(df_main.groupby(['Target', 'Dataset'])['Picture'].count())

Unnamed: 0_level_0,Unnamed: 1_level_0,Picture
Target,Dataset,Unnamed: 2_level_1


### Count pictures by target class

In [None]:
pd.DataFrame(df_main.groupby(['Target'])['Picture'].count())

Unnamed: 0_level_0,Picture
Target,Unnamed: 1_level_1


### Count dataset modality by picture name

In [None]:
df_dataset_mod = pd.DataFrame(df_main.groupby(['Picture'])['Dataset'].nunique())
df_dataset_mod.rename(columns={'Dataset':'Dataset_mod'}, inplace=True)
len(df_dataset_mod[df_dataset_mod['Dataset_mod'] > 1])

0

> No file with the same name in the different datasets

### Count target class modality by picture name

In [None]:
df_target_mod = pd.DataFrame(df_main.groupby(['Picture'])['Target'].nunique())
df_target_mod.rename(columns={'Target':'Target_mod'}, inplace=True)
df_target_mod[df_target_mod['Target_mod'] > 1]

Unnamed: 0_level_0,Target_mod
Picture,Unnamed: 1_level_1


In [None]:
df_target_mod = df_target_mod.reset_index(drop=False)

In [None]:
pd.DataFrame(df_target_mod.groupby(['Target_mod'])['Picture'].count())

Unnamed: 0_level_0,Picture
Target_mod,Unnamed: 1_level_1


 > many files with the same name in the different target classes   
 > for instance, 156 files have the same name and appear in 24 different target class

In [None]:
df_main[df_main['Picture'] == 'r0_0.jpg'][['Picture', 'Target', 'Dataset']]

Unnamed: 0,Picture,Target,Dataset


In [None]:
pict = Image.open(df_main['FullFileName'].iloc[20])
plt.imshow(pict)
plt.show()

"pict = Image.open(df_main['FullFileName'].iloc[20])\nplt.imshow(pict)\nplt.show()"

In [None]:
pict = Image.open(df_main['FullFileName'].iloc[40])
plt.imshow(pict)
plt.show()

"pict = Image.open(df_main['FullFileName'].iloc[40])\nplt.imshow(pict)\nplt.show()"

 > File name format: r?_image_index.jpg (e.g. r0_31.jpg or r1_12.jpg)  
 > "r?" stands for rotation axis (first one is r0)

### Distinguish rotation axis and index

In [None]:
# Laurence: supprimer les FutureWarning
df_main["Rotation"], df_main["Index"] = df_main["Picture"].str.split("_", 1).str
df_main["Rotation"] = df_main["Rotation"].str.replace('r','')
df_main["Index"] = df_main["Index"].str.replace('.jpg','')
df_main

  


ValueError: ignored

In [None]:
pd.DataFrame(df_main["Rotation"].unique())

In [None]:
pict = Image.open(df_main['FullFileName'].iloc[df_main[df_main['Rotation'] == '0'].head(1).index[0]])
plt.imshow(pict)
plt.show()

In [None]:
pict = Image.open(df_main['FullFileName'].iloc[df_main[df_main['Rotation'] == '1'].head(1).index[0]])
plt.imshow(pict)
plt.show()

In [None]:
pict = Image.open(df_main['FullFileName'].iloc[df_main[df_main['Rotation'] == '2'].head(1).index[0]])
plt.imshow(pict)
plt.show()

 > 0 - queue top or down > rotation around the z-axis  
 > 1 - queue behind or ahead > rotation around the x-axis  
 > 2 - queue left or right > rotation around the y-axis  

### Target class count distribution

In [None]:
def distribution(df_in, dataset):
    
    df = df_in.copy()
    
    if dataset in ['Training', 'Test', 'Validation']:
        df.drop(df[df['Dataset'] != dataset].index, inplace=True)
    elif dataset != '*':
        print("dataset argument should be 'Training', 'Test', 'Validation' or '*'")
        return -1
        
    df_distrib = pd.DataFrame(df.groupby(['Target'])['Picture'].count())
    df_distrib.reset_index(drop=False, inplace=True)
    df_distrib.rename(columns={'Picture':'Picture count', 'Target':'Class'}, inplace=True)
    df_distrib = df_distrib.sort_values(by='Class', ascending=False)    

    if len(df_distrib) == 0: return -1
    
    df_distrib.plot.barh(x='Class', y='Picture count', figsize=(12, 10))    
    
    return 1

In [None]:
ret = distribution(df_main, 'Training')
ret = distribution(df_main, 'Test')
ret = distribution(df_main, 'Validation')
ret = distribution(df_main, '*')

### Target class filesize average distribution  
Logitech C920 camera and dedicated algorithm which extract the fruit from the background

In [None]:
pd.DataFrame(df_main.groupby('Target')['FileSize (in KB)'].mean())

In [None]:
pd.DataFrame(df_main.groupby('Dataset')['FileSize (in KB)'].mean())

In [None]:
df_main['FileSize (in KB)'].mean(), df_main['FileSize (in KB)'].sum()/1024**2

## Get class information <a class="anchor" id="sub2_3"></a>

In [None]:
path = '../fruits-360-original-size/Meta/'

df_class_add = pd.DataFrame(columns = ['PathName', 'Target', 'TxtName'])
df_meta = pd.DataFrame(columns = ['Flag', 'Value'])

for file in glob.iglob(path+'**/info.txt', recursive = True):
    
    df_meta_add = pd.read_csv(file, sep="=", names=['Flag', 'Value'])
        
    df_class_add.loc[0] = file.split('\\')   
        
    df_meta = pd.concat([df_class_add.join(df_meta_add, how='cross'), df_meta])

del df_class_add, df_meta_add

df_meta.drop(columns=['PathName', 'TxtName'], inplace=True)
df_meta = df_meta.sort_values(['Target', 'Flag'], ascending=True)
df_meta.reset_index(drop=True, inplace=True)
df_meta

In [None]:
df_meta['Flag'].nunique()

In [None]:
pd.DataFrame(df_meta['Flag'].unique(), columns=['Flag']).sort_values(by='Flag')

## Target label encoding  <a class="anchor" id="sub2_4"></a>

In [None]:
df_main, df_target_mapping = MyMod.encode_LabelEncoder(df_main, 'Target')
df_main.head(5)

In [None]:
df_meta, df_target_mapping = MyMod.encode_LabelEncoder(df_meta, 'Target')
df_meta.head(5)

In [None]:
df_target_mapping

# Preparation of the local sampled picture set <a class="anchor" id="chapter3"></a>

## Create a local sampled picture set <a class="anchor" id="sub3_1"></a>

In [None]:
target_prec = ""
for file in glob.iglob(LOCAL_SRC_PATH+'**/*.jpg', recursive = True):
    
    lst = file.split('\\')
    
    # Limit number of pictures per class(GET_PICTURES_NB_PER_CLASS)
    if lst[2] == target_prec: 
        i += 1
    else:
        i = 0
    target_prec = lst[2]
    
    if (lst[1] == 'Training' and i < 2*GET_PICTURES_NB_PER_CLASS) or \
                    (lst[1] != 'Training' and i < GET_PICTURES_NB_PER_CLASS):     
                
        # create the destination repository if necessary
        if not os.path.exists(LOCAL_DEST_PATH+lst[1]+"/"+lst[2]):
             os.makedirs(LOCAL_DEST_PATH+lst[1]+"/"+lst[2])

        # copy the file to the destination repository
        shutil.copyfile(LOCAL_SRC_PATH+lst[1]+"/"+lst[2]+"/"+lst[3], LOCAL_DEST_PATH+lst[1]+"/"+lst[2]+"/"+lst[3])

## Get picture information from local sampled picture set <a class="anchor" id="sub3_2"></a>

In [None]:
df_main = rep_2_picture_info(GOOGLE_DEST_PATH)
df_main

# Create and configure a Spark Session <a class="anchor" id="chapter4"></a> 
- builder()     generator to create the session
- master()      master name (yarn, mesos or local[number of cores to use]. Number of partitions for distributed objects.
- appName()     name the application
- getOrCreate() create a new session or return the existing one
- config()      configure session
    - spark.sql.repl.eagerEval.enabled: PySpark DataFrame quick assessment in Jupyter  
    - spark.sql.repl.eagerEval.maxNumRows: number of lines to show      
    - spark.sql.execution.arrow.pyspark.enabled: to use Arrow optimiser for Spark to Pandas DataFrame conversions (toPandas or createDataFrame)  

In [122]:
# Instantiate SparkSession
spark = SparkSession.builder\
        .master("local[*]")\
        .appName('P8_03')\
        .getOrCreate()

spark.conf.set('spark.sql.repl.eagerEval.enabled', True)
spark.conf.set('spark.sql.repl.eagerEval.maxNumRows', 5)
spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', True)

#spark.conf.set('java.io.tmpdir', 'C:\Users\Public\AppData\Local\Temp\')
#java.library.path

# Pandas UDFs on large records (e.g., very large images) can run into Out Of Memory (OOM) errors.
# If you hit such errors in the cell below, try reducing the Arrow batch size via `maxRecordsPerBatch`.
spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "1024")

In [123]:
# Get the SparkContext
sc = spark.sparkContext

In [124]:
# User web interface
spark

# Load data <a class="anchor" id="chapter5"></a> 

## Load pictures <a class="anchor" id="sub5_1"></a> 

### Get pictures in a Spark DataFrame in "image" format

In [125]:
pictures_df = spark.read.format('image').load(GOOGLE_DEST_PATH+"*/*/*.jpg", inferschema=True)
pictures_df.count()

193

In [126]:
pictures_df.select('image.origin').show(5, False, True)

-RECORD 0---------------------------------------------------------------------------------
 origin | file:/content/drive/MyDrive/fruits-360-sample/Test/apple_hit_1/r0_107.jpg       
-RECORD 1---------------------------------------------------------------------------------
 origin | file:/content/drive/MyDrive/fruits-360-sample/Training/apple_hit_1/r0_102.jpg   
-RECORD 2---------------------------------------------------------------------------------
 origin | file:/content/drive/MyDrive/fruits-360-sample/Test/apple_hit_1/r0_103.jpg       
-RECORD 3---------------------------------------------------------------------------------
 origin | file:/content/drive/MyDrive/fruits-360-sample/Validation/apple_hit_1/r0_101.jpg 
-RECORD 4---------------------------------------------------------------------------------
 origin | file:/content/drive/MyDrive/fruits-360-sample/Training/apple_hit_1/r0_100.jpg   
only showing top 5 rows



In [127]:
# Spark DataFrame scheme
pictures_df.printSchema()

root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = true)
 |    |-- width: integer (nullable = true)
 |    |-- nChannels: integer (nullable = true)
 |    |-- mode: integer (nullable = true)
 |    |-- data: binary (nullable = true)



### Get pictures in a Spark DataFrame in "binaryFile" format

In [128]:
pictures_df = spark.read.format("binaryFile") \
            .option("pathGlobFilter", "*.jpg") \
            .option("recursiveFileLookup", "true") \
            .load(GOOGLE_DEST_PATH+"*/*/*.jpg")

In [129]:
# Count sample size
pictures_df.count()

193

In [130]:
# Spark DataFrame visualisation
pictures_df.show(5)

+--------------------+-------------------+------+--------------------+
|                path|   modificationTime|length|             content|
+--------------------+-------------------+------+--------------------+
|file:/content/dri...|2022-03-21 14:12:59|123514|[FF D8 FF E0 00 1...|
|file:/content/dri...|2022-03-21 14:13:00|122525|[FF D8 FF E0 00 1...|
|file:/content/dri...|2022-03-21 14:12:59|121530|[FF D8 FF E0 00 1...|
|file:/content/dri...|2022-03-21 14:13:00|119805|[FF D8 FF E0 00 1...|
|file:/content/dri...|2022-03-21 14:13:00|119068|[FF D8 FF E0 00 1...|
+--------------------+-------------------+------+--------------------+
only showing top 5 rows



In [131]:
# Spark DataFrame scheme
pictures_df.printSchema()

root
 |-- path: string (nullable = true)
 |-- modificationTime: timestamp (nullable = true)
 |-- length: long (nullable = true)
 |-- content: binary (nullable = true)



## Distinguish Target and DataSet <a class="anchor" id="sub5_2"></a> 

In [132]:
# Extract Target from path
pictures_df = pictures_df.withColumn('target', element_at(split(pictures_df['path'], "/"), -2))

In [133]:
# Extract DataSet from path
pictures_df = pictures_df.withColumn('dataset', element_at(split(pictures_df['path'], "/"), -3))

## Index Target <a class="anchor" id="sub5_3"></a> 

In [134]:
indexer = StringIndexer(inputCol="target", outputCol="target_code") 
pictures_df = indexer.fit(pictures_df).transform(pictures_df)
pictures_df.show(20)

+--------------------+-------------------+------+--------------------+---------------+----------+-----------+
|                path|   modificationTime|length|             content|         target|   dataset|target_code|
+--------------------+-------------------+------+--------------------+---------------+----------+-----------+
|file:/content/dri...|2022-03-21 14:12:59|123514|[FF D8 FF E0 00 1...|    apple_hit_1|      Test|        7.0|
|file:/content/dri...|2022-03-21 14:13:00|122525|[FF D8 FF E0 00 1...|    apple_hit_1|  Training|        7.0|
|file:/content/dri...|2022-03-21 14:12:59|121530|[FF D8 FF E0 00 1...|    apple_hit_1|      Test|        7.0|
|file:/content/dri...|2022-03-21 14:13:00|119805|[FF D8 FF E0 00 1...|    apple_hit_1|Validation|        7.0|
|file:/content/dri...|2022-03-21 14:13:00|119068|[FF D8 FF E0 00 1...|    apple_hit_1|  Training|        7.0|
|file:/content/dri...|2022-03-21 14:13:01| 96730|[FF D8 FF E0 00 1...|cabbage_white_1|Validation|       15.0|
|file:/con

In [135]:
pictures_df = pictures_df.withColumn("target_code",pictures_df.target_code.cast(IntegerType()))
pictures_df.show(20)

+--------------------+-------------------+------+--------------------+---------------+----------+-----------+
|                path|   modificationTime|length|             content|         target|   dataset|target_code|
+--------------------+-------------------+------+--------------------+---------------+----------+-----------+
|file:/content/dri...|2022-03-21 14:12:59|123514|[FF D8 FF E0 00 1...|    apple_hit_1|      Test|          7|
|file:/content/dri...|2022-03-21 14:13:00|122525|[FF D8 FF E0 00 1...|    apple_hit_1|  Training|          7|
|file:/content/dri...|2022-03-21 14:12:59|121530|[FF D8 FF E0 00 1...|    apple_hit_1|      Test|          7|
|file:/content/dri...|2022-03-21 14:13:00|119805|[FF D8 FF E0 00 1...|    apple_hit_1|Validation|          7|
|file:/content/dri...|2022-03-21 14:13:00|119068|[FF D8 FF E0 00 1...|    apple_hit_1|  Training|          7|
|file:/content/dri...|2022-03-21 14:13:01| 96730|[FF D8 FF E0 00 1...|cabbage_white_1|Validation|         15|
|file:/con

In [136]:
# Spark DataFrame scheme
pictures_df.printSchema()

root
 |-- path: string (nullable = true)
 |-- modificationTime: timestamp (nullable = true)
 |-- length: long (nullable = true)
 |-- content: binary (nullable = true)
 |-- target: string (nullable = true)
 |-- dataset: string (nullable = true)
 |-- target_code: integer (nullable = true)



# Features extraction <a class="anchor" id="chapter6"></a> 

1 - Instantiate a base model and load pre-trained weights into it  
2 - Run your new dataset through it and record the output of one/several layers from the base model   
3 - Use that output as input data for a new, smaller model   
advantage: you only run the base model once on your data, rather than once per epoch of training > a lot faster & cheaper  
issue: doesn't allow you to dynamically modify the input data of your new model during training, which is required when doing data augmentation  

## Instantiate a ResNet50 model with pre-trained weights  <a class="anchor" id="sub6_1"></a> 
"Residual Network" with 50 layers. Convolutional Neural Network

In [137]:
pre_model = ResNet50(
    # Weights pre-trained on ImageNet: resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5 
    # in cache directory (~/.keras/models)
    weights='imagenet',                                 
    input_shape=(IMAGE_RESIZE, IMAGE_RESIZE, 3),
    # Do not include the ImageNet fully-connected layer at the top of the network
    include_top=True,
    classes=1000)
print(len(pre_model.layers))

177


In [None]:
pre_model.summary()

In [138]:
pre_model = ResNet50(
    # Weights pre-trained on ImageNet: resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5 
    # in cache directory (~/.keras/models)
    weights='imagenet',                                 
    input_shape=(IMAGE_RESIZE, IMAGE_RESIZE, 3),
    # Do not include the ImageNet fully-connected layer at the top of the network
    include_top=False,
    classes=24)
print(len(pre_model.layers))

175


In [None]:
pre_model.summary()

In [None]:
pre_model.layers[0], pre_model.layers[1], pre_model.layers[19]

(<keras.engine.input_layer.InputLayer at 0x7f721526a890>,
 <keras.layers.convolutional.ZeroPadding2D at 0x7f721526a710>,
 <keras.layers.convolutional.Conv2D at 0x7f721525e5d0>)

In [139]:
# Broadcast the model weights in the SparkContext
bc_model_weights = sc.broadcast(pre_model.get_weights())

## Functions  <a class="anchor" id="sub6_2"></a> 
https://docs.databricks.com/_static/notebooks/deep-learning/deep-learning-transfer-learning-keras.html

In [140]:
def model_fn():
    """
    Returns a ResNet50 model with top layer removed and broadcasted pretrained weights.
    """
    
    model = ResNet50(
        weights=None,                                 
        input_shape=(IMAGE_RESIZE, IMAGE_RESIZE, 3),
        include_top=False,         # Do not include the ImageNet fully-connected layer at the top of the network
        classes=24)

    # set model weights to previously broadcasted weights
    model.set_weights(bc_model_weights.value)
    
    return model

### Preprocess one image

In [141]:
def preprocess(content):
    """
    Preprocesses raw image bytes for prediction.
    """
    # Redimension of the picture
    img = Image.open(io.BytesIO(content)).resize([IMAGE_RESIZE, IMAGE_RESIZE])
    
    # Converts the PIL Image instance to a Numpy array
    arr = img_to_array(img)
    
    # Preprocesses a tensor or Numpy array encoding a batch of images
    return preprocess_input(arr)

### Featurize a pd.Series of images

In [142]:
def featurize_series(model, content_series):
    """
    Featurize a pd.Series of raw images using the input model.
    :return: a pd.Series of image features
    """
    # Join the sequence of arrays along the first axis. The stacked array has one more dimension than the input arrays
    input = np.stack(content_series.map(preprocess))
    
    # Compute predictions 
    preds = model.predict(input)

    # For some layers, output features will be multi-dimensional tensors.
    # We flatten the feature tensors to vectors for easier storage in Spark DataFrames.
    output = [p.flatten() for p in preds]
    
    return pd.Series(output)

###  Featurize all images
PandasUDFType.SCALAR_ITER used to amortize the cost of loading large models on workers  

PandasUDF User Defined Function  
New Pandas UDFs with Python type hints  

In [143]:
@pandas_udf('array<float>')
def featurize_udf(content_series_iter: Iterator[pd.Series]) -> Iterator[pd.Series]:
    '''
    This method is a Scalar Iterator pandas UDF wrapping our featurization function.
    The decorator specifies that this returns a Spark DataFrame column of type ArrayType(FloatType).

    :param content_series_iter: This argument is an iterator over batches of data, where each batch
                              is a pandas Series of image data.
    '''
    # With Scalar Iterator pandas UDFs, we can load the model once and then re-use it
    # for multiple data batches. This amortizes the overhead of loading big models.    
    pre_model = model_fn()
    
    for content_series in content_series_iter:
        
        # Yield returns a generator to the caller and the execution of the code starts only when the generator is iterated 
        yield featurize_series(pre_model, content_series)

## Features extraction  <a class="anchor" id="sub6_3"></a> 

In [144]:
# Pandas UDFs on large records (e.g., very large images) can run into Out Of Memory (OOM) errors.
# If you hit such errors in the cell below, try reducing the Arrow batch size via `maxRecordsPerBatch`.
spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "1024")

In [146]:
# Save starting time
time_start = time.time()

# Large model to the full dataset
# Dispatches dataframe over 16 partitions
features_df = pictures_df.repartition(16).select(col("dataset"), col('target'), col('target_code'), col("content"), featurize_udf("content").alias("X_features"))

# Compute time elapse
elapse_s = time.time()-time_start
elapse_m = int(elapse_s / 60)
print('Feature extraction done! Time elapsed: {} seconds ({} minutes)'.format(elapse_s, elapse_m))

Feature extraction done! Time elapsed: 0.07172608375549316 seconds (0 minutes)


In [147]:
features_df.persist()

dataset,target,target_code,content,X_features
Test,pear_3,21,[FF D8 FF E0 00 1...,"[0.0, 0.0, 0.0, 0..."
Training,cucumber_3,18,[FF D8 FF E0 00 1...,"[2.2741177, 0.0, ..."
Training,apple_red_3,11,[FF D8 FF E0 00 1...,"[0.0, 0.0, 0.0, 0..."
Training,apple_crimson_snow_1,2,[FF D8 FF E0 00 1...,"[0.0, 0.0, 0.0, 0..."
Training,apple_red_delicios_1,12,[FF D8 FF E0 00 1...,"[0.0, 0.0, 0.0, 0..."


In [148]:
features_df.show()

+----------+--------------------+-----------+--------------------+--------------------+
|   dataset|              target|target_code|             content|          X_features|
+----------+--------------------+-----------+--------------------+--------------------+
|      Test|              pear_3|         21|[FF D8 FF E0 00 1...|[0.0, 0.0, 0.0, 0...|
|  Training|          cucumber_3|         18|[FF D8 FF E0 00 1...|[2.2741177, 0.0, ...|
|  Training|         apple_red_3|         11|[FF D8 FF E0 00 1...|[0.0, 0.0, 0.0, 0...|
|  Training|apple_crimson_snow_1|          2|[FF D8 FF E0 00 1...|[0.0, 0.0, 0.0, 0...|
|  Training|apple_red_delicios_1|         12|[FF D8 FF E0 00 1...|[0.0, 0.0, 0.0, 0...|
|  Training|         apple_red_2|         10|[FF D8 FF E0 00 1...|[0.0, 0.0, 0.0, 0...|
|Validation|      apple_golden_1|          3|[FF D8 FF E0 00 1...|[0.0, 0.0, 0.0, 0...|
|  Training|          zucchini_1|         22|[FF D8 FF E0 00 1...|[0.0, 0.0, 0.0, 0...|
|  Training|      apple_golden_2

In [149]:
picture_cnt = features_df.count()
avg_elapse_s = elapse_s / picture_cnt

print('Number of pictures: {}'.format(picture_cnt))
print('Average time elapsed per picture: {} seconds'.format(avg_elapse_s))

Number of pictures: 193
Average time elapsed per picture: 0.0003716377396657677 seconds


In [150]:
features_df.show(1, False, True)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [151]:
# write spark dataframe to a Parquet file
features_df.write.mode("overwrite").parquet("features_data_parquet")

# Train a new model using pre-computed features <a class="anchor" id="chapter7"></a> 

Layers trainable attributes:
- weights : list of all weights variables of the layer  
- trainable_weights : list of those that are meant to be updated (via gradient descent) to minimize the loss during training  
- non_trainable_weights : list of those that aren't meant to be trained (updated by the model during the forward pass)  
- trainable : false moves all the layer's weights from trainable to non-trainable ("freezing" the layer)  
The only built-in layer that has non-trainable weights is the BatchNormalization layer  

1 - Instantiate a base model and load pre-trained weights into it  
2 - Freeze all layers in the base model by setting trainable = False  
3 - Create a new model on top of the output of one/several layers from the base model  
4 - Train your new model on your new dataset  (top layers to learn to turn the old features into predictions on a new dataset)   

Fine-tuning (optionnal): 
unfreeze the entire/partial model you obtained and re-training it on the new data with a very low learning rate  

## Prepare my new model <a class="anchor" id="sub7_1"></a> 

In [152]:
def add_layers_2_model(pretrained_model):
    """

    """    
    # Freeze all layers in the base model by setting trainable = False
    pretrained_model.trainable = False

    # Create a new model on top of the output of one layer from the model ------------------------------------------- 
    new_model = Sequential()

    new_model.add(pretrained_model)

    new_model.add(Flatten())
    new_model.add(Dense(512, activation='relu'))
    new_model.add(Dense(24, activation='softmax'))

    '''# Instantiates a Keras tensor
    inputs = keras.Input(shape=(IMAGE_RESIZE, IMAGE_RESIZE, 3))

    # Makes pretrained_model run in inference mode by passing training to False (necessary for fine-tuning)
    x = model(inputs, training=False)

    # Convert features of shape 'base_model.output_shape[1:]' to vectors
    # inputs a 4D tensor with shape (batch_size, rows, cols, channels)
    # returns a 2D tensor with shape (batch_size, channels)
    x = keras.layers.GlobalAveragePooling2D()(x)

    # A Dense classifier with 24 units et activation 'relu'
    outputs = keras.layers.Dense(24, activation='relu')(x)

    model = keras.Model(inputs, outputs)'''

    # Compile
    sgd = optimizers.SGD(learning_rate = 0.01, decay = 1e-6, momentum = 0.9, nesterov = True)
    OBJECTIVE_FUNCTION = 'categorical_crossentropy'
    LOSS_METRICS = ['accuracy']
    new_model.compile(optimizer=sgd, loss=OBJECTIVE_FUNCTION, metrics=LOSS_METRICS)
    
    return new_model

new_model = add_layers_2_model(pre_model)    

In [153]:
new_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 resnet50 (Functional)       (None, 7, 7, 2048)        23587712  
                                                                 
 flatten_1 (Flatten)         (None, 100352)            0         
                                                                 
 dense_2 (Dense)             (None, 512)               51380736  
                                                                 
 dense_3 (Dense)             (None, 24)                12312     
                                                                 
Total params: 74,980,760
Trainable params: 51,393,048
Non-trainable params: 23,587,712
_________________________________________________________________


Spark workers need to access the model and its weights.  
For moderately sized models (< 1GB in size), a good practice is to download the model to the Spark driver and 
then broadcast the weights to the workers.  
For large models (> 1GB), it is best to load the model weights from distributed storage to workers directly.  

## Train my new model on Training dataset <a class="anchor" id="sub7_3"></a> 

In [None]:
from pyspark.sql.functions import udf

def preprocess(content):
    # Redimension of the picture
    img = Image.open(io.BytesIO(content)).resize([IMAGE_RESIZE, IMAGE_RESIZE])
    
    # Converts the PIL Image instance to a Numpy array
    arr = img_to_array(img)
    
    # Preprocesses a tensor or Numpy array encoding a batch of images
    arr_temp = preprocess_input(arr)
    
    # Return a copy of the array data as a (nested) Python list. Data items are converted to the nearest compatible builtin Python type, via the item function.
    # Convert to a list whose values are Python primitives instead of numpy objects numpy.float32 (mismatched data types between Python and Spark)
    return arr_temp.astype(np.float).tolist()
    #return tf.convert_to_tensor(arr_temp, dtype=tf.float32)
    return tf.convert_to_tensor(arr_temp.astype(np.float).tolist())

preprocess_udf = udf(lambda content_series: preprocess(content_series), ArrayType(FloatType()))

preprocess_df = features_df.select('dataset', 'target', 'target_code', preprocess_udf('content').alias("model_input"))

preprocess_df.show()

+----------+--------------------+-----------+--------------------+
|   dataset|              target|target_code|         model_input|
+----------+--------------------+-----------+--------------------+
|  Training|          cucumber_3|       18.0|[null, null, null...|
|  Training|              pear_3|       21.0|[null, null, null...|
|      Test|apple_crimson_snow_1|        2.0|[null, null, null...|
|  Training|apple_red_delicios_1|       12.0|[null, null, null...|
|Validation|          zucchini_1|       22.0|[null, null, null...|
|Validation|apple_red_delicios_1|       12.0|[null, null, null...|
|  Training|apple_granny_smith_1|        6.0|[null, null, null...|
|Validation|         apple_red_2|       10.0|[null, null, null...|
|Validation|  apple_red_yellow_1|       13.0|[null, null, null...|
|  Training|      apple_golden_3|        5.0|[null, null, null...|
|  Training|  apple_red_yellow_1|       13.0|[null, null, null...|
|  Training|             apple_6|        0.0|[null, null, null

In [None]:
preprocess_df.show(1, False, True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
'''def preprocess(content):
    """
    Preprocesses raw image bytes for prediction.
    """
    # Redimension of the picture
    img = Image.open(io.BytesIO(content)).resize([IMAGE_RESIZE, IMAGE_RESIZE])
    
    # Converts the PIL Image instance to a Numpy array
    arr = img_to_array(img)
    
    # Preprocesses a tensor or Numpy array encoding a batch of images
    return preprocess_input(arr)

def preprocess_series(content_series):
    """
    PreProcess a pd.Series of raw images using the input model.
    :return: a pd.Series of image features
    """
    # Join the sequence of arrays along the first axis. The stacked array has one more dimension than the input arrays       
    return pd.Series(np.stack(content_series.map(preprocess)))

def preprocess_udf(content_series_iter):
    for content_series in content_series_iter:        
        # Yield returns a generator to the caller and the execution of the code starts only when the generator is iterated 
        yield preprocess_series(content_series)
        
preprocess_df = features_df.repartition(16).select(col("dataset"), col('target_code'), preprocess_udf("content"))

# TypeError: Invalid argument, not a string or column: <generator object preprocess_udf at 0x7f23dbee6d50> of type 
#<class 'generator'>. For column literals, use 'lit', 'array', 'struct' or 'create_map' function.        
'''

In [None]:
train_df = preprocess_df[preprocess_df['dataset'] == 'Training']
test_df = preprocess_df[preprocess_df['dataset'] == 'Test']
validation_df = preprocess_df[preprocess_df['dataset'] == 'Validation']
test_df.show()

+-------+--------------------+-----------+--------------------+
|dataset|              target|target_code|         model_input|
+-------+--------------------+-----------+--------------------+
|   Test|apple_crimson_snow_1|        2.0|[null, null, null...|
|   Test|     cabbage_white_1|       15.0|[null, null, null...|
|   Test|          zucchini_1|       22.0|[null, null, null...|
|   Test|         apple_red_1|        9.0|[null, null, null...|
|   Test|    apple_braeburn_1|        1.0|[null, null, null...|
|   Test|apple_red_delicios_1|       12.0|[null, null, null...|
|   Test|              pear_1|       20.0|[null, null, null...|
|   Test|            carrot_1|       16.0|[null, null, null...|
|   Test|             apple_6|        0.0|[null, null, null...|
|   Test|         apple_hit_1|        7.0|[null, null, null...|
|   Test|              pear_3|       21.0|[null, null, null...|
|   Test|apple_crimson_snow_1|        2.0|[null, null, null...|
|   Test|          cucumber_1|       17.

In [None]:
X_train = np.stack(train_df.repartition(16).select(col("model_input")))
y_train = train_df.repartition(16).select(col("target_code"))

print(type(X_train))
print(type(y_train))

#cb_early_stopper = EarlyStopping(monitor = 'loss', patience = EARLY_STOP_PATIENCE)
#cb_checkpointer = ModelCheckpoint(filepath = 'best.hdf5', monitor = 'loss', save_best_only = True, mode = 'auto')

preds = new_model.fit(x=X_train, y=y_train, epochs=NUM_EPOCHS) #, callbacks=[cb_checkpointer, cb_early_stopper])

<class 'numpy.ndarray'>
<class 'pyspark.sql.dataframe.DataFrame'>


ValueError: ignored

## Predict fruit class with my new model on Test dataset <a class="anchor" id="sub7_4"></a> 

In [None]:
X_test = np.stack(test_df.repartition(16).select(col("model_input")))
y_test = test_df.repartition(16).select(col("target_code"))
    
y_pred = new_model.predict(X_test)

#Schema: Dataset|target|target_code|model_input|target_code_pred

ValueError: ignored

# Visualise classification results <a class="anchor" id="chapter8"></a> 

## Reduce dimension with PCA <a class="anchor" id="sub8_1"></a>

In [None]:
from pyspark.ml.feature import PCA
from pyspark.ml.feature import StandardScaler
from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector

In [None]:
def pca_transformation(df, n_components=10):
    
    """
    Applique un algorithme de PCA sur l'ensemble des images pour réduire la dimension de chaque image 
    du jeu de données.
    
    Paramètres:
    df(pyspark dataFrame): contient une colonne avec les données images
    n_components(int): nombre de dimensions à conserver
    """
    # Initilisation du temps de calcul
    start_time = time.time()

    # Les données images sont converties au format vecteur dense
    to_vector_udf = udf(lambda r: Vectors.dense(r), VectorUDT())
    df = df.withColumn('X_vectors', to_vector_udf('X_features'))

    # Entrainement de l'algorithme
    pca = PCA(k=n_components, inputCol='X_vectors', outputCol='X_vectors_pca')
    model_pca = pca.fit(df)

    # Transformation des images sur les k premières composantes
    df = model_pca.transform(df)
    
    # Affiche le temps de calcul
    print("Temps d'execution {:.2f} secondes".format(time.time() - start_time))

    return df

In [None]:
final_df = pca_transformation(test_df, n_components=10)

# write spark dataframe to a Parquet file
final_df.write.mode('overwrite').parquet("results_data_parquet")

## Draw predictions in the first main plan <a class="anchor" id="sub8_2"></a>

In [None]:
X_pca = final_df.select("X_vectors_pca")

fig = plt.figure(figsize=(10, 8))

plt.scatter(X_pca[:, 0], X_pca[:, 1], s=10, c=final_df.select("target_code_pred")+1) 
plt.colorbar()

plt.plot([X_pca[:, 0].min(), X_pca[:, 0].max()], [0, 0], color='grey', ls='-') 
plt.plot([0, 0], [X_pca[:, 1].min(), X_pca[:, 1].max()], color='grey', ls='-') 

plt.title("Projection of predictions on the first main plan")
plt.xlabel("PC1")
plt.ylabel("PC2")   
   
plt.show()

* [Go to Table des matières](#chapter0)

# End <a class="anchor" id="chapter100"></a> 