# Objective
Set-up a first big data architecture using AWS products (Mobile application with a fruit pictures classifier engine)

# Data
Link to upload data: https://www.kaggle.com/moltean/fruits

# Table of contents <a class="anchor" id="chapter0"></a> 
* [Imports and declarations](#chapter1)
    * [Import packages](#sub1_1)
    * [Declare constants](#sub1_2)
* [Exploration of the full dataset](#chapter2)
    * [Get picture information from local full dataset](#sub2_1)
    * [Explore picture information](#sub2_2)
    * [Get class information](#sub2_3) 
    * [Target label encoding](#sub2_4)
* [Preparation of the local sampled picture set](#chapter3)
    * [Create a local sampled picture set](#sub3_1)
    * [Get picture information from local sampled picture set](#sub3_2)
* [Transfer learning](#chapter4)
* [To the stars](#chapter5)
* [Old code](#chapter6)
    * [Cluster's descriptors](#sub3_2)
    * [Compute frequency histogram on clusters' descriptors](#sub3_3)
    * [Reduce dimension with PCA](#sub3_4)
* [Modelling](#chapter4)
    * [Train a KNN model](#sub4_1)
    * [Check learning curve](#sub4_2)
    * [Predict and compare prediction to reality on Test dataset](#sub4_3)
    * [Predict and compare prediction to reality on Validation dataset](#sub4_4)
* [Go to End](#chapter100)

# Imports and declarations <a class="anchor" id="chapter1"></a>

## Import packages <a class="anchor" id="sub1_1"></a>

In [1]:
import P8_02_module as MyMod

import numpy as np
import pandas as pd 

import os
from os import path
import glob
import shutil
import time

import matplotlib.pyplot as plt
from matplotlib.image import imread

from cv2 import cv2
import PIL
from PIL import Image, ImageDraw, ImageOps, ImageFilter

from sklearn import cluster
from sklearn import decomposition
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import learning_curve
from sklearn.utils import shuffle

import findspark
from pyspark import SparkContext

import boto3

## Declare constants <a class="anchor" id="sub1_2"></a>

In [2]:
# Sample limitations
GET_PICTURES_NB_PER_CLASS = 2

# KMeans hyper-param
KMEANS_N_CLUSTERS = 90

# Local repositories
LOCAL_SRC_PATH = '../fruits-360-original-size/'
LOCAL_DEST_PATH = '../fruits-360-sample/'

IMAGE_RESIZE = 224

# S3 Bucket
BUCKET_NAME = "moncompartimentamoi"

# Exploration of the full dataset <a class="anchor" id="chapter2"></a>

## Get picture information from local full dataset <a class="anchor" id="sub2_1"></a>

In [None]:
def rep_2_picture_info(path):

    # Initiate Dataframe with Dataset names, Target class names and Picture names
    df = pd.DataFrame(columns = ['FullFileName', 'Dataset', 'Target', 'Picture', 'FileSize (in KB)']) 

    for file in glob.iglob(path+'**/*.jpg', recursive = True):

        lst = file.split('\\')

        # update DataFrame
        lst.append(os.path.getsize(file) / 1024)  # in KBytes    
        lst[0] = lst[0] + "/" + lst[1] + "/" + lst[2] + "/" + lst[3]
        df.loc[len(df)] = lst

    return df

df_main = rep_2_picture_info(LOCAL_SRC_PATH)
df_main

## Explore picture information <a class="anchor" id="sub2_2"></a>

### Assess volumes and modalities

In [None]:
df_main.describe()

 > 12 455 pictures, 24 target classes, 3 datasets  
 > 958 picture names mean that some pictures have the same name and are not classified in the same repository

### Count pictures by target class and dataset

In [None]:
pd.DataFrame(df_main.groupby(['Target', 'Dataset'])['Picture'].count())

### Count pictures by target class

In [None]:
pd.DataFrame(df_main.groupby(['Target'])['Picture'].count())

### Count dataset modality by picture name

In [None]:
df_dataset_mod = pd.DataFrame(df_main.groupby(['Picture'])['Dataset'].nunique())
df_dataset_mod.rename(columns={'Dataset':'Dataset_mod'}, inplace=True)
len(df_dataset_mod[df_dataset_mod['Dataset_mod'] > 1])

> No file with the same name in the different datasets

### Count target class modality by picture name

In [None]:
df_target_mod = pd.DataFrame(df_main.groupby(['Picture'])['Target'].nunique())
df_target_mod.rename(columns={'Target':'Target_mod'}, inplace=True)
df_target_mod[df_target_mod['Target_mod'] > 1]

In [None]:
df_target_mod = df_target_mod.reset_index(drop=False)

In [None]:
pd.DataFrame(df_target_mod.groupby(['Target_mod'])['Picture'].count())

 > many files with the same name in the different target classes   
 > for instance, 156 files have the same name and appear in 24 different target class

In [None]:
df_main[df_main['Picture'] == 'r0_0.jpg'][['Picture', 'Target', 'Dataset']]

In [None]:
pict = Image.open(df_main['FullFileName'].iloc[20])
plt.imshow(pict)
plt.show()

In [None]:
pict = Image.open(df_main['FullFileName'].iloc[40])
plt.imshow(pict)
plt.show()

 > File name format: r?_image_index.jpg (e.g. r0_31.jpg or r1_12.jpg)  
 > "r?" stands for rotation axis (first one is r0)

### Distinguish rotation axis and index

In [None]:
# Laurence: supprimler les FutureWarning
df_main["Rotation"], df_main["Index"] = df_main["Picture"].str.split("_", 1).str
df_main["Rotation"] = df_main["Rotation"].str.replace('r','')
df_main["Index"] = df_main["Index"].str.replace('.jpg','')
df_main

In [None]:
pd.DataFrame(df_main["Rotation"].unique())

In [None]:
pict = Image.open(df_main['FullFileName'].iloc[df_main[df_main['Rotation'] == '0'].head(1).index[0]])
plt.imshow(pict)
plt.show()

In [None]:
pict = Image.open(df_main['FullFileName'].iloc[df_main[df_main['Rotation'] == '1'].head(1).index[0]])
plt.imshow(pict)
plt.show()

In [None]:
pict = Image.open(df_main['FullFileName'].iloc[df_main[df_main['Rotation'] == '2'].head(1).index[0]])
plt.imshow(pict)
plt.show()

 > 0 - queue top or down > rotation around the z-axis  
 > 1 - queue behind or ahead > rotation around the x-axis  
 > 2 - queue left or right > rotation around the y-axis  

### Target class count distribution

In [None]:
def distribution(df_in, dataset):
    
    df = df_in.copy()
    
    if dataset in ['Training', 'Test', 'Validation']:
        df.drop(df[df['Dataset'] != dataset].index, inplace=True)
    elif dataset != '*':
        print("dataset argument should be 'Training', 'Test', 'Validation' or '*'")
        return -1
        
    df_distrib = pd.DataFrame(df.groupby(['Target'])['Picture'].count())
    df_distrib.reset_index(drop=False, inplace=True)
    df_distrib.rename(columns={'Picture':'Picture count', 'Target':'Class'}, inplace=True)
    df_distrib = df_distrib.sort_values(by='Class', ascending=False)    

    if len(df_distrib) == 0: return -1
    
    df_distrib.plot.barh(x='Class', y='Picture count', figsize=(12, 10))    
    
    return 1

In [None]:
ret = distribution(df_main, 'Training')
ret = distribution(df_main, 'Test')
ret = distribution(df_main, 'Validation')
ret = distribution(df_main, '*')

### Target class filesize average distribution  
Logitech C920 camera and dedicated algorithm which extract the fruit from the background

In [None]:
pd.DataFrame(df_main.groupby('Target')['FileSize (in KB)'].mean())

In [None]:
pd.DataFrame(df_main.groupby('Dataset')['FileSize (in KB)'].mean())

In [None]:
df_main['FileSize (in KB)'].mean()

## Get class information <a class="anchor" id="sub2_3"></a>

In [None]:
path = '../fruits-360-original-size/Meta/'

df_class_add = pd.DataFrame(columns = ['PathName', 'Target', 'TxtName'])
df_meta = pd.DataFrame(columns = ['Flag', 'Value'])

for file in glob.iglob(path+'**/info.txt', recursive = True):
    
    df_meta_add = pd.read_csv(file, sep="=", names=['Flag', 'Value'])
        
    df_class_add.loc[0] = file.split('\\')   
        
    df_meta = pd.concat([df_class_add.join(df_meta_add, how='cross'), df_meta])

del df_class_add, df_meta_add

df_meta.drop(columns=['PathName', 'TxtName'], inplace=True)
df_meta = df_meta.sort_values(['Target', 'Flag'], ascending=True)
df_meta.reset_index(drop=True, inplace=True)
df_meta

In [None]:
df_meta['Flag'].nunique()

In [None]:
pd.DataFrame(df_meta['Flag'].unique(), columns=['Flag']).sort_values(by='Flag')

## Target label encoding  <a class="anchor" id="sub2_4"></a>

In [None]:
df_main, df_target_mapping = MyMod.encode_LabelEncoder(df_main, 'Target')
df_main.head(5)

In [None]:
df_meta, df_target_mapping = MyMod.encode_LabelEncoder(df_meta, 'Target')
df_meta.head(5)

In [None]:
df_target_mapping

# Preparation of the local sampled picture set <a class="anchor" id="chapter3"></a>

## Create a local sampled picture set <a class="anchor" id="sub3_1"></a>

In [None]:
target_prec = ""
for file in glob.iglob(LOCAL_SRC_PATH+'**/*.jpg', recursive = True):
    
    lst = file.split('\\')
    
    # Limit number of pictures per class(GET_PICTURES_NB_PER_CLASS)
    if lst[2] == target_prec: 
        i += 1
    else:
        i = 0
    target_prec = lst[2]
    
    if (lst[1] == 'Training' and i < 2*GET_PICTURES_NB_PER_CLASS) or \
                    (lst[1] != 'Training' and i < GET_PICTURES_NB_PER_CLASS):     
                
        # create the destination repository if necessary
        if not os.path.exists(LOCAL_DEST_PATH+lst[1]+"/"+lst[2]):
             os.makedirs(LOCAL_DEST_PATH+lst[1]+"/"+lst[2])

        # copy the file to the destination repository
        shutil.copyfile(LOCAL_SRC_PATH+lst[1]+"/"+lst[2]+"/"+lst[3], LOCAL_DEST_PATH+lst[1]+"/"+lst[2]+"/"+lst[3])

## Get picture information from local sampled picture set <a class="anchor" id="sub3_2"></a>

In [None]:
df_main = rep_2_picture_info(LOCAL_DEST_PATH)
df_main

# Transfer learning <a class="anchor" id="chapter4"></a> 
because my dataset has too little data to train a full-scale model from scratch  

Layers trainable attributes:
- weights : list of all weights variables of the layer  
- trainable_weights : list of those that are meant to be updated (via gradient descent) to minimize the loss during training  
- non_trainable_weights : list of those that aren't meant to be trained (updated by the model during the forward pass)  
- trainable : false moves all the layer's weights from trainable to non-trainable ("freezing" the layer)  
The only built-in layer that has non-trainable weights is the BatchNormalization layer  

Workflow 1 :  
1 - Instantiate a base model and load pre-trained weights into it  
2 - Freeze all layers in the base model by setting trainable = False  
3 - Create a new model on top of the output of one/several layers from the base model  
4 - Train your new model on your new dataset  (top layers to learn to turn the old features into predictions on a new dataset)   

Workflow 2 : feature extraction  
1 - Instantiate a base model and load pre-trained weights into it  
2 - Run your new dataset through it and record the output of one/several layers from the base model   
3 - Use that output as input data for a new, smaller model   
advantage: you only run the base model once on your data, rather than once per epoch of training > a lot faster & cheaper  
issue: doesn't allow you to dynamically modify the input data of your new model during training, which is required when doing data augmentation  

Fine-tuning (optionnal): 
unfreeze the entire/partial model you obtained and re-training it on the new data with a very low learning rate  

In [3]:
import tensorflow as tf
import keras
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.preprocessing.image import ImageDataGenerator

from tensorflow.keras import optimizers
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint
#from tensorflow.keras.applications.resnet50 import preprocess_input, decode_predictions
#from tensorflow.keras.preprocessing import image
#from tensorflow.keras.utils import image_dataset_from_directory

#import keras
##from tensorflow.keras import layers,Dense,Flatten
##from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
#from tensorflow.keras.losses import BinaryCrossentropy
#from tensorflow.keras.metrics import BinaryAccuracy

In [4]:
BATCH_SIZE = 16
RESNET50_POOLING_AVERAGE = 'avg'
DENSE_LAYER_ACTIVATION = 'softmax'
OBJECTIVE_FUNCTION = 'categorical_crossentropy'
LOSS_METRICS = ['accuracy']

# EARLY_STOP_PATIENCE must be < NUM_EPOCHS
NUM_EPOCHS = 10
EARLY_STOP_PATIENCE = 3

In [5]:
# Instantiate a base model: ResNet50 with pre-trained weights
#"Residual Network" with 50 layers. Convolutional Neural Network

pretrained_model = ResNet50(
    # Weights pre-trained on ImageNet: resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5 
    # in cache directory (~/.keras/models)
    weights='imagenet',                                 
    input_shape=(IMAGE_RESIZE, IMAGE_RESIZE, 3),
    # Do not include the ImageNet fully-connected layer at the top of the network
    include_top=False,
    classes=24)
# input_tensor=None, pooling=[None, 'avg'], classifier_activation=None

In [6]:
# Freeze all layers in the base model by setting trainable = False
pretrained_model.trainable = False

In [8]:
# Create a new model on top of the output of one layer from the base model
inputs = keras.Input(shape=(IMAGE_RESIZE, IMAGE_RESIZE, 3))

# Makes base_model run in inference mode by passing training to False (necessary for fine-tuning)
x = pretrained_model(inputs, training=False)

# Convert features of shape 'base_model.output_shape[1:]' to vectors
x = keras.layers.GlobalAveragePooling2D()(x)

# A Dense classifier with 65 units et activation 'softmax'
outputs = keras.layers.Dense(65, activation=DENSE_LAYER_ACTIVATION)(x)

model = keras.Model(inputs, outputs)

In [9]:
# Train the model on new data
sgd = optimizers.SGD(lr = 0.01, decay = 1e-6, momentum = 0.9, nesterov = True)
model.compile(optimizer=sgd, loss=OBJECTIVE_FUNCTION, metrics=LOSS_METRICS)

In [15]:
# Read Training data
train_datagen = ImageDataGenerator(
                    rescale=1. / 255,
                    shear_range=0.2,
                    zoom_range=0.2,
                    horizontal_flip=True)

test_datagen = ImageDataGenerator(rescale=1. / 255)

train_generator = train_datagen.flow_from_directory(
                    LOCAL_DEST_PATH+'Training/',
                    target_size=(IMAGE_RESIZE, IMAGE_RESIZE),
                    batch_size=BATCH_SIZE,
                    class_mode='categorical')

validation_generator = test_datagen.flow_from_directory(
                    LOCAL_DEST_PATH+'Validation/',
                    target_size=(IMAGE_RESIZE, IMAGE_RESIZE),
                    batch_size=BATCH_SIZE,
                    class_mode='categorical')

print(type(train_generator))

Found 96 images belonging to 24 classes.
Found 48 images belonging to 24 classes.
<class 'keras.preprocessing.image.DirectoryIterator'>


In [14]:
cb_early_stopper = EarlyStopping(monitor = 'loss', patience = EARLY_STOP_PATIENCE)
cb_checkpointer = ModelCheckpoint(filepath = 'best.hdf5', monitor = 'loss', save_best_only = True, mode = 'auto')

model.fit(train_generator, 
          epochs=NUM_EPOCHS, 
          callbacks=[cb_checkpointer, cb_early_stopper],
          validation_data=validation_generator) 

#Detected at node 'categorical_crossentropy/softmax_cross_entropy_with_logits' defined at (most recent call last):

Epoch 1/10


InvalidArgumentError: Graph execution error:

Detected at node 'categorical_crossentropy/softmax_cross_entropy_with_logits' defined at (most recent call last):
    File "C:\ProgramData\Anaconda3\lib\runpy.py", line 197, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "C:\ProgramData\Anaconda3\lib\runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
      app.launch_new_instance()
    File "C:\ProgramData\Anaconda3\lib\site-packages\traitlets\config\application.py", line 846, in launch_instance
      app.start()
    File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 677, in start
      self.io_loop.start()
    File "C:\ProgramData\Anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "C:\ProgramData\Anaconda3\lib\asyncio\base_events.py", line 596, in run_forever
      self._run_once()
    File "C:\ProgramData\Anaconda3\lib\asyncio\base_events.py", line 1890, in _run_once
      handle._run()
    File "C:\ProgramData\Anaconda3\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 457, in dispatch_queue
      await self.process_one()
    File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 446, in process_one
      await dispatch(*args)
    File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 353, in dispatch_shell
      await result
    File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 648, in execute_request
      reply_content = await reply_content
    File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 353, in do_execute
      res = shell.run_cell(code, store_history=store_history, silent=silent)
    File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 533, in run_cell
      return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
    File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2901, in run_cell
      result = self._run_cell(
    File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2947, in _run_cell
      return runner(coro)
    File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\async_helpers.py", line 68, in _pseudo_sync_runner
      coro.send(None)
    File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3172, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3364, in run_ast_nodes
      if (await self.run_code(code, result,  async_=asy)):
    File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3444, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\BNPLEA~1\AppData\Local\Temp/ipykernel_12332/984358097.py", line 4, in <module>
      model.fit(train_generator, epochs=NUM_EPOCHS,
    File "C:\ProgramData\Anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "C:\ProgramData\Anaconda3\lib\site-packages\keras\engine\training.py", line 1384, in fit
      tmp_logs = self.train_function(iterator)
    File "C:\ProgramData\Anaconda3\lib\site-packages\keras\engine\training.py", line 1021, in train_function
      return step_function(self, iterator)
    File "C:\ProgramData\Anaconda3\lib\site-packages\keras\engine\training.py", line 1010, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\ProgramData\Anaconda3\lib\site-packages\keras\engine\training.py", line 1000, in run_step
      outputs = model.train_step(data)
    File "C:\ProgramData\Anaconda3\lib\site-packages\keras\engine\training.py", line 860, in train_step
      loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "C:\ProgramData\Anaconda3\lib\site-packages\keras\engine\training.py", line 918, in compute_loss
      return self.compiled_loss(
    File "C:\ProgramData\Anaconda3\lib\site-packages\keras\engine\compile_utils.py", line 201, in __call__
      loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "C:\ProgramData\Anaconda3\lib\site-packages\keras\losses.py", line 141, in __call__
      losses = call_fn(y_true, y_pred)
    File "C:\ProgramData\Anaconda3\lib\site-packages\keras\losses.py", line 245, in call
      return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "C:\ProgramData\Anaconda3\lib\site-packages\keras\losses.py", line 1789, in categorical_crossentropy
      return backend.categorical_crossentropy(
    File "C:\ProgramData\Anaconda3\lib\site-packages\keras\backend.py", line 5098, in categorical_crossentropy
      return tf.nn.softmax_cross_entropy_with_logits(
Node: 'categorical_crossentropy/softmax_cross_entropy_with_logits'
logits and labels must be broadcastable: logits_size=[16,65] labels_size=[16,24]
	 [[{{node categorical_crossentropy/softmax_cross_entropy_with_logits}}]] [Op:__inference_train_function_9674]

# To the stars <a class="anchor" id="chapter5"></a> 

In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
#from pyspark.sql import Row
from pyspark.ml.image import ImageSchema
from pyspark.ml.linalg import DenseVector, VectorUDT

# Path to hadoop 
findspark.init("C:\spark\spark-3.2.1-bin-hadoop3.2")

# Instantiate SparkSession
spark = SparkSession \
            .builder \
            .appName("Python Spark Keypoints to RDD") \
            .config("spark.some.config.option", "some-value") \
            .getOrCreate()
#spark.createDataFrame(rdd)

# Instantiate a SparkContext
sc = SparkContext()
# Verify Spark version
print("Spark version:{}".format(sc.version))

# Instantiate S3 client
s3_client = boto3.client('s3', region_name='eu-west-3')
# Instantiate S3 resource
#s3_resource = boto3.resource('s3')
# Instantiate S3 bucket
#s3_bucket = s3_resource.Bucket(BUCKET_NAME)

Though SparkContext used to be an entry point prior to 2.0, it is not completely replaced with SparkSession.
Many features of SparkContext are still available and used in Spark 2.0 and later.  
You should also know that SparkSession internally creates SparkConfig and SparkContext with the configuration provided with SparkSession.

In [None]:
# Read pictures 1
#df = spark.read.format("image").load(LOCAL_DEST_PATH) 
#df.show()

# Read pictures 2 (ImageSchema.imageFields)
#img2vec = F.udf(lambda x: DenseVector(ImageSchema.toNDArray(x).flatten()), VectorUDT())
#df = df.withColumn('vecs', img2vec("image"))
#df.show()

In [None]:
# Transfer pictures from LOCAL_DEST_PATH local repository to S3 bucket
for file in glob.iglob(LOCAL_DEST_PATH+'**/*.jpg', recursive = True):
    
    lst = file.split('\\')
        
    # upload the image in the S3 bucket  
    s3_client.upload_file(lst[0], BUCKET_NAME, lst[1]+"/"+lst[2]+"/"+lst[3])

# Old code <a class="anchor" id="chapter6"></a> 

# Extration des descripteurs : jpg sur S3 > descripteurs

In [None]:
def read_image_from_s3(key):
    """Load image file from s3.

    Parameters
    ----------
    key : string           Path in s3

    Returns
    -------
    np array               Image array
    """   
    #object = s3_bucket.Object(key)
    #response = object.get()
    #file_stream = response['Body']    
    #im = Image.open(file_stream)
    
    return np.array(Image.open(s3_bucket.Object(key).get()['Body']))

In [None]:
# Create SIFT descriptor
sift = cv2.SIFT_create()
    
# Loop on .jpg pictures in AWS S3 Bucket
for obj in s3_bucket.objects.all():      # Ne prendre que les .jpg !!!!
    
    key = obj.key
    
    print(key) 
    
    if key.endswith('jpg'):        
        print(key)
        
        # Read picture from AWS S3 Bucket
        pict = np.array(Image.open(s3_bucket.Object(key).get()['Body']))

        # Compute key points and picture descriptors (descript: numpy array with one line by interest point, 128 columns)
        keypoints, descript = sift.detectAndCompute(pict, None)
        
        pd.DataFrame(descript)    

In [None]:
#s3_url = "s3a://moncompartimentamoi/Test/apple_6/*"
s3_url = "https://moncompartimentamoi.s3.eu-west-3.amazonaws.com/Test/apple_6/r0_103.jpg"

df = spark.read.format("image").load(s3_url)

print((df.count(), len(df.columns)))
print(df.printSchema())

df.select('image.nChannels', "image.width", "image.height", "image.data").show(truncate=True)

In [None]:
# Create PySpark RDD (Resilient Distributed Dataset)  from .jpg file
from pyspark.ml.image import ImageSchema

test = ImageSchema.readImages(s3_bucket) 
#"Training/apple_6/")

In [None]:
def load_picture(path):
    test = ImageSchema.toNDArray(path)   
    # TypeError: image argument should be pyspark.sql.types.Row; however, it got [<class 'str'>].
    
    #test = ImageSchema.readImages(path) 
    # AttributeError: '_ImageSchema' object has no attribute 'readImages'
    
    return test

print(load_picture("Training/apple_6/"))

## Clusters' descriptors <a class="anchor" id="sub3_2"></a>

In [None]:
# Save starting time
time_start = time.time()

# Create KMeans clustering model
kmeans = cluster.KMeans(n_clusters=KMEANS_N_CLUSTERS, random_state=42) 

# Train and predict using KMeans clustering model
df_kpdesc_training = pd.concat([df_kpdesc_training, \
    pd.DataFrame(kmeans.fit_predict(df_kpdesc_training[df_kpdesc_training.columns[1:]].values), \
                 columns=['Desc_cluster'])], axis=1)

# Compute time elapse
elapse_s = time.time()-time_start
elapse_m = int(elapse_s / 60)
print('KMeans {} clusters done! Time elapsed: {} seconds ({} minutes)'.format(KMEANS_N_CLUSTERS, elapse_s, elapse_m))

# Number of iterations run et Coordinates of cluster centers
print("Case {} clusters: Converge after {} iterations"\
      .format(kmeans.cluster_centers_.shape[0], kmeans.n_iter_)) 

print()
print("Descriptor dataframe shape : ", df_kpdesc_training.shape)

df_kpdesc_training

## Compute frequency histogram on clusters' descriptors <a class="anchor" id="sub3_3"></a>
samples: pictures x features: clusters

In [None]:
def histo_freq(df_kpdesc_training):
    
    # Use index to count
    df_kpdesc_training.reset_index(drop=False, inplace=True)
    df_kpdesc_training = df_kpdesc_training.pivot_table('index', index='FullFileName', columns='Desc_cluster', \
                                                        aggfunc='count', fill_value=0, margins=True)
    # Normalise: total for a picture is one
    for c in df_kpdesc_training.columns[:-1]:
        df_kpdesc_training[c] = df_kpdesc_training[c] / df_kpdesc_training['All']

    # Drop unusefull information
    df_kpdesc_training.drop(index='All', inplace=True)
    df_kpdesc_training.drop(columns='All', inplace=True)
    return df_kpdesc_training
    
df_kpdesc_training = histo_freq(df_kpdesc_training)    

## Reduce dimension with PCA <a class="anchor" id="sub3_4"></a>

In [None]:
PCA_N_COMPONENTS = 0.90

# Create PCA 
pca = decomposition.PCA()

# Fit PCA
pca.fit(df_kpdesc_training.values)

# Draw explained variance absolute and cumulated
df_eboulis = MyMod.graph_eboulis_valeurspropres(pca, (18, 18), True)

print("{} clusters explain {}% of the variance"\
           .format(df_eboulis[df_eboulis['explained_variance_ratio_cum'] > PCA_N_COMPONENTS]['rang'].min(), \
                   PCA_N_COMPONENTS * 100))

In [None]:
# Create PCA 
pca = decomposition.PCA(n_components=PCA_N_COMPONENTS)

# Fit Transform PCA
pict_features = pca.fit_transform(df_kpdesc_training.values)

print()
print("Matrix dimensions (pictures, visual words) : {}".format(pict_features.shape)) 

# Get PC coordinates in cluster space
df_contrib_PC = pd.DataFrame(pca.components_, columns=df_kpdesc_training.columns) 
df_contrib_PC.shape 

# Get the cluster best represented for each PC
#df_contrib_PC_t = df_contrib_PC.transpose()
lst_contrib = []
for i in range(pca.n_components_):     
    lst_contrib.append(df_contrib_PC.transpose()[i].idxmax(axis=0))
    
# Keep only the cluster best represented
df_kpdesc_training = df_kpdesc_training[lst_contrib]
del lst_contrib

# Unduplicate identical columns
df_kpdesc_training = df_kpdesc_training.T.groupby(level=0).first().T

df_kpdesc_training.head(5)

# Modelling <a class="anchor" id="chapter4"></a> 

## Train a KNN model <a class="anchor" id="sub4_1"></a>

In [None]:
# Create KNN model
knn = KNeighborsClassifier(n_neighbors=5)

# Train KNN model
X_train = df_kpdesc_training.values
y_train = df_main_training['Target_encoded'].values
knn.fit(X_train, y_train)

## Check learning curve <a class="anchor" id="sub4_2"></a>

In [None]:
train_sizes_abs, train_scores, test_scores = learning_curve(knn, X_train, y_train, 
                                            cv=5, scoring='neg_median_absolute_error',
                                            train_sizes=np.linspace(0.1, 1, 5), 
                                            random_state=42)
plot = plt.figure(figsize=(12, 8))
plot = plt.plot(train_sizes_abs, train_scores.mean(axis=1), label='train score')
plot = plt.plot(train_sizes_abs, test_scores.mean(axis=1), label='validation score')
plot = plt.legend()

## Predict and compare prediction to reality on Test dataset <a class="anchor" id="sub4_3"></a>

In [None]:
def predict_class(df, kmeans_model, pca_col_lst, knn_model):
    
    # Save starting time
    time_start = time.time()

    # Extract descriptors
    df_kpdesc = desc_extraction(df)

    # Predict clusters' descriptors with KMEANS
    df_kpdesc = pd.concat([df_kpdesc, \
        pd.DataFrame(kmeans.predict(df_kpdesc[df_kpdesc.columns[1:]].values), columns=['Desc_cluster'])], axis=1)

    # Compute histogram for main clusters
    df_kpdesc = histo_freq(df_kpdesc)    
    df_kpdesc = df_kpdesc[pca_col_lst]

    # Predict class with trained KNN
    df = pd.concat([df, pd.DataFrame(knn.predict(df_kpdesc.values), columns=['Predict'])], axis=1)

    # Compute time elapse
    elapse_s = time.time()-time_start
    elapse_m = int(elapse_s / 60)
    print('Test predictions done! Time elapsed: {} seconds ({} minutes)'.format(elapse_s, elapse_m))
    
    # Assess result ARI
    ari = metrics.adjusted_rand_score(df['Target_encoded'].values, df['Predict'].values)
    print('Test predictions done! Adjusted Rand Index: {}'.format(ari))
    
    return df

# Select Test dataset
df_main_test = df_main[df_main['Dataset'] == 'Test']

# Predict on Test dataset
df_main_test = predict_class(df_main_test, kmeans, df_kpdesc_training.columns, knn)
df_main_test.head(5)

## Predict and compare prediction to reality on Validation dataset <a class="anchor" id="sub4_4"></a>

In [None]:
# Select Validation dataset
df_main_validation = df_main[df_main['Dataset'] == 'Validation']

# Predict on Validation dataset
df_main_validation = predict_class(df_main_validation, kmeans, df_kpdesc_training.columns, knn)
df_main_validation.head(5)

* [Go to Table des matières](#chapter0)

# End <a class="anchor" id="chapter100"></a> 

In [None]:
'''df_main: FullFileName, Dataset, Target, Picture, FileSize (in KB), Rotation, Index, Target_encoded
df_meta: Target, Flag, Value, Target_encoded
df_target_mapping: Target_encoded, Target

df_main_training, df_main_test, df_main_validation: 
        FullFileName, Dataset, Target, Picture, FileSize (in KB), Rotation, Index, Target_encoded, Predict
df_kpdesc_training: FullFileName, Desc_cluster'''