# Mise en place du notebook

## Import des modules

In [4]:
import warnings
from IPython.core.display import HTML
import numpy as np
import pandas as pd
import os
import random
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.applications.vgg16 import VGG16
from keras.models import Model
from pyspark.conf import SparkConf
from pyspark import SparkContext
import boto3
from time import time
from datetime import datetime

## Gestion des paramètres d'affichage de Jupyter

In [None]:
plt.style.use('seaborn')

warnings.filterwarnings(category=FutureWarning, action='ignore')
warnings.simplefilter(category=UserWarning, action='ignore')

# pour gérer l'affichage des dataframes pandas
pd.options.display.max_rows = 100
pd.options.display.max_columns = 200
pd.set_option("colheader_justify", "left")

# pour gérer l'affichage des arrays numpy
# 230 sur config bureau, 165 sur config portable
np.set_printoptions(linewidth=165)

# pour afficher les gros fichiers sur toute la largeur de l'écran
HTML("<style>.container { width:100% }</style>")

## Gestion des chemins

In [7]:
dirpath = os.getcwd()
print("Working directory is :", dirpath)

Working directory is : /home/jim/projet8


In [6]:
# Répertoires de photos de fruits en local
list_input_dir = sorted(os.listdir(os.path.join(dirpath, 'input')))
print("Répertoires de photos de fruits en local :", list_input_dir)

Répertoires de photos de fruits en local : ['Apple Braeburn', 'Apricot', 'Avocado', 'Banana', 'Beetroot', 'Blueberry', 'Cactus fruit', 'Cantaloupe 1', 'Carambula', 'Cauliflower']


In [7]:
os.makedirs(os.path.join(dirpath, 'logs/'), exist_ok=True)

## VGG-16

In [21]:
# load model
model_vgg16 = VGG16()
# remove the output layer
model_vgg16 = Model(inputs=model_vgg16.inputs, outputs=model_vgg16.layers[-2].output)

### Fonction de feature extraction AVEC Spark (en local)

In [19]:
def VGG16_extracteur_spark(path, nom_image, model):
    """Transforme un fichier image en un vecteur de dimension 4096.
    Args : 
    - path : chemin vers les images (chemin local ou 'S3').
    - nom_image : chemin d'accès à l'image (exemple : 'Apple Braeburn/r_173_100.jpg')
    - model : model_vgg16 ou bc_model_vgg16.
    Returns :
    - liste de dimension 4097 (4096 dimensions de VGG16 + nom_image).
    """
    # create path to the image stored locally
    if path == 'S3':
        s3_bucket = boto3.resource('s3').Bucket('oc-projet-8')  # lent (réfléchir à la possibilité d'un unique appel par chaque exécuteur : broadcast ?)
        s3_bucket.download_file('input/' + nom_image, '/tmp/img')
        path_nom_image = '/tmp/img'
    else:
        path_nom_image = os.path.join(path, nom_image)
    # load the image for keras processing
    image = load_img(path_nom_image, target_size=(224, 224))
    # convert the image pixels to a numpy array
    image = img_to_array(image)
    # reshape data for the model
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    # prepare the image for the VGG model
    image = preprocess_input(image)
    # get extracted features
    im_features = model.predict(image)
    # convert to list and add nom_image
    im_features = im_features[0].tolist()
    im_features.append(nom_image)
    return im_features

## Stockage des data en local

On sélectionne 10 répertoires de données (1 pour les 10 premiers fruits de l'alphabet).

### Inventaire des images chargées en local

In [20]:
# Répertoires de photos de fruits en local
for i in range(len(list_input_dir)):
    list_images = sorted(os.listdir(os.path.join(dirpath, 'input', list_input_dir[i])))
    print(f"Nb images de '{list_input_dir[i]}' :", len(list_images))

Nb images de 'Apple Braeburn' : 492
Nb images de 'Apricot' : 492
Nb images de 'Avocado' : 427
Nb images de 'Banana' : 490
Nb images de 'Beetroot' : 450
Nb images de 'Blueberry' : 462
Nb images de 'Cactus fruit' : 490
Nb images de 'Cantaloupe 1' : 492
Nb images de 'Carambula' : 490
Nb images de 'Cauliflower' : 702


### Liste des images : megabatch_img

In [17]:
megabatch_img = []
for i in range(len(list_input_dir)):   # l'itérateur parcourt les différents fruits
    list_images = os.listdir(os.path.join(dirpath, 'input', list_input_dir[i]))
    for image in list_images:   # l'itérateur parcourt les images d'un fruit
        megabatch_img.append(list_input_dir[i] + '/' + image) 
megabatch_img = sorted(megabatch_img)

print(len(megabatch_img), "images dans megabatch_img :")

4987 images dans megabatch_img :


## Stockage des data sur S3

On sélectionne 10 répertoires de données (1 pour les 10 premiers fruits de l'alphabet) depuis le répertoire local que l'on exporte vers un bucket S3.

In [32]:
# connexion au bucket
s3_bucket = boto3.resource('s3').Bucket('projet8-oc')

In [46]:
# inventaire des fruits dans le répertoire "input/"
all_fruits = s3_bucket.objects.filter(Prefix="input")
print("Nb de fichiers dans le bucket :", sum(1 for _ in all_fruits))
print("\nFichiers dans le bucket :")
# for obj in all_fruits:
#     print(obj.key)

Nb de fichiers dans le bucket : 4988

Fichiers dans le bucket :


In [18]:
megabatch_img = []
for obj in all_fruits:
    megabatch_img.append(obj.key.lstrip("input/"))
del megabatch_img[0]  # suppression d'un objet non pertinent

# Réductions de dimension en local avec Spark

## Mise au point de l'extraction de features avec VGG16

### Démarrage d'un SparkContext

In [162]:
N_WORKERS = 8
conf = SparkConf()
conf.setMaster(f'local[{N_WORKERS}]').setAppName(f'Spark ({N_WORKERS} workers) sur Jupyter')

sc = SparkContext(conf=conf)

# broadcasting
inputpath = os.path.join(dirpath, 'input')
bc_inputpath = sc.broadcast(inputpath)
bc_model_vgg16 = sc.broadcast(model_vgg16)

INFO:tensorflow:Assets written to: ram://2a3a9458-06c8-48b7-a4a4-75ae10e1bb94/assets


In [163]:
sc.getConf().getAll()

[('spark.app.name', 'Spark (3 workers) sur Jupyter'),
 ('spark.driver.host', '192.168.1.24'),
 ('spark.rdd.compress', 'True'),
 ('spark.master', 'local[3]'),
 ('spark.driver.port', '41181'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.submit.pyFiles', ''),
 ('spark.app.startTime', '1642344132895'),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.app.id', 'local-1642344132925'),
 ('spark.ui.showConsoleProgress', 'true')]

### Tests comparatifs d'extraction en variant le nb de workers / images 

In [1]:
# sans broadcast
# ancienne fonction (VGG16_extracteur_spark_old)
t0 = time()

N_IMAGES = 64

resultat = sc.parallelize(megabatch_img[:N_IMAGES]) \
    .map(lambda img: os.path.join(dirpath, 'input', img)) \
    .map(lambda img_path: VGG16_extracteur_spark_old(img_path)) \
    .collect()

t_delta = time() - t0

In [31]:
t_delta

23.28361678123474

Quelques remarques. Toutes choses égales par ailleurs :  
    - le premier run après un chargement de nouveau spark context est plus long que le runs suivants.  
    - les temps indiqués ci-dessous sont les temps mesurés pour la durée d'exécution de la cellule Jupyter avec la nbextension ExecuteTime (ExecuteTime retourne 6s de plus que les temps de Spark Web UI pour 4 workers).

__1 worker :__  
1 worker et 1 image : 2 runs : 10.5 / 10.9s   
1 worker et 64 images : 2 runs : 20.8 / 21.2s  
1 worker et 2048 images : le job plante à chaque fois (3 tests) après environ 345s : "java.lang.OutOfMemoryError: Java heap space" (solution ? lorsque Java est démarré il faut certainement lui donner un heap plus grand).    
En résumé, avec 1 worker :  il faut 10.5s d'overhead puis 10.5s pour 64 images.  
Durant les jobs longs, la RAM (32Go au total) monte de 15% (initial) à 26% d'utilisation, les CPU sont à 60%. 

__2 workers :__  
2 workers et 1 image : 11.0 / 10.6s  
2 workers et 4 images : 11.6 / 11.2  
2 workers et 16 images : 12.7 / 13.1  
2 workers et 64 images : 20.0 / 19.8  
2 workers et 128 images : 28.7 / 29.3  
2 workers et 2048 images : 300 / 304s  
En résumé, avec 2 workers :  il faut 11s d'overhead puis 9s pour 64 images.  
Durant les jobs longs, la RAM (32Go au total) monte de 15% (initial) à 35% d'utilisation, les CPU sont à 75%.  
  
__4 workers :__  
4 workers et 1 image : 14.4 / 12.8 / 13.3s  
4 workers et 16 images : 15.9 / 14.3  
4 workers et 64 images : 20.9 / 21.3  
4 workers et 128 images : 28.9 / 28.9  
4 workers et 256 images : 46.1 / 44.5  
4 workers et 512 images : 76.3 / 77.2
4 workers et 2048 images :  277 / 270  
En résumé, avec 4 workers :  il faut 13s d'overhead puis 8s pour 64 images.  
Durant les jobs longs, la RAM (32Go au total) monte de 15% (initial) à 60% d'utilisation (pic à 80%), les CPU sont à 95%.  

__8 workers :__  
J'ai fait un premier test avec 8 workers, mais mes applis ouvertes et le système (carte réseau !) ont été bousillées par le job Spark. Le problème vient de l'insuffisance de la RAM (solution : augmenter le swap ?).  
Les tests suivants ont été réalisés avec Jupyter fermé (uniquement avec le kernel ipython ouvert), et le monitoring s'est fait avec time.time()
8 workers et 1 image : 20.7 / 19.0s (CPU, RAM et swap à 100%!)
8 workers et 64 images : plantage "ERROR Executor" "org.apache.spark.SparkException: Python worker exited unexpectedly (crashed)", c'est sans doute dû à un manque de RAM, mais ce n'est pas dit explicitement dans les logs. Cependant Java n'a pas planté (on peut lancer des runs subséquents).  
En augmentant le swap de mon système (de 2Go à 16Go), la RAM tient, mais on ça plante sur une "java.lang.OutOfMemoryError: Java heap space".  
En passant conf.set("spark.driver.memory", "4g") et conf.set("spark.executor.memory", "4g") : a priori ça n'a pas d'effet sur "heap committed" de la JVM, et on se retrouve encore avec une "java.lang.OutOfMemoryError: Java heap space".  
Différents essais pour augmenter la heap de java : non concluant.  
Autres solutions envisagées : rendre le GC de Python plus agressif.

### Utilisation de broadcast (données chargées en local)

In [24]:
t0 = time()

N_IMAGES = 64

resultat = sc.parallelize(megabatch_img[:N_IMAGES]) \
    .map(lambda img: VGG16_extracteur_spark(bc_inputpath.value, img, bc_model_vgg16.value)) \
    .collect()

t_delta = time() - t0

Temps de calcul :  
  
| SETUP                        | SANS BRODCAST |            AVEC BROADCAST  | 
| ---------------------------- | ------------- | ----------------------------  |
| 1 workers / 64 images     | 21s |  16.6 / 11.0 / 10.7s / 15.2 / 10.6 / 11.2 / 10.6 / 10.6s  |
| 2 workers / 1 images     | 10.7s | 0.2 / 0.2 / 0.2 / 2.4 / 2.4 / 0.2 / 0.2 / 2.1 / 0.2s  |
| 2 workers / 16 images     |12.9s |  8.5 / 6.5 / 2.5 / 6.3 / 2.2 / 4.6s  |
| 2 workers / 64 images     | 19.9s | 13.5 / 9.0 /  9.7s  | 
| 2 workers / 128 images     | 29.0s | 18.1 / 18.4s  |
| 2 workers / 2048 images     | 300s | planté (heap space) / 291s / 292s  | 
| 4 workers / 64 images     | 21s | 16.5 / 8.5 / 8.3 / 8.5s  | 
| 8 workers / 64 images     |  | 8.3 / 8.4 / 8.8s  | 
  
En résumé, avec 2 workers :  il faut 0s d'overhead puis 9s pour 64 images.  
Durant les jobs longs, la RAM (32Go au total) monte de ?% (initial) à 22% d'utilisation, les CPU sont à 75%.  

1 worker : CPUs à 65%  
2 workers : CPUs à 75%  
4 workers : CPUs à 95%  
8 workers : CPUs à 100%


Le boost en temps d'overhead est spectaculaire ! Mais les résultats sont assez fluctuants, en particulier lorsque N_IMAGES est petit.   
Typiquement, quand on fait le calcul d'un même setup plusieurs fois d'affilée, le premier run (parfois le second) est plus lent.  
Au global, le gain est faible sur un grand nombre d'image.  
  


### Les données sont téléchargées depuis S3

In [133]:
# logger
logfile = "Log for job started " + timestamp() + ".txt"
logger_s3(s3_bucket, logfile, "job starts")

In [8]:
N_IMAGES = 16

resultat = sc.parallelize(megabatch_img[:N_IMAGES]) \
    .map(lambda img: VGG16_extracteur_spark('S3', img, bc_model_vgg16.value)) \
    .collect()

Temps de calcul :  
  
| SETUP                        |         INPUT DATA EN LOCAL  | INPUT DATA SUR S3 |
| ---------------------------- |  ----------------------------  |-------------  |
| 1 worker / 64 images     | 13s  | 30.6 / 26.3 / 27.4 / 30.0 / 26.6 / 25.9 / 30.2 s|
| 2 workers / 1 images     |  0.2s  | 0.5 / 0.4 / 0.4s |
| 2 workers / 16 images     |  2.2s  | 4.2 / 8.2 / 4.2 / 4.3s | 
| 2 workers / 64 images     | 9.0s  | 16.6 / 16.4 s|
| 2 workers / 128 images     | 18.2s  | 31.0 / 31.9 s |
| 2 workers / 2048 images     | 291s / 292s  | 515s |
| 4 workers / 64 images     | 8.5s  | 11.2 / 11.1 / 10.9 s|
| 8 workers / 64 images     |  8.5s | 9.5 / 9.3 / 9.0 / 9.2  s|
  
En résumé, avec 2 workers :  il faut 0s d'overhead puis 16s pour 64 images.  
Durant les jobs longs, la RAM (32Go au total) monte de ?% (initial) à 38% d'utilisation.
Le temps de calcul augmente de 80% par rapport au chargement local de la data.  

1 worker : CPUs à 30%.  
2 worker : CPUs à 50%.  
4 workers : CPUs à 75%.  
8 workers : CPUs à 95%

In [135]:
# logger
logger_s3(s3_bucket, logfile, "job has finished")

### Arrêt du SparkContext

In [11]:
sc.stop()

## Conversion du RDD post-extraction en dataframe spark

Le but est de récupérer le RDD issu de l'extraction, avant le collect(), et le transformer en dataframe Spark.  

### Avec spark.createDataFrame(rdd)

Il y a deux options pour démarrer une SparkSession :  
- à partir d'un SparkContext.  
- from scratch.  
Seule la deuxième a fonctionné sans bug.  

In [26]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local") \
    .appName("Extraction VGG16") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [27]:
spark

###### Calculs

In [13]:
rdd = spark.sparkContext.parallelize(megabatch_img[:16]).map(lambda img: os.path.join("/home/jim/projet8/input", img)).map(lambda img_path: VGG16_extracteur_spark(img_path))
rdd.collect();

In [None]:
sdf = spark.createDataFrame(rdd)
# sdf.show()   # ligne de code passée en commentaire car sinon elle affiche des milliers de lignes

Ca fonctionne.  

### Par un calcul tout-en-un avec .toDF(), ça fonctionne mais c'est très lent

In [55]:
# Il faut instancier une SparkSession sinon ça ne fonctionne pas!
# Le spark.sparkContext "hérite" alors de la SparkSession.
# Utiliser cette méthode pour appeler un Spark Context : spark.sparkContext

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Projet 8').getOrCreate()

In [14]:
sdf = spark.sparkContext.parallelize(megabatch_img[:4]) \
    .map(lambda img: os.path.join("/home/jim/projet8/input", img)) \
    .map(lambda img_path: VGG16_extracteur_spark(img_path)) \
    .toDF()

En remplaçant .collect() par .toDF(), le temps de calcul triple => on préférera travailler avec .collect() qu'avec .toDF()

### Conclusion

spark.createDataFrame(rdd) : ça fonctionne bien tant qu'on reste dans les clous mais on génère de  trop nombreux "java.lang.OutOfMemoryError: Java heap space".  
A éviter.  

Avec rdd.toDF() : ça fonctionne mais c'est très lent.  

## Lancement du job complet depuis la CLI avec la commande spark-submit

Le code de la cellule suivante est copié dans le fichier "extraction_features_v1.py"  

Durée du job spark avec la commande "spark-submit --master __local[4]__ extraction_features_v1.py" :   
 4 images : 9.7 / 9.5 / 9.6s  (durée totale du script Python : 19 / 19s)  
16 images : 11.5 / 11.9 / 11.5s  
64 images : 20.2 / 19.3s
128 images : 29.8 / 26.9s / 31.6 / 28.6s (durée totale du script Python : 42 / 39s)  
=> Il y a un overhead de 10s pour le démarrage et l'export des données.  
Les CPUs turbinent à 85%.  

Durée du job spark avec la commande "spark-submit --master __local[8]__ extraction_features_v1.py" :   
128 images : 30.4 / 32.2s (durée totale du script Python : 39 / 42s)  
La RAM monte à 100% (32Go) et le swap prend le relai (5% de 16Go).  
Les CPUs turbinent à 95%.  

In [None]:
# imports modules
import numpy as np
import pandas as pd
import os
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.applications.vgg16 import VGG16
from keras.models import Model
from pyspark import SparkContext
import boto3
from datetime import datetime
from time import time

def timestamp():
    return datetime.now().strftime("%Y-%m-%d--%H-%M-%S")

def logger_s3(s3_bucket, logfile, log, log_to_shell=True):
    """Appends log to logfile. Uploads each update to S3 bucket.
    Args :
        - s3_bucket (Bucket).
        - logfile (string) .
        - log (string).
        - log_to_shell (bool) : si True, imprime également dans le std.out.
    Returns : /
    """
    time_log = timestamp() + " : " + log + "\n"
    if log_to_shell : print("*"*100 + "\n", time_log)
    with open("./logs/"+ logfile, 'a') as f:
        f.write(time_log)
    s3_bucket.upload_file("./logs/"+ logfile, 'logs/' + logfile)

# fonction d'extraction de feature
def VGG16_extracteur_spark(path, nom_image, model):
    """Transforme un fichier image en un vecteur de dimension 4096.
    Args :
    - path : chemin vers les images (chemin local ou 'S3').
    - nom_image : chemin d'accès à l'image (exemple : 'Apple Braeburn/r_173_100.jpg')
    - model : model_vgg16 ou bc_model_vgg16.
    Returns :
    - liste de dimension 4097 (4096 dimensions de VGG16 + nom_image).
    """
    # create path to the image stored locally
    if path == 'S3':
        s3_bucket_vgg16 = boto3.resource('s3').Bucket('oc-projet-8')    # il faut un une instance Bucket différente sur le driver et les exécuteurs
        s3_bucket_vgg16.download_file('input/' + nom_image, '/tmp/img')
        path_nom_image = '/tmp/img'
    else:
        path_nom_image = os.path.join(path, nom_image)
    # load the image for keras processing
    image = load_img(path_nom_image, target_size=(224, 224))
    # convert the image pixels to a numpy array
    image = img_to_array(image)
    # reshape data for the model
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    # prepare the image for the VGG model
    image = preprocess_input(image)
    # get extracted features
    im_features = model.predict(image)
    # convert to list and add nom_image
    im_features = im_features[0].tolist()
    im_features.append(nom_image)
    return im_features

# if __name__ == '__main__':  # tester si ça marche avec cette ligne ?

# il faut un une instance Bucket différente sur le driver et les exécuteurs
s3_bucket = boto3.resource('s3').Bucket('oc-projet-8')

# start logging
os.makedirs(os.path.join(os.getcwd(), 'logs/'), exist_ok=True)
logfile = "Log for job started " + timestamp() + ".txt"
logger_s3(s3_bucket, logfile, "logging starts")

# load model VGG16
model_vgg16 = VGG16()
# remove the output layer
model_vgg16 = Model(inputs=model_vgg16.inputs, outputs=model_vgg16.layers[-2].output)

# Spark context : le SparkConf est supprimé car géré via le spark-submit
sc = SparkContext()

# logger des paramètres du spark context lancé
logger_s3(s3_bucket, logfile, "Configuration de Spark :" + str(sc.getConf().getAll()))



# broadcasting
bc_model_vgg16 = sc.broadcast(model_vgg16)




# liste des images pour le test
batch_img = \
['Apple Braeburn/0_100.jpg',
 'Apple Braeburn/100_100.jpg',
 'Apple Braeburn/101_100.jpg',
 'Apple Braeburn/102_100.jpg',
 'Apple Braeburn/103_100.jpg',
 'Apple Braeburn/104_100.jpg',
 'Apple Braeburn/105_100.jpg',
 'Apple Braeburn/106_100.jpg',
 'Apple Braeburn/107_100.jpg',
 'Apple Braeburn/108_100.jpg',
 'Apple Braeburn/109_100.jpg',
 'Apple Braeburn/10_100.jpg',
 'Apple Braeburn/110_100.jpg',
 'Apple Braeburn/111_100.jpg',
 'Apple Braeburn/112_100.jpg',
 'Apple Braeburn/113_100.jpg',
 'Apple Braeburn/114_100.jpg',
 'Apple Braeburn/115_100.jpg',
 'Apple Braeburn/116_100.jpg',
 'Apple Braeburn/117_100.jpg',
 'Apple Braeburn/118_100.jpg',
 'Apple Braeburn/119_100.jpg',
 'Apple Braeburn/11_100.jpg',
 'Apple Braeburn/120_100.jpg',
 'Apple Braeburn/121_100.jpg',
 'Apple Braeburn/122_100.jpg',
 'Apple Braeburn/123_100.jpg',
 'Apple Braeburn/124_100.jpg',
 'Apple Braeburn/125_100.jpg',
 'Apple Braeburn/126_100.jpg',
 'Apple Braeburn/127_100.jpg',
 'Apple Braeburn/128_100.jpg',
 'Apple Braeburn/129_100.jpg',
 'Apple Braeburn/12_100.jpg',
 'Apple Braeburn/130_100.jpg',
 'Apple Braeburn/131_100.jpg',
 'Apple Braeburn/132_100.jpg',
 'Apple Braeburn/133_100.jpg',
 'Apple Braeburn/134_100.jpg',
 'Apple Braeburn/135_100.jpg',
 'Apple Braeburn/136_100.jpg',
 'Apple Braeburn/137_100.jpg',
 'Apple Braeburn/138_100.jpg',
 'Apple Braeburn/139_100.jpg',
 'Apple Braeburn/13_100.jpg',
 'Apple Braeburn/140_100.jpg',
 'Apple Braeburn/141_100.jpg',
 'Apple Braeburn/142_100.jpg',
 'Apple Braeburn/143_100.jpg',
 'Apple Braeburn/144_100.jpg',
 'Apple Braeburn/145_100.jpg',
 'Apple Braeburn/146_100.jpg',
 'Apple Braeburn/147_100.jpg',
 'Apple Braeburn/148_100.jpg',
 'Apple Braeburn/149_100.jpg',
 'Apple Braeburn/14_100.jpg',
 'Apple Braeburn/150_100.jpg',
 'Apple Braeburn/151_100.jpg',
 'Apple Braeburn/152_100.jpg',
 'Apple Braeburn/153_100.jpg',
 'Apple Braeburn/154_100.jpg',
 'Apple Braeburn/155_100.jpg',
 'Apple Braeburn/156_100.jpg',
 'Apple Braeburn/157_100.jpg',
 'Apple Braeburn/158_100.jpg',
 'Apple Braeburn/159_100.jpg',
 'Apple Braeburn/15_100.jpg',
 'Apple Braeburn/160_100.jpg',
 'Apple Braeburn/161_100.jpg',
 'Apple Braeburn/162_100.jpg',
 'Apple Braeburn/163_100.jpg',
 'Apple Braeburn/164_100.jpg',
 'Apple Braeburn/165_100.jpg',
 'Apple Braeburn/166_100.jpg',
 'Apple Braeburn/167_100.jpg',
 'Apple Braeburn/168_100.jpg',
 'Apple Braeburn/169_100.jpg',
 'Apple Braeburn/16_100.jpg',
 'Apple Braeburn/170_100.jpg',
 'Apple Braeburn/171_100.jpg',
 'Apple Braeburn/172_100.jpg',
 'Apple Braeburn/173_100.jpg',
 'Apple Braeburn/174_100.jpg',
 'Apple Braeburn/175_100.jpg',
 'Apple Braeburn/176_100.jpg',
 'Apple Braeburn/177_100.jpg',
 'Apple Braeburn/178_100.jpg',
 'Apple Braeburn/179_100.jpg',
 'Apple Braeburn/17_100.jpg',
 'Apple Braeburn/180_100.jpg',
 'Apple Braeburn/181_100.jpg',
 'Apple Braeburn/182_100.jpg',
 'Apple Braeburn/183_100.jpg',
 'Apple Braeburn/184_100.jpg',
 'Apple Braeburn/185_100.jpg',
 'Apple Braeburn/186_100.jpg',
 'Apple Braeburn/187_100.jpg',
 'Apple Braeburn/188_100.jpg',
 'Apple Braeburn/189_100.jpg',
 'Apple Braeburn/18_100.jpg',
 'Apple Braeburn/190_100.jpg',
 'Apple Braeburn/191_100.jpg',
 'Apple Braeburn/192_100.jpg',
 'Apple Braeburn/193_100.jpg',
 'Apple Braeburn/194_100.jpg',
 'Apple Braeburn/195_100.jpg',
 'Apple Braeburn/196_100.jpg',
 'Apple Braeburn/197_100.jpg',
 'Apple Braeburn/198_100.jpg',
 'Apple Braeburn/199_100.jpg',
 'Apple Braeburn/19_100.jpg',
 'Apple Braeburn/1_100.jpg',
 'Apple Braeburn/200_100.jpg',
 'Apple Braeburn/201_100.jpg',
 'Apple Braeburn/202_100.jpg',
 'Apple Braeburn/203_100.jpg',
 'Apple Braeburn/204_100.jpg',
 'Apple Braeburn/205_100.jpg',
 'Apple Braeburn/206_100.jpg',
 'Apple Braeburn/207_100.jpg',
 'Apple Braeburn/208_100.jpg',
 'Apple Braeburn/209_100.jpg',
 'Apple Braeburn/20_100.jpg',
 'Apple Braeburn/210_100.jpg',
 'Apple Braeburn/211_100.jpg',
 'Apple Braeburn/212_100.jpg',
 'Apple Braeburn/213_100.jpg',
 'Apple Braeburn/214_100.jpg']

# logging
logger_s3(s3_bucket, logfile, "spark job starts")
t0 = time()

# job spark
N_IMAGES = 128
resultat = sc.parallelize(batch_img[:N_IMAGES]) \
    .map(lambda img: VGG16_extracteur_spark('S3', img, bc_model_vgg16.value)) \
    .collect()

logger_s3(s3_bucket, logfile, f"spark job has ended (duration of {round(time() - t0, 1)}s) - output to S3 starts")

# output
df_output = pd.DataFrame(resultat)
df_output.columns = [f'dim_{i}' for i in range(4096)] + ['path']
s3_bucket = boto3.resource('s3').Bucket('oc-projet-8')
file_name = f'df_output_{len(df_output)}_fruits_-_spark_en_local.csv'
local_path = os.path.join(os.getcwd(), file_name)
df_output.to_csv(path_or_buf=local_path)
s3_bucket.upload_file(local_path, 'output/' + file_name)

# end logging
logger_s3(s3_bucket, logfile, "output to S3 has ended")

# Réductions de dimension dans le cloud avec Spark et AWS EMR

## Démarrage du service EMR

###### Configuration finale retenue

Zone : eu-west-1  

**Software Configuration**  
EMR release : 6.5.0.
Cocher uniquement : Hadoop, Hive, Spark (JupyterEnterpriseGateway et Livy si notebook EMR). Ne pas cocher Tesnsorflow.     
  
**Hardware**  
On prend tous les paramètres par défaut.  
Instance type : m5.xlarge (m5 : general purpose, 4CPU, 16Go de RAM) (EBS : 2*32Go)  
Number of instances : 3 (1 master and 2 core nodes) pour les tests ; 7 pour le cluster du test de scalabilité.  

**General Options**  
On prend tous les paramètres par défaut.   
  
**Security Options**  
Choisir une clé préalablement créée.    
Utiliser la config par défaut. 
(Une fois le cluster démarré, ouvrir le port 22 via la console AWS si besoin de SSH.)  
  
**Bootstrap actions**  
Custom action : importer un script .sh depuis S3 (script pour le "pip install").  
  
    
Note : sur mon compte, j'ai une limitation à < 32vCPUs. Ca correspond à 7 instances xlarge.  


###### Débuggage du démarrage

Point le plus important : **choisir un EMR 5.34.0 plutôt que 6.5.0.** pour le démarrage d'une nouvelle config.  Les EMR 6 plantent facilement.  
Préférer eu-west-1 (Irelande) à eu-west-3 (Paris).  

## Liste des modules Python installés

Lancer le script PySpark suivant sur l'EMR et récupérer le stdout.

In [20]:
from pip import _internal
print(_internal.main(['list']))

from pyspark import SparkContext
sc = SparkContext()   # inutile mais nécessaire pour que l'EMR accepte le job

### EMR 6.5.0

In [None]:
Résultat des modules installés sur le cluster avant boostrapping :  
**********************************************
absl-py                    0.15.0
astor                      file-astor-VERSION
astunparse                 1.6.3
aws-cfn-bootstrap          2.0
beautifulsoup4             4.9.3
bleach                     4.1.0
boto                       2.49.0
botocore                   1.23.39
cachetools                 4.1.1
certifi                    2021.10.8
chardet                    4.0.0
charset-normalizer         2.0.7
click                      8.0.3
cryptography               3.2.1
docutils                   0.14
flatbuffers                2.0
funcsigs                   1.0.2
gast                       0.5.2
google-auth                1.23.0
google-auth-oauthlib       0.4.2
google-pasta               0.2.0
grpcio                     1.41.0
h5py                       2.10.0
html5lib                   1.1
idna                       2.10
importlib-metadata         2.0.0
jmespath                   0.10.0
joblib                     1.1.0
Keras-Applications         1.0.8
Keras-Preprocessing        1.1.2
lockfile                   0.11.0
lxml                       4.6.3
Markdown                   3.3.4
mysqlclient                1.4.2
nltk                       3.6.5
nose                       1.3.4
numpy                      1.21.5
oauthlib                   3.1.0
opt-einsum                 2.3.2
packaging                  21.0
pip                        20.2.2
protobuf                   3.19.0
py-dateutil                2.2
pyasn1                     0.4.8
pyasn1-modules             0.2.8
pystache                   0.5.4
python-daemon              2.2.3
python-dateutil            2.8.2
python37-sagemaker-pyspark 1.4.1
pytz                       2021.3
PyYAML                     5.4.1
regex                      2021.10.21
requests                   2.26.0
requests-oauthlib          1.3.0
rsa                        4.6
s3transfer                 0.5.0
setuptools                 44.0.0
simplejson                 3.2.0
six                        1.13.0
tensorboard                2.4.1
tensorboard-plugin-wit     1.8.0
tensorflow                 2.4.1
tensorflow-estimator       2.4.0
termcolor                  1.1.0
tqdm                       4.62.3
typing-extensions          3.10.0.2
urllib3                    1.26.7
webencodings               0.5.1
Werkzeug                   1.0.1
wheel                      0.35.1
windmill                   1.6
wrapt                      1.12.1
zipp                       3.4.0

## Adaptation (débuggage) du code qui tourne en local mais pas sur EMR

#### Modules manquants

boto3, pandas, keras, PIL ne sont pas installés  ->   solution :   "sudo python3 -m pip install boto3 pandas keras"   dans le bootstrap

#### Version de pandas

Downgrade de pandas: "sudo python3 -m pip install pandas==1.2.5" dans le bootstrap

#### Mémoire insuffisante du conteneur du driver

L'instruction problématique est    "bc_model_vgg16 = sc.broadcast(model_vgg16)"    qui fait planter le Driver Program.

Solution retenue : passer la mémoire du conteneur du driver de 2 à 10Go via l'option :  
spark-submit --driver-memory 10g  

#### TypeError: can't pickle weakref objects

Les versions pré 2.7 de TensorFlow ont des problèmes notoires de sérialisation des modèles keras.  
  
Solution retenue :  install de TF v2.7.0 sur le cluster. 

## Scripts pour les configurations retenues

### Création du cluster depuis AWS CLI

```shell
aws emr create-cluster --applications Name=Hadoop Name=Hive Name=Spark Name=Ganglia --ec2-attributes '{"KeyName":"cle_irelande","InstanceProfile":"EMR_EC2_DefaultRole","SubnetId":"subnet-00bbdbcf537bf57b5","EmrManagedSlaveSecurityGroup":"sg-08a05f6eec6c610ac","EmrManagedMasterSecurityGroup":"sg-09d0c339df01417fd"}' --release-label emr-6.5.0 --log-uri 's3n://aws-logs-327946743066-eu-west-1/elasticmapreduce/' --instance-groups '[{"InstanceCount":6,"EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"SizeInGB":32,"VolumeType":"gp2"},"VolumesPerInstance":2}]},"InstanceGroupType":"CORE","InstanceType":"m5.xlarge","Name":"Core - 2"},{"InstanceCount":1,"EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"SizeInGB":32,"VolumeType":"gp2"},"VolumesPerInstance":2}]},"InstanceGroupType":"MASTER","InstanceType":"m5.xlarge","Name":"Master - 1"}]' --auto-scaling-role EMR_AutoScaling_DefaultRole --bootstrap-actions '[{"Path":"s3://projet8-oc/scripts-pyspark/emr_boostrap_v5.sh","Name":"Custom action"}]' --ebs-root-volume-size 100 --service-role EMR_DefaultRole --enable-debugging --auto-termination-policy '{"IdleTimeout":14400}' --name 'Cluster pour Projet 8' --scale-down-behavior TERMINATE_AT_TASK_COMPLETION --region eu-west-1
```

### Bootstrap

```shell
#!/bin/bash
sudo python3 -m pip install boto3 pandas==1.2.5 tensorflow==2.7.0 Pillow==8.4.0
```

### Lancement d'un job PySpark depuis la CLI du node master

```shell
spark-submit --deploy-mode cluster --driver-memory 10g s3://projet8-oc/scripts-pyspark/extraction_features_v5.1.py
```

## Exécutions du job spark

Pour tirer profit au maximum du service EMR (historique des jobs,gestion des logs, monitoring Spark, Ganglia), on préférera utiliser la soumission par la méthode "steps" du service EMR, plutôt que de faire des "spark-submit" via la console.  

### Jobs de tuning des paramètres sur mini-clusters à 3 nodes (manager + driver + 1 exécuteur)

Paramètres par défaut du cluster à 3 machines m5.xlarge :  

| FLAG | PROPERTY | VALUE | COMMENTAIRE
| - | - | -| -
| --deploy-mode | spark.submit.deployMode | cluster  
| | yarn.nodemanager.resource.memory-mb | 12288   (ça fait 12Gb)  | c'est la valeur par node worker
| | yarn.nodemanager.resource.cpu-vcores | 4  | c'est la valeur par node worker
| --driver-cores | | ??? | on peut mettre à 4 tranquille si le driver n'est pas sur le node master
| --driver-memory | spark.driver.memory | 2G  | totalement insuffisant pour instancier VGG16
| --num-executors | spark.executor.instances |  1 vraisemblablement | vu qu'un node worker est pris par le driver il ne reste qu'un node pour l'exécuteur
| --executor-memory | spark.executor.memory | 9486M | c'est la valeur par exécuteur  
| --executor-cores | spark.executor.cores | 4 | c'est la valeur par exécuteur
| | spark.dynamicAllocation.enabled | true
  
---
AWS n'est pas user-friendly quand on veut passer le --deploy-mode client (càd avoir le driver sur le node master, et donc économiser un node (et diminuer le coût du cluster)), donc dans le cadre d'un POC, je ne teste pas et je reste en mode cluster.    




---
| CLUSTER | APP ID | SPARK-SUBMIT | ACTION | N_IMAGES | DUREE JOB SPARK (logger) | COMMENTAIRE 
| - | - | - | - | - | - | - | 
| 20b | 5/6 | --deploy-mode cluster --driver-memory 10g | extraction_features_v3.1.1.py | 1 | 14.1 / 12.9s | 4 tasks (3 vides). 1 seul exécuteur à 4 cores (tous sur le même node)
| 20b | 7/8 | --deploy-mode cluster --driver-memory 10g | extraction_features_v3.1.64.py | 64 | 29.9 / 31.1 s | 4 tasks. 1 seul exécuteur à 4 cores (tous sur le même node)
| 20b | 3/4/9 | --deploy-mode cluster --driver-memory 10g | extraction_features_v3.1.py | 128 | 43.3 / 42.7 / 43.8s | 4 tasks. 1 seul exécuteur à 4 cores (tous sur le même node) 
| 20b | 10 | --deploy-mode cluster --driver-memory 10g --num-executors 1 | extraction_features_v3.1.py | 128 | 42.4s | 4 tasks. 1 seul exécuteur à 4 cores (tous sur le même node) 
| 20b | 11 | --deploy-mode cluster --driver-memory 10g --num-executors 2 | extraction_features_v3.1.py | 128 | 47.3s | 4 tasks. 1 seul exécuteur à 4 cores (tous sur le même node) 
| 20b | 12 | --deploy-mode cluster --driver-memory 10g --num-executors 3 | extraction_features_v3.1.py | 128 | 47.3s | 4 tasks. 1 seul exécuteur à 4 cores (tous sur le même node) 
| 20b | 13 | --deploy-mode cluster --driver-memory 10g --num-executors 4 | extraction_features_v3.1.py | 128 | 47.1s | 4 tasks. 1 seul exécuteur à 4 cores (tous sur le même node) 
| 20b | 15 | --deploy-mode cluster --driver-memory 10g | extraction_features_v4.1.py (8 conteneurs) | 128 | 43.5s | 8 tasks. 1 seul exécuteur à 4 cores (tous sur le même node) 
| 20b | 16 | --deploy-mode cluster --driver-memory 10g --num-executors 2 | extraction_features_v4.1.py (8 conteneurs) | 128 | 43.0s | 8 tasks. 1 seul exécuteur à 4 cores (tous sur le même node) 
| 20b | 17/20 | --deploy-mode cluster --driver-memory 10g --num-executors 2 --executor-cores 2 --executor-memory 4743M | extraction_features_v4.1.py (8 conteneurs) | 128 | 47.0 / 44.3s | 8 tasks. 2 exécuteurs à 2 cores (tous sur le même node) 
| 20b | 18/19/21 | --deploy-mode cluster --driver-memory 10g --num-executors 4 --executor-cores 1 --executor-memory 2400M | extraction_features_v4.1.py (8 conteneurs) | 128 | (62.2s (1 task failed) / 55.9 / 57.2s) | 8 tasks. 4 exécuteurs à 1 core (tous sur le même node)  

Sur le cluster 20b, 128 images sont automatiquement partitionnées en 4 conteneurs.  
On a testé comparativement une partition en 8 conteneurs => pas de gain de temps significatif si le job se passe bien. Mais calcul plus rapide (conteneur plus petit) si une tâche plante.  

### Jobs de tuning des paramètres sur mini-clusters à 4 nodes (manager + driver + 2 exécuteurs)

---
| CLUSTER | APP ID | SPARK-SUBMIT | ACTION | N_IMAGES | DUREE JOB SPARK (logger) | COMMENTAIRE 
| - | - | - | - | - | - | - | 
| 21 | 36 | --deploy-mode cluster --driver-memory 10g | extraction_features_v3.1.1.py | 1 | 13.6s | 8 tasks (7 vides). 2 exécuteurs à 4 cores (sur 2 nodes)
| 21 | 4 | --deploy-mode cluster --driver-memory 10g | extraction_features_v3.1.64.py | 64 | 24.2 s | 8 tasks. 2 exécuteurs à 4 cores (sur 2 nodes)
| 21 | 5/6 | --deploy-mode cluster --driver-memory 10g | extraction_features_v3.1.py | 128 | 31.5 / 31.2s | 8 tasks. 2 exécuteurs à 4 cores (sur 2 nodes)
| 21 | 7/8/9/10/11/12/13 | --deploy-mode cluster --driver-memory 10g --num-executors 4 --executor-cores 2 --executor-memory 4743M | extraction_features_v3.1.py | 128 | 41.6 (1 failed) / 40.9 (1 failed) / 52.0 (1 failed) / 49.4 (1 failed) / 32.5 (0 failed) / 32.0 (0 failed) / 46.4 (2 failed) | 8 tasks. 4 exécuteurs à 2 cores (sur 2 nodes)



### Jobs finaux sur le cluster de POC à 7 nodes (manager + driver + 5 exécuteurs)

AWS m'autorise  à un maximum de <32 vCPUs sur mon compte. Je peux donc lancer au mieux 7 nodes à 4vCPUs.  

| CLUSTER | APP ID | SPARK-SUBMIT | ACTION | N_IMAGES | DUREE JOB SPARK (logger) | COMMENTAIRE 
| - | - | - | - | - | - | - | 
| 22b | 3 | --deploy-mode cluster --driver-memory 10g | extraction_features_v5.0.py | 491 | 40.8s | 20 tasks. 5 exécuteurs à 4 cores. Job fait en 78.4s sur mon PC en local (avec data sur S3).
| 22b | 4/6 | --deploy-mode cluster --driver-memory 10g | extraction_features_v5.1.py | 4987 | 224 / 224s | 20 tasks. 5 exécuteurs à 4 cores.



#### Combien d'images par heure CPU d'exécuteur ?

4987-491=4496 images extraites en 224-41=183 secondes, avec 20 vCPUs m5.  
**En 1h, un vCPU de m5 va extraire 4400 images**

In [8]:
(4987-491)/(224-41)/20*3600

4422.295081967212

#### Quel coût pour 1 million d'images ?

Hypothèses :  
- 0.2€ / h pour une instance EC2 m5.xlarge.
- On ne tient pas compte du coût du service EMR, ni de l'EBS, ni des accès S3 (coûts comparativement faibles).  
- On ne tient pas compte du coût du master node, ni du driver node (coûts comparativement faibles si le cluster beaucoup de noeud).  

In [9]:
print(f"Coût pour 1 million d'images = {round(1e6 / _ / 4 * 0.2)} €")

Coût pour 1 million d'images = 11 €


## Script PySpark retenu

Correspond au fichier "extraction_features_v5.1.py".  

In [15]:
# choix de la zone selon disponibilité :
BUCKET_NAME = "projet8-oc"   # eu-west-1

# imports modules
import os
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.models import Model
from pyspark import SparkContext
import boto3
from datetime import datetime
from time import time
import pandas as pd

def timestamp():
    return datetime.now().strftime("%Y-%m-%d--%H-%M-%S")

def logger_s3(s3_bucket, logfile, log, log_to_shell=True):
    """Appends log to logfile. Uploads each update to S3 bucket.
    Args :
        - s3_bucket (Bucket).
        - logfile (string) .
        - log (string).
        - log_to_shell (bool) : si True, imprime également dans le std.out.
    Returns : /
    """
    time_log = timestamp() + " : " + log + "\n"
    if log_to_shell : print("*"*100 + "\n", time_log)
    with open("./logs/"+ logfile, 'a') as f:
        f.write(time_log)
    s3_bucket.upload_file("./logs/"+ logfile, 'logs/' + logfile)

# fonction d'extraction de feature
def VGG16_extracteur_spark(path, nom_image, model):
    """Transforme un fichier image en un vecteur de dimension 4096.
    Args :
    - path : chemin vers les images (chemin local ou 'S3').
    - nom_image : chemin d'accès à l'image (exemple : 'Apple Braeburn/r_173_100.jpg')
    - model : model_vgg16 ou bc_model_vgg16.
    Returns :
    - liste de dimension 4097 (4096 dimensions de VGG16 + nom_image).
    """
    # create path to the image stored locally
    if path == 'S3':
        # il faut un une instance Bucket différente sur le driver et les exécuteurs
        s3_bucket_vgg16 = boto3.resource('s3').Bucket(BUCKET_NAME)    
        s3_bucket_vgg16.download_file('input/' + nom_image, '/tmp/img')
        path_nom_image = '/tmp/img'
    else:
        path_nom_image = os.path.join(path, nom_image)
    # load the image (Pillow appelé par TF)
    image = load_img(path_nom_image, target_size=(224, 224))
    # convert the image pixels to a numpy array
    image = img_to_array(image)
    # reshape data for the model
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    # prepare the image for the VGG model
    image = preprocess_input(image)
    # get extracted features
    im_features = model.predict(image)
    # convert to list and add nom_image
    im_features = im_features[0].tolist()
    im_features.append(nom_image)
    return im_features

# il faut un une instance Bucket différente sur le driver et les exécuteurs
s3_bucket = boto3.resource('s3').Bucket(BUCKET_NAME)

# start logging
os.makedirs(os.path.join(os.getcwd(), 'logs/'), exist_ok=True)
logfile = "Log for job started " + timestamp() + ".txt"
logger_s3(s3_bucket, logfile, "logging starts")

# liste des fruits dans le répertoire "input/" : megabatch_img
all_fruits = s3_bucket.objects.filter(Prefix="input")
megabatch_img = []
for obj in all_fruits:
    megabatch_img.append(obj.key.lstrip("input/"))
del megabatch_img[0]  # suppression d'un objet non pertinent

# load model VGG16
model_vgg16 = VGG16()
# remove the output layer
model_vgg16 = Model(inputs=model_vgg16.inputs, outputs=model_vgg16.layers[-2].output)

# Spark context : le SparkConf est supprimé car géré via le spark-submit
sc = SparkContext()

# logger des paramètres du spark context lancé
logger_s3(s3_bucket, logfile, "Configuration de Spark :" + str(sc.getConf().getAll()))

# broadcasting
bc_model_vgg16 = sc.broadcast(model_vgg16)

# logging
logger_s3(s3_bucket, logfile, "spark job starts")
t0 = time()

# job spark
resultat = sc.parallelize(megabatch_img) \
    .map(lambda img: VGG16_extracteur_spark('S3', img, bc_model_vgg16.value)) \
    .collect()

# logging
logger_s3(s3_bucket, logfile, f"spark job has ended (duration of {round(time() - t0, 1)}s) - output to S3 starts")

# output
df_output = pd.DataFrame(resultat)
df_output.columns = [f'dim_{i}' for i in range(4096)] + ['path']
file_name = f'df_output_{len(df_output)}_fruits.csv'
local_path = os.path.join(os.getcwd(), file_name)
df_output.to_csv(path_or_buf=local_path)
s3_bucket.upload_file(local_path, 'output/' + file_name)

# end logging
logger_s3(s3_bucket, logfile, "output to S3 has ended")

# Validation du .csv issu de S3

Il s'agit de montrer que le .csv final du projet 8 est bien celui attendu.

In [54]:
# choix de la zone selon disponibilité :
BUCKET_NAME = "projet8-oc"   # eu-west-1
# BUCKET_NAME = "oc-projet-8"   # eu-west-3

NOM_CSV_FINAL = 'df_output_4987_fruits.csv'  

# connexion au bucket
s3_bucket = boto3.resource('s3').Bucket(BUCKET_NAME)
# download du fichier
os.makedirs(os.path.join(dirpath, 'tmp/'), exist_ok=True)
s3_bucket.download_file(f'output/{NOM_CSV_FINAL}', os.path.join(dirpath, f'tmp/{NOM_CSV_FINAL}'))
# affichage du csv converti en df pandas
pd.read_csv(os.path.join(dirpath, f'tmp/{NOM_CSV_FINAL}'))

Unnamed: 0.1,Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,dim_10,dim_11,dim_12,dim_13,dim_14,dim_15,dim_16,dim_17,dim_18,dim_19,dim_20,dim_21,dim_22,dim_23,dim_24,dim_25,dim_26,dim_27,dim_28,dim_29,dim_30,dim_31,dim_32,dim_33,dim_34,dim_35,dim_36,dim_37,dim_38,dim_39,dim_40,dim_41,dim_42,dim_43,dim_44,dim_45,dim_46,dim_47,dim_48,dim_49,dim_50,dim_51,dim_52,dim_53,dim_54,dim_55,dim_56,dim_57,dim_58,dim_59,dim_60,dim_61,dim_62,dim_63,dim_64,dim_65,dim_66,dim_67,dim_68,dim_69,dim_70,dim_71,dim_72,dim_73,dim_74,dim_75,dim_76,dim_77,dim_78,dim_79,dim_80,dim_81,dim_82,dim_83,dim_84,dim_85,dim_86,dim_87,dim_88,dim_89,dim_90,dim_91,dim_92,dim_93,dim_94,dim_95,dim_96,dim_97,dim_98,...,dim_3997,dim_3998,dim_3999,dim_4000,dim_4001,dim_4002,dim_4003,dim_4004,dim_4005,dim_4006,dim_4007,dim_4008,dim_4009,dim_4010,dim_4011,dim_4012,dim_4013,dim_4014,dim_4015,dim_4016,dim_4017,dim_4018,dim_4019,dim_4020,dim_4021,dim_4022,dim_4023,dim_4024,dim_4025,dim_4026,dim_4027,dim_4028,dim_4029,dim_4030,dim_4031,dim_4032,dim_4033,dim_4034,dim_4035,dim_4036,dim_4037,dim_4038,dim_4039,dim_4040,dim_4041,dim_4042,dim_4043,dim_4044,dim_4045,dim_4046,dim_4047,dim_4048,dim_4049,dim_4050,dim_4051,dim_4052,dim_4053,dim_4054,dim_4055,dim_4056,dim_4057,dim_4058,dim_4059,dim_4060,dim_4061,dim_4062,dim_4063,dim_4064,dim_4065,dim_4066,dim_4067,dim_4068,dim_4069,dim_4070,dim_4071,dim_4072,dim_4073,dim_4074,dim_4075,dim_4076,dim_4077,dim_4078,dim_4079,dim_4080,dim_4081,dim_4082,dim_4083,dim_4084,dim_4085,dim_4086,dim_4087,dim_4088,dim_4089,dim_4090,dim_4091,dim_4092,dim_4093,dim_4094,dim_4095,path
0,0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,7.707440,0.0,0.0,0.0,0.206265,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.816617,0.0,0.000000,0.00000,0.0,4.550093,3.883253,2.026253,0.466197,0.0,0.0,0.0,0.0,0.000000,0.000000,2.362326,8.676317,0.0,2.63152,0.0,0.408177,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,2.013056,0.000000,4.894269,0.0,0.0,0.000000,0.000000,2.791145,0.556839,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,1.115640,0.0,0.0,0.0,0.046922,0.0,0.0,0.0,0.0,3.428858,0.000000,0.0,3.226360,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.000000,2.884595,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,7.052612,0.244668,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,3.617070,0.0,2.405461,0.0,0.0,0.0,2.156476,1.976058,0.000000,5.916898,0.631697,0.0,0.0,0.000000,0.0,3.189957,0.000000,0.0,0.000000,0.000000,1.533861,0.0,3.382356,0.000000,0.0,8.087275,4.379069,1.486008,0.0,0.0,1.597372,4.910885,0.0,0.0,0.0,0.0,2.001863,0.0,5.759784,0.0,0.000000,1.496690,0.389178,0.0,1.216436,0.0,0.0,0.0,4.726130,0.0,2.063914,0.368924,0.000000,0.0,1.06749,2.895292,0.0,0.000000,0.0,0.000000,0.0,1.353368,0.0,0.896136,0.0,0.000000,0.0,1.189074,0.940704,0.000000,3.395598,0.258413,1.812144,0.0,Apple Braeburn/0_100.jpg
1,1,0.0,0.000000,0.000000,1.860842,0.000000,0.0,6.156754,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,1.806051,0.000000,0.217776,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,1.398381,0.00000,0.0,3.199073,1.177544,3.198128,1.349004,0.0,0.0,0.0,0.0,0.000000,0.000000,2.409419,6.927777,0.0,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,2.793935,0.000000,5.099165,0.0,0.0,0.643208,0.000000,1.985425,0.000000,0.0,0.174190,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,1.027436,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.725480,0.000000,0.0,1.276844,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.000000,2.622023,0.0,0.453474,1.895137,0.000000,0.929289,0.0,0.0,0.0,0.000000,0.0,0.787319,0.0,0.0,0.0,4.740569,0.327348,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,3.757420,0.0,3.416483,0.0,0.0,0.0,1.224865,0.139760,0.000000,4.121590,0.486960,0.0,0.0,0.418387,0.0,0.000000,0.000000,0.0,0.000000,0.508060,0.000000,0.0,1.320193,0.000000,0.0,5.536569,2.623333,2.039082,0.0,0.0,1.725121,2.202492,0.0,0.0,0.0,0.0,0.483147,0.0,5.102569,0.0,0.000000,0.000000,0.000000,0.0,0.794049,0.0,0.0,0.0,2.813943,0.0,2.177521,1.616907,0.000000,0.0,0.00000,0.712588,0.0,0.000000,0.0,0.000000,0.0,1.635242,0.0,0.000000,0.0,0.500092,0.0,0.000000,0.772667,0.000000,5.774760,0.000000,1.141268,0.0,Apple Braeburn/100_100.jpg
2,2,0.0,0.000000,0.000000,2.629139,0.000000,0.0,5.248846,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,1.821227,0.000000,0.470393,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,1.575772,0.00000,0.0,2.734585,2.083382,3.096009,1.044707,0.0,0.0,0.0,0.0,0.000000,0.000000,2.450672,6.867932,0.0,0.00000,0.0,0.720028,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,3.570728,0.000000,5.327176,0.0,0.0,0.422214,0.000000,1.460718,0.000000,0.0,0.454584,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,1.032613,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.882505,0.000000,0.0,1.121397,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.019309,2.230622,0.0,0.459137,1.854128,0.000000,1.266425,0.0,0.0,0.0,0.000000,0.0,0.788662,0.0,0.0,0.0,4.825982,0.850352,0.000000,0.000000,0.000000,0.000000,0.0,0.228986,0.000000,3.696232,0.0,3.451251,0.0,0.0,0.0,0.954569,0.000000,0.000000,3.664170,1.234787,0.0,0.0,0.774301,0.0,0.000000,0.000000,0.0,0.000000,0.517520,0.000000,0.0,1.273251,0.000000,0.0,5.159644,2.693308,1.937661,0.0,0.0,1.400888,2.242710,0.0,0.0,0.0,0.0,1.213556,0.0,3.905993,0.0,0.000000,0.000000,0.256908,0.0,0.917110,0.0,0.0,0.0,2.967221,0.0,2.394893,1.705273,0.000000,0.0,0.00000,1.155093,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.682496,0.0,0.000000,0.576677,0.000000,5.988881,0.117657,0.927160,0.0,Apple Braeburn/101_100.jpg
3,3,0.0,0.000000,0.000000,2.782533,0.000000,0.0,5.570585,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,2.308554,0.000000,0.344396,0.0,0.897336,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,1.298252,0.00000,0.0,1.868551,1.540524,3.848393,0.716403,0.0,0.0,0.0,0.0,0.258332,0.000000,1.963872,6.002176,0.0,0.00000,0.0,0.391242,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,4.169444,0.000000,4.912257,0.0,0.0,0.000000,0.000000,2.681197,0.000000,0.0,0.578973,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,1.455425,0.0,0.0,0.0,0.009620,0.0,0.0,0.0,0.0,0.640726,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.537483,1.699488,0.0,0.000000,2.227765,0.000000,1.132675,0.0,0.0,0.0,0.000000,0.0,1.063481,0.0,0.0,0.0,5.014203,1.084627,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,4.053185,0.0,2.721104,0.0,0.0,0.0,0.895413,0.006683,0.000000,3.477646,1.030381,0.0,0.0,0.788542,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.673434,0.000000,0.0,5.156811,1.980933,1.820636,0.0,0.0,1.677066,2.324290,0.0,0.0,0.0,0.0,1.168684,0.0,3.820113,0.0,0.000000,0.000000,0.200013,0.0,0.755543,0.0,0.0,0.0,2.435991,0.0,2.290510,1.964813,0.000000,0.0,0.00000,1.101207,0.0,0.000000,0.0,0.000000,0.0,0.511846,0.0,0.141531,0.0,0.341045,0.0,0.000000,0.834506,0.404473,5.902613,0.293886,0.592740,0.0,Apple Braeburn/102_100.jpg
4,4,0.0,0.000000,0.000000,2.707351,0.000000,0.0,5.719396,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,2.686819,0.000000,0.000000,0.0,0.310528,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,1.683410,0.00000,0.0,2.213417,1.675967,3.641191,1.417987,0.0,0.0,0.0,0.0,0.364581,0.000000,2.438608,6.367381,0.0,0.00000,0.0,0.211699,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,3.849742,0.000000,5.821661,0.0,0.0,0.103918,0.000000,1.633499,0.000000,0.0,1.112808,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,1.408233,0.0,0.0,0.0,0.126704,0.0,0.0,0.0,0.0,0.462471,0.000000,0.0,0.135078,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.226224,2.269430,0.0,0.366624,2.042130,0.000000,0.789015,0.0,0.0,0.0,0.000000,0.0,0.394314,0.0,0.0,0.0,5.123999,0.827606,0.000000,0.000000,0.000000,0.000000,0.0,0.078276,0.000000,4.439198,0.0,3.277715,0.0,0.0,0.0,1.438671,0.537503,0.000000,3.619257,1.066671,0.0,0.0,1.274524,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,1.040659,0.000000,0.0,5.085872,2.331032,1.847747,0.0,0.0,1.463872,2.566049,0.0,0.0,0.0,0.0,1.056402,0.0,3.194069,0.0,0.000000,0.000000,0.000000,0.0,1.185935,0.0,0.0,0.0,2.945217,0.0,2.764750,1.673033,0.000000,0.0,0.00000,1.138748,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,1.394632,0.000000,6.038689,0.000000,0.846206,0.0,Apple Braeburn/103_100.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4982,4982,0.0,1.135720,0.075085,4.077478,0.349203,0.0,2.672222,0.0,0.0,0.0,0.000000,0.0,0.675173,0.661218,0.000000,0.000000,0.069069,1.899345,0.0,0.000000,0.0,0.0,0.0,0.064773,3.553955,0.000000,1.330065,0.777377,0.512536,1.782941,0.0,0.0,0.0,0.000000,0.0,0.000000,0.00000,0.0,7.506771,0.000000,0.108315,0.000000,0.0,0.0,0.0,0.0,0.000000,7.132561,0.000000,4.587662,0.0,0.00000,0.0,0.000000,0.669433,1.416281,6.038474,0.0,1.592174,0.0,0.0,5.734343,1.849386,1.967583,0.0,0.0,3.197520,0.907054,0.000000,2.768105,0.0,3.412900,0.0,0.0,0.379476,0.0,0.0,0.0,1.464730,0.0,0.649024,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,1.061285,1.716263,0.0,0.000000,2.832537,0.0,0.000000,0.0,0.0,0.0,...,2.922280,4.433890,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,12.371401,0.000000,3.951100,4.855664,3.956212,0.585304,0.0,0.974294,4.365596,0.000000,0.0,0.443223,0.0,0.0,0.0,4.077805,1.164714,6.377956,0.000000,0.000000,0.0,0.0,0.546994,0.0,1.404378,5.021410,0.0,0.000000,0.356728,6.170701,0.0,5.830205,1.826298,0.0,2.633169,3.005190,0.000000,0.0,0.0,0.000000,4.465318,0.0,0.0,0.0,0.0,0.000000,0.0,2.267470,0.0,0.000000,1.352523,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.00000,4.065494,0.0,0.000000,0.0,0.244029,0.0,0.000000,0.0,4.682040,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,1.211337,5.613244,0.0,Cauliflower/r_96_100.jpg
4983,4983,0.0,1.677208,0.000000,3.082828,0.752789,0.0,1.391753,0.0,0.0,0.0,0.000000,0.0,1.569207,2.071315,0.000000,0.000000,0.000000,1.622995,0.0,0.000000,0.0,0.0,0.0,0.000000,3.659100,0.289655,0.343175,1.292939,0.432749,1.489007,0.0,0.0,0.0,0.000000,0.0,0.000000,0.00000,0.0,7.738481,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,7.474264,0.000000,3.443918,0.0,0.00000,0.0,0.000000,0.511685,1.595558,6.216537,0.0,2.263904,0.0,0.0,5.621294,2.171186,2.376347,0.0,0.0,3.558488,0.694918,0.000000,2.278404,0.0,3.239506,0.0,0.0,0.000000,0.0,0.0,0.0,1.150541,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.479288,1.166091,0.0,0.000000,2.833055,0.0,0.000000,0.0,0.0,0.0,...,3.745840,4.574128,0.0,0.000000,0.336018,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,12.294601,0.000000,4.566935,4.517511,4.310040,0.926612,0.0,1.206479,4.864810,0.000000,0.0,0.483527,0.0,0.0,0.0,4.334904,1.242146,7.069700,0.000000,0.021508,0.0,0.0,0.557093,0.0,1.581484,5.556756,0.0,0.061029,0.000000,5.943589,0.0,5.576692,1.575464,0.0,3.625865,2.749603,0.000000,0.0,0.0,0.000000,4.974060,0.0,0.0,0.0,0.0,0.000000,0.0,2.622629,0.0,0.000000,2.586429,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.00000,2.448918,0.0,0.000000,0.0,0.020859,0.0,0.000000,0.0,4.585892,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.050244,0.842635,6.106942,0.0,Cauliflower/r_97_100.jpg
4984,4984,0.0,1.384819,0.000000,3.600247,1.202142,0.0,1.516171,0.0,0.0,0.0,0.000000,0.0,1.651822,0.927463,0.184091,0.000000,0.000000,2.120744,0.0,0.000000,0.0,0.0,0.0,0.000000,3.459838,0.000000,0.000000,0.481766,0.242049,1.583516,0.0,0.0,0.0,0.000000,0.0,0.000000,0.00000,0.0,6.577733,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,7.456744,0.000000,3.854496,0.0,0.00000,0.0,0.000000,0.934040,1.063455,6.014441,0.0,1.693111,0.0,0.0,6.347840,1.807941,2.823799,0.0,0.0,3.189009,1.151786,0.000000,2.329881,0.0,3.623349,0.0,0.0,0.000000,0.0,0.0,0.0,1.535123,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,1.097861,0.313926,0.0,0.000000,2.703346,0.0,0.000000,0.0,0.0,0.0,...,2.765606,4.952980,0.0,0.000000,0.550600,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,13.593579,0.000000,3.655057,4.582375,3.974208,0.000000,0.0,0.303905,5.256853,0.000000,0.0,0.000000,0.0,0.0,0.0,3.565377,0.952323,6.442844,0.000000,0.000000,0.0,0.0,0.000000,0.0,1.192557,4.753700,0.0,0.000000,0.000000,6.155531,0.0,5.907612,1.807116,0.0,2.414168,3.152674,0.000000,0.0,0.0,0.000000,4.199926,0.0,0.0,0.0,0.0,0.000000,0.0,2.634205,0.0,0.000000,2.561550,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.00000,4.259901,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,5.132056,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.748169,5.636731,0.0,Cauliflower/r_98_100.jpg
4985,4985,0.0,1.220244,0.117396,3.198638,0.643887,0.0,1.600079,0.0,0.0,0.0,0.000000,0.0,1.129378,1.332653,0.000000,0.000000,0.000000,1.679977,0.0,0.000000,0.0,0.0,0.0,0.000000,3.427257,0.000000,0.661659,0.372514,1.296367,2.451314,0.0,0.0,0.0,0.000000,0.0,0.000000,0.00000,0.0,7.016243,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,8.314436,0.000000,3.402091,0.0,0.00000,0.0,0.000000,0.931786,1.669306,6.422529,0.0,1.515875,0.0,0.0,5.919014,2.311400,2.382618,0.0,0.0,3.453100,1.440666,0.000000,2.158053,0.0,3.640436,0.0,0.0,0.347761,0.0,0.0,0.0,1.323434,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.866562,0.715301,0.0,0.000000,1.969208,0.0,0.000000,0.0,0.0,0.0,...,3.366948,4.915794,0.0,0.000000,0.224540,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,12.324314,0.000000,3.912486,4.776536,3.854800,0.102583,0.0,1.274272,4.533993,0.000000,0.0,0.000000,0.0,0.0,0.0,3.692292,1.273191,6.427769,0.000000,0.055849,0.0,0.0,0.561004,0.0,1.034724,4.437870,0.0,0.000000,0.000000,7.008589,0.0,5.656385,1.638303,0.0,2.657717,3.091187,0.000000,0.0,0.0,0.000000,4.457010,0.0,0.0,0.0,0.0,0.000000,0.0,2.978399,0.0,0.000000,2.095099,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.00000,3.702222,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,5.362126,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.130129,0.503035,5.950613,0.0,Cauliflower/r_99_100.jpg
