# Objective
Set-up a first big data architecture using AWS products (Mobile application with a fruit pictures classifier engine)

# Data
Link to upload data: https://www.kaggle.com/moltean/fruits

# Table of contents <a class="anchor" id="chapter0"></a> 
* [Preparation](#chapter1)
    * [Import packages](#sub1_1)
    * [Declare constants](#sub1_2)
    * [Get picture information](#sub1_3)
    * [Explore picture information](#sub1_4)
    * [Get class information](#sub1_5) 
    * [Target label encoding](#sub1_6)
* [Feature engineering](#chapter3)
    * [Generate descriptors on Training dataset](#sub3_1)
    * [Cluster's descriptors](#sub3_2)
    * [Compute frequency histogram on clusters' descriptors](#sub3_3)
    * [Reduce dimension with PCA](#sub3_4)
* [Modelling](#chapter4)
    * [Train a KNN model](#sub4_1)
    * [Check learning curve](#sub4_2)
    * [Predict and compare prediction to reality on Test dataset](#sub4_3)
    * [Predict and compare prediction to reality on Validation dataset](#sub4_4)
* [Go to End](#chapter100)

# Preparation <a class="anchor" id="chapter1"></a>

## Import packages <a class="anchor" id="sub1_1"></a>

In [1]:
import P8_02_module as MyMod

import numpy as np
import pandas as pd 

import os
import glob
import time

import matplotlib.pyplot as plt
from matplotlib.image import imread

from cv2 import cv2
import PIL
from PIL import Image, ImageDraw, ImageOps, ImageFilter

from sklearn import cluster
from sklearn import decomposition
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import learning_curve
from sklearn.utils import shuffle

import findspark

from pyspark import SparkContext

import boto3

## Declare constants <a class="anchor" id="sub1_2"></a>

In [2]:
# Sample volume limitations
#GET_PICTURES_SAMPLE_SIZE = 50
GET_PICTURES_NB_PER_CLASS = 2
#TRAINING_SAMPLE_SIZE = 5000
#TEST_SAMPLE_SIZE = 5000
#VALIDATION_SAMPLE_SIZE = 5000

# KMeans hyper-param
KMEANS_N_CLUSTERS = 90

# Pictures repository
#PERSO_REP = "C:\\Users\\BNP Leasing\\3D Objects\\Soutenance P8\\fruits-perso\\"
#PROJECT_REP = "C:\\Users\\BNP Leasing\\3D Objects\\Soutenance P8\\fruits-360-original-size\\"

BUCKET_NAME = "moncompartimentamoi"

## Get picture information <a class="anchor" id="sub1_3"></a>

In [3]:
# For AWS deb________
# Laurence: à commenter
findspark.init("C:\spark\spark-3.2.1-bin-hadoop3.2")

# Instantiate a SparkContext
sc = SparkContext()

print(sc.version)

# Instantiate S3 client
s3_client = boto3.client('s3', region_name='eu-west-3')

s3_resource = boto3.resource('s3')

s3_bucket = s3_resource.Bucket(BUCKET_NAME)

# For AWS fin________

3.2.1


In [4]:
# Build Dataframe and transfer pictures from local repository to AWS S3

path = '../fruits-360-original-size/'

# Get Dataset names, Target class names and Picture names
df_main = pd.DataFrame(columns = ['FullFileName', 'Dataset', 'Target', 'Picture', 'FileSize (in KB)']) 
target_prec = ""

for file in glob.iglob(path+'**/*.jpg', recursive = True):
    
    lst = file.split('\\')
    
    # Limit number of pictures per class(GET_PICTURES_NB_PER_CLASS)
    if lst[2] == target_prec: 
        i += 1
    else:
        i = 0
    target_prec = lst[2]
    
    if (lst[1] == 'Training' and i < 2*GET_PICTURES_NB_PER_CLASS) or \
                    (lst[1] != 'Training' and i < GET_PICTURES_NB_PER_CLASS):     
        
        # update DataFrame
        lst.append(os.path.getsize(file) / 1024)  # in KBytes    
        lst[0] = lst[0] + "/" + lst[1] + "/" + lst[2] + "/" + lst[3]
        df_main.loc[len(df_main)] = lst
                
        # For AWS deb________
        # upload the image in the AWS S3 bucket  
        s3_client.upload_file(lst[0], BUCKET_NAME, lst[1]+"/"+lst[2]+"/"+lst[3])
        # For AWS fin________
        
df_main.head(20)

Unnamed: 0,FullFileName,Dataset,Target,Picture,FileSize (in KB)
0,../fruits-360-original-size/Test/apple_6/r0_10...,Test,apple_6,r0_103.jpg,15.641602
1,../fruits-360-original-size/Test/apple_6/r0_10...,Test,apple_6,r0_107.jpg,15.830078
2,../fruits-360-original-size/Test/apple_braebur...,Test,apple_braeburn_1,r0_103.jpg,37.476562
3,../fruits-360-original-size/Test/apple_braebur...,Test,apple_braeburn_1,r0_107.jpg,37.428711
4,../fruits-360-original-size/Test/apple_crimson...,Test,apple_crimson_snow_1,r0_103.jpg,59.035156
5,../fruits-360-original-size/Test/apple_crimson...,Test,apple_crimson_snow_1,r0_107.jpg,58.818359
6,../fruits-360-original-size/Test/apple_golden_...,Test,apple_golden_1,r0_103.jpg,39.104492
7,../fruits-360-original-size/Test/apple_golden_...,Test,apple_golden_1,r0_107.jpg,40.859375
8,../fruits-360-original-size/Test/apple_golden_...,Test,apple_golden_2,r0_103.jpg,32.066406
9,../fruits-360-original-size/Test/apple_golden_...,Test,apple_golden_2,r0_107.jpg,32.089844


In [5]:
# Build full file names
#Laurence: df_main['FullFileName'] = df_main['FullFileName']+'/'+df_main['Dataset']+'/'+df_main['Target']+'/'+df_main['Picture']  
df_main

Unnamed: 0,FullFileName,Dataset,Target,Picture,FileSize (in KB)
0,../fruits-360-original-size/Test/apple_6/r0_10...,Test,apple_6,r0_103.jpg,15.641602
1,../fruits-360-original-size/Test/apple_6/r0_10...,Test,apple_6,r0_107.jpg,15.830078
2,../fruits-360-original-size/Test/apple_braebur...,Test,apple_braeburn_1,r0_103.jpg,37.476562
3,../fruits-360-original-size/Test/apple_braebur...,Test,apple_braeburn_1,r0_107.jpg,37.428711
4,../fruits-360-original-size/Test/apple_crimson...,Test,apple_crimson_snow_1,r0_103.jpg,59.035156
...,...,...,...,...,...
187,../fruits-360-original-size/Validation/pear_3/...,Validation,pear_3,r0_101.jpg,71.865234
188,../fruits-360-original-size/Validation/zucchin...,Validation,zucchini_1,r0_1.jpg,43.950195
189,../fruits-360-original-size/Validation/zucchin...,Validation,zucchini_1,r0_101.jpg,50.963867
190,../fruits-360-original-size/Validation/zucchin...,Validation,zucchini_dark_1,r0_1.jpg,33.743164


## Explore picture information <a class="anchor" id="sub1_4"></a>

### Assess volumes and modalities

In [None]:
df_main.describe()

 > 12 455 pictures, 24 target classes, 3 datasets  
 > 958 picture names mean that some pictures have the same name and are not classified in the same repository

### Count pictures by target class and dataset

In [None]:
pd.DataFrame(df_main.groupby(['Target', 'Dataset'])['Picture'].count())

### Count pictures by target class

In [None]:
pd.DataFrame(df_main.groupby(['Target'])['Picture'].count())

### Count dataset modality by picture name

In [None]:
df_dataset_mod = pd.DataFrame(df_main.groupby(['Picture'])['Dataset'].nunique())
df_dataset_mod.rename(columns={'Dataset':'Dataset_mod'}, inplace=True)
len(df_dataset_mod[df_dataset_mod['Dataset_mod'] > 1])

> No file with the same name in the different datasets

### Count target class modality by picture name

In [None]:
df_target_mod = pd.DataFrame(df_main.groupby(['Picture'])['Target'].nunique())
df_target_mod.rename(columns={'Target':'Target_mod'}, inplace=True)
df_target_mod[df_target_mod['Target_mod'] > 1]

In [None]:
df_target_mod = df_target_mod.reset_index(drop=False)

In [None]:
pd.DataFrame(df_target_mod.groupby(['Target_mod'])['Picture'].count())

 > many files with the same name in the different target classes   
 > for instance, 156 files have the same name and appear in 24 different target class

In [None]:
df_main[df_main['Picture'] == 'r0_0.jpg'][['Picture', 'Target', 'Dataset']]

In [None]:
pict = Image.open(df_main['FullFileName'].iloc[20])
plt.imshow(pict)
plt.show()

In [None]:
pict = Image.open(df_main['FullFileName'].iloc[40])
plt.imshow(pict)
plt.show()

 > File name format: r?_image_index.jpg (e.g. r0_31.jpg or r1_12.jpg)  
 > "r?" stands for rotation axis (first one is r0)

### Distinguish rotation axis and index

In [None]:
# Laurence: supprimler les FutureWarning
df_main["Rotation"], df_main["Index"] = df_main["Picture"].str.split("_", 1).str
df_main["Rotation"] = df_main["Rotation"].str.replace('r','')
df_main["Index"] = df_main["Index"].str.replace('.jpg','')
df_main

In [None]:
pd.DataFrame(df_main["Rotation"].unique())

In [None]:
pict = Image.open(df_main['FullFileName'].iloc[df_main[df_main['Rotation'] == '0'].head(1).index[0]])
plt.imshow(pict)
plt.show()

In [None]:
'''pict = Image.open(df_main['FullFileName'].iloc[df_main[df_main['Rotation'] == '1'].head(1).index[0]])
plt.imshow(pict)
plt.show()'''

In [None]:
'''pict = Image.open(df_main['FullFileName'].iloc[df_main[df_main['Rotation'] == '2'].head(1).index[0]])
plt.imshow(pict)
plt.show()'''

 > 0 - queue top or down > rotation around the z-axis  
 > 1 - queue behind or ahead > rotation around the x-axis  
 > 2 - queue left or right > rotation around the y-axis  

### Target class count distribution

In [None]:
def distribution(df_in, dataset):
    
    df = df_in.copy()
    
    if dataset in ['Training', 'Test', 'Validation']:
        df.drop(df[df['Dataset'] != dataset].index, inplace=True)
    elif dataset != '*':
        print("dataset argument should be 'Training', 'Test', 'Validation' or '*'")
        return -1
        
    df_distrib = pd.DataFrame(df.groupby(['Target'])['Picture'].count())
    df_distrib.reset_index(drop=False, inplace=True)
    df_distrib.rename(columns={'Picture':'Picture count', 'Target':'Class'}, inplace=True)
    df_distrib = df_distrib.sort_values(by='Class', ascending=False)    

    if len(df_distrib) == 0: return -1
    
    df_distrib.plot.barh(x='Class', y='Picture count', figsize=(12, 10))    
    
    return 1

In [None]:
ret = distribution(df_main, 'Training')
ret = distribution(df_main, 'Test')
ret = distribution(df_main, 'Validation')
ret = distribution(df_main, '*')

### Target class filesize average distribution  
Logitech C920 camera and dedicated algorithm which extract the fruit from the background

In [None]:
pd.DataFrame(df_main.groupby('Target')['FileSize (in KB)'].mean())

In [None]:
pd.DataFrame(df_main.groupby('Dataset')['FileSize (in KB)'].mean())

In [None]:
df_main['FileSize (in KB)'].mean()

## Get class information <a class="anchor" id="sub1_5"></a>

In [None]:
path = '../fruits-360-original-size/Meta/'

df_class_add = pd.DataFrame(columns = ['PathName', 'Target', 'TxtName'])
df_meta = pd.DataFrame(columns = ['Flag', 'Value'])

for file in glob.iglob(path+'**/info.txt', recursive = True):
    
    df_meta_add = pd.read_csv(file, sep="=", names=['Flag', 'Value'])
        
    df_class_add.loc[0] = file.split('\\')   
        
    df_meta = pd.concat([df_class_add.join(df_meta_add, how='cross'), df_meta])

del df_class_add, df_meta_add

df_meta.drop(columns=['PathName', 'TxtName'], inplace=True)
df_meta = df_meta.sort_values(['Target', 'Flag'], ascending=True)
df_meta.reset_index(drop=True, inplace=True)
df_meta

In [None]:
df_meta['Flag'].nunique()

In [None]:
pd.DataFrame(df_meta['Flag'].unique(), columns=['Flag']).sort_values(by='Flag')

## Target label encoding  <a class="anchor" id="sub1_6"></a>

In [None]:
df_main, df_target_mapping = MyMod.encode_LabelEncoder(df_main, 'Target')
df_main.head(5)

In [None]:
df_meta, df_target_mapping = MyMod.encode_LabelEncoder(df_meta, 'Target')
df_meta.head(5)

In [None]:
df_target_mapping

# Feature engineering <a class="anchor" id="chapter3"></a> 

## Generate descriptors on Training dataset <a class="anchor" id="sub3_1"></a>

In [None]:
'''def dataset_reduction(df, SAMPLE_SIZE):
    # Initiate index
    ind_sample = np.arange(len(df))
    # Shuffle
    ind_sample = shuffle(ind_sample, random_state=42)
    # Select SAMPLE_SIZE first index  
    ind_sample = ind_sample[:SAMPLE_SIZE]
    
    # Check balanced dataset over 'Target', 'Rotation' features
    # Prepare resulting dataframe
    df_ret = df.iloc[ind_sample] 
    # Reset index
    df_ret.reset_index(drop=True, inplace=True)
    
    return df_ret'''

In [None]:
df_main_training = df_main[df_main['Dataset'] == 'Training']
#df_main_training = dataset_reduction(df_main_training, TRAINING_SAMPLE_SIZE)
df_main_training

In [None]:
def desc_extraction(df_main_training):
    
    # Create SIFT descriptor
    sift = cv2.SIFT_create()

    # Initiate the resulting dataframe for key points descriptors
    df_kpdesc_training = pd.DataFrame()

    for pict_num in range(len(df_main_training)):

        # Print treatment progress each 100 pictures
        if pict_num%100 == 0 : print(pict_num)

        # Open the picture
        pict = Image.open(df_main_training['FullFileName'].iloc[pict_num])

        # Compute key points and picture descriptors
        # descript: numpy array with one line by interest point, 128 columns
        keypoints, descript = sift.detectAndCompute(np.array(pict), None)

        # Enrich df_kpdesc_training with FullFileName
        df_pict_name = df_main_training[df_main_training.index == pict_num][['FullFileName']]        
        df_kpdesc = df_pict_name.merge(pd.DataFrame(descript), how='cross')

        # Concatenate new samples to the resulting dataframe
        df_kpdesc_training = pd.concat([df_kpdesc_training, df_kpdesc])

    # Reset indexation
    df_kpdesc_training.reset_index(drop=True, inplace=True)
    
    return df_kpdesc_training

In [None]:
# Save starting time
time_start=time.time()

df_kpdesc_training = desc_extraction(df_main_training)

# Compute time elapse
elapse_s = round(time.time()-time_start, 0)
elapse_m = round(elapse_s / 60, 2)
print()
print('Time elapse with SIFT descriptor : {} seconds ({} minutes)'.format(elapse_s, elapse_m))

print()
print("Descriptor dataframe shape : ", df_kpdesc_training.shape)

df_kpdesc_training.head(5)

# Extration des descripteurs : jpg sur S3 > descripteurs

#Though SparkContext used to be an entry point prior to 2.0, it is not completely replaced with SparkSession,
#many features of SparkContext are still available and used in Spark 2.0 and later. 
#You should also know that SparkSession internally creates SparkConfig and SparkContext with the configuration 
#provided with SparkSession.

In [11]:
#Fait plus haut

#BUCKET_NAME = "moncompartimentamoi"

#sc = SparkContext()
#print(sc.version)

#s3_client = boto3.client('s3', region_name='eu-west-3')
#s3_client.upload_file(lst[0], BUCKET_NAME, lst[1]+"/"+lst[2]+"/"+lst[3])

#s3_resource = boto3.resource('s3')
#s3_bucket = s3_resource.Bucket(BUCKET_NAME)

In [25]:
def read_image_from_s3(key):
    """Load image file from s3.

    Parameters
    ----------
    key : string           Path in s3

    Returns
    -------
    np array               Image array
    """   
    #object = s3_bucket.Object(key)
    #response = object.get()
    #file_stream = response['Body']    
    #im = Image.open(file_stream)
    
    return np.array(Image.open(s3_bucket.Object(key).get()['Body']))

"reprendre là"  
Chargement et copie d’objets à l’aide d’un chargement partitionné?  
Multithreading or multiprocessing with sessions?  
How to read image file from S3 bucket directly into memory?  

In [54]:
# Create SIFT descriptor
sift = cv2.SIFT_create()
    
# Loop on .jpg pictures in AWS S3 Bucket
for obj in s3_bucket.objects.all():      # Ne prendre que les .jpg !!!!
    
    key = obj.key
    
    print(key) 
    
    if key.endswith('jpg'):        
        print(key)
        
        # Read picture from AWS S3 Bucket
        pict = np.array(Image.open(s3_bucket.Object(key).get()['Body']))

        # Compute key points and picture descriptors (descript: numpy array with one line by interest point, 128 columns)
        keypoints, descript = sift.detectAndCompute(pict, None)
        
        pd.DataFrame(descript)    

MonPrefixe/MonRapport/20220301-20220401/20220308T142121Z/MonRapport-00001.csv.zip
MonPrefixe/MonRapport/20220301-20220401/20220308T142121Z/MonRapport-Manifest.json
MonPrefixe/MonRapport/20220301-20220401/MonRapport-Manifest.json
Test/apple_6/r0_103.jpg
Test/apple_6/r0_103.jpg
Test/apple_6/r0_107.jpg
Test/apple_6/r0_107.jpg
Test/apple_braeburn_1/r0_103.jpg
Test/apple_braeburn_1/r0_103.jpg
Test/apple_braeburn_1/r0_107.jpg
Test/apple_braeburn_1/r0_107.jpg
Test/apple_crimson_snow_1/r0_103.jpg
Test/apple_crimson_snow_1/r0_103.jpg
Test/apple_crimson_snow_1/r0_107.jpg
Test/apple_crimson_snow_1/r0_107.jpg
Test/apple_golden_1/r0_103.jpg
Test/apple_golden_1/r0_103.jpg
Test/apple_golden_1/r0_107.jpg
Test/apple_golden_1/r0_107.jpg
Test/apple_golden_2/r0_103.jpg
Test/apple_golden_2/r0_103.jpg
Test/apple_golden_2/r0_107.jpg
Test/apple_golden_2/r0_107.jpg
Test/apple_golden_3/r0_103.jpg
Test/apple_golden_3/r0_103.jpg
Test/apple_golden_3/r0_107.jpg
Test/apple_golden_3/r0_107.jpg
Test/apple_granny_smith

Training/eggplant_violet_1/r0_0.jpg
Training/eggplant_violet_1/r0_0.jpg
Training/eggplant_violet_1/r0_10.jpg
Training/eggplant_violet_1/r0_10.jpg
Training/eggplant_violet_1/r0_100.jpg
Training/eggplant_violet_1/r0_100.jpg
Training/eggplant_violet_1/r0_102.jpg
Training/eggplant_violet_1/r0_102.jpg
Training/pear_1/r0_0.jpg
Training/pear_1/r0_0.jpg
Training/pear_1/r0_10.jpg
Training/pear_1/r0_10.jpg
Training/pear_1/r0_100.jpg
Training/pear_1/r0_100.jpg
Training/pear_1/r0_102.jpg
Training/pear_1/r0_102.jpg
Training/pear_3/r0_0.jpg
Training/pear_3/r0_0.jpg
Training/pear_3/r0_10.jpg
Training/pear_3/r0_10.jpg
Training/pear_3/r0_100.jpg
Training/pear_3/r0_100.jpg
Training/pear_3/r0_102.jpg
Training/pear_3/r0_102.jpg
Training/zucchini_1/r0_0.jpg
Training/zucchini_1/r0_0.jpg
Training/zucchini_1/r0_10.jpg
Training/zucchini_1/r0_10.jpg
Training/zucchini_1/r0_100.jpg
Training/zucchini_1/r0_100.jpg
Training/zucchini_1/r0_102.jpg
Training/zucchini_1/r0_102.jpg
Training/zucchini_dark_1/r0_0.jpg
Traini

In [None]:
from pyspark.sql import SparkSession

# Instantiate SparkSession
spark = SparkSession \
    .builder \
    .appName("Python Spark create RDD example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [16]:
#s3_url = "s3a://moncompartimentamoi/Test/apple_6/*"
s3_url = "https://moncompartimentamoi.s3.eu-west-3.amazonaws.com/Test/apple_6/r0_103.jpg"

df = spark.read.format("image").load(s3_url)

print((df.count(), len(df.columns)))
print(df.printSchema())

df.select('image.nChannels', "image.width", "image.height", "image.data").show(truncate=True)

Py4JJavaError: An error occurred while calling o34.load.
: java.lang.UnsupportedOperationException
	at org.apache.hadoop.fs.http.AbstractHttpFileSystem.listStatus(AbstractHttpFileSystem.java:94)
	at org.apache.hadoop.fs.http.HttpsFileSystem.listStatus(HttpsFileSystem.java:23)
	at org.apache.spark.util.HadoopFSUtils$.listLeafFiles(HadoopFSUtils.scala:225)
	at org.apache.spark.util.HadoopFSUtils$.$anonfun$parallelListLeafFilesInternal$1(HadoopFSUtils.scala:95)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at scala.collection.TraversableLike.map(TraversableLike.scala:286)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
	at scala.collection.AbstractTraversable.map(Traversable.scala:108)
	at org.apache.spark.util.HadoopFSUtils$.parallelListLeafFilesInternal(HadoopFSUtils.scala:85)
	at org.apache.spark.util.HadoopFSUtils$.parallelListLeafFiles(HadoopFSUtils.scala:69)
	at org.apache.spark.sql.execution.datasources.InMemoryFileIndex$.bulkListLeafFiles(InMemoryFileIndex.scala:158)
	at org.apache.spark.sql.execution.datasources.InMemoryFileIndex.listLeafFiles(InMemoryFileIndex.scala:131)
	at org.apache.spark.sql.execution.datasources.InMemoryFileIndex.refresh0(InMemoryFileIndex.scala:94)
	at org.apache.spark.sql.execution.datasources.InMemoryFileIndex.<init>(InMemoryFileIndex.scala:66)
	at org.apache.spark.sql.execution.datasources.DataSource.createInMemoryFileIndex(DataSource.scala:565)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:409)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:274)
	at org.apache.spark.sql.DataFrameReader.$anonfun$load$3(DataFrameReader.scala:245)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:245)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:188)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Unknown Source)


In [14]:
# Create PySpark RDD (Resilient Distributed Dataset)  from .jpg file
from pyspark.ml.image import ImageSchema

test = ImageSchema.readImages(s3_bucket) 
#"Training/apple_6/")

AttributeError: '_ImageSchema' object has no attribute 'readImages'

In [10]:
def load_picture(path):
    test = ImageSchema.toNDArray(path)   # TypeError: image argument should be pyspark.sql.types.Row; however, it got [<class 'str'>].
    #test = ImageSchema.readImages(path) # AttributeError: '_ImageSchema' object has no attribute 'readImages'
    return test

print(load_picture("Training/apple_6/"))

TypeError: image argument should be pyspark.sql.types.Row; however, it got [<class 'str'>].

In [None]:
#df = spark.read.format('com.databricks.spark.csv').\
#            options(header='true', inferschema='true').\
#            load("/home/feng/Spark/Code/data/Advertising.csv", header=True)

In [None]:
#from pyspark.sql import Row

#def load_dataframe(path):
#    rdd = sc.textFile(path)\
#        .map(lambda line: line.split())\
#        .map(lambda words: Row(label=words[0], words=words[1:]))
#    return spark.createDataFrame(rdd)

## Clusters' descriptors <a class="anchor" id="sub3_2"></a>

In [None]:
# Save starting time
time_start = time.time()

# Create KMeans clustering model
kmeans = cluster.KMeans(n_clusters=KMEANS_N_CLUSTERS, random_state=42) 

# Train and predict using KMeans clustering model
df_kpdesc_training = pd.concat([df_kpdesc_training, \
    pd.DataFrame(kmeans.fit_predict(df_kpdesc_training[df_kpdesc_training.columns[1:]].values), \
                 columns=['Desc_cluster'])], axis=1)

# Compute time elapse
elapse_s = time.time()-time_start
elapse_m = int(elapse_s / 60)
print('KMeans {} clusters done! Time elapsed: {} seconds ({} minutes)'.format(KMEANS_N_CLUSTERS, elapse_s, elapse_m))

# Number of iterations run et Coordinates of cluster centers
print("Case {} clusters: Converge after {} iterations"\
      .format(kmeans.cluster_centers_.shape[0], kmeans.n_iter_)) 

print()
print("Descriptor dataframe shape : ", df_kpdesc_training.shape)

df_kpdesc_training

## Compute frequency histogram on clusters' descriptors <a class="anchor" id="sub3_3"></a>
samples: pictures x features: clusters

In [None]:
def histo_freq(df_kpdesc_training):
    
    # Use index to count
    df_kpdesc_training.reset_index(drop=False, inplace=True)
    df_kpdesc_training = df_kpdesc_training.pivot_table('index', index='FullFileName', columns='Desc_cluster', \
                                                        aggfunc='count', fill_value=0, margins=True)
    # Normalise: total for a picture is one
    for c in df_kpdesc_training.columns[:-1]:
        df_kpdesc_training[c] = df_kpdesc_training[c] / df_kpdesc_training['All']

    # Drop unusefull information
    df_kpdesc_training.drop(index='All', inplace=True)
    df_kpdesc_training.drop(columns='All', inplace=True)
    return df_kpdesc_training
    
df_kpdesc_training = histo_freq(df_kpdesc_training)    

## Reduce dimension with PCA <a class="anchor" id="sub3_4"></a>

In [None]:
PCA_N_COMPONENTS = 0.90

# Create PCA 
pca = decomposition.PCA()

# Fit PCA
pca.fit(df_kpdesc_training.values)

# Draw explained variance absolute and cumulated
df_eboulis = MyMod.graph_eboulis_valeurspropres(pca, (18, 18), True)

print("{} clusters explain {}% of the variance"\
           .format(df_eboulis[df_eboulis['explained_variance_ratio_cum'] > PCA_N_COMPONENTS]['rang'].min(), \
                   PCA_N_COMPONENTS * 100))

In [None]:
# Create PCA 
pca = decomposition.PCA(n_components=PCA_N_COMPONENTS)

# Fit Transform PCA
pict_features = pca.fit_transform(df_kpdesc_training.values)

print()
print("Matrix dimensions (pictures, visual words) : {}".format(pict_features.shape)) 

# Get PC coordinates in cluster space
df_contrib_PC = pd.DataFrame(pca.components_, columns=df_kpdesc_training.columns) 
df_contrib_PC.shape 

# Get the cluster best represented for each PC
#df_contrib_PC_t = df_contrib_PC.transpose()
lst_contrib = []
for i in range(pca.n_components_):     
    lst_contrib.append(df_contrib_PC.transpose()[i].idxmax(axis=0))
    
# Keep only the cluster best represented
df_kpdesc_training = df_kpdesc_training[lst_contrib]
del lst_contrib

# Unduplicate identical columns
df_kpdesc_training = df_kpdesc_training.T.groupby(level=0).first().T

df_kpdesc_training.head(5)

In [None]:
# Laurence: sortie de la réduction de dimension (une matrice écrite sur un fichier CSV ou autre)???

# Modelling <a class="anchor" id="chapter4"></a> 

## Train a KNN model <a class="anchor" id="sub4_1"></a>

In [None]:
# Create KNN model
knn = KNeighborsClassifier(n_neighbors=5)

# Train KNN model
X_train = df_kpdesc_training.values
y_train = df_main_training['Target_encoded'].values
knn.fit(X_train, y_train)

## Check learning curve <a class="anchor" id="sub4_2"></a>

In [None]:
train_sizes_abs, train_scores, test_scores = learning_curve(knn, X_train, y_train, 
                                            cv=5, scoring='neg_median_absolute_error',
                                            train_sizes=np.linspace(0.1, 1, 5), 
                                            random_state=42)
plot = plt.figure(figsize=(12, 8))
plot = plt.plot(train_sizes_abs, train_scores.mean(axis=1), label='train score')
plot = plt.plot(train_sizes_abs, test_scores.mean(axis=1), label='validation score')
plot = plt.legend()

## Predict and compare prediction to reality on Test dataset <a class="anchor" id="sub4_3"></a>

In [None]:
def predict_class(df, kmeans_model, pca_col_lst, knn_model):
    
    # Save starting time
    time_start = time.time()

    # Extract descriptors
    df_kpdesc = desc_extraction(df)

    # Predict clusters' descriptors with KMEANS
    df_kpdesc = pd.concat([df_kpdesc, \
        pd.DataFrame(kmeans.predict(df_kpdesc[df_kpdesc.columns[1:]].values), columns=['Desc_cluster'])], axis=1)

    # Compute histogram for main clusters
    df_kpdesc = histo_freq(df_kpdesc)    
    df_kpdesc = df_kpdesc[pca_col_lst]

    # Predict class with trained KNN
    df = pd.concat([df, pd.DataFrame(knn.predict(df_kpdesc.values), columns=['Predict'])], axis=1)

    # Compute time elapse
    elapse_s = time.time()-time_start
    elapse_m = int(elapse_s / 60)
    print('Test predictions done! Time elapsed: {} seconds ({} minutes)'.format(elapse_s, elapse_m))
    
    # Assess result ARI
    ari = metrics.adjusted_rand_score(df['Target_encoded'].values, df['Predict'].values)
    print('Test predictions done! Adjusted Rand Index: {}'.format(ari))
    
    return df

# Select Test dataset
df_main_test = df_main[df_main['Dataset'] == 'Test']
df_main_test = dataset_reduction(df_main_test, TEST_SAMPLE_SIZE)

# Predict on Test dataset
df_main_test = predict_class(df_main_test, kmeans, df_kpdesc_training.columns, knn)
df_main_test.head(5)

## Predict and compare prediction to reality on Validation dataset <a class="anchor" id="sub4_4"></a>

In [None]:
# Select Validation dataset
df_main_validation = df_main[df_main['Dataset'] == 'Validation']
df_main_validation = dataset_reduction(df_main_validation, VALIDATION_SAMPLE_SIZE)

# Predict on Validation dataset
df_main_validation = predict_class(df_main_validation, kmeans, df_kpdesc_training.columns, knn)
df_main_validation.head(5)

* [Go to Table des matières](#chapter0)

# End <a class="anchor" id="chapter100"></a> 

In [None]:
'''df_main: FullFileName, Dataset, Target, Picture, FileSize (in KB), Rotation, Index, Target_encoded
df_meta: Target, Flag, Value, Target_encoded
df_target_mapping: Target_encoded, Target

df_main_training, df_main_test, df_main_validation: 
        FullFileName, Dataset, Target, Picture, FileSize (in KB), Rotation, Index, Target_encoded, Predict
df_kpdesc_training: FullFileName, Desc_cluster'''

In [None]:
df_target_mapping

In [None]:
df_meta