In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import BinaryType
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.feature import PCA, StandardScaler
from pyspark.sql.functions import col, pandas_udf, PandasUDFType, element_at, split
from PIL import Image
import numpy as np
import io
import os
import pandas as pd
import tensorflow as tf
from tensorflow.keras.applications.mobilenet_v2 import MobileNetV2, preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras import Model

2024-05-28 22:09:21.761437: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
if SparkSession.builder.getOrCreate().sparkContext:
    SparkSession.builder.getOrCreate().sparkContext.stop()

spark = (SparkSession
             .builder
             .appName('test15-spark')
             .master('local')
             .config("spark.sql.parquet.writeLegacyFormat", 'true')
             .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/28 22:09:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
PATH = os.getcwd()
PATH_Data = PATH+'/data/Test1'
PATH_Result = PATH+'/data/results'
print('PATH:        '+\
      PATH+'\nPATH_Data:   '+\
      PATH_Data+'\nPATH_Result: '+PATH_Result)

PATH:        /Users/gaeldelescluse/Documents/OpenClassRooms/2.Projets/Projet11/ai-cloud-computing-spark/train
PATH_Data:   /Users/gaeldelescluse/Documents/OpenClassRooms/2.Projets/Projet11/ai-cloud-computing-spark/train/data/Test1
PATH_Result: /Users/gaeldelescluse/Documents/OpenClassRooms/2.Projets/Projet11/ai-cloud-computing-spark/train/data/results


In [6]:
df = spark.read.format("binaryFile") \
  .option("pathGlobFilter", "*.jpg") \
  .option("recursiveFileLookup", "true") \
  .load(PATH_Data)
df = df.withColumn('label', element_at(split(df['path'], '/'),-2))

In [7]:
df.show()

+--------------------+-------------------+------+--------------------+--------------+
|                path|   modificationTime|length|             content|         label|
+--------------------+-------------------+------+--------------------+--------------+
|file:/Users/gaeld...|2021-09-12 19:25:42|  5656|[FF D8 FF E0 00 1...|Apple Braeburn|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5627|[FF D8 FF E0 00 1...|Apple Braeburn|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5613|[FF D8 FF E0 00 1...|Apple Braeburn|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5611|[FF D8 FF E0 00 1...|Apple Braeburn|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5611|[FF D8 FF E0 00 1...|Apple Braeburn|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5606|[FF D8 FF E0 00 1...|Apple Braeburn|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5606|[FF D8 FF E0 00 1...|Apple Braeburn|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5602|[FF D8 FF E0 00 1...|Apple Braeburn|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5599|[FF D

In [8]:
def process_image(content):
    img = Image.open(io.BytesIO(content)).convert('L').resize([224, 224])
    return Vectors.dense(np.array(img).flatten().tolist())

process_image_udf = udf(process_image, VectorUDT())
df_final = df.withColumn("features", process_image_udf(df.content))

df_final.show()

                                                                                

+--------------------+-------------------+------+--------------------+--------------+--------------------+
|                path|   modificationTime|length|             content|         label|            features|
+--------------------+-------------------+------+--------------------+--------------+--------------------+
|file:/Users/gaeld...|2021-09-12 19:25:42|  5656|[FF D8 FF E0 00 1...|Apple Braeburn|[255.0,255.0,255....|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5627|[FF D8 FF E0 00 1...|Apple Braeburn|[255.0,255.0,255....|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5613|[FF D8 FF E0 00 1...|Apple Braeburn|[255.0,255.0,255....|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5611|[FF D8 FF E0 00 1...|Apple Braeburn|[255.0,255.0,255....|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5611|[FF D8 FF E0 00 1...|Apple Braeburn|[255.0,255.0,255....|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5606|[FF D8 FF E0 00 1...|Apple Braeburn|[255.0,255.0,255....|
|file:/Users/gaeld...|2021-09-12 19:2

In [14]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['features'],outputCol='features2')
df_final = assembler.transform(df_final)

                                                                                

In [15]:
df_final.show()

                                                                                

+--------------------+-------------------+------+--------------------+--------------+--------------------+--------------------+
|                path|   modificationTime|length|             content|         label|            features|           features2|
+--------------------+-------------------+------+--------------------+--------------+--------------------+--------------------+
|file:/Users/gaeld...|2021-09-12 19:25:42|  5656|[FF D8 FF E0 00 1...|Apple Braeburn|[255.0,255.0,255....|[255.0,255.0,255....|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5627|[FF D8 FF E0 00 1...|Apple Braeburn|[255.0,255.0,255....|[255.0,255.0,255....|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5613|[FF D8 FF E0 00 1...|Apple Braeburn|[255.0,255.0,255....|[255.0,255.0,255....|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5611|[FF D8 FF E0 00 1...|Apple Braeburn|[255.0,255.0,255....|[255.0,255.0,255....|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5611|[FF D8 FF E0 00 1...|Apple Braeburn|[255.0,255.0,255...

In [16]:
df_final.printSchema()

root
 |-- path: string (nullable = true)
 |-- modificationTime: timestamp (nullable = true)
 |-- length: long (nullable = true)
 |-- content: binary (nullable = true)
 |-- label: string (nullable = true)
 |-- features: vector (nullable = true)
 |-- features2: vector (nullable = true)



In [18]:
df_final.select('content').first()

Row(content=bytearray(b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x00\x00\xff\xdb\x00C\x00\x02\x01\x01\x01\x01\x01\x02\x01\x01\x01\x02\x02\x02\x02\x02\x04\x03\x02\x02\x02\x02\x05\x04\x04\x03\x04\x06\x05\x06\x06\x06\x05\x06\x06\x06\x07\t\x08\x06\x07\t\x07\x06\x06\x08\x0b\x08\t\n\n\n\n\n\x06\x08\x0b\x0c\x0b\n\x0c\t\n\n\n\xff\xdb\x00C\x01\x02\x02\x02\x02\x02\x02\x05\x03\x03\x05\n\x07\x06\x07\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\xff\xc0\x00\x11\x08\x00d\x00d\x03\x01"\x00\x02\x11\x01\x03\x11\x01\xff\xc4\x00\x1f\x00\x00\x01\x05\x01\x01\x01\x01\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\xff\xc4\x00\xb5\x10\x00\x02\x01\x03\x03\x02\x04\x03\x05\x05\x04\x04\x00\x00\x01}\x01\x02\x03\x00\x04\x11\x05\x12!1A\x06\x13Qa\x07"q\x142\x81\x91\xa1\x08#B\xb1\xc1\x15R\xd1\xf0$3br\x82\t\n\x16\x17\x18\x19\x1a%&\'()*456789:CDEFGHIJSTUVWXYZcdefghijstuvwxyz\x83\x84\x85\x86\x87\x88\x89\x8a\x92\x93\

In [20]:
import cv2

def image_to_vector(image_binary):
    # Convertir l'image binaire en tableau numpy
    image_array = np.frombuffer(image_binary, np.uint8)
    # Lire l'image à l'aide d'OpenCV
    image = cv2.imdecode(image_array, cv2.IMREAD_GRAYSCALE)  # ou cv2.IMREAD_COLOR pour des images en couleur
    # Aplatir l'image en un vecteur 1D
    image_vector = image.flatten()
    # Normaliser les pixels (optionnel)
    image_vector = image_vector / 255.0
    return image_vector.tolist()

In [24]:
from pyspark.sql.types import ArrayType, FloatType

image_to_vector_udf = udf(image_to_vector, ArrayType(FloatType()))

In [25]:
def list_to_vector(image_list):
    return Vectors.dense(image_list)

list_to_vector_udf = udf(list_to_vector, VectorUDT())

In [28]:
df = df.withColumn('image_vector', list_to_vector_udf('image_vector'))

In [29]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['image_vector'],outputCol='features')
df_test = assembler.transform(df)

                                                                                

In [30]:
df_test.show()

                                                                                

+--------------------+-------------------+------+--------------------+--------------+--------------------+--------------------+
|                path|   modificationTime|length|             content|         label|        image_vector|            features|
+--------------------+-------------------+------+--------------------+--------------+--------------------+--------------------+
|file:/Users/gaeld...|2021-09-12 19:25:42|  5656|[FF D8 FF E0 00 1...|Apple Braeburn|[1.0,1.0,1.0,1.0,...|[1.0,1.0,1.0,1.0,...|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5627|[FF D8 FF E0 00 1...|Apple Braeburn|[1.0,1.0,1.0,1.0,...|[1.0,1.0,1.0,1.0,...|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5613|[FF D8 FF E0 00 1...|Apple Braeburn|[1.0,1.0,1.0,1.0,...|[1.0,1.0,1.0,1.0,...|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5611|[FF D8 FF E0 00 1...|Apple Braeburn|[1.0,1.0,1.0,1.0,...|[1.0,1.0,1.0,1.0,...|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5611|[FF D8 FF E0 00 1...|Apple Braeburn|[1.0,1.0,1.0,1.0,..