In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import BinaryType
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.feature import PCA, StandardScaler
from pyspark.sql.functions import col, pandas_udf, PandasUDFType, element_at, split
from PIL import Image
import numpy as np
import io
import os
import pandas as pd
import tensorflow as tf
from tensorflow.keras.applications.mobilenet_v2 import MobileNetV2, preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras import Model

2024-05-28 18:44:53.772048: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, IntegerType, MapType, FloatType
from pyspark.ml.linalg import Vectors, VectorUDT
from PIL import Image
import numpy as np
import io
import os

if SparkSession.builder.getOrCreate().sparkContext:
    SparkSession.builder.getOrCreate().sparkContext.stop()

spark = (SparkSession
             .builder
             .appName('test12-spark')
             .master('local')
             .config("spark.sql.parquet.writeLegacyFormat", 'true')
             .getOrCreate()
)

PATH = os.getcwd()
PATH_Data = os.path.join(PATH, 'data', 'Test2')
PATH_Result = os.path.join(PATH, 'data', 'results')
print(f'PATH:        {PATH}\nPATH_Data:   {PATH_Data}\nPATH_Result: {PATH_Result}')

df = spark.read.format("binaryFile") \
  .option("pathGlobFilter", "*.jpg") \
  .option("recursiveFileLookup", "true") \
  .load(PATH_Data)

def process_image(content):
    img = Image.open(io.BytesIO(content)).convert('L')
    return np.array(img).flatten().tolist()

process_image_udf = udf(process_image, ArrayType(IntegerType()))
df_final = df.withColumn("features", process_image_udf(df.content))

def group_sparse_vector(features):
    grouped_indices = {}
    for index, value in enumerate(features):
        if value not in grouped_indices:
            grouped_indices[value] = []
        grouped_indices[value].append(index)
    return grouped_indices

group_sparse_vector_udf = udf(group_sparse_vector, MapType(FloatType(), ArrayType(IntegerType())))

df_X = df_final.withColumn("grouped_sparse_features", group_sparse_vector_udf(df_final["features"]))

df_X.show()


PATH:        /Users/gaeldelescluse/Documents/OpenClassRooms/2.Projets/Projet11/ai-cloud-computing-spark/train
PATH_Data:   /Users/gaeldelescluse/Documents/OpenClassRooms/2.Projets/Projet11/ai-cloud-computing-spark/train/data/Test2
PATH_Result: /Users/gaeldelescluse/Documents/OpenClassRooms/2.Projets/Projet11/ai-cloud-computing-spark/train/data/results


                                                                                

+-----------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [3]:
df_X.show()

+--------------------+--------------------+------+--------------------+--------------------+-----------------------+
|                path|    modificationTime|length|             content|            features|grouped_sparse_features|
+--------------------+--------------------+------+--------------------+--------------------+-----------------------+
|file:/Users/gaeld...|2024-05-28 15:55:...|  4800|[FF D8 FF E0 00 1...|[255, 255, 255, 2...|   {0.0 -> [6991], 0...|
|file:/Users/gaeld...|2024-05-28 15:55:...|  4792|[FF D8 FF E0 00 1...|[255, 255, 255, 2...|   {0.0 -> [7090], 0...|
|file:/Users/gaeld...|2024-05-28 15:55:...|  4773|[FF D8 FF E0 00 1...|[255, 255, 255, 2...|   {0.0 -> [6591, 72...|
+--------------------+--------------------+------+--------------------+--------------------+-----------------------+



In [7]:
df_X.select('grouped_sparse_features').first()

                                                                                

Row(grouped_sparse_features={0.0: [0, 1, 2, 3, 4, 5, 6, 14, 15, 17, 18, 22, 27, 31, 32, 34, 36, 67, 68, 70, 71, 72, 75, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 113, 114, 115, 124, 136, 137, 138, 146, 158, 160, 162, 163, 167, 169, 170, 171, 172, 173, 176, 178, 179, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 233, 234, 235, 239, 241, 242, 248, 253, 257, 265, 268, 271, 274, 276, 281, 282, 283, 284, 285, 286, 287, 288, 289, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 322, 337, 341, 344, 362, 368, 369, 372, 375, 378, 380, 381, 382, 383, 384, 385, 386, 387, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 425, 426, 435, 464, 466, 467, 472, 477, 479, 480, 48

In [6]:
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import udf
from pyspark.sql.types import VectorUDT

# Fonction pour convertir les données groupées en vecteur sparse
def to_sparse_vector(grouped_features, length=10000):
    indices = []
    values = []
    for value, idx_list in grouped_features.items():
        indices.extend(idx_list)
        values.extend([value] * len(idx_list))
    return Vectors.sparse(length, indices, values)

# UDF pour convertir les données groupées en vecteur sparse
to_sparse_vector_udf = udf(lambda grouped_features: to_sparse_vector(grouped_features), VectorUDT())

# Ajouter la colonne de vecteurs sparses
df_X = df_X.withColumn("sparse_vector", to_sparse_vector_udf(df_X["grouped_sparse_features"]))

# Vérifier le schéma pour s'assurer que la colonne "sparse_vector" a été ajoutée correctement
df_X.printSchema()


ImportError: cannot import name 'VectorUDT' from 'pyspark.sql.types' (/Applications/anaconda3/envs/test-spark/lib/python3.10/site-packages/pyspark/sql/types.py)

In [2]:
if SparkSession.builder.getOrCreate().sparkContext:
    SparkSession.builder.getOrCreate().sparkContext.stop()

spark = (SparkSession
             .builder
             .appName('test12-spark')
             .master('local')
             .config("spark.sql.parquet.writeLegacyFormat", 'true')
             .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/28 18:44:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/05/28 18:45:00 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/05/28 18:45:01 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
PATH = os.getcwd()
PATH_Data = PATH+'/data/Test2'
PATH_Result = PATH+'/data/results'
print('PATH:        '+\
      PATH+'\nPATH_Data:   '+\
      PATH_Data+'\nPATH_Result: '+PATH_Result)

PATH:        /Users/gaeldelescluse/Documents/OpenClassRooms/2.Projets/Projet11/ai-cloud-computing-spark/train
PATH_Data:   /Users/gaeldelescluse/Documents/OpenClassRooms/2.Projets/Projet11/ai-cloud-computing-spark/train/data/Test2
PATH_Result: /Users/gaeldelescluse/Documents/OpenClassRooms/2.Projets/Projet11/ai-cloud-computing-spark/train/data/results


In [4]:
sc = spark.sparkContext
spark

In [5]:
df = spark.read.format("binaryFile") \
  .option("pathGlobFilter", "*.jpg") \
  .option("recursiveFileLookup", "true") \
  .load(PATH_Data)

In [6]:
df = df.withColumn('label', element_at(split(df['path'], '/'),-2))
print(df.printSchema())
df.show()

root
 |-- path: string (nullable = true)
 |-- modificationTime: timestamp (nullable = true)
 |-- length: long (nullable = true)
 |-- content: binary (nullable = true)
 |-- label: string (nullable = true)

None
+--------------------+--------------------+------+--------------------+--------------+
|                path|    modificationTime|length|             content|         label|
+--------------------+--------------------+------+--------------------+--------------+
|file:/Users/gaeld...|2024-05-28 15:55:...|  4800|[FF D8 FF E0 00 1...|Apple Braeburn|
|file:/Users/gaeld...|2024-05-28 15:55:...|  4792|[FF D8 FF E0 00 1...|Apple Braeburn|
|file:/Users/gaeld...|2024-05-28 15:55:...|  4773|[FF D8 FF E0 00 1...|Apple Braeburn|
+--------------------+--------------------+------+--------------------+--------------+



In [7]:
from pyspark.sql.types import ArrayType, IntegerType

def process_image(content):
    img = Image.open(io.BytesIO(content)).convert('L')
    return np.array(img).flatten().tolist()

process_image_udf = udf(process_image, ArrayType(IntegerType()))
df_final = df.withColumn("features", process_image_udf(df.content))

df_final.show()

[Stage 1:>                                                          (0 + 1) / 1]

+--------------------+--------------------+------+--------------------+--------------+--------------------+
|                path|    modificationTime|length|             content|         label|            features|
+--------------------+--------------------+------+--------------------+--------------+--------------------+
|file:/Users/gaeld...|2024-05-28 15:55:...|  4800|[FF D8 FF E0 00 1...|Apple Braeburn|[255, 255, 255, 2...|
|file:/Users/gaeld...|2024-05-28 15:55:...|  4792|[FF D8 FF E0 00 1...|Apple Braeburn|[255, 255, 255, 2...|
|file:/Users/gaeld...|2024-05-28 15:55:...|  4773|[FF D8 FF E0 00 1...|Apple Braeburn|[255, 255, 255, 2...|
+--------------------+--------------------+------+--------------------+--------------+--------------------+



                                                                                

In [8]:
df_final.printSchema()

root
 |-- path: string (nullable = true)
 |-- modificationTime: timestamp (nullable = true)
 |-- length: long (nullable = true)
 |-- content: binary (nullable = true)
 |-- label: string (nullable = true)
 |-- features: array (nullable = true)
 |    |-- element: integer (containsNull = true)



In [9]:
df_final.select('features').first()

Row(features=[255, 255, 255, 255, 255, 255, 255, 254, 254, 254, 254, 254, 254, 254, 255, 255, 254, 255, 255, 254, 254, 254, 255, 254, 254, 254, 254, 255, 254, 254, 254, 255, 255, 253, 255, 254, 255, 253, 253, 254, 253, 253, 253, 253, 253, 253, 253, 252, 254, 254, 254, 254, 253, 252, 254, 254, 254, 254, 254, 254, 249, 254, 252, 254, 254, 254, 254, 255, 255, 253, 255, 255, 255, 253, 254, 255, 249, 254, 254, 253, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 254, 254, 254, 254, 254, 255, 255, 255, 254, 254, 254, 254, 254, 254, 254, 253, 255, 254, 254, 254, 254, 254, 254, 254, 252, 254, 253, 254, 255, 255, 255, 253, 254, 252, 254, 254, 252, 252, 255, 254, 254, 254, 254, 254, 254, 254, 254, 253, 252, 254, 255, 254, 255, 252, 255, 255, 254, 254, 254, 255, 254, 255, 255, 255, 255, 255, 254, 253, 255, 253, 255, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 2

In [10]:
def sparse_vector(features):
    return Vectors.sparse(features)

# Enregistrer la fonction comme UDF
sparse_vector_udf = udf(sparse_vector, VectorUDT())
df_X = df_final.withColumn("dense_features", sparse_vector_udf(df_final["features"]))
df_X.printSchema()

root
 |-- path: string (nullable = true)
 |-- modificationTime: timestamp (nullable = true)
 |-- length: long (nullable = true)
 |-- content: binary (nullable = true)
 |-- label: string (nullable = true)
 |-- features: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- dense_features: vector (nullable = true)



In [12]:
from pyspark.sql.functions import udf
from pyspark.ml.linalg import Vectors, VectorUDT

def sparse_vector(features):
    # Assurez-vous que 'features' est sous une forme appropriée pour Vectors.sparse
    return Vectors.sparse(len(features), list(enumerate(features)))

sparse_vector_udf = udf(sparse_vector, VectorUDT())

df_X = df_final.withColumn("dense_features", sparse_vector_udf(df_final["features"]))

df_X.printSchema()
df_X.show()

root
 |-- path: string (nullable = true)
 |-- modificationTime: timestamp (nullable = true)
 |-- length: long (nullable = true)
 |-- content: binary (nullable = true)
 |-- label: string (nullable = true)
 |-- features: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- dense_features: vector (nullable = true)



[Stage 4:>                                                          (0 + 1) / 1]

+--------------------+--------------------+------+--------------------+--------------+--------------------+--------------------+
|                path|    modificationTime|length|             content|         label|            features|      dense_features|
+--------------------+--------------------+------+--------------------+--------------+--------------------+--------------------+
|file:/Users/gaeld...|2024-05-28 15:55:...|  4800|[FF D8 FF E0 00 1...|Apple Braeburn|[255, 255, 255, 2...|(10000,[0,1,2,3,4...|
|file:/Users/gaeld...|2024-05-28 15:55:...|  4792|[FF D8 FF E0 00 1...|Apple Braeburn|[255, 255, 255, 2...|(10000,[0,1,2,3,4...|
|file:/Users/gaeld...|2024-05-28 15:55:...|  4773|[FF D8 FF E0 00 1...|Apple Braeburn|[255, 255, 255, 2...|(10000,[0,1,2,3,4...|
+--------------------+--------------------+------+--------------------+--------------+--------------------+--------------------+



                                                                                

In [13]:
df_X.select('dense_features').first()

Row(dense_features=SparseVector(10000, {0: 255.0, 1: 255.0, 2: 255.0, 3: 255.0, 4: 255.0, 5: 255.0, 6: 255.0, 7: 254.0, 8: 254.0, 9: 254.0, 10: 254.0, 11: 254.0, 12: 254.0, 13: 254.0, 14: 255.0, 15: 255.0, 16: 254.0, 17: 255.0, 18: 255.0, 19: 254.0, 20: 254.0, 21: 254.0, 22: 255.0, 23: 254.0, 24: 254.0, 25: 254.0, 26: 254.0, 27: 255.0, 28: 254.0, 29: 254.0, 30: 254.0, 31: 255.0, 32: 255.0, 33: 253.0, 34: 255.0, 35: 254.0, 36: 255.0, 37: 253.0, 38: 253.0, 39: 254.0, 40: 253.0, 41: 253.0, 42: 253.0, 43: 253.0, 44: 253.0, 45: 253.0, 46: 253.0, 47: 252.0, 48: 254.0, 49: 254.0, 50: 254.0, 51: 254.0, 52: 253.0, 53: 252.0, 54: 254.0, 55: 254.0, 56: 254.0, 57: 254.0, 58: 254.0, 59: 254.0, 60: 249.0, 61: 254.0, 62: 252.0, 63: 254.0, 64: 254.0, 65: 254.0, 66: 254.0, 67: 255.0, 68: 255.0, 69: 253.0, 70: 255.0, 71: 255.0, 72: 255.0, 73: 253.0, 74: 254.0, 75: 255.0, 76: 249.0, 77: 254.0, 78: 254.0, 79: 253.0, 80: 254.0, 81: 254.0, 82: 254.0, 83: 255.0, 84: 255.0, 85: 255.0, 86: 255.0, 87: 255.0, 88

In [14]:
pca = PCA(k=20, inputCol="dense_features", outputCol="pcaFeatures")
model = pca.fit(df_X)

24/05/28 18:48:11 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/05/28 18:48:12 ERROR Executor: Exception in task 0.0 in stage 10.0 (TID 10)
java.lang.OutOfMemoryError: Java heap space
	at java.base/java.nio.HeapByteBuffer.<init>(HeapByteBuffer.java:61)
	at java.base/java.nio.ByteBuffer.allocate(ByteBuffer.java:348)
	at org.apache.spark.serializer.SerializerHelper$.$anonfun$serializeToChunkedBuffer$1(SerializerHelper.scala:40)
	at org.apache.spark.serializer.SerializerHelper$.$anonfun$serializeToChunkedBuffer$1$adapted(SerializerHelper.scala:40)
	at org.apache.spark.serializer.SerializerHelper$$$Lambda$2594/0x00000008010ef840.apply(Unknown Source)
	at org.apache.spark.util.io.ChunkedByteBufferOutputStream.allocateNewChunkIfNeeded(ChunkedByteBufferOutputStream.scala:87)
	at org.apache.spark.util.io.ChunkedByteBufferOutputStream.write(ChunkedByteBufferOutputStream.scala:75)
	at java.base/java.io.ObjectOutputStream$BlockDataOutputStream.drain(O

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 51317)
Traceback (most recent call last):
  File "/Applications/anaconda3/envs/test-spark/lib/python3.10/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/Applications/anaconda3/envs/test-spark/lib/python3.10/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/Applications/anaconda3/envs/test-spark/lib/python3.10/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/Applications/anaconda3/envs/test-spark/lib/python3.10/socketserver.py", line 747, in __init__
    self.handle()
  File "/Applications/anaconda3/envs/test-spark/lib/python3.10/site-packages/pyspark/accumulators.py", line 295, in handle
    poll(accum_updates)
  File "/Applications/anaconda3/envs/test-spark/lib/python3.10/site-packages/pyspark/a

ConnectionRefusedError: [Errno 61] Connection refused

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/Applications/anaconda3/envs/test-spark/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/Applications/anaconda3/envs/test-spark/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
ConnectionResetError: [Errno 54] Connection reset by peer

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Applications/anaconda3/envs/test-spark/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/Applications/anaconda3/envs/test-spark/lib/python3.10/site-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
