In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import BinaryType
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.feature import PCA, StandardScaler
from PIL import Image
import numpy as np
import io
import os
import pandas as pd

In [3]:
if SparkSession.builder.getOrCreate().sparkContext:
    SparkSession.builder.getOrCreate().sparkContext.stop()

spark = (SparkSession
             .builder
             .appName('test9-spark-PCA')
             .master('local')
             .config("spark.sql.parquet.writeLegacyFormat", 'true')
             .getOrCreate()
)

In [4]:
PATH = os.getcwd()
PATH_Data = PATH+'/data/Test1'
PATH_Result = PATH+'/data/results'
print('PATH:        '+\
      PATH+'\nPATH_Data:   '+\
      PATH_Data+'\nPATH_Result: '+PATH_Result)

PATH:        /Users/gaeldelescluse/Documents/OpenClassRooms/2.Projets/Projet11/ai-cloud-computing-spark/train
PATH_Data:   /Users/gaeldelescluse/Documents/OpenClassRooms/2.Projets/Projet11/ai-cloud-computing-spark/train/data/Test1
PATH_Result: /Users/gaeldelescluse/Documents/OpenClassRooms/2.Projets/Projet11/ai-cloud-computing-spark/train/data/results


In [5]:
sc = spark.sparkContext
spark

In [6]:
df = spark.read.format("binaryFile") \
  .option("pathGlobFilter", "*.jpg") \
  .option("recursiveFileLookup", "true") \
  .load(PATH_Data)

In [7]:
df.show()

                                                                                

+--------------------+-------------------+------+--------------------+
|                path|   modificationTime|length|             content|
+--------------------+-------------------+------+--------------------+
|file:/Users/gaeld...|2021-09-12 19:25:42|  5656|[FF D8 FF E0 00 1...|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5627|[FF D8 FF E0 00 1...|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5613|[FF D8 FF E0 00 1...|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5611|[FF D8 FF E0 00 1...|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5611|[FF D8 FF E0 00 1...|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5606|[FF D8 FF E0 00 1...|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5606|[FF D8 FF E0 00 1...|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5602|[FF D8 FF E0 00 1...|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5599|[FF D8 FF E0 00 1...|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5597|[FF D8 FF E0 00 1...|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5594|[FF D8 FF E0 00 1...|
|file:

In [None]:
def process_image(content):
    img = Image.open(io.BytesIO(content)).convert('L').resize([224, 224])
    return Vectors.dense(np.array(img).flatten().tolist())

In [None]:
def featurize_series(content_series):
    input = np.stack(content_series.map(process_image))
    preds = model.predict(input)
    pca = PCA(k=20)
    model = pca.fit(input)
    preds = model.transform(input)
    output = [p.flatten() for p in preds]
    return pd.Series(output)

In [None]:
from pyspark.sql.functions import col, pandas_udf, PandasUDFType, element_at, split

@pandas_udf('array<float>', PandasUDFType.SCALAR_ITER)
def featurize_udf(content_series_iter):
    for content_series in content_series_iter:
        yield featurize_series(content_series)

In [None]:
features_df = df.repartition(20).select(col("path"),
                                            col("label"),
                                            featurize_udf("content").alias("features")
                                           )

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import ArrayType, FloatType
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.feature import PCA, StandardScaler, VectorAssembler
from PIL import Image
import numpy as np
import io
import os

# Initialiser la session Spark avec plus de mémoire JVM
spark = (SparkSession
             .builder
             .appName('test9-spark-PCA')
             .master('local')
             .config("spark.sql.parquet.writeLegacyFormat", 'true')
             .config("spark.executor.memory", "4g")  # Augmenter la mémoire de l'exécuteur
             .config("spark.driver.memory", "4g")  # Augmenter la mémoire du pilote
             .config("spark.memory.offHeap.enabled", True)
             .config("spark.memory.offHeap.size", "4g")  # Augmenter la mémoire off-heap
             .getOrCreate()
)

PATH = os.getcwd()
PATH_Data = os.path.join(PATH, 'data', 'Test1')
PATH_Result = os.path.join(PATH, 'data', 'results')
print(f'PATH:        {PATH}\nPATH_Data:   {PATH_Data}\nPATH_Result: {PATH_Result}')

# Chargement des fichiers d'images
df = spark.read.format("binaryFile") \
  .option("pathGlobFilter", "*.jpg") \
  .option("recursiveFileLookup", "true") \
  .load(PATH_Data)

# Fonction pour traiter une image et la convertir en vecteur (taille réduite)
def process_image(content):
    img = Image.open(io.BytesIO(content)).convert('L').resize((112, 112))  # Taille réduite
    return np.array(img).flatten().tolist()

# UDF pour convertir le contenu binaire de l'image en vecteur
process_image_udf = udf(lambda content: process_image(content), ArrayType(FloatType()))

# Ajouter la colonne 'features' avec les vecteurs des images
df = df.withColumn('features', process_image_udf(col('content')))

# Convertir les listes de float en vecteurs
def array_to_vector(array):
    return Vectors.dense(array)

array_to_vector_udf = udf(array_to_vector, VectorUDT())

df = df.withColumn('features_vec', array_to_vector_udf(col('features')))

# Standardisation des données
scaler = StandardScaler(inputCol="features_vec", outputCol="scaledFeatures", withMean=True, withStd=True)
scaler_model = scaler.fit(df)
df = scaler_model.transform(df)

# Application de la PCA
pca = PCA(k=20, inputCol="scaledFeatures", outputCol="pcaFeatures")
pca_model = pca.fit(df)
df = pca_model.transform(df)

# Sélectionner les colonnes finales
df = df.select("path", "pcaFeatures")

# Afficher le DataFrame résultant
df.show()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/28 14:26:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


PATH:        /Users/gaeldelescluse/Documents/OpenClassRooms/2.Projets/Projet11/ai-cloud-computing-spark/train
PATH_Data:   /Users/gaeldelescluse/Documents/OpenClassRooms/2.Projets/Projet11/ai-cloud-computing-spark/train/data/Test1
PATH_Result: /Users/gaeldelescluse/Documents/OpenClassRooms/2.Projets/Projet11/ai-cloud-computing-spark/train/data/results


24/05/28 14:26:24 WARN RowMatrix: 12544 columns will require at least 1258 megabytes of memory!
24/05/28 14:26:33 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/05/28 14:28:00 ERROR Utils: Uncaught exception in thread task-result-getter-0
java.lang.OutOfMemoryError: Java heap space
	at java.base/java.nio.HeapByteBuffer.<init>(HeapByteBuffer.java:61)
	at java.base/java.nio.ByteBuffer.allocate(ByteBuffer.java:348)
	at org.apache.spark.util.io.ChunkedByteBuffer$.$anonfun$fromFile$1(ChunkedByteBuffer.scala:243)
	at org.apache.spark.util.io.ChunkedByteBuffer$.$anonfun$fromFile$1$adapted(ChunkedByteBuffer.scala:243)
	at org.apache.spark.util.io.ChunkedByteBuffer$$$Lambda$3721/0x00000008017db840.apply(Unknown Source)
	at org.apache.spark.util.io.ChunkedByteBufferOutputStream.allocateNewChunkIfNeeded(ChunkedByteBufferOutputStream.scala:87)
	at org.apache.spark.util.io.ChunkedByteBufferOutputStream.write(ChunkedByteBufferOutputStream.scala:75)
	at or

ConnectionRefusedError: [Errno 61] Connection refused

In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, pandas_udf, PandasUDFType
from pyspark.sql.types import ArrayType, FloatType
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.feature import PCA, StandardScaler
from PIL import Image
import numpy as np
import io
import os
import pandas as pd

# Initialiser la session Spark
spark = (SparkSession
             .builder
             .appName('test9-spark-PCA')
             .master('local')
             .config("spark.sql.parquet.writeLegacyFormat", 'true')
             .getOrCreate()
)

PATH = os.getcwd()
PATH_Data = os.path.join(PATH, 'data', 'Test1')
PATH_Result = os.path.join(PATH, 'data', 'results')
print(f'PATH:        {PATH}\nPATH_Data:   {PATH_Data}\nPATH_Result: {PATH_Result}')

# Chargement des fichiers d'images
df = spark.read.format("binaryFile") \
  .option("pathGlobFilter", "*.jpg") \
  .option("recursiveFileLookup", "true") \
  .load(PATH_Data)

# Fonction pour traiter une image et la convertir en vecteur
def process_image(content):
    img = Image.open(io.BytesIO(content)).convert('L').resize((224, 224))
    return np.array(img).flatten().tolist()

# Fonction pour appliquer la PCA à une série de contenus d'image
def featurize_series(content_series):
    input_data = np.stack(content_series.apply(process_image))
    # Standardisation des données
    scaler = StandardScaler(with_mean=True, with_std=True)
    input_data_scaled = scaler.fit_transform(input_data)
    # Application de la PCA
    pca = PCA(n_components=20)
    transformed_data = pca.fit_transform(input_data_scaled)
    output = [Vectors.dense(p) for p in transformed_data]
    return pd.Series(output)

# Définition de la UDF pour transformer les images en features
@pandas_udf(ArrayType(VectorUDT()), PandasUDFType.SCALAR)
def featurize_udf(content_series):
    return featurize_series(content_series)

# Application de la transformation
features_df = df.repartition(20).select(col("path"),
                                        featurize_udf("content").alias("features"))

# Afficher le DataFrame résultant
features_df.show()


PATH:        /Users/gaeldelescluse/Documents/OpenClassRooms/2.Projets/Projet11/ai-cloud-computing-spark/train
PATH_Data:   /Users/gaeldelescluse/Documents/OpenClassRooms/2.Projets/Projet11/ai-cloud-computing-spark/train/data/Test1
PATH_Result: /Users/gaeldelescluse/Documents/OpenClassRooms/2.Projets/Projet11/ai-cloud-computing-spark/train/data/results


24/05/28 14:11:19 ERROR Executor: Exception in task 0.0 in stage 6.0 (TID 34) 1]
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/var/folders/kl/3nc79ycx61v13jbqlw61nnfw0000gn/T/ipykernel_87369/1641043716.py", line 52, in featurize_udf
  File "/var/folders/kl/3nc79ycx61v13jbqlw61nnfw0000gn/T/ipykernel_87369/1641043716.py", line 41, in featurize_series
  File "/usr/local/opt/apache-spark/libexec/python/lib/pyspark.zip/pyspark/__init__.py", line 139, in wrapper
    return func(self, **kwargs)
TypeError: StandardScaler.__init__() got an unexpected keyword argument 'with_mean'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read(PythonArrowOutput.scala:118)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterato

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/var/folders/kl/3nc79ycx61v13jbqlw61nnfw0000gn/T/ipykernel_87369/1641043716.py", line 52, in featurize_udf
  File "/var/folders/kl/3nc79ycx61v13jbqlw61nnfw0000gn/T/ipykernel_87369/1641043716.py", line 41, in featurize_series
  File "/usr/local/opt/apache-spark/libexec/python/lib/pyspark.zip/pyspark/__init__.py", line 139, in wrapper
    return func(self, **kwargs)
TypeError: StandardScaler.__init__() got an unexpected keyword argument 'with_mean'


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, pandas_udf, PandasUDFType
from pyspark.sql.types import ArrayType, FloatType
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.feature import PCA, StandardScaler
from PIL import Image
import numpy as np
import io
import os
import pandas as pd

# Initialiser la session Spark
spark = (SparkSession
             .builder
             .appName('test9-spark-PCA')
             .master('local')
             .config("spark.sql.parquet.writeLegacyFormat", 'true')
             .getOrCreate()
)

PATH = os.getcwd()
PATH_Data = os.path.join(PATH, 'data', 'Test1')
PATH_Result = os.path.join(PATH, 'data', 'results')
print(f'PATH:        {PATH}\nPATH_Data:   {PATH_Data}\nPATH_Result: {PATH_Result}')

# Chargement des fichiers d'images
df = spark.read.format("binaryFile") \
  .option("pathGlobFilter", "*.jpg") \
  .option("recursiveFileLookup", "true") \
  .load(PATH_Data)

# Fonction pour traiter une image et la convertir en vecteur
def process_image(content):
    img = Image.open(io.BytesIO(content)).convert('L').resize((224, 224))
    return np.array(img).flatten().tolist()

# Fonction pour appliquer la PCA à une série de contenus d'image
def featurize_series(content_series):
    input_data = np.stack(content_series.apply(process_image))
    # Standardisation des données
    scaler = StandardScaler(with_mean=True, with_std=True)
    input_data_scaled = scaler.fit_transform(input_data)
    # Application de la PCA
    pca = PCA(n_components=20)
    transformed_data = pca.fit_transform(input_data_scaled)
    output = [Vectors.dense(p) for p in transformed_data]
    return pd.Series(output)

# Définition de la UDF pour transformer les images en features
@pandas_udf(ArrayType(VectorUDT()), PandasUDFType.SCALAR)
def featurize_udf(content_series):
    return featurize_series(content_series)

# Application de la transformation
features_df = df.repartition(20).select(col("path"),
                                        featurize_udf("content").alias("features"))

# Afficher le DataFrame résultant
features_df.show()


PATH:        /Users/gaeldelescluse/Documents/OpenClassRooms/2.Projets/Projet11/ai-cloud-computing-spark/train
PATH_Data:   /Users/gaeldelescluse/Documents/OpenClassRooms/2.Projets/Projet11/ai-cloud-computing-spark/train/data/Test1
PATH_Result: /Users/gaeldelescluse/Documents/OpenClassRooms/2.Projets/Projet11/ai-cloud-computing-spark/train/data/results


24/05/28 14:11:19 ERROR Executor: Exception in task 0.0 in stage 6.0 (TID 34) 1]
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/var/folders/kl/3nc79ycx61v13jbqlw61nnfw0000gn/T/ipykernel_87369/1641043716.py", line 52, in featurize_udf
  File "/var/folders/kl/3nc79ycx61v13jbqlw61nnfw0000gn/T/ipykernel_87369/1641043716.py", line 41, in featurize_series
  File "/usr/local/opt/apache-spark/libexec/python/lib/pyspark.zip/pyspark/__init__.py", line 139, in wrapper
    return func(self, **kwargs)
TypeError: StandardScaler.__init__() got an unexpected keyword argument 'with_mean'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read(PythonArrowOutput.scala:118)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterato

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/var/folders/kl/3nc79ycx61v13jbqlw61nnfw0000gn/T/ipykernel_87369/1641043716.py", line 52, in featurize_udf
  File "/var/folders/kl/3nc79ycx61v13jbqlw61nnfw0000gn/T/ipykernel_87369/1641043716.py", line 41, in featurize_series
  File "/usr/local/opt/apache-spark/libexec/python/lib/pyspark.zip/pyspark/__init__.py", line 139, in wrapper
    return func(self, **kwargs)
TypeError: StandardScaler.__init__() got an unexpected keyword argument 'with_mean'


In [7]:
def process_image(content):
    img = Image.open(io.BytesIO(content)).convert('L').resize([224, 224])
    return Vectors.dense(np.array(img).flatten().tolist())

process_image_udf = udf(process_image, VectorUDT())
df_final = df.withColumn("features", process_image_udf(df.content))

df_final.show()

                                                                                

+--------------------+-------------------+------+--------------------+--------------------+
|                path|   modificationTime|length|             content|            features|
+--------------------+-------------------+------+--------------------+--------------------+
|file:/Users/gaeld...|2021-09-12 19:25:42|  5656|[FF D8 FF E0 00 1...|[255.0,255.0,255....|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5627|[FF D8 FF E0 00 1...|[255.0,255.0,255....|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5613|[FF D8 FF E0 00 1...|[255.0,255.0,255....|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5611|[FF D8 FF E0 00 1...|[255.0,255.0,255....|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5611|[FF D8 FF E0 00 1...|[255.0,255.0,255....|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5606|[FF D8 FF E0 00 1...|[255.0,255.0,255....|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5606|[FF D8 FF E0 00 1...|[255.0,255.0,255....|
|file:/Users/gaeld...|2021-09-12 19:25:42|  5602|[FF D8 FF E0 00 1...|[255.0,255

In [8]:
first_row = df_final.select("features").first()
first_row

                                                                                

Row(features=DenseVector([255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 254.0, 254.0, 255.0, 255.0, 254.0, 254.0, 254.0, 254.0, 254.0, 254.0, 254.0, 255.0, 255.0, 254.0, 254.0, 254.0, 254.0, 254.0, 254.0, 254.0, 254.0, 254.0, 254.0, 254.0, 252.0, 250.0, 251.0, 253.0, 254.0, 254.0, 254.0, 252.0, 251.0, 253.0, 255.0, 255.0, 254.0, 253.0, 251.0, 251.0, 254.0, 255.0, 255.0, 255.0, 252.0, 249.0, 250.0, 253.0, 254.0, 253.0, 252.0, 253.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 241.0, 212.0, 167.0, 134.0, 127.0, 128.0, 125.0, 121.0, 117.0, 111.0, 109.0, 113.0, 117.0, 119.0, 121.0, 124.0, 128.0, 131.0, 133.0, 147.0, 174.0, 202.0, 237.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 2

scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)
scaler_model = scaler.fit(df_final)
df_scaled = scaler_model.transform(df_final)

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, FloatType

def subsample_vector(vector, new_size):
    indices = np.random.choice(len(vector), new_size, replace=False)
    return vector[indices].tolist()

new_size = 1000

# Définir l'UDT (User Defined Function) pour Spark
subsample_vector_udf = udf(lambda vec: subsample_vector(vec, new_size), ArrayType(FloatType()))

# Appliquer la fonction UDF sur la colonne 'features'
df_subsampled = df_final.withColumn("subsampled_features", subsample_vector_udf("features"))

In [10]:
pca = PCA(k=20, inputCol="subsampled_features", outputCol="pca_features")
model = pca.fit(df_subsampled)
result = model.transform(df_subsampled)

IllegalArgumentException: requirement failed: Column subsampled_features must be of type class org.apache.spark.ml.linalg.VectorUDT:struct<type:tinyint,size:int,indices:array<int>,values:array<double>> but was actually class org.apache.spark.sql.types.ArrayType:array<float>.