In [1]:
import os
import cv2
import numpy as np
from pyspark.sql import SparkSession
from pyspark.ml.feature import PCA as PCAml
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, DoubleType

# Vérifiez que Spark et Java sont correctement configurés
import findspark
findspark.init()

# Initialiser SparkSession
spark = SparkSession.builder \
    .appName("PCA Image Processing") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

# Fonction pour charger les images
def load_images_from_folder(folder):
    images = []
    for filename in os.listdir(folder):
        img = cv2.imread(os.path.join(folder, filename), cv2.IMREAD_GRAYSCALE)
        if img is not None:
            images.append(img.flatten().tolist())
    return images

# Chemin vers le dossier contenant les images
folder_path = './data/Test2/Apple Braeburn'
images = load_images_from_folder(folder_path)

# Créer un DataFrame Spark à partir des images
images_rdd = spark.sparkContext.parallelize(images)
images_df = images_rdd.map(lambda x: (Vectors.dense(x),)).toDF(["features"])

# Calculer la moyenne et centrer les images
mean_image = np.mean(np.array(images), axis=0)
mean_image_broadcast = spark.sparkContext.broadcast(mean_image)

def center_image(image):
    return (np.array(image) - mean_image_broadcast.value).tolist()

center_image_udf = udf(center_image, ArrayType(DoubleType()))
centered_images_df = images_df.withColumn("centered_features", center_image_udf(images_df["features"]))

# Convertir les features centrées en format dense pour PCA
centered_images_rdd = centered_images_df.rdd.map(lambda row: (Vectors.dense(row['centered_features']),))
centered_images_df = centered_images_rdd.toDF(["features"])

# Appliquer PCA
pca = PCAml(k=10, inputCol="features", outputCol="pca_features")
model = pca.fit(centered_images_df)
result = model.transform(centered_images_df).select("pca_features")

# Calculer la variance cumulée et déterminer le nombre de composantes pour 95% de variance expliquée
explained_variance = model.explainedVariance.cumsum().tolist()
num_components_95 = np.argmax(np.array(explained_variance) >= 0.95) + 1

# Réduire les données à num_components_95 dimensions
pca_95 = PCAml(k=num_components_95, inputCol="features", outputCol="reduced_features")
model_95 = pca_95.fit(centered_images_df)
reduced_images_df = model_95.transform(centered_images_df).select("reduced_features")

# Afficher les résultats
reduced_images_df.show()

# Arrêter SparkSession
spark.stop()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/28 16:01:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/05/28 16:01:41 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/05/28 16:02:21 ERROR Utils: Uncaught exception in thread task-result-getter-0
java.lang.OutOfMemoryError: Java heap space
	at java.base/java.lang.reflect.Array.newArray(Native Method)
	at java.base/java.lang.reflect.Array.newInstance(Array.java:78)
	at java.base/java.io.ObjectInputStream.readArray(ObjectInputStream.java:2098)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1675)
	at java.base/java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2496)
	at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2390)
	at java.base/java.io.ObjectInputStre

Py4JError: An error occurred while calling o105.fit

[Stage 8:>                                                          (0 + 2) / 2]