In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

In [3]:
spark = SparkSession.builder \
    .appName("DataCo - Pr√©diction Retards") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.default.parallelism", "200") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .getOrCreate()


In [4]:
feature_numeric_cols = [
    "Days for shipment (scheduled)",
    "Benefit per order",
    "Sales per customer",
    "Order Item Discount",
    "Order Item Discount Rate",
    "Order Item Product Price",
    "Order Item Profit Ratio",
    "Order Item Quantity",
    "Sales",
    "Order Profit Per Order"
]

feature_categorical_cols = [
    "Type",
    "Shipping Mode",
    "Market",
    "Customer Segment",
    "Order State",
    "Order Region",
    "Category Name"
]


cible = ["Late_delivery_risk"]  # Variable cible

In [5]:
from pyspark.sql.functions import col

base_cols = feature_numeric_cols + feature_categorical_cols + cible

df = (
    spark.read
        .option("header", True)
        .option("inferSchema", True)
        .csv("../data/DataCoSupplyChainDataset.csv")
        .filter(col("Delivery Status") != "Shipping canceled")
        .select(*base_cols)
)

df.explain(True)


== Parsed Logical Plan ==
'Project ['Days for shipment (scheduled), 'Benefit per order, 'Sales per customer, 'Order Item Discount, 'Order Item Discount Rate, 'Order Item Product Price, 'Order Item Profit Ratio, 'Order Item Quantity, 'Sales, 'Order Profit Per Order, 'Type, 'Shipping Mode, 'Market, 'Customer Segment, 'Order State, 'Order Region, 'Category Name, 'Late_delivery_risk]
+- Filter NOT (Delivery Status#22 = Shipping canceled)
   +- Relation [Type#17,Days for shipping (real)#18,Days for shipment (scheduled)#19,Benefit per order#20,Sales per customer#21,Delivery Status#22,Late_delivery_risk#23,Category Id#24,Category Name#25,Customer City#26,Customer Country#27,Customer Email#28,Customer Fname#29,Customer Id#30,Customer Lname#31,Customer Password#32,Customer Segment#33,Customer State#34,Customer Street#35,Customer Zipcode#36,Department Id#37,Department Name#38,Latitude#39,Longitude#40,Market#41,... 28 more fields] csv

== Analyzed Logical Plan ==
Days for shipment (scheduled): in

In [17]:
df.printSchema()



root
 |-- Days for shipment (scheduled): integer (nullable = true)
 |-- Benefit per order: double (nullable = true)
 |-- Sales per customer: double (nullable = true)
 |-- Order Item Discount: double (nullable = true)
 |-- Order Item Discount Rate: double (nullable = true)
 |-- Order Item Product Price: double (nullable = true)
 |-- Order Item Profit Ratio: double (nullable = true)
 |-- Order Item Quantity: integer (nullable = true)
 |-- Sales: double (nullable = true)
 |-- Order Profit Per Order: double (nullable = true)
 |-- Type: string (nullable = true)
 |-- Shipping Mode: string (nullable = true)
 |-- Market: string (nullable = true)
 |-- Customer Segment: string (nullable = true)
 |-- Order State: string (nullable = true)
 |-- Order Region: string (nullable = true)
 |-- Category Name: string (nullable = true)
 |-- Late_delivery_risk: integer (nullable = true)



In [6]:
df_verifier_equilibre = df.groupBy("Late_delivery_risk").count()
df_verifier_equilibre.show()

+------------------+-----+
|Late_delivery_risk|count|
+------------------+-----+
|                 1|98977|
|                 0|73788|
+------------------+-----+



In [8]:
from pyspark.sql.functions import col, when, count
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml import Pipeline

# VOS FEATURES
feature_numeric_cols = [
    "Days for shipment (scheduled)", "Benefit per order", "Sales per customer",
    "Order Item Discount", "Order Item Discount Rate", "Order Item Product Price",
    "Order Item Profit Ratio", "Order Item Quantity", "Sales", "Order Profit Per Order"
]

feature_categorical_cols = [
    "Type", "Shipping Mode", "Market", "Customer Segment",
    "Order State", "Order Region", "Category Name"
]

cible = ["Late_delivery_risk"]

# Charger les donn√©es
base_cols = feature_numeric_cols + feature_categorical_cols + cible

df = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .csv("../data/DataCoSupplyChainDataset.csv")
    .filter(col("Delivery Status") != "Shipping canceled")
    .select(*base_cols)
)


# 1. GESTION DES VALEURS MANQUANTES
print("=== V√©rification des valeurs manquantes ===")
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

df_clean = df.dropna()
print(f"Lignes avant: {df.count()}, Lignes apr√®s: {df_clean.count()}")


# 2. ENCODAGE DES VARIABLES CAT√âGORIELLES
print("\n=== Encodage des variables cat√©gorielles ===")
indexers = []
for col_name in feature_categorical_cols:
    indexer = StringIndexer(
        inputCol=col_name, 
        outputCol=col_name + "_indexed", 
        handleInvalid="keep"
    )
    indexers.append(indexer)

pipeline_encoding = Pipeline(stages=indexers)
model_encoding = pipeline_encoding.fit(df_clean)
df_encoded = model_encoding.transform(df_clean)

encoded_cols = [c + "_indexed" for c in feature_categorical_cols]


# 3. NORMALISATION DES FEATURES NUM√âRIQUES
print("\n=== Normalisation des features num√©riques ===")
assembler_num = VectorAssembler(
    inputCols=feature_numeric_cols, 
    outputCol="numeric_features",
    handleInvalid="keep"
)
df_assembled = assembler_num.transform(df_encoded)

scaler = StandardScaler(
    inputCol="numeric_features", 
    outputCol="numeric_features_scaled",
    withMean=True, 
    withStd=True
)
scaler_model = scaler.fit(df_assembled)
df_scaled = scaler_model.transform(df_assembled)


# 4. PR√âPARATION FINALE POUR LE MOD√àLE
print("\n=== Assembly final ===")
all_feature_cols = ["numeric_features_scaled"] + encoded_cols
assembler_final = VectorAssembler(
    inputCols=all_feature_cols, 
    outputCol="features",
    handleInvalid="keep"
)
df_final = assembler_final.transform(df_scaled)

# Dataset pr√™t
df_ready = df_final.select("features", col("Late_delivery_risk").alias("label"))

print("\n=== Dataset pr√™t pour le mod√®le ===")
print(f"Nombre total de lignes: {df_ready.count()}")
df_ready.show(5, truncate=False)

# Distribution finale
print("\n=== Distribution de la variable cible ===")
df_ready.groupBy("label").count().show()

=== V√©rification des valeurs manquantes ===
+-----------------------------+-----------------+------------------+-------------------+------------------------+------------------------+-----------------------+-------------------+-----+----------------------+----+-------------+------+----------------+-----------+------------+-------------+------------------+
|Days for shipment (scheduled)|Benefit per order|Sales per customer|Order Item Discount|Order Item Discount Rate|Order Item Product Price|Order Item Profit Ratio|Order Item Quantity|Sales|Order Profit Per Order|Type|Shipping Mode|Market|Customer Segment|Order State|Order Region|Category Name|Late_delivery_risk|
+-----------------------------+-----------------+------------------+-------------------+------------------------+------------------------+-----------------------+-------------------+-----+----------------------+----+-------------+------+----------------+-----------+------------+-------------+------------------+
|               

## Pipline

In [9]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, OneHotEncoder
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, MultilayerPerceptronClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.sql.functions import col

# VOS FEATURES
feature_numeric_cols = [
    "Days for shipment (scheduled)", "Benefit per order", "Sales per customer",
    "Order Item Discount", "Order Item Discount Rate", "Order Item Product Price",
    "Order Item Profit Ratio", "Order Item Quantity", "Sales", "Order Profit Per Order"
]

feature_categorical_cols = [
    "Type", "Shipping Mode", "Market", "Customer Segment",
    "Order State", "Order Region", "Category Name"
]

cible = ["Late_delivery_risk"]

# Charger les donn√©es
base_cols = feature_numeric_cols + feature_categorical_cols + cible

df = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .csv("../data/DataCoSupplyChainDataset.csv")
    .filter(col("Delivery Status") != "Shipping canceled")
    .select(*base_cols)
)

# Nettoyer les donn√©es
df_clean = df.dropna()
print(f"Dataset nettoy√©: {df_clean.count()} lignes")

# V√©rifier le nombre de valeurs uniques par colonne cat√©gorielle
print("\n=== Nombre de valeurs uniques par variable cat√©gorielle ===")
for col_name in feature_categorical_cols:
    nb_unique = df_clean.select(col_name).distinct().count()
    print(f"{col_name}: {nb_unique} valeurs uniques")


# ====================================================================
# √âTAPES DE PR√âTRAITEMENT
# ====================================================================

# 1. Encodage des variables cat√©gorielles avec StringIndexer
indexers = []
for col_name in feature_categorical_cols:
    indexer = StringIndexer(
        inputCol=col_name, 
        outputCol=col_name + "_indexed", 
        handleInvalid="keep"
    )
    indexers.append(indexer)

indexed_cols = [c + "_indexed" for c in feature_categorical_cols]

# 2. OneHotEncoding pour les mod√®les qui en ont besoin (Neural Network)
encoders = []
for col_name in feature_categorical_cols:
    encoder = OneHotEncoder(
        inputCol=col_name + "_indexed",
        outputCol=col_name + "_encoded"
    )
    encoders.append(encoder)

encoded_cols = [c + "_encoded" for c in feature_categorical_cols]

# 3. Assembly des features num√©riques
assembler_num = VectorAssembler(
    inputCols=feature_numeric_cols, 
    outputCol="numeric_features",
    handleInvalid="keep"
)

# 4. Normalisation
scaler = StandardScaler(
    inputCol="numeric_features", 
    outputCol="numeric_features_scaled",
    withMean=True, 
    withStd=True
)

# 5. Assembly final pour Random Forest et GBT (avec indexed)
assembler_final_trees = VectorAssembler(
    inputCols=["numeric_features_scaled"] + indexed_cols, 
    outputCol="features",
    handleInvalid="keep"
)

# 6. Assembly final pour Neural Network (avec OneHot)
assembler_final_mlp = VectorAssembler(
    inputCols=["numeric_features_scaled"] + encoded_cols, 
    outputCol="features",
    handleInvalid="keep"
)


# ====================================================================
# MOD√àLES NON-LIN√âAIRES
# ====================================================================

# MOD√àLE 1 : RANDOM FOREST
# SOLUTION : Augmenter maxBins pour g√©rer plus de cat√©gories
rf = RandomForestClassifier(
    featuresCol="features", 
    labelCol="Late_delivery_risk",
    numTrees=100,
    maxDepth=10,
    maxBins=2000,  # ‚Üê AUGMENT√â pour g√©rer Order State (1081 valeurs)
    minInstancesPerNode=1,
    seed=42
)

# MOD√àLE 2 : GRADIENT BOOSTING TREES
gbt = GBTClassifier(
    featuresCol="features", 
    labelCol="Late_delivery_risk",
    maxIter=100,
    maxDepth=5,
    maxBins=2000,  # ‚Üê AUGMENT√â √©galement
    stepSize=0.1,
    seed=42
)

# MOD√àLE 3 : NEURAL NETWORK
# Pour MLP, on a besoin de compter les features apr√®s OneHotEncoding
# On va le calculer dynamiquement apr√®s le preprocessing
mlp = MultilayerPerceptronClassifier(
    featuresCol="features",
    labelCol="Late_delivery_risk",
    maxIter=100,
    blockSize=128,
    seed=42
)


# ====================================================================
# SPLIT TRAIN/TEST
# ====================================================================

train_data, test_data = df_clean.randomSplit([0.8, 0.2], seed=42)

print(f"\nDonn√©es d'entra√Ænement: {train_data.count()} lignes")
print(f"Donn√©es de test: {test_data.count()} lignes")


# ====================================================================
# FONCTION POUR ENTRA√éNER ET √âVALUER UN MOD√àLE
# ====================================================================

def entrainer_et_evaluer(model, nom_modele, use_onehot=False):
    print(f"\n{'='*60}")
    print(f"MOD√àLE : {nom_modele}")
    print(f"{'='*60}")
    
    # Choisir le bon assembler
    if use_onehot:
        assembler_final = assembler_final_mlp
        stages = [*indexers, *encoders, assembler_num, scaler, assembler_final, model]
    else:
        assembler_final = assembler_final_trees
        stages = [*indexers, assembler_num, scaler, assembler_final, model]
    
    # Cr√©er le pipeline
    pipeline = Pipeline(stages=stages)
    
    # Entra√Æner
    print(f"‚è≥ Entra√Ænement en cours...")
    model_trained = pipeline.fit(train_data)
    print(f"‚úÖ Entra√Ænement termin√© !")
    
    # Pr√©dictions
    predictions_train = model_trained.transform(train_data)
    predictions_test = model_trained.transform(test_data)
    
    # Evaluateurs
    evaluator_auc = BinaryClassificationEvaluator(
        labelCol="Late_delivery_risk",
        rawPredictionCol="rawPrediction",
        metricName="areaUnderROC"
    )
    
    evaluator_accuracy = MulticlassClassificationEvaluator(
        labelCol="Late_delivery_risk",
        predictionCol="prediction",
        metricName="accuracy"
    )
    
    evaluator_f1 = MulticlassClassificationEvaluator(
        labelCol="Late_delivery_risk",
        predictionCol="prediction",
        metricName="f1"
    )
    
    evaluator_precision = MulticlassClassificationEvaluator(
        labelCol="Late_delivery_risk",
        predictionCol="prediction",
        metricName="weightedPrecision"
    )
    
    evaluator_recall = MulticlassClassificationEvaluator(
        labelCol="Late_delivery_risk",
        predictionCol="prediction",
        metricName="weightedRecall"
    )
    
    # Calculer les m√©triques
    auc_train = evaluator_auc.evaluate(predictions_train)
    auc_test = evaluator_auc.evaluate(predictions_test)
    
    accuracy_train = evaluator_accuracy.evaluate(predictions_train)
    accuracy_test = evaluator_accuracy.evaluate(predictions_test)
    
    f1_train = evaluator_f1.evaluate(predictions_train)
    f1_test = evaluator_f1.evaluate(predictions_test)
    
    precision_test = evaluator_precision.evaluate(predictions_test)
    recall_test = evaluator_recall.evaluate(predictions_test)
    
    # Afficher les r√©sultats
    print(f"\nüìä R√âSULTATS - {nom_modele}")
    print(f"{'-'*60}")
    print(f"TRAIN | AUC: {auc_train:.4f} | Accuracy: {accuracy_train:.4f} | F1: {f1_train:.4f}")
    print(f"TEST  | AUC: {auc_test:.4f} | Accuracy: {accuracy_test:.4f} | F1: {f1_test:.4f}")
    print(f"TEST  | Precision: {precision_test:.4f} | Recall: {recall_test:.4f}")
    
    # Matrice de confusion
    print(f"\nüìà Matrice de confusion - {nom_modele}")
    confusion_matrix = predictions_test.groupBy("Late_delivery_risk", "prediction").count()
    confusion_matrix.orderBy("Late_delivery_risk", "prediction").show()
    
    return {
        'modele': nom_modele,
        'auc_test': auc_test,
        'accuracy_test': accuracy_test,
        'f1_test': f1_test,
        'precision_test': precision_test,
        'recall_test': recall_test,
        'pipeline': model_trained
    }


# ====================================================================
# ENTRA√éNER ET COMPARER TOUS LES MOD√àLES NON-LIN√âAIRES
# ====================================================================

resultats = []

# Random Forest
resultats.append(entrainer_et_evaluer(rf, "Random Forest", use_onehot=False))

# Gradient Boosting
resultats.append(entrainer_et_evaluer(gbt, "Gradient Boosting Trees", use_onehot=False))

# Neural Network (avec OneHot encoding)
# D'abord, configurer les layers correctement
# On va le faire apr√®s avoir compt√© les features
print("\n‚è≥ Configuration du Neural Network...")

# Cr√©er un pipeline temporaire pour compter les features
temp_pipeline = Pipeline(stages=[*indexers, *encoders, assembler_num, scaler, assembler_final_mlp])
temp_model = temp_pipeline.fit(train_data.limit(100))  # Juste pour compter
temp_data = temp_model.transform(train_data.limit(1))
nb_features = temp_data.select("features").first()[0].size

print(f"Nombre de features apr√®s preprocessing: {nb_features}")

# Configurer le MLP avec le bon nombre de features
layers = [nb_features, 128, 64, 2]  # 2 classes
mlp.setLayers(layers)

resultats.append(entrainer_et_evaluer(mlp, "Neural Network (MLP)", use_onehot=True))


# ====================================================================
# COMPARAISON FINALE
# ====================================================================

print(f"\n{'='*80}")
print("COMPARAISON DES MOD√àLES NON-LIN√âAIRES")
print(f"{'='*80}")
print(f"{'Mod√®le':<30} {'AUC':<10} {'Accuracy':<10} {'F1-Score':<10} {'Precision':<10} {'Recall':<10}")
print(f"{'-'*80}")

for r in resultats:
    print(f"{r['modele']:<30} {r['auc_test']:<10.4f} {r['accuracy_test']:<10.4f} {r['f1_test']:<10.4f} {r['precision_test']:<10.4f} {r['recall_test']:<10.4f}")

# Trouver le meilleur mod√®le
meilleur = max(resultats, key=lambda x: x['f1_test'])
print(f"\nüèÜ MEILLEUR MOD√àLE : {meilleur['modele']} (F1-Score: {meilleur['f1_test']:.4f})")
print(f"{'='*80}")

Dataset nettoy√©: 172765 lignes

=== Nombre de valeurs uniques par variable cat√©gorielle ===
Type: 4 valeurs uniques
Shipping Mode: 4 valeurs uniques
Market: 5 valeurs uniques
Customer Segment: 3 valeurs uniques
Order State: 1083 valeurs uniques
Order Region: 23 valeurs uniques
Category Name: 50 valeurs uniques

Donn√©es d'entra√Ænement: 138318 lignes
Donn√©es de test: 34447 lignes

MOD√àLE : Random Forest
‚è≥ Entra√Ænement en cours...


Py4JJavaError: An error occurred while calling o1578.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 20 in stage 259.0 failed 1 times, most recent failure: Lost task 20.0 in stage 259.0 (TID 3162) (DESKTOP-7M3HR4V executor driver): java.lang.OutOfMemoryError: Java heap space
	at org.apache.spark.ml.tree.impl.DTStatsAggregator.<init>(DTStatsAggregator.scala:77)
	at org.apache.spark.ml.tree.impl.RandomForest$.$anonfun$findBestSplits$22(RandomForest.scala:653)
	at org.apache.spark.ml.tree.impl.RandomForest$.$anonfun$findBestSplits$22$adapted(RandomForest.scala:649)
	at org.apache.spark.ml.tree.impl.RandomForest$$$Lambda/0x00000177d26e4560.apply(Unknown Source)
	at scala.Array$.tabulate(Array.scala:441)
	at org.apache.spark.ml.tree.impl.RandomForest$.$anonfun$findBestSplits$21(RandomForest.scala:649)
	at org.apache.spark.ml.tree.impl.RandomForest$$$Lambda/0x00000177d2617100.apply(Unknown Source)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:866)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:866)
	at org.apache.spark.rdd.RDD$$Lambda/0x00000177d1f32f70.apply(Unknown Source)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:374)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:338)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:107)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:171)
	at org.apache.spark.scheduler.Task.run(Task.scala:147)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$5(Executor.scala:647)
	at org.apache.spark.executor.Executor$TaskRunner$$Lambda/0x00000177d1e972a0.apply(Unknown Source)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:80)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:77)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:650)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	at java.base/java.lang.Thread.runWith(Thread.java:1596)
	at java.base/java.lang.Thread.run(Thread.java:1583)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$3(DAGScheduler.scala:2935)
	at scala.Option.getOrElse(Option.scala:201)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2935)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2927)
	at scala.collection.immutable.List.foreach(List.scala:334)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2927)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1295)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1295)
	at scala.Option.foreach(Option.scala:437)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1295)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3207)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3141)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3130)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:50)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:1009)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2484)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2505)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2524)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2549)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1057)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:417)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1056)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$collectAsMap$1(PairRDDFunctions.scala:740)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:417)
	at org.apache.spark.rdd.PairRDDFunctions.collectAsMap(PairRDDFunctions.scala:739)
	at org.apache.spark.ml.tree.impl.RandomForest$.findBestSplits(RandomForest.scala:665)
	at org.apache.spark.ml.tree.impl.RandomForest$.runBagged(RandomForest.scala:210)
	at org.apache.spark.ml.tree.impl.RandomForest$.run(RandomForest.scala:304)
	at org.apache.spark.ml.classification.RandomForestClassifier.$anonfun$train$1(RandomForestClassifier.scala:168)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:226)
	at scala.util.Try$.apply(Try.scala:217)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:226)
	at org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:139)
	at org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:47)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:115)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:79)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:75)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:580)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:184)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:108)
	at java.base/java.lang.Thread.run(Thread.java:1583)
Caused by: java.lang.OutOfMemoryError: Java heap space
	at org.apache.spark.ml.tree.impl.DTStatsAggregator.<init>(DTStatsAggregator.scala:77)
	at org.apache.spark.ml.tree.impl.RandomForest$.$anonfun$findBestSplits$22(RandomForest.scala:653)
	at org.apache.spark.ml.tree.impl.RandomForest$.$anonfun$findBestSplits$22$adapted(RandomForest.scala:649)
	at org.apache.spark.ml.tree.impl.RandomForest$$$Lambda/0x00000177d26e4560.apply(Unknown Source)
	at scala.Array$.tabulate(Array.scala:441)
	at org.apache.spark.ml.tree.impl.RandomForest$.$anonfun$findBestSplits$21(RandomForest.scala:649)
	at org.apache.spark.ml.tree.impl.RandomForest$$$Lambda/0x00000177d2617100.apply(Unknown Source)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:866)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:866)
	at org.apache.spark.rdd.RDD$$Lambda/0x00000177d1f32f70.apply(Unknown Source)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:374)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:338)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:107)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:171)
	at org.apache.spark.scheduler.Task.run(Task.scala:147)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$5(Executor.scala:647)
	at org.apache.spark.executor.Executor$TaskRunner$$Lambda/0x00000177d1e972a0.apply(Unknown Source)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:80)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:77)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:650)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	at java.base/java.lang.Thread.runWith(Thread.java:1596)
	... 1 more


ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "c:\Users\elkho\AppData\Local\Programs\Python\Python311\Lib\site-packages\py4j\clientserver.py", line 535, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\elkho\AppData\Local\Programs\Python\Python311\Lib\socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
ConnectionResetError: [WinError 10054] Une connexion existante a d√ª √™tre ferm√©e par l‚Äôh√¥te distant

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\Users\elkho\AppData\Local\Programs\Python\Python311\Lib\site-packages\py4j\java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\elkho\AppData\Local\Programs\Python\Python311\Lib\site-package

ConnectionRefusedError: [WinError 10061] Aucune connexion n‚Äôa pu √™tre √©tablie car l‚Äôordinateur cible l‚Äôa express√©ment refus√©e