In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, rand
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.sql.types import DoubleType
import pandas as pd

In [107]:
# √âTAPE 1 : R√©cup√©ration des donn√©es depuis MongoDB
from pymongo import MongoClient

client = MongoClient("mongodb://localhost:27017/")
db = client["clients_bancaires"]
collection = db["df_clean_collection"]

# exemple r√©cup√©rer seulement 5 documents sans le champ _id
docs = list(collection.find({}, {"_id": 0}).limit(5))



In [108]:

# R√©cup√©rer les documents sans le champ _id
data = list(collection.find({}, {"_id": 0}))
df_pandas = pd.DataFrame(data)

In [109]:
# Nettoyer les donn√©es dans Pandas
df_pandas = df_pandas.drop([ 'CustomerId', 'Surname'], axis=1, errors='ignore')



In [110]:
# Extraire GeographyVec
def extract_geography(vec):
    france = 0.0
    germany = 0.0
    if vec is not None and len(vec) == 3:
        # vec = (taille, indices, valeurs)
        indices = vec[1]
        values = vec[2]
        for idx, val in zip(indices, values):
            if idx == 0:
                france = float(val)
            elif idx == 1:
                germany = float(val)
    return pd.Series([france, germany])

df_pandas[['Geography_France', 'Geography_Germany']] = df_pandas['GeographyVec'].apply(extract_geography)
df_pandas = df_pandas.drop('GeographyVec', axis=1)


In [89]:
df_pandas.head(10)

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,GenderIndex,Geography_France,Geography_Germany
0,619,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,41,1,83807.86,1,0,1,112542.58,0,1.0,0.0,0.0
2,502,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,43,2,125510.82,1,1,1,79084.1,0,1.0,0.0,0.0
5,645,44,8,113755.78,2,1,0,149756.71,1,0.0,0.0,0.0
6,822,50,7,0.0,2,1,1,10062.8,0,0.0,0.0,0.0
7,376,29,4,115046.74,4,1,0,119346.88,1,1.0,0.0,0.0
8,501,44,4,142051.07,2,0,1,74940.5,0,0.0,0.0,0.0
9,684,27,2,134603.88,1,1,1,71725.73,0,0.0,0.0,0.0


In [90]:
import os
from pyspark.sql import SparkSession

# üö®üö®üö® ÿ£ŸàŸÑ ÿ≠ÿßÿ¨ÿ©: ŸÑÿßÿ≤ŸÖ ÿ™ÿπŸàÿ∂ ŸáÿßÿØ ÿßŸÑÿ≥ÿ∑ÿ± ÿ®ÿßŸÑŸÖÿ≥ÿßÿ± ÿßŸÑÿµÿ≠Ÿäÿ≠ ÿØŸäÿßŸÑ ÿßŸÑŸÄ venv ÿßŸÑÿ¨ÿØŸäÿØ üö®üö®üö®
# (ŸÖÿ´ŸÑÿß: C:\Users\elkho\SparkProjects\breif6_...\venv\Scripts\python.exe)
NEW_PYTHON_EXECUTABLE = r'C:\Users\elkho\SparkProjects\breif6_PredictiondelAttritionClientBancaire\venv\Scripts\python.exe'

# 1. ŸÉŸÜÿ≠ÿØÿØŸàÿß ŸÑŸÄ Spark ÿ®ÿßŸÑÿ∂ÿ®ÿ∑ ÿßŸÑÿ®ÿßŸäÿ´ŸàŸÜ ÿßŸÑŸÑŸä ÿÆÿßÿµŸà ŸäÿÆÿØŸÖ ÿ®ŸäŸá
os.environ['PYSPARK_PYTHON'] = NEW_PYTHON_EXECUTABLE
os.environ['PYSPARK_DRIVER_PYTHON'] = NEW_PYTHON_EXECUTABLE

# 2. ŸÉŸÜÿ≤ŸäÿØŸà ÿßŸÑŸÉŸàŸÜŸÅŸäÿ∫Ÿàÿ±ÿßÿ≥ŸäŸàŸÜ ÿØŸäÿßŸÑ ÿßŸÑÿ¥ÿ®ŸÉÿ© Ÿà ÿßŸÑŸÄ Timeout
spark = SparkSession.builder \
    .config("spark.driver.extraJavaOptions", "-Dfile.encoding=UTF-8")\
    .config("spark.driver.host", "127.0.0.1") \
    .config("spark.driver.bindAddress", "127.0.0.1") \
    .config("spark.python.worker.timeout", "600") \
    .config("spark.driver.port", "50000") \
    .config("spark.blockManager.port", "50001") \
    .appName("MonProjetSpark") \
    .getOrCreate()

print("SparkSession is running...")

SparkSession is running...


In [1]:
# Avant de cr√©er une nouvelle session, arr√™ter l'ancienne
from pyspark.sql import SparkSession

# Arr√™ter toute session existante
SparkSession.builder.getOrCreate().stop()

# Ou forcer l'arr√™t
import time
try:
    spark.stop()
    time.sleep(2)  # Attendre que les ports se lib√®rent
except:
    pass

# Cr√©er une nouvelle session
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Bancaire pipeline") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "false") \
    .getOrCreate()

In [2]:
# ========================================
# Cr√©er Spark Session


spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Bancaire pipeline") \
    .master("local[*]") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "false") \
    .getOrCreate()

In [96]:
df_pandas_sample = df_pandas.head(5000)
df_pandas.to_csv('../data/df_pandas.csv', index=False)



In [3]:
df_spark = spark.read.csv('../data/df_pandas.csv', header=True, inferSchema=True)
df_spark.show(5)

+-----------+---+------+---------+-------------+---------+--------------+---------------+------+-----------+----------------+-----------------+
|CreditScore|Age|Tenure|  Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|GenderIndex|Geography_France|Geography_Germany|
+-----------+---+------+---------+-------------+---------+--------------+---------------+------+-----------+----------------+-----------------+
|        619| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|        1.0|             0.0|              0.0|
|        608| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|        1.0|             0.0|              0.0|
|        502| 42|     8| 159660.8|            3|        1|             0|      113931.57|     1|        1.0|             0.0|              0.0|
|        699| 39|     1|      0.0|            2|        0|             0|       93826.63|     0|        1.0|             0.0|           

In [6]:
# Convertir les colonnes en types num√©riques appropri√©s
numeric_columns = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 
                   'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Exited', 'GenderIndex']
for col_name in numeric_columns:
    df_spark = df_spark.withColumn(col_name, col(col_name).cast(DoubleType()))


In [99]:
df_spark.printSchema()

root
 |-- CreditScore: double (nullable = true)
 |-- Age: double (nullable = true)
 |-- Tenure: double (nullable = true)
 |-- Balance: double (nullable = true)
 |-- NumOfProducts: double (nullable = true)
 |-- HasCrCard: double (nullable = true)
 |-- IsActiveMember: double (nullable = true)
 |-- EstimatedSalary: double (nullable = true)
 |-- Exited: double (nullable = true)
 |-- GenderIndex: double (nullable = true)
 |-- Geography_France: double (nullable = true)
 |-- Geography_Germany: double (nullable = true)



In [7]:
print("Donn√©es r√©cup√©r√©es depuis MongoDB:")
print(f"Nombre total de lignes: {df_spark.count()}")

Donn√©es r√©cup√©r√©es depuis MongoDB:
Nombre total de lignes: 10000


In [8]:
# V√©rifier et g√©rer le d√©s√©quilibre de classes
# Compter combien de clients ont Exited=1 et Exited=0

df_spark.groupBy("Exited").count().show()


+------+-----+
|Exited|count|
+------+-----+
|   0.0| 7963|
|   1.0| 2037|
+------+-----+



In [9]:
# le dataset est d√©s√©quilibr√©, on peut appliquer un sur√©chantillonnage de la classe minoritaire
nombre_exited_1 = df_spark.filter(col("Exited") == 1).count()
nombre_exited_0 = df_spark.filter(col("Exited") == 0).count()

min_count = min(nombre_exited_0, nombre_exited_1)

df_classe_0 = df_spark.filter(col("Exited") == 0).sample(False, min_count / nombre_exited_0, seed=42)
df_classe_1 = df_spark.filter(col("Exited") == 1).sample(False, min_count / nombre_exited_1, seed=42)

df_equilibre = df_classe_0.union(df_classe_1)



In [10]:
# verifierer l'√©quilibre
df_equilibre.groupBy("Exited").count().show()

+------+-----+
|Exited|count|
+------+-----+
|   0.0| 2119|
|   1.0| 2037|
+------+-----+



Ici, j‚Äôai perdu beaucoup d‚Äôinformations avec l‚Äôundersampling,
alors je passe √† une autre m√©thode : j‚Äôai appliqu√© under et over en m√™me temps.

In [11]:
from pyspark.sql.functions import col

# Comptage du nombre d'observations dans chaque classe
nombre_restants = df_spark.filter(col("Exited") == 0).count()  # Clients rest√©s
nombre_sortis = df_spark.filter(col("Exited") == 1).count()    # Clients sortis

# D√©finition du ratio cible d'√©quilibrage (ex : 0.4 = 40% de chaque classe)
ratio_cible = 0.4
total_cible = nombre_restants + nombre_sortis
cible_sortis = int(total_cible * ratio_cible)
cible_restants = cible_sortis  # √©quilibre parfait

# √âchantillonnage des deux classes selon le ratio
classe_restants = df_spark.filter(col("Exited") == 0).sample(False, cible_restants / nombre_restants, seed=42)
classe_sortis = df_spark.filter(col("Exited") == 1).sample(True, cible_sortis / nombre_sortis, seed=42)

# Union des deux classes pour cr√©er un DataFrame √©quilibr√©
df_equilibre = classe_restants.union(classe_sortis)

# V√©rification de l'√©quilibre obtenu
df_equilibre.groupBy("Exited").count().show()


+------+-----+
|Exited|count|
+------+-----+
|   0.0| 4062|
|   1.0| 4019|
+------+-----+



In [12]:
# ============================================
# √âTAPE 6 : D√©finir les colonnes features
# ============================================# Toutes les colonnes num√©riques (features)
colonnes_features = [
    "CreditScore",
    "Age",
    "Tenure",
    "Balance",
    "NumOfProducts",
    "HasCrCard",
    "IsActiveMember",
    "EstimatedSalary",
    "GenderIndex",    
    "Geography_France",
    "Geography_Germany" 
]

# Colonne cible
colonne_cible = "Exited"

#Assembler toutes les features dans un vecteur


# Cr√©er le VectorAssembler
assembler = VectorAssembler(
    inputCols=colonnes_features,
    outputCol="features_raw"
)

# Appliquer l'assembler
df_avec_features = assembler.transform(df_equilibre)

# Afficher le r√©sultat
print("Features assembl√©es (premiers exemples):")
df_avec_features.select(colonnes_features + ["features_raw"]).show(5, truncate=False)



Features assembl√©es (premiers exemples):
+-----------+----+------+-------+-------------+---------+--------------+---------------+-----------+----------------+-----------------+------------------------------------------------------+
|CreditScore|Age |Tenure|Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|GenderIndex|Geography_France|Geography_Germany|features_raw                                          |
+-----------+----+------+-------+-------------+---------+--------------+---------------+-----------+----------------+-----------------+------------------------------------------------------+
|822.0      |50.0|7.0   |0.0    |2.0          |1.0      |1.0           |10062.8        |0.0        |0.0             |0.0              |[822.0,50.0,7.0,0.0,2.0,1.0,1.0,10062.8,0.0,0.0,0.0]  |
|497.0      |24.0|3.0   |0.0    |2.0          |1.0      |0.0           |76390.01       |0.0        |0.0             |0.0              |(11,[0,1,2,4,5,7],[497.0,24.0,3.0,2.0,1.0,76390.01])  |
|63

In [14]:
#  Normaliser les features avec StandardScaler      # Cr√©er le StandardScaler
scaler = StandardScaler(
    inputCol="features_raw",
    outputCol="features",
    withStd=True,  # Normaliser avec √©cart-type
    withMean=True  # Centrer sur la moyenne
)
# Entra√Æner le scaler sur les donn√©es
scaler_model = scaler.fit(df_avec_features)

# Appliquer la normalisation
df_final = scaler_model.transform(df_avec_features)

# Afficher le r√©sultat
print("Features normalis√©es (premiers exemples):")
df_final.select("features_raw", "features").show(5, truncate=False)


Features normalis√©es (premiers exemples):
+------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|features_raw                                          |features                                                                                                                                                                                  |
+------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[822.0,50.0,7.0,0.0,2.0,1.0,1.0,10062.8,0.0,0.0,0.0]  |[1.7554119083328914,0.8301437864465926,0.6765702918234291,-1.3500498587716774,0.7531503687480621,0.6568705569879226,1.097616608426526,-1.581212567159642,-1.0020439117270

In [None]:
# par convention en change le nom de exited to label
df_final = df_final.withColumnRenamed("Exited", "label")

In [24]:
# S√©parer les donn√©es en train (80%) et test (20%)
train_data, test_data = df_final.randomSplit([0.8, 0.2], seed=42)

print(f"jeu d'entra√Ænement: {train_data.count()}   " )
print(f"jeu de test: {test_data.count()} ")

# V√©rifier la distribution dans chaque set
print("Distribution dans train_data:")
train_data.groupBy("Exited").count().show()

print("Distribution dans test_data:")
test_data.groupBy("Exited").count().show()


jeu d'entra√Ænement: 6535   
jeu de test: 1546 
Distribution dans train_data:
+------+-----+
|Exited|count|
+------+-----+
|   0.0| 3308|
|   1.0| 3227|
+------+-----+

Distribution dans test_data:
+------+-----+
|Exited|count|
+------+-----+
|   0.0|  754|
|   1.0|  792|
+------+-----+



In [25]:
# √âTAPE 6.8 : Choisir et cr√©er le mod√®le MLlib
# Teste 3 mod√®les MLlib :
from pyspark.ml.classification import (
    LogisticRegression,
    RandomForestClassifier,
    GBTClassifier
)

## PIPLELINE

In [26]:
# ============================================
# PIPELINE 
# Pr√©diction de l'attrition bancaire
# ============================================

from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.feature import VectorAssembler, StandardScaler

# ============================================
# √âTAPE 6 : CONSTRUCTION DU PIPELINE
# ============================================

print("=" * 60)
print("√âTAPE 6 : Construction du Pipeline ML")
print("=" * 60)

# 1. D√©finir les colonnes features (d√©j√† fait dans votre code)
colonnes_features = [
    "CreditScore", "Age", "Tenure", "Balance", "NumOfProducts",
    "HasCrCard", "IsActiveMember", "EstimatedSalary", "GenderIndex",
    "Geography_France", "Geography_Germany"
]

# 2. Cr√©er les √©tapes du pipeline (transformations)
# √âtape 1 : Assembler les features en un seul vecteur
assembler = VectorAssembler(
    inputCols=colonnes_features,
    outputCol="features_raw"  # Vecteur brut avant normalisation
)

# √âtape 2 : Normaliser les features (StandardScaler)
scaler = StandardScaler(
    inputCol="features_raw",   # Entr√©e : vecteur brut
    outputCol="features",       # Sortie : vecteur normalis√©
    withStd=True,               # Diviser par l'√©cart-type
    withMean=True               # Centrer sur la moyenne
)

# 3. Renommer la colonne cible en "label" (obligatoire pour MLlib)
df_prepared = df_equilibre.withColumnRenamed("Exited", "label")

# 4. S√©parer les donn√©es : 80% train, 20% test
train_data, test_data = df_prepared.randomSplit([0.8, 0.2], seed=42)

# IMPORTANT : Cacher les donn√©es car elles seront utilis√©es plusieurs fois
train_data.cache()
test_data.cache()

print(f"‚úÖ Donn√©es d'entra√Ænement : {train_data.count()} lignes")
print(f"‚úÖ Donn√©es de test : {test_data.count()} lignes")

# V√©rifier la distribution des classes
print("\nüìä Distribution dans train_data:")
train_data.groupBy("label").count().show()

print("üìä Distribution dans test_data:")
test_data.groupBy("label").count().show()

√âTAPE 6 : Construction du Pipeline ML
‚úÖ Donn√©es d'entra√Ænement : 6535 lignes
‚úÖ Donn√©es de test : 1546 lignes

üìä Distribution dans train_data:
+-----+-----+
|label|count|
+-----+-----+
|  0.0| 3308|
|  1.0| 3227|
+-----+-----+

üìä Distribution dans test_data:
+-----+-----+
|label|count|
+-----+-----+
|  0.0|  754|
|  1.0|  792|
+-----+-----+



## ENTRAINER 

In [31]:
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# ============================================
# √âTAPE 7 : ENTRA√éNEMENT ET VALIDATION CROIS√âE
# ============================================

print("\n" + "=" * 60)
print("√âTAPE 7 : Entra√Ænement des Mod√®les")
print("=" * 60)

lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10)
rf = RandomForestClassifier(featuresCol="features", labelCol="label", seed=42)
gbt = GBTClassifier(featuresCol="features", labelCol="label", seed=42)

# Liste des mod√®les avec leurs configurations
modeles = [
    {
        "nom": "Logistic Regression",
        "modele": LogisticRegression(featuresCol="features", labelCol="label", maxIter=10),
        "param_grid": ParamGridBuilder()
            .addGrid(LogisticRegression().regParam, [0.01, 0.1])  # ‚ö†Ô∏è Correction ici
            .addGrid(LogisticRegression().elasticNetParam, [0.0, 0.5])
            .build()
    },
    {
        "nom": "Random Forest",
        "modele": RandomForestClassifier(featuresCol="features", labelCol="label", seed=42),
        "param_grid": ParamGridBuilder()
            .addGrid(RandomForestClassifier().numTrees, [50, 100])  # ‚ö†Ô∏è Correction ici
            .addGrid(RandomForestClassifier().maxDepth, [5, 10])
            .build()
    },
    {
        "nom": "Gradient Boosted Trees",
        "modele": GBTClassifier(featuresCol="features", labelCol="label", seed=42),
        "param_grid": ParamGridBuilder()
            .addGrid(GBTClassifier().maxIter, [50, 100])  # ‚ö†Ô∏è Correction ici
            .addGrid(GBTClassifier().maxDepth, [3, 5])
            .build()
    }
]

# √âvaluateur commun pour tous les mod√®les
evaluator = BinaryClassificationEvaluator(
    labelCol="label",
    rawPredictionCol="rawPrediction",
    metricName="areaUnderROC"  # M√©trique principale
)

# Dictionnaire pour stocker les mod√®les entra√Æn√©s
modeles_entraines = {}  # ‚ö†Ô∏è Correction orthographe

# Boucle d'entra√Ænement pour chaque mod√®le
for m in modeles:
    print(f"\nüîπ Mod√®le : {m['nom']}")
    
    # Cr√©er le pipeline complet (assemblage + normalisation + mod√®le)
    pipeline = Pipeline(stages=[assembler, scaler, m["modele"]])
    
    # Cr√©er le CrossValidator (validation crois√©e)
    cv = CrossValidator(
        estimator=pipeline,
        estimatorParamMaps=m["param_grid"],
        evaluator=evaluator,
        numFolds=3,  # 3-fold cross validation
        seed=42
    )
    
    # Entra√Æner le mod√®le
    print(f"‚è≥ Entra√Ænement en cours...")
    modele_entraine = cv.fit(train_data)
    print("‚úÖ Entra√Ænement termin√©!")
    
    # Sauvegarder le mod√®le entra√Æn√©
    modeles_entraines[m["nom"]] = modele_entraine

print("\n‚úÖ Tous les mod√®les ont √©t√© entra√Æn√©s avec succ√®s!")


√âTAPE 7 : Entra√Ænement des Mod√®les

üîπ Mod√®le : Logistic Regression
‚è≥ Entra√Ænement en cours...
‚úÖ Entra√Ænement termin√©!

üîπ Mod√®le : Random Forest
‚è≥ Entra√Ænement en cours...
‚úÖ Entra√Ænement termin√©!

üîπ Mod√®le : Gradient Boosted Trees
‚è≥ Entra√Ænement en cours...
‚úÖ Entra√Ænement termin√©!

‚úÖ Tous les mod√®les ont √©t√© entra√Æn√©s avec succ√®s!


In [32]:
# ============================================
# √âTAPE 8 : √âVALUATION DES MOD√àLES
# ============================================

print("\n" + "=" * 60)
print("√âTAPE 8 : √âvaluation des Mod√®les")
print("=" * 60)

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Fonction pour √©valuer un mod√®le
def evaluer_modele(nom_modele, modele, test_data):
    """
    Returns:
        dict: Dictionnaire avec toutes les m√©triques
    """
    print(f"\nüìä √âvaluation : {nom_modele}")
    print("-" * 50)
    
    # 1. FAIRE LES PR√âDICTIONS sur le test set
    predictions = modele.transform(test_data)
    
    # 2. CALCULER LES M√âTRIQUES
    
    # AUC-ROC (m√©trique principale pour classification binaire)
    evaluator_auc = BinaryClassificationEvaluator(
        labelCol="label",
        rawPredictionCol="rawPrediction",
        metricName="areaUnderROC"
    )
    auc = evaluator_auc.evaluate(predictions)
    
    # Accuracy (Pr√©cision globale)
    evaluator_acc = MulticlassClassificationEvaluator(
        labelCol="label",
        predictionCol="prediction",
        metricName="accuracy"
    )
    accuracy = evaluator_acc.evaluate(predictions)
    
    # Precision (Pr√©cision par classe)
    evaluator_precision = MulticlassClassificationEvaluator(
        labelCol="label",
        predictionCol="prediction",
        metricName="weightedPrecision"
    )
    precision = evaluator_precision.evaluate(predictions)
    
    # Recall (Rappel)
    evaluator_recall = MulticlassClassificationEvaluator(
        labelCol="label",
        predictionCol="prediction",
        metricName="weightedRecall"
    )
    recall = evaluator_recall.evaluate(predictions)
    
    # F1-Score (Moyenne harmonique de Precision et Recall)
    evaluator_f1 = MulticlassClassificationEvaluator(
        labelCol="label",
        predictionCol="prediction",
        metricName="f1"
    )
    f1 = evaluator_f1.evaluate(predictions)
    
    # 3. AFFICHER LES R√âSULTATS
    print(f"  AUC-ROC    : {auc:.4f}")
    print(f"  Accuracy   : {accuracy:.4f}")
    print(f"  Precision  : {precision:.4f}")
    print(f"  Recall     : {recall:.4f}")
    print(f"  F1-Score   : {f1:.4f}")
    
    # 4. MATRICE DE CONFUSION
    print("\n  üìã Matrice de Confusion:")
    confusion_matrix = predictions.groupBy("label", "prediction").count()
    confusion_matrix.orderBy("label", "prediction").show()
    
    # Retourner les m√©triques
    return {
        "nom": nom_modele,
        "auc": auc,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

# ---------------------------------------------
# √âVALUER TOUS LES MOD√àLES
# ---------------------------------------------

resultats = []

for nom_modele, modele in modeles_entraines.items():
    metrics = evaluer_modele(nom_modele, modele, test_data)
    resultats.append(metrics)

# ---------------------------------------------
# COMPARAISON FINALE
# ---------------------------------------------

print("\n" + "=" * 60)
print("üìà R√âCAPITULATIF DES PERFORMANCES")
print("=" * 60)

# Afficher un tableau comparatif
print(f"\n{'Mod√®le':<30} {'AUC-ROC':<10} {'Accuracy':<10} {'F1-Score':<10}")
print("-" * 60)
for r in resultats:
    print(f"{r['nom']:<30} {r['auc']:<10.4f} {r['accuracy']:<10.4f} {r['f1']:<10.4f}")

# Trouver le meilleur mod√®le (bas√© sur AUC-ROC)
meilleur = max(resultats, key=lambda x: x['auc'])
print("\n" + "=" * 60)
print(f"üèÜ MEILLEUR MOD√àLE : {meilleur['nom']}")
print(f"   AUC-ROC = {meilleur['auc']:.4f}")
print("=" * 60)



√âTAPE 8 : √âvaluation des Mod√®les

üìä √âvaluation : Logistic Regression
--------------------------------------------------
  AUC-ROC    : 0.7542
  Accuracy   : 0.6889
  Precision  : 0.6911
  Recall     : 0.6889
  F1-Score   : 0.6886

  üìã Matrice de Confusion:
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  0.0|       0.0|  548|
|  0.0|       1.0|  206|
|  1.0|       0.0|  275|
|  1.0|       1.0|  517|
+-----+----------+-----+


üìä √âvaluation : Random Forest
--------------------------------------------------
  AUC-ROC    : 0.8470
  Accuracy   : 0.7600
  Precision  : 0.7644
  Recall     : 0.7600
  F1-Score   : 0.7595

  üìã Matrice de Confusion:
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  0.0|       0.0|  614|
|  0.0|       1.0|  140|
|  1.0|       0.0|  231|
|  1.0|       1.0|  561|
+-----+----------+-----+


üìä √âvaluation : Gradient Boosted Trees
--------------------------------------------------
  AUC-ROC  

In [37]:
# ============================================
# √âTAPE 9 : SAUVEGARDE DU MOD√àLE
# ============================================

print("\n" + "=" * 60)
print("√âTAPE 9 : Sauvegarde du Mod√®le")
print("=" * 60)

# R√©cup√©rer le meilleur mod√®le
best_model = modeles_entraines[meilleur['nom']]

# Chemin de sauvegarde
chemin_sauvegarde = "../models/best_model_attrition"

# Sauvegarder le mod√®le
best_model = best_model.bestModel
try:
    best_model.write().overwrite().save(chemin_sauvegarde)
    print(f"‚úÖ Mod√®le sauvegard√© dans : {chemin_sauvegarde}")
    print(f"   Mod√®le : {meilleur['nom']}")
    print(f"   Performance (AUC-ROC) : {meilleur['auc']:.4f}")
except Exception as e:
    print(f"‚ùå Erreur lors de la sauvegarde : {e}")


# Lib√©rer la m√©moire cache
print("\nüßπ Nettoyage de la m√©moire...")
train_data.unpersist()
test_data.unpersist()

print("\n" + "=" * 60)
print("‚úÖ PIPELINE TERMIN√â AVEC SUCC√àS!")
print("=" * 60)
print(f"\nüìä R√©sum√© final :")
print(f"   - {len(modeles_entraines)} mod√®les entra√Æn√©s")
print(f"   - Meilleur mod√®le : {meilleur['nom']}")
print(f"   - AUC-ROC : {meilleur['auc']:.4f}")
print(f"   - Mod√®le sauvegard√© : {chemin_sauvegarde}")



√âTAPE 9 : Sauvegarde du Mod√®le
‚ùå Erreur lors de la sauvegarde : An error occurred while calling o57469.save.
: java.lang.RuntimeException: java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset. -see https://cwiki.apache.org/confluence/display/HADOOP2/WindowsProblems
	at org.apache.hadoop.util.Shell.getWinUtilsPath(Shell.java:789)
	at org.apache.hadoop.util.Shell.getSetPermissionCommand(Shell.java:298)
	at org.apache.hadoop.util.Shell.getSetPermissionCommand(Shell.java:314)
	at org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:1116)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkOneDirWithMode(RawLocalFileSystem.java:798)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:838)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:810)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:837)
	at o

In [39]:
import os

print("\n" + "=" * 60)
print("√âTAPE 9 : Sauvegarde du DataFrame final en CSV")
print("=" * 60)

# Cr√©er le dossier ../models s'il n'existe pas
chemin_dossier = "../models"
os.makedirs(chemin_dossier, exist_ok=True)

# Chemin complet du CSV
chemin_csv = os.path.join(chemin_dossier, "best_model_attrition.csv")

# Conversion en pandas et sauvegarde
try:
    df_final_pd = df_final.toPandas()
    df_final_pd.to_csv(chemin_csv, index=False)
    print(f"‚úÖ DataFrame sauvegard√© en CSV : {chemin_csv}")
except Exception as e:
    print(f"‚ùå Erreur lors de la sauvegarde CSV : {e}")

# Nettoyage m√©moire
print("\nüßπ Nettoyage de la m√©moire...")
train_data.unpersist()
test_data.unpersist()

print("\n" + "=" * 60)
print("‚úÖ PIPELINE TERMIN√â AVEC SUCC√àS!")
print("=" * 60)
print(f"\nüìä R√©sum√© final :")
print(f"   - {len(modeles_entraines)} mod√®les entra√Æn√©s")  # v√©rifie le nom exact du dictionnaire
print(f"   - Meilleur mod√®le : {meilleur['nom']}")
print(f"   - AUC-ROC : {meilleur['auc']:.4f}")
print(f"   - DataFrame sauvegard√© en CSV : {chemin_csv}")



√âTAPE 9 : Sauvegarde du DataFrame final en CSV
‚úÖ DataFrame sauvegard√© en CSV : ../models\best_model_attrition.csv

üßπ Nettoyage de la m√©moire...

‚úÖ PIPELINE TERMIN√â AVEC SUCC√àS!

üìä R√©sum√© final :
   - 3 mod√®les entra√Æn√©s
   - Meilleur mod√®le : Gradient Boosted Trees
   - AUC-ROC : 0.8652
   - DataFrame sauvegard√© en CSV : ../models\best_model_attrition.csv
