In [1]:
pip install pyspark

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [3]:
# 1️⃣ Initialiser Spark
spark = SparkSession.builder.appName("HiggsBosonClassification").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/08 16:58:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# 2️⃣ Charger les données (⚠️ Les fichiers sont compressés .gz)
train_path = "HIGGS-Train.csv"
test_path = "HIGGS-Test.csv"

In [5]:
train_df = spark.read.csv(train_path, header=False, inferSchema=True)
test_df = spark.read.csv(test_path, header=False, inferSchema=True)

                                                                                

In [6]:
train_df.show(5)


25/03/08 17:00:38 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+---+------------------+-------------------+-------------------+------------------+-------------------+------------------+--------------------+-------------------+------------------+------------------+-------------------+-------------------+------------------+------------------+--------------------+-------------------+-----------------+-------------------+--------------------+--------------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|_c0|               _c1|                _c2|                _c3|               _c4|                _c5|               _c6|                 _c7|                _c8|               _c9|              _c10|               _c11|               _c12|              _c13|              _c14|                _c15|               _c16|             _c17|               _c18|                _c19|                _c20|             _c21|              _c22|           

In [7]:
# 3️⃣ Renommer les colonnes (Aucun header dans les fichiers CSV)
col_names = ["label"] + [f"feature_{i}" for i in range(1, 29)]
train_df = train_df.toDF(*col_names)
test_df = test_df.toDF(*col_names)

In [8]:
# 4️⃣ Assembler les caractéristiques en un seul vecteur
feature_cols = [f"feature_{i}" for i in range(1, 29)]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

train_df = assembler.transform(train_df).select("features", "label")
test_df = assembler.transform(test_df).select("features", "label")

In [9]:
# 5️⃣ Séparer le jeu d'entraînement (80%) et de validation (20%)
train_data, val_data = train_df.randomSplit([0.8, 0.2], seed=42)

In [10]:
# 6️⃣ Définir les modèles de classification
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=50)
lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=20)
gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=50)


In [11]:
# 7️⃣ Définir l'évaluateur (ROC)
evaluator = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderROC")


In [12]:
# 8️⃣ Définir la validation croisée et grille d’hyperparamètres pour Random Forest
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [10, 50, 100]).build()
crossval = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3)


In [14]:
# 9️⃣ Entraîner les modèles
rf_model = crossval.fit(train_data)
lr_model = lr.fit(train_data)
gbt_model = gbt.fit(train_data)

25/03/08 17:04:59 WARN CacheManager: Asked to cache already cached data.
25/03/08 17:04:59 WARN CacheManager: Asked to cache already cached data.
25/03/08 17:07:45 WARN MemoryStore: Not enough space to cache rdd_484_5 in memory! (computed 32.2 MiB so far)
25/03/08 17:07:45 WARN MemoryStore: Not enough space to cache rdd_484_2 in memory! (computed 32.2 MiB so far)
25/03/08 17:07:45 WARN BlockManager: Persisting block rdd_484_5 to disk instead.
25/03/08 17:07:45 WARN BlockManager: Persisting block rdd_484_2 to disk instead.
25/03/08 17:07:45 WARN MemoryStore: Not enough space to cache rdd_484_7 in memory! (computed 32.2 MiB so far)
25/03/08 17:07:45 WARN BlockManager: Persisting block rdd_484_7 to disk instead.
25/03/08 17:07:45 WARN MemoryStore: Not enough space to cache rdd_484_4 in memory! (computed 32.2 MiB so far)
25/03/08 17:07:45 WARN MemoryStore: Not enough space to cache rdd_484_1 in memory! (computed 32.2 MiB so far)
25/03/08 17:07:45 WARN BlockManager: Persisting block rdd_484

In [15]:
# 🔟 Évaluer les modèles sur le jeu de validation
rf_auc = evaluator.evaluate(rf_model.transform(val_data))
lr_auc = evaluator.evaluate(lr_model.transform(val_data))
gbt_auc = evaluator.evaluate(gbt_model.transform(val_data))

print(f"AUC (Random Forest) : {rf_auc:.4f}")
print(f"AUC (Logistic Regression) : {lr_auc:.4f}")
print(f"AUC (Gradient Boosting) : {gbt_auc:.4f}")

                                                                                

AUC (Random Forest) : 0.7427
AUC (Logistic Regression) : 0.6843
AUC (Gradient Boosting) : 0.7959


In [16]:
# 🔟 Enregistrer le meilleur modèle
best_model = rf_model.bestModel
best_model.write().overwrite().save("best_rf_model")

                                                                                

In [17]:
# 🚀 Tester le meilleur modèle sur le jeu de test
test_results = best_model.transform(test_df)
test_auc = evaluator.evaluate(test_results)
print(f"AUC sur le test (Meilleur Random Forest) : {test_auc:.4f}")




AUC sur le test (Meilleur Random Forest) : 0.7427


                                                                                

In [18]:
# 📌 Fermer Spark
spark.stop()