In [1]:
# Initialisation des librairies pyspark

# Initialisation de Spark
import pyspark

from pyspark.sql import SparkSession

from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

from pyspark.sql.types import FloatType
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.ml.classification import LogisticRegression
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.mllib.util import MLUtils
from pyspark.mllib.regression import LabeledPoint
#import pandas as pd

from pyspark import SparkConf, SparkContext, SQLContext

import findspark
findspark.init()

spark = SparkSession \
    .builder \
    .appName("MNIST Classifier") \
    .config('spark.sql.warehouse.dir', 'heart.csv') \
    .config('spark.executor.instances', 10) \
    .getOrCreate()


In [2]:
# Lecture du fichier mnist (partie train)


fileNameTrain = 'heart.csv'
mnist_train = spark.read.csv(fileNameTrain, header=True)

# Lecture du fichier mnist (partie test)
fileNameTest = 'heart.csv'
mnist_test = spark.read.csv(fileNameTest, header=True)

In [3]:
numeric_columns = ['ages', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
for col_name in numeric_columns:
    mnist_train = mnist_train.withColumn(col_name, col(col_name).cast(FloatType()))


#afficher le schema
mnist_train.printSchema()




root
 |-- ages: float (nullable = true)
 |-- sex: float (nullable = true)
 |-- cp: float (nullable = true)
 |-- trestbps: float (nullable = true)
 |-- chol: float (nullable = true)
 |-- fbs: float (nullable = true)
 |-- restecg: float (nullable = true)
 |-- thalach: float (nullable = true)
 |-- exang: float (nullable = true)
 |-- oldpeak: float (nullable = true)
 |-- slope: float (nullable = true)
 |-- ca: float (nullable = true)
 |-- thal: float (nullable = true)
 |-- target: float (nullable = true)



In [4]:
# Liste des colonnes à convertir en types numériques
numeric_columns_test = ['ages', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']

# Convertir les colonnes en types numériques
for col_name in numeric_columns_test:
    mnist_test = mnist_test.withColumn(col_name, col(col_name).cast(FloatType()))

# Afficher le nouveau schéma
print("Schema of mnist_test after conversion:")
mnist_test.printSchema()

Schema of mnist_test after conversion:
root
 |-- ages: float (nullable = true)
 |-- sex: float (nullable = true)
 |-- cp: float (nullable = true)
 |-- trestbps: float (nullable = true)
 |-- chol: float (nullable = true)
 |-- fbs: float (nullable = true)
 |-- restecg: float (nullable = true)
 |-- thalach: float (nullable = true)
 |-- exang: float (nullable = true)
 |-- oldpeak: float (nullable = true)
 |-- slope: float (nullable = true)
 |-- ca: float (nullable = true)
 |-- thal: float (nullable = true)
 |-- target: float (nullable = true)



In [5]:

# Transform les colonnes ZxY to one column named "features"
# création des colonnes "label" et "features"
labeledPoints = Myassembler.transform(mnist_train)
labeledPointsTest = assembler1.transform(mnist_test)

print(labeledPoints)

NameError: name 'Myassembler' is not defined

In [None]:
from pyspark.ml.feature import VectorAssembler

# Liste des colonnes à assembler (toutes sauf "target")
input_cols = [col_name for col_name in mnist_train.columns if col_name != 'target']

# Création du VectorAssembler
assembler = VectorAssembler(
    inputCols=input_cols,
    outputCol="features"
)

# Transformation sur le jeu de données d'entraînement
labeledPoints = assembler.transform(mnist_train)

# Création des colonnes "label" et "features" pour le jeu de données d'entraînement
labeledPoints = labeledPoints.select([ 'target', 'features'])

# Transformation sur le jeu de données de test
labeledPointsTest = assembler.transform(mnist_test)

# Création des colonnes "label" et "features" pour le jeu de données de test
labeledPointsTest = labeledPointsTest.select([ 'target', 'features'])

# Afficher les résultats
print("Labeled Points (Training Set):")
labeledPoints.show(truncate=False)

print("Labeled Points Test (Test Set):")
labeledPointsTest.show(truncate=False)


Labeled Points (Training Set):
+------+---------------------------------------------------------------------------+
|target|features                                                                   |
+------+---------------------------------------------------------------------------+
|0.0   |[52.0,1.0,0.0,125.0,212.0,0.0,1.0,168.0,0.0,1.0,2.0,2.0,3.0]               |
|0.0   |[53.0,1.0,0.0,140.0,203.0,1.0,0.0,155.0,1.0,3.0999999046325684,0.0,0.0,3.0]|
|0.0   |[70.0,1.0,0.0,145.0,174.0,0.0,1.0,125.0,1.0,2.5999999046325684,0.0,0.0,3.0]|
|0.0   |[61.0,1.0,0.0,148.0,203.0,0.0,1.0,161.0,0.0,0.0,2.0,1.0,3.0]               |
|0.0   |[62.0,0.0,0.0,138.0,294.0,1.0,1.0,106.0,0.0,1.899999976158142,1.0,3.0,2.0] |
|1.0   |(13,[0,3,4,7,9,10,12],[58.0,100.0,248.0,122.0,1.0,1.0,2.0])                |
|0.0   |[58.0,1.0,0.0,114.0,318.0,0.0,2.0,140.0,0.0,4.400000095367432,0.0,3.0,1.0] |
|0.0   |[55.0,1.0,0.0,160.0,289.0,0.0,0.0,145.0,1.0,0.800000011920929,1.0,1.0,3.0] |
|0.0   |[46.0,1.0,0.0,120.0,249.0,

In [None]:
labeledPoints.printSchema()
labeledPoints.show()

root
 |-- target: float (nullable = true)
 |-- features: vector (nullable = true)

+------+--------------------+
|target|            features|
+------+--------------------+
|   0.0|[52.0,1.0,0.0,125...|
|   0.0|[53.0,1.0,0.0,140...|
|   0.0|[70.0,1.0,0.0,145...|
|   0.0|[61.0,1.0,0.0,148...|
|   0.0|[62.0,0.0,0.0,138...|
|   1.0|(13,[0,3,4,7,9,10...|
|   0.0|[58.0,1.0,0.0,114...|
|   0.0|[55.0,1.0,0.0,160...|
|   0.0|[46.0,1.0,0.0,120...|
|   0.0|[54.0,1.0,0.0,122...|
|   1.0|[71.0,0.0,0.0,112...|
|   0.0|[43.0,0.0,0.0,132...|
|   1.0|[34.0,0.0,1.0,118...|
|   0.0|[51.0,1.0,0.0,140...|
|   0.0|[52.0,1.0,0.0,128...|
|   1.0|[34.0,0.0,1.0,118...|
|   1.0|[51.0,0.0,2.0,140...|
|   0.0|[54.0,1.0,0.0,124...|
|   1.0|[50.0,0.0,1.0,120...|
|   1.0|[58.0,1.0,2.0,140...|
+------+--------------------+
only showing top 20 rows



In [None]:
from pyspark.sql.functions import when

# Liste des colonnes à assembler (toutes sauf "target")
input_cols = [col_name for col_name in mnist_train.columns if col_name != 'target']

# Création du VectorAssembler
assembler1 = VectorAssembler(
    inputCols=input_cols,
    outputCol="features"
)

# Transformation sur le jeu de données d'entraînement
labeledPoints = assembler1.transform(mnist_train)

# Création de la colonne "labelIndex"
labeledPoints = labeledPoints.withColumn("labelIndex", when(col("target") == 0.0, 0).otherwise(1))

# Création des colonnes "target", "features" et "labelIndex" pour le jeu de données d'entraînement
labeledPoints = labeledPoints.select(['target', 'features', 'labelIndex'])

# Transformation sur le jeu de données de test
labeledPointsTest = assembler1.transform(mnist_test)

# Création de la colonne "labelIndex" pour le jeu de données de test
labeledPointsTest = labeledPointsTest.withColumn("labelIndex", when(col("target") == 0.0, 0).otherwise(1))

# Création des colonnes "target", "features" et "labelIndex" pour le jeu de données de test
labeledPointsTest = labeledPointsTest.select(['target', 'features', 'labelIndex'])

# Afficher les résultats
print("Labeled Points (Training Set):")
labeledPoints.show(truncate=False)

print("Labeled Points Test (Test Set):")
labeledPointsTest.show(truncate=False)


Labeled Points (Training Set):
+------+---------------------------------------------------------------------------+----------+
|target|features                                                                   |labelIndex|
+------+---------------------------------------------------------------------------+----------+
|0.0   |[52.0,1.0,0.0,125.0,212.0,0.0,1.0,168.0,0.0,1.0,2.0,2.0,3.0]               |0         |
|0.0   |[53.0,1.0,0.0,140.0,203.0,1.0,0.0,155.0,1.0,3.0999999046325684,0.0,0.0,3.0]|0         |
|0.0   |[70.0,1.0,0.0,145.0,174.0,0.0,1.0,125.0,1.0,2.5999999046325684,0.0,0.0,3.0]|0         |
|0.0   |[61.0,1.0,0.0,148.0,203.0,0.0,1.0,161.0,0.0,0.0,2.0,1.0,3.0]               |0         |
|0.0   |[62.0,0.0,0.0,138.0,294.0,1.0,1.0,106.0,0.0,1.899999976158142,1.0,3.0,2.0] |0         |
|1.0   |(13,[0,3,4,7,9,10,12],[58.0,100.0,248.0,122.0,1.0,1.0,2.0])                |1         |
|0.0   |[58.0,1.0,0.0,114.0,318.0,0.0,2.0,140.0,0.0,4.400000095367432,0.0,3.0,1.0] |0         |
|0.0   |[

In [None]:
print(labeledPoints.head())

Row(target=0.0, features=DenseVector([52.0, 1.0, 0.0, 125.0, 212.0, 0.0, 1.0, 168.0, 0.0, 1.0, 2.0, 2.0, 3.0]), labelIndex=0)


In [None]:
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression
from pyspark.ml import Pipeline

# Définition des classifieurs
classifiers = [
    RandomForestClassifier(labelCol="labelIndex", featuresCol="features", impurity='gini', maxBins=32),
    LogisticRegression(labelCol="labelIndex", featuresCol="features", maxIter=10, regParam=0.01)
]

# Utilisation d'un classifieur spécifique (Logistic Regression dans cet exemple)
classifier1 = LogisticRegression(labelCol="labelIndex", featuresCol="features", maxIter=10, regParam=0.01)

# Utilisation de la pipeline pour entraîner le classifieur
pipeline1 = Pipeline(stages=[classifier1])



model1= pipeline1.fit(labeledPoints)


# Utilisation du modèle pour prédire sur le jeu de données de test
predictions1 = model1.transform(labeledPointsTest)


# Affichage des résultats
predictions1.select("target", "prediction", "probability").show(truncate=False)



+------+----------+------------------------------------------+
|target|prediction|probability                               |
+------+----------+------------------------------------------+
|0.0   |0.0       |[0.7570777301568339,0.24292226984316612]  |
|0.0   |0.0       |[0.975711511757866,0.024288488242134032]  |
|0.0   |0.0       |[0.9768891656887262,0.02311083431127381]  |
|0.0   |0.0       |[0.62681713490892,0.37318286509108]       |
|0.0   |0.0       |[0.9163792011587731,0.08362079884122686]  |
|1.0   |1.0       |[0.28851267623393156,0.7114873237660684]  |
|0.0   |0.0       |[0.96979114842567,0.030208851574329953]   |
|0.0   |0.0       |[0.9720746843238539,0.02792531567614609]  |
|0.0   |0.0       |[0.6457657271198721,0.3542342728801279]   |
|0.0   |0.0       |[0.9901497950145989,0.009850204985401123] |
|1.0   |1.0       |[0.23961863871046962,0.7603813612895304]  |
|0.0   |0.0       |[0.9146923909438363,0.08530760905616375]  |
|1.0   |1.0       |[0.014250650793847394,0.985749349206

In [None]:
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression
from pyspark.ml import Pipeline

# Définition des classifieurs
classifiers = [
    RandomForestClassifier(labelCol="labelIndex", featuresCol="features", impurity='gini', maxBins=32),
    LogisticRegression(labelCol="labelIndex", featuresCol="features", maxIter=10, regParam=0.01)
]
classifier2 = RandomForestClassifier(labelCol="labelIndex", featuresCol="features", impurity='gini', maxBins=32)
pipeline2 =Pipeline(stages=[classifier2])
model2= pipeline2.fit(labeledPoints)
predictions2 = model2.transform(labeledPointsTest)
predictions2.select("target", "prediction", "probability").show(truncate=False)

+------+----------+-----------------------------------------+
|target|prediction|probability                              |
+------+----------+-----------------------------------------+
|0.0   |0.0       |[0.8367688273975743,0.16323117260242576] |
|0.0   |0.0       |[0.9468543319767099,0.05314566802329009] |
|0.0   |0.0       |[0.9475229280513927,0.052477071948607255]|
|0.0   |0.0       |[0.848547292445587,0.15145270755441292]  |
|0.0   |0.0       |[0.8791873831854505,0.1208126168145495]  |
|1.0   |1.0       |[0.22536442736017714,0.7746355726398229] |
|0.0   |0.0       |[0.8899907851219911,0.11000921487800883] |
|0.0   |0.0       |[0.9652332316660865,0.03476676833391347] |
|0.0   |0.0       |[0.7467508774390418,0.2532491225609582]  |
|0.0   |0.0       |[0.958123861173131,0.041876138826868956] |
|1.0   |1.0       |[0.27006108254875727,0.7299389174512427] |
|0.0   |0.0       |[0.9660862701604153,0.033913729839584794]|
|1.0   |1.0       |[0.053693303490198206,0.9463066965098017]|
|0.0   |

In [None]:
from pyspark.ml.classification import GBTClassifier
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler

# Définition des classifieurs
gbt = GBTClassifier(labelCol="labelIndex", featuresCol="features", maxDepth=5, maxBins=32, maxIter=10)

# Création du pipeline avec XGBoost
pipeline_xgboost = Pipeline(stages=[
    gbt
])

# Entraînement du modèle
model_xgboost = pipeline_xgboost.fit(labeledPoints)

# Prédictions sur l'ensemble de test
predictions_xgboost = model_xgboost.transform(labeledPointsTest)

predictionsAndLabels_xgboost=predictions_xgboost.select("target", "prediction", "probability")
# Affichage des résultats
predictionsAndLabels_xgboost.show(truncate=False)


+------+----------+-----------------------------------------+
|target|prediction|probability                              |
+------+----------+-----------------------------------------+
|0.0   |0.0       |[0.9327732103219774,0.06722678967802265] |
|0.0   |0.0       |[0.9337480394876319,0.06625196051236815] |
|0.0   |0.0       |[0.9337480394876319,0.06625196051236815] |
|0.0   |0.0       |[0.9151667804874724,0.08483321951252765] |
|0.0   |0.0       |[0.9153216824483899,0.08467831755161015] |
|1.0   |1.0       |[0.08813671429720109,0.911863285702799]  |
|0.0   |0.0       |[0.926436855501563,0.07356314449843704]  |
|0.0   |0.0       |[0.9392217445976794,0.060778255402320625]|
|0.0   |0.0       |[0.8849706282043883,0.11502937179561168] |
|0.0   |0.0       |[0.9334096983039707,0.06659030169602931] |
|1.0   |1.0       |[0.08813671429720109,0.911863285702799]  |
|0.0   |0.0       |[0.9424718489617971,0.05752815103820286] |
|1.0   |1.0       |[0.06181442813281981,0.9381855718671802] |
|0.0   |

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Évaluateur d'accuracy
evaluator_accuracy = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator_accuracy.evaluate(predictionAndLabels_xgboost)
print("Test set accuracy = " + str(accuracy))

# Évaluateur de F1
evaluator_f1 = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction", metricName="f1")
f1 = evaluator_f1.evaluate(predictionAndLabels_xgboost)
print("Test set f1 = " + str(f1))

# Évaluateur de weightedPrecision
evaluator_precision = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction", metricName="weightedPrecision")
precision = evaluator_precision.evaluate(predictionAndLabels_xgboost)
print("Test set weightedPrecision = " + str(precision))

# Évaluateur de weightedRecall
evaluator_recall = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction", metricName="weightedRecall")
recall = evaluator_recall.evaluate(predictionAndLabels_xgboost)
print("Test set weightedRecall = " + str(recall))


Test set accuracy = 0.9902439024390244
Test set f1 = 0.9902427499215545
Test set weightedPrecision = 0.9902717929409139
Test set weightedRecall = 0.9902439024390244


In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline


# Sélection des colonnes pertinentes pour l'évaluation
predictionAndLabels_1 = predictions1.select("labelIndex", "prediction")

# Évaluation
evaluator_lr = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction", metricName="accuracy")
accuracy_lr = evaluator_lr.evaluate(predictionAndLabels_1)

print("RandomForest Test set accuracy = " + str(accuracy_lr))


# Évaluateur de weightedPrecision
evaluator_precision = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction", metricName="weightedPrecision")
precision = evaluator_precision.evaluate(predictionAndLabels_1)
print("Test set weightedPrecision = " + str(precision))


RandomForest Test set accuracy = 0.848780487804878
Test set weightedPrecision = 0.8533439088328948


In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline


# Sélection des colonnes pertinentes pour l'évaluation
predictionAndLabels_2 = predictions2.select("labelIndex", "prediction")

# Évaluation
evaluator_rf = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction", metricName="accuracy")
accuracy_rf = evaluator_rf.evaluate(predictionAndLabels_2)

print("RandomForest Test set accuracy = " + str(accuracy_rf))


# Évaluateur de weightedPrecision
evaluator_precision = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction", metricName="weightedPrecision")
precision = evaluator_precision.evaluate(predictionAndLabels_2)
print("Test set weightedPrecision = " + str(precision))


RandomForest Test set accuracy = 0.9160975609756098
Test set weightedPrecision = 0.9182429238424474


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Charger le fichier CSV dans un DataFrame pandas
file_path = 'heart.csv'
dataset = pd.read_csv(file_path)

# Exemple : Histogramme de l'âge en fonction de la maladie cardiaque
plt.figure(figsize=(10, 6))
sns.histplot(data=dataset, x='age', hue='target', bins=20, kde=True, palette='husl')
plt.title('Répartition de l\'âge en fonction de la maladie cardiaque')
plt.xlabel('Âge')
plt.ylabel('Fréquence')
plt.show()
