In [0]:
%spark
import org.apache.hadoop.hbase.HBaseConfiguration

val conf = HBaseConfiguration.create()
conf.set("hbase.zookeeper.quorum", "zookeeper")
conf.set("hbase.zookeeper.property.clientPort", "2181")
conf.set("zookeeper.znode.parent", "/hbase")


In [1]:
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._

// Définir le schéma de votre table
val schema = StructType(Array(
    StructField("rowkey", StringType, true),
    StructField("city", StringType, true),
    StructField("temperature", DoubleType, true),
    StructField("humidity", IntegerType, true),
    StructField("pressure", IntegerType, true),
    StructField("wind_speed", DoubleType, true),
    StructField("weather_main", StringType, true),
    StructField("timestamp", StringType, true)
))

// Lire depuis HBase
conf.set(TableInputFormat.INPUT_TABLE, "weather_data")
conf.set(TableInputFormat.SCAN_COLUMNS, "cf:city cf:temperature cf:humidity cf:pressure cf:wind_speed cf:weather_main")

val hbaseRDD = sc.newAPIHadoopRDD(
    conf,
    classOf[TableInputFormat],
    classOf[ImmutableBytesWritable],
    classOf[Result]
)

// Convertir en DataFrame
val rowsRDD = hbaseRDD.map { case (_, result) =>
    val rowkey = Bytes.toString(result.getRow)
    val city = Option(result.getValue(Bytes.toBytes("cf"), Bytes.toBytes("city"))).map(b => Bytes.toString(b)).getOrElse("")
    val temperature = Option(result.getValue(Bytes.toBytes("cf"), Bytes.toBytes("temperature"))).map(b => Bytes.toString(b).toDouble).getOrElse(0.0)
    val humidity = Option(result.getValue(Bytes.toBytes("cf"), Bytes.toBytes("humidity"))).map(b => Bytes.toString(b).toInt).getOrElse(0)
    val pressure = Option(result.getValue(Bytes.toBytes("cf"), Bytes.toBytes("pressure"))).map(b => Bytes.toString(b).toInt).getOrElse(0)
    val wind_speed = Option(result.getValue(Bytes.toBytes("cf"), Bytes.toBytes("wind_speed"))).map(b => Bytes.toString(b).toDouble).getOrElse(0.0)
    val weather_main = Option(result.getValue(Bytes.toBytes("cf"), Bytes.toBytes("weather_main"))).map(b => Bytes.toString(b)).getOrElse("")
    val timestamp = rowkey
    
    Row(rowkey, city, temperature, humidity, pressure, wind_speed, weather_main, timestamp)
}

val weatherDF = spark.createDataFrame(rowsRDD, schema)

// Afficher les 10 premières lignes
weatherDF.show(10)


In [2]:
%spark
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler, OneHotEncoder}
import org.apache.spark.ml.Pipeline
import org.apache.spark.sql.functions._

// 1. Convertir les colonnes catégorielles en indices
val indexer = new StringIndexer()
  .setInputCol("weather_main")
  .setOutputCol("weather_index")

// 2. Encoder One-Hot pour les villes (optionnel)
val cityIndexer = new StringIndexer()
  .setInputCol("city")
  .setOutputCol("city_index")

// 3. Assembler les features
val assembler = new VectorAssembler()
  .setInputCols(Array("humidity", "pressure", "wind_speed", "weather_index"))
  .setOutputCol("features")

// 4. Pipeline de prétraitement
val pipeline = new Pipeline()
  .setStages(Array(indexer, cityIndexer, assembler))

val model = pipeline.fit(weatherDF)
val preparedDF = model.transform(weatherDF)

// Afficher les données préparées
preparedDF.select("temperature", "humidity", "pressure", "wind_speed", "weather_main", "features").show(10)

In [3]:
%spark
// Division train/test
val Array(trainingData, testData) = preparedDF.randomSplit(Array(0.8, 0.2), seed = 42)

println(s"Training: ${trainingData.count()} rows")
println(s"Test: ${testData.count()} rows")

In [4]:
%spark
import org.apache.spark.ml.regression.LinearRegression

val lr = new LinearRegression()
  .setLabelCol("temperature")
  .setFeaturesCol("features")
  .setMaxIter(10)
  .setRegParam(0.3)
  .setElasticNetParam(0.8)

val lrModel = lr.fit(trainingData)
println(s"Linear Regression coefficients: ${lrModel.coefficients}")
println(s"Linear Regression intercept: ${lrModel.intercept}")

// Évaluation
val lrPredictions = lrModel.transform(testData)
lrPredictions.select("prediction", "temperature", "features").show(5)

In [5]:
%spark
import org.apache.spark.ml.regression.RandomForestRegressor

val rf = new RandomForestRegressor()
  .setLabelCol("temperature")
  .setFeaturesCol("features")
  .setNumTrees(10)

val rfModel = rf.fit(trainingData)
val rfPredictions = rfModel.transform(testData)

In [6]:
%spark
import org.apache.spark.ml.regression.GBTRegressor

val gbt = new GBTRegressor()
  .setLabelCol("temperature")
  .setFeaturesCol("features")
  .setMaxIter(10)

val gbtModel = gbt.fit(trainingData)
val gbtPredictions = gbtModel.transform(testData)

In [7]:
%spark
import org.apache.spark.ml.evaluation.RegressionEvaluator

val evaluator = new RegressionEvaluator()
  .setLabelCol("temperature")
  .setPredictionCol("prediction")
  .setMetricName("rmse")

// Calculer RMSE pour chaque modèle
val lrRmse = evaluator.evaluate(lrPredictions)
val rfRmse = evaluator.evaluate(rfPredictions)
val gbtRmse = evaluator.evaluate(gbtPredictions) // si vous avez entraîné GBT

println(f"Linear Regression RMSE: $lrRmse%.2f")
println(f"Random Forest RMSE: $rfRmse%.2f")
println(f"GBT RMSE: $gbtRmse%.2f")

In [8]:
%spark
// Sélectionner le modèle avec le plus bas RMSE
//val bestModel = if (lrRmse < rfRmse) lrModel else rfModel
// Comparer les trois RMSE et sélectionner le meilleur modèle
val bestModel = if (lrRmse <= rfRmse && lrRmse <= gbtRmse) {
  println("Meilleur modèle: Linear Regression")
  lrModel
} else if (rfRmse <= lrRmse && rfRmse <= gbtRmse) {
  println("Meilleur modèle: Random Forest")
  rfModel
} else {
  println("Meilleur modèle: GBT")
  gbtModel
}
// Sauvegarder le modèle
bestModel.write.overwrite().save("/models/weather_prediction_model")

// Sauvegarder aussi le pipeline de prétraitement
model.write.overwrite().save("/models/data_preprocessing_pipeline")

println("Modèle sauvegardé avec succès!")

In [9]:
%spark
// ------------------------------
// 1️⃣ Linear Regression
// ------------------------------
val lrResults = lrPredictions
  .select("city", "temperature", "prediction")
  .withColumnRenamed("prediction", "lr_prediction")

lrResults.show(15)

// ------------------------------
// 2️⃣ Random Forest
// ------------------------------
val rfResults = rfPredictions
  .select("city", "temperature", "prediction")
  .withColumnRenamed("prediction", "rf_prediction")

rfResults.show(15)

// ------------------------------
// 3️⃣ Gradient Boosted Trees
// ------------------------------
val gbtResults = gbtPredictions
  .select("city", "temperature", "prediction")
  .withColumnRenamed("prediction", "gbt_prediction")

gbtResults.show(15)

// ------------------------------
// 4️⃣ Fusionner tous les résultats pour comparaison
// ------------------------------
val combinedResults = lrResults
  .join(rfResults.select("city", "rf_prediction", "temperature"), Seq("city", "temperature"))
  .join(gbtResults.select("city", "gbt_prediction", "temperature"), Seq("city", "temperature"))

combinedResults.show(15)


In [10]:
%spark
// ------------------------------
// 1️⃣ Linear Regression - supprimer doublons
// ------------------------------
val lrResultsUnique = lrPredictions
  .withColumnRenamed("prediction", "lr_prediction")
  .select("city", "temperature", "lr_prediction")
  .dropDuplicates("city", "temperature")

//lrResultsUnique.show(15)
z.show(lrResultsUnique)
// ------------------------------
// 2️⃣ Random Forest - supprimer doublons
// ------------------------------
val rfResultsUnique = rfPredictions
  .withColumnRenamed("prediction", "rf_prediction")
  .select("city", "temperature", "rf_prediction")
  .dropDuplicates("city", "temperature")

//rfResultsUnique.show(15)
z.show(rfResultsUnique)

// ------------------------------
// 3️⃣ Gradient Boosted Trees - supprimer doublons
// ------------------------------
val gbtResultsUnique = gbtPredictions
  .withColumnRenamed("prediction", "gbt_prediction")
  .select("city", "temperature", "gbt_prediction")
  .dropDuplicates("city", "temperature")

//gbtResultsUnique.show(15)
z.show(gbtResultsUnique)
// ------------------------------
// 4️⃣ Fusionner tous les résultats uniques pour comparaison
// ------------------------------
val combinedResultsUnique = lrResultsUnique
  .join(rfResultsUnique, Seq("city", "temperature"))
  .join(gbtResultsUnique, Seq("city", "temperature"))

//combinedResultsUnique.show(15)
z.show(combinedResultsUnique)


In [11]:
%spark
val vizDF = combinedResultsUnique.select("city", "temperature", "lr_prediction", "rf_prediction", "gbt_prediction")
vizDF.show(20)


In [12]:
%spark
// 1. Calculer l'erreur de chaque modèle
import org.apache.spark.sql.functions._

val resultsWithErrors = combinedResultsUnique
  .withColumn("lr_error", abs($"temperature" - $"lr_prediction"))
  .withColumn("rf_error", abs($"temperature" - $"rf_prediction"))
  .withColumn("gbt_error", abs($"temperature" - $"gbt_prediction"))

// 2. Créer un DataFrame agrégé par ville (idéal pour les graphiques)
val aggregatedByCity = resultsWithErrors.groupBy("city")
  .agg(
    avg("temperature").as("avg_actual_temp"),
    avg("lr_error").as("avg_lr_error"),
    avg("rf_error").as("avg_rf_error"),
    avg("gbt_error").as("avg_gbt_error")
  )

// Afficher le tableau pour vérification
//aggregatedByCity.show()
z.show(aggregatedByCity)

In [13]:
%spark
// Afficher le DataFrame avec l'interface graphique de Zeppelin
z.show(aggregatedByCity)

In [14]:
%spark
// Convertir un DataFrame en format texte pour %table
val tableOutput = aggregatedByCity
  .orderBy("avg_lr_error") // Trier par exemple
  .collect()
  .map(row => s"${row.getAs[String]("city")}\t${row.getAs[Double]("avg_lr_error").formatted("%.2f")}\t${row.getAs[Double]("avg_rf_error").formatted("%.2f")}\t${row.getAs[Double]("avg_gbt_error").formatted("%.2f")}")
  .mkString("\n")

// Afficher avec l'en-tête
print("%table Ville\tErreur LR\tErreur RF\tErreur GBT\n" + tableOutput)