In [0]:
%scala

import org.apache.spark.ml.feature.{Imputer, OneHotEncoderEstimator, StandardScaler, StringIndexer, VectorAssembler}
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.functions.{lit, mean, rand, udf}
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset, Encoders, Row, SparkSession}
import org.apache.spark.{SparkConf, SparkContext}

import scala.collection.mutable.ArrayBuffer
import scala.math.sqrt
import scala.util.Random.shuffle

In [0]:
%scala

def getSS: SparkSession = {
    val spark = SparkSession
      .builder()
      .master("local")
      .appName("Projet GD")
      .config("spark.some.config.option", "some-value")
      .getOrCreate()
    spark
  }

case class GenericDSRow(labels: Double, features: Seq[Double])

In [0]:
%scala

def grad(y: Double, w: Array[Double], x: Array[Double]): Array[Double] = {
  prod_by_scal(x, 2 * (prod_scal(w, x) - y))
}

def subtr(x: Array[Double], y: Array[Double]) = {
  (x zip y).map(a => a._1 - (a._2))
}

def prod_scal(x: Array[Double], y: Array[Double]): Double = (for ((a, b) <- x zip y) yield a * b).reduce((a, b) => a + b)

def prod_by_scal(x: Array[Double], y: Double) = (for (z <- 0 to x.size - 1) yield (x(z)) * y).toArray

def sum(x: Array[Double], y: Array[Double]) = (x zip y).map(a => a._1 + a._2)

def mul(x: Array[Double], y: Array[Double]) = (x zip y).map(a => a._1 * a._2)

def div(x: Array[Double], y: Array[Double]) = (x zip y).map(a => a._1 / a._2)

def root(x: Array[Double]) = x.map(a => sqrt(a))

def toArrayDeRow(miniers: Array[GenericDSRow]) = {
    var liste: ArrayBuffer[Row] = new ArrayBuffer[Row]()
    val schema = Encoders.product[GenericDSRow].schema
    //schema.printTreeString()
    for (m <- miniers) {
      val value = m.productIterator.toArray
      val row: Row = new GenericRowWithSchema(value, schema)
      liste += row
    }
    liste.toArray
 }

def arrayOfRowToArray(l: Array[Row]) = {
    var liste: ArrayBuffer[(Double, Array[Double])] = new ArrayBuffer[(Double, Array[Double])]()
    for (el <- l) {
      liste += Tuple2(el.getDouble(0), el.getSeq(1).toArray) // getAs[Seq[Double]](1).toArray[Double])
    }
    liste.toArray
}

In [0]:
%scala

def MBGD_local_SansSpark(W_b: Array[Double], nu: Double, epochsLocal: Int, r: Array[(Double, Array[Double])], tailleBatch: Int): Array[Double] = {

    val N = r.length
    var W_local = W_b
    //println("wlocal=",W_local)
    for (i <- 1 to epochsLocal) {
      var random_idx_list = shuffle(0 to N - 1)
      val batches_idx = random_idx_list.grouped(tailleBatch).toList
      for (batch_idx <- batches_idx) {
        //println("batch_idx",batch_idx)
        val batch = batch_idx.map(index => r.apply(index))
        //println("batch",batch)
        val temp = batch.map(x => grad(x._1, W_local, x._2)).reduce(sum(_, _))
        val nu_w = prod_by_scal(temp, nu / batch.length)
        W_local = subtr(W_local, nu_w)
      }
    }
    W_local
  }

def MBGD_parallel_DF(w: Array[Double], nu: Double, epochs: Int, epochsLocal: Int,
                       df: DataFrame, tailleBatch: Int): Array[Double] = {
    val spark = getSS
    var W = w

    for (i <- 1 to epochs) {
      import spark.implicits._
      val W_b = sc.broadcast(W)
      val wLocauxDF = df.mapPartitions(
        iterator => { // Iterator[Row]
          var result = MBGD_local(W_b.value, nu, epochsLocal, iterator.toIterable.toArray[Row], tailleBatch)
          Seq(result).toIterator
        })
      W = wLocauxDF.select("*").collect().map(w => w.getAs[Seq[Double]](0).toArray[Double]
        .map(_.toString.toDouble / wLocauxDF.rdd.getNumPartitions)).reduce(sum(_, _))
    } // Fin for
    W
 }

def MBGD_local(W_b: Array[Double], nu: Double, epochsLocal: Int,
                 r: Array[Row], tailleBatch: Int): Array[Double] = {
    val N = r.length
    var W_local = W_b

    for (i <- 1 to epochsLocal) {
      var random_idx_list = shuffle(0 to N - 1)
      val batches_idx = random_idx_list.grouped(tailleBatch).toList
      for (batch_idx <- batches_idx) {
        val batch = batch_idx.map(index => r.apply(index))
        //val temp = batch.map(x => grad(x.getDouble(0), W_local, x.getSeq(1).toArray[Double])).reduce(sum(_, _))
        val temp = batch.map(x => grad(x.getDouble(0), W_local, x.getSeq(1).toArray[Double])).reduce(sum(_, _))
        val nu_w = prod_by_scal(temp, nu / batch.length)
        W_local = subtr(W_local, nu_w)
      }
    }
    W_local
  }

def MBGD_parallel_RDD(w: Array[Double], nu: Double, epochs: Int, epochsLocal: Int, r: RDD[(Double, Array[Double])], tailleBatch: Int): Array[Double] = {
    val numPartitions = r.getNumPartitions
    val parts = r.glom()
    var W = w
    for (i <- 1 to epochs) {
      val W_b = sc.broadcast(W)
      val W_locaux = parts.map(x => MBGD_local_SansSpark(W_b.value, nu, epochsLocal, x, tailleBatch))
      W = W_locaux.map(w => w.map(_ / numPartitions)).reduce(sum(_, _))
    }
    W
  }

def MBGD_parallel_DS(w: Array[Double], nu: Double, epochs: Int, epochsLocal: Int,
                       ds: Dataset[GenericDSRow], tailleBatch: Int): Array[Double] = {
    val spark = getSS
    var W = w

    for (i <- 1 to epochs) {
      import spark.implicits._
      val W_b = sc.broadcast(W)
      val wLocauxDF = ds.mapPartitions(
        iterator => { // Iterator[Row]
          val r2 = toArrayDeRow(iterator.toIterable.toArray[GenericDSRow])
          var result = MBGD_local(W_b.value, nu, epochsLocal, r2, tailleBatch)
          Seq(result).toIterator
        })
      W = wLocauxDF.select("*").collect().map(w => w.getAs[Seq[Double]](0).toArray[Double]
        .map(_.toString.toDouble / wLocauxDF.rdd.getNumPartitions)).reduce(sum(_, _))
    } // Fin for
    W
  }

In [0]:
%scala

  def MOM_MBGD_local(W_b: Array[Double], nu: Double, epochsLocal: Int,
                     r: Array[(Double, Array[Double])], tailleBatch: Int, M_b: Array[Double],
                     beta: Double): (Array[Double], Array[Double]) = {

    val N = r.length
    var W_local = W_b
    var M_local = M_b
    for (i <- 1 to epochsLocal) {
      var random_idx_list = shuffle(0 to N - 1)
      val batches_idx = random_idx_list.grouped(tailleBatch).toList
      for (batch_idx <- batches_idx) {
        val batch = batch_idx.map(index => r.apply(index))
        val temp = batch.map(x => grad(x._1, W_local, x._2)).reduce(sum(_, _))
        val nu_w = prod_by_scal(temp, nu / batch.length)
        val mbeta = prod_by_scal(M_local, beta)
        M_local = sum(mbeta, nu_w)
        W_local = subtr(W_local, M_local)
      }
    }
    (W_local, M_local)
  }

 def MOM_MBGD_parallel_RDD(w: Array[Double], nu: Double, epochs: Int, epochsLocal: Int,
                            r: RDD[(Double, Array[Double])], tailleBatch: Int, beta: Double): Array[Double] = {
    val numPartitions = r.getNumPartitions
    val parts = r.glom()
    var W = w
    var M = w
    for (i <- 1 to epochs) {
      val W_b = sc.broadcast(W)
      val M_b = sc.broadcast(M)
      val W_M_locaux = parts.map(x => MOM_MBGD_local(W_b.value, nu, epochsLocal, x, tailleBatch, M_b.value, beta))
      val W_locaux = W_M_locaux.map(x => x._1)
      val M_locaux = W_M_locaux.map(x => x._2)
      W = W_locaux.map(w => w.map(_ / numPartitions)).reduce(sum(_, _))
      M = M_locaux.map(m => m.map(_ / numPartitions)).reduce(sum(_, _))
    }
    W
  }

def MOM_MBGD_parallel_DF(w: Array[Double], nu: Double, epochs: Int, epochsLocal: Int,
                         df: DataFrame, tailleBatch: Int, beta: Double): Array[Double] = {
  var W = w
  var M = w
  for (i <- 1 to epochs) {
    val spark = getSS
    import spark.implicits._
    val W_b = sc.broadcast(W)
    val M_b = sc.broadcast(M)
    val W_M_locaux = df.mapPartitions(
      iterator => { // Iterator[Row]
        var result = MOM_MBGD_local(W_b.value, nu, epochsLocal,
          arrayOfRowToArray(iterator.toIterable.toArray[Row]), tailleBatch, M_b.value, beta)
        Seq(result).toIterator
      })
    W = W_M_locaux.select("*").collect().map(w => w.getAs[Seq[Double]](0).toArray[Double]
      .map(_.toString.toDouble / W_M_locaux.rdd.getNumPartitions)).reduce(sum(_, _))

    M = W_M_locaux.select("*").collect().map(w => w.getAs[Seq[Double]](1).toArray[Double]
      .map(_.toString.toDouble / W_M_locaux.rdd.getNumPartitions)).reduce(sum(_, _))
  }
  W
}

def MOM_MBGD_parallel_DS(w: Array[Double], nu: Double, epochs: Int, epochsLocal: Int,
                           ds: Dataset[GenericDSRow], tailleBatch: Int, beta: Double): Array[Double] = {
    var W = w
    var M = w
    for (i <- 1 to epochs) {
      val spark = getSS
      import spark.implicits._
      val W_b = sc.broadcast(W)
      val M_b = sc.broadcast(M)
      val W_M_locaux = ds.mapPartitions(
        iterator => { // Iterator[Row]
          val r2 = toArrayDeRow(iterator.toIterable.toArray[GenericDSRow])
          var result = MOM_MBGD_local(W_b.value, nu, epochsLocal,
            arrayOfRowToArray(r2), tailleBatch, M_b.value, beta)
          Seq(result).toIterator
        })
      W = W_M_locaux.select("*").collect().map(w => w.getAs[Seq[Double]](0).toArray[Double]
        .map(_.toString.toDouble / W_M_locaux.rdd.getNumPartitions)).reduce(sum(_, _))

      M = W_M_locaux.select("*").collect().map(w => w.getAs[Seq[Double]](1).toArray[Double]
        .map(_.toString.toDouble / W_M_locaux.rdd.getNumPartitions)).reduce(sum(_, _))
    }
    W
  }

In [0]:
%scala

def ADA_MBGD_local(W_b: Array[Double], nu: Double, epochsLocal: Int, r: Array[(Double, Array[Double])], tailleBatch: Int, S_b: Array[Double], eps: Double): (Array[Double], Array[Double]) = {

    val N = r.length
    var W_local = W_b
    var S_local = S_b
    for (i <- 1 to epochsLocal) {
      var random_idx_list = shuffle(0 to N - 1)
      val batches_idx = random_idx_list.grouped(tailleBatch).toList
      for (batch_idx <- batches_idx) {
        val batch = batch_idx.map(index => r.apply(index))
        val temp = batch.map(x => grad(x._1, W_local, x._2)).reduce(sum(_, _)).map(x => x / batch.length)
        val sq = mul(temp, temp)
        S_local = sum(S_local, sq)
        val nu_w = prod_by_scal(temp, nu)
        val root1 = root(S_local).map(x => x + eps)
        val div1 = div(nu_w, root1)
        W_local = subtr(W_local, div1)
      }
    }
    (W_local, S_local)
  }

def ADA_MBGD_parallel_RDD(w: Array[Double], nu: Double, epochs: Int, epochsLocal: Int,
                            r: RDD[(Double, Array[Double])], tailleBatch: Int, eps: Double): Array[Double] = {

    val numPartitions = r.getNumPartitions
    //println("Nombre de partitions : ", numPartitions)
    val parts = r.glom()
    var W = w
    var S = w
    for (i <- 1 to epochs) {
      val W_b = sc.broadcast(W)
      val S_b = sc.broadcast(S)
      val W_S_locaux = parts.map(x => ADA_MBGD_local(W_b.value, nu, epochsLocal, x, tailleBatch, S_b.value, eps))
      val W_locaux = W_S_locaux.map(x => x._1)
      val S_locaux = W_S_locaux.map(x => x._2)
      W = W_locaux.map(w => w.map(_ / numPartitions)).reduce(sum(_, _))
      S = S_locaux.map(m => m.map(_ / numPartitions)).reduce(sum(_, _))
    }
    W
 }
def ADA_MBGD_parallel_DF(w: Array[Double], nu: Double, epochs: Int, epochsLocal: Int,
                           df: DataFrame, tailleBatch: Int, eps: Double): Array[Double] = {
    var W = w
    var S = w
    for (i <- 1 to epochs) {
      val spark = getSS
      import spark.implicits._
      val W_b = sc.broadcast(W)
      val S_b = sc.broadcast(S)
      val W_S_locaux = df.mapPartitions(
        iterator => { // Iterator[Row]
          var result = ADA_MBGD_local(W_b.value, nu, epochsLocal, arrayOfRowToArray(iterator.toIterable.toArray[Row]), tailleBatch, S_b.value, eps)
          Seq(result).toIterator
        })
      W = W_S_locaux.select("*").collect().map(w => w.getAs[Seq[Double]](0).toArray[Double]
        .map(_.toString.toDouble / W_S_locaux.rdd.getNumPartitions)).reduce(sum(_, _))

      S = W_S_locaux.select("*").collect().map(w => w.getAs[Seq[Double]](1).toArray[Double]
        .map(_.toString.toDouble / W_S_locaux.rdd.getNumPartitions)).reduce(sum(_, _))
    }
    W
  }

def ADA_MBGD_parallel_DS(w: Array[Double], nu: Double, epochs: Int, epochsLocal: Int,
                           ds: Dataset[GenericDSRow], tailleBatch: Int, eps: Double): Array[Double] = {
    val spark = getSS
    var W = w
    var S = w
    for (i <- 1 to epochs) {
      import spark.implicits._
      val W_b = sc.broadcast(W)
      val S_b = sc.broadcast(S)
      val W_S_locaux = ds.mapPartitions(
        iterator => { // Iterator[Row]
          val r2 = toArrayDeRow(iterator.toIterable.toArray[GenericDSRow])
          var result = ADA_MBGD_local(W_b.value, nu, epochsLocal, arrayOfRowToArray(r2), tailleBatch, S_b.value, eps)
          Seq(result).toIterator
        })
      W = W_S_locaux.select("*").collect().map(w => w.getAs[Seq[Double]](0).toArray[Double]
        .map(_.toString.toDouble / W_S_locaux.rdd.getNumPartitions)).reduce(sum(_, _))

      S = W_S_locaux.select("*").collect().map(w => w.getAs[Seq[Double]](1).toArray[Double]
        .map(_.toString.toDouble / W_S_locaux.rdd.getNumPartitions)).reduce(sum(_, _))
    } // Fin for
    W
  }

In [0]:
%scala

def preprocess(pathFile: String, label: String, repartition: Int = 0): (DataFrame, DataFrame) = {

    val spark = getSS
    import spark.implicits._

    // Load CSV into DataFrame
    val df = getSS.read.format("csv")
      .option("header", "true")
      .option("inferSchema", "true")
      //.option("numPartitions", partitionNumber)
      .load(pathFile)

    // Repartitonment
    //val dfRepart = df.coalesce(repartitionNumber)

    // Drop useless columns
    val dfDrop = df.drop("date", "_c0")

    //convert to Double
    //valueStr = valueStr.replace(',', '.');

    // Add Bias column
    val dfWithBias = dfDrop.withColumn("bias", lit(1.0))
    dfWithBias.show()

    // Fill null values with mean
    val imputer = new Imputer()
      .setInputCols(dfWithBias.columns)
      .setOutputCols(dfWithBias.columns.map(c => s"${c}"))
      .setStrategy("mean")

    val dfCleaned = imputer.fit(dfWithBias).transform(dfWithBias)

    // Split df into train and test set
    def train_test_split(data: DataFrame) = {

      val assembler = new VectorAssembler().
        setInputCols(data.drop(label).columns).
        setOutputCol("features")

      val Array(train, test) = data.randomSplit(Array(0.8, 0.2), seed = 0)
      (assembler.transform(train), assembler.transform(test))
    }

    val (train, test) = train_test_split(dfCleaned)


    // Normalize each feature to have unit standard deviation.
    val scaler = new StandardScaler()
      .setInputCol("features")
      .setOutputCol("scaledFeatures")
      .setWithStd(true)
      .setWithMean(false)

    val scaledTrain = scaler.fit(train).transform(train)
    val scaledTest = scaler.fit(train).transform(test)

    val toArr: Any => Array[Double] = _.asInstanceOf[DenseVector].toArray
      .map(x => BigDecimal(x)
        .setScale(4, BigDecimal.RoundingMode.HALF_UP).toDouble)
    val toArrUdf = udf(toArr)

    val scaledTrain_arr = scaledTrain.withColumn("features_arr", toArrUdf('scaledFeatures))
    val scaledTest_arr = scaledTest.withColumn("features_arr", toArrUdf('scaledFeatures))

    val df_train = scaledTrain_arr.select(label, "features_arr").withColumnRenamed(label, "labels").withColumnRenamed("features_arr", "features")
    val df_test = scaledTest_arr.select(label, "features_arr").withColumnRenamed(label, "labels").withColumnRenamed("features_arr", "features")

    if (repartition != 0) {
      val repart_df_train = df_train.coalesce(repartition)
      val repart_df_test = df_test.coalesce(repartition)
      return (repart_df_train, repart_df_test)
    }

    (df_train, df_test)
  }

def preProcessEtExportCSV() = {
    val spark = getSS
    import spark.implicits._

    val schema = StructType(
      StructField("labels", StringType, true) ::
        StructField("features", ArrayType(DoubleType), false) :: Nil)

    val (df_train_, df_test_) =
      preprocess("/FileStore/tables/mining_dataset.csv", "% Silica Concentrate", 18)

    val df_train = df_train_.toDF("labels", "features")
    val df_test = df_test_.toDF("labels", "features")
    val dfTrainPourExport = df_train.select($"labels", $"features"(0), $"features"(1), $"features"(2), $"features"(3), $"features"(4),
      $"features"(5), $"features"(6), $"features"(7), $"features"(8), $"features"(9),
      $"features"(10), $"features"(11), $"features"(12), $"features"(13), $"features"(14),
      $"features"(15), $"features"(16), $"features"(17), $"features"(18), $"features"(19),
      $"features"(20), $"features"(21), $"features"(22)
    )
    val dfTestPourExport = df_test.select($"labels", $"features"(0), $"features"(1), $"features"(2), $"features"(3), $"features"(4),
      $"features"(5), $"features"(6), $"features"(7), $"features"(8), $"features"(9),
      $"features"(10), $"features"(11), $"features"(12), $"features"(13), $"features"(14),
      $"features"(15), $"features"(16), $"features"(17), $"features"(18), $"features"(19),
      $"features"(20), $"features"(21), $"features"(22)
    )

    exportToCSV(dfTrainPourExport, "dtrain.csv")
    exportToCSV(dfTestPourExport, "dtest.csv")
  }

  def exportToCSV(df: DataFrame, path: String) = {
    val spark = getSS
    df.coalesce(1).write.option("header", "true").csv(path)
  }

def valeursNulles(df_train: DataFrame) = {
  // affiche le total de valeurs nulles par colonne
  import org.apache.spark.sql.functions._
  df_train.select(df_train.columns.map(colName => {
    count(when(col(colName).isNull, true)) as s"${colName}_nulls_count"
  }): _*).show(10)
}

// Faire des predictions en utilisant les W issus d'une Gradient Descent
def predict(df_test: DataFrame, w_gd: Array[Double]): DataFrame = {
  val spark = getSS
  import spark.implicits._
  val pred = df_test.select("features").map(row => prod_scal(row.getSeq(0).toArray, w_gd)).toDF("predictions")
  return pred
}

// R2-Score des predictions de la gradient descente (problème linéaire)
def r2_score(pred: DataFrame, df_test: DataFrame): Double = {
  val spark = getSS
  import spark.implicits._
  val y_mean = df_test.select(mean(df_test("labels"))).collect()(0).getDouble(0)
  //println("y_mean", y_mean)
  //val ss_res = (pred.rdd zip df_test.select("labels").rdd).map(x => (x._1 - x._2.getDouble(0))).map(x => x * x).reduce(_+_)/n
  val ss_res = (df_test.select("labels").rdd zip pred.select("predictions").rdd).map(x => (x._1.getDouble(0) - x._2.getDouble(0)) * (x._1.getDouble(0) - x._2.getDouble(0))).reduce(_ + _)
  //println("res", ss_res)
  val ss_tot = df_test.select("labels").map(yi => (yi.getDouble(0) - y_mean) * (yi.getDouble(0) - y_mean)).reduce(_ + _)
  //println("tot", ss_tot)
  val score = 1 - ss_res / ss_tot
  return score
}

// Mean Squarred Error des predictions de la gradient descent
def mse(pred: DataFrame, df_test: DataFrame): Double = {
  val y_mean = df_test.select(mean(df_test("labels"))).collect()(0).getDouble(0)
  println("y_mean", y_mean)
  //val ss_res = (pred.rdd zip df_test.select("labels").rdd).map(x => (x._1 - x._2.getDouble(0))).map(x => x * x).reduce(_+_)/n
  val mse = (df_test.select("labels").rdd zip pred.select("predictions").rdd).map(x => (x._1.getDouble(0) - x._2.getDouble(0)) * (x._1.getDouble(0) - x._2.getDouble(0))).reduce(_ + _) / df_test.count()
  return mse
}

def recupCsvEnDataFrame(path: String): DataFrame = {
    val spark = getSS
    import spark.implicits._
    val df_ = getSS.read.format("csv")
      .option("header", "true")
      .option("inferSchema", "true")
      //.option("numPartitions", partitionNumber)
      .load(path)
    val assembler = new VectorAssembler().
      setInputCols(df_.drop("labels").columns).
      setOutputCol("features_")
    val df__ = assembler.transform(df_)
    val df = df__.select("labels", "features_")

    val toArr: Any => Array[Double] = _.asInstanceOf[DenseVector].toArray
      .map(x => BigDecimal(x)
        .setScale(4, BigDecimal.RoundingMode.HALF_UP).toDouble)
    val toArrUdf = udf(toArr)

    val df2 = df.withColumn("features", toArrUdf('features_))
    val df3 = df2.drop("features_")
    df3
}

In [0]:
%scala

def generateRDD(taille: Int, repartition: Int = 0): RDD[(Double, Array[Double])] = {

  val r = scala.util.Random
  val spark = getSS
  import spark.implicits._

  val f1 = for (i <- 1 to taille) yield r.nextInt(100).toDouble
  val f2 = for (i <- 1 to taille) yield r.nextInt(100).toDouble
  val f3 = for (i <- 1 to taille) yield r.nextInt(100).toDouble
  val features = (f1 zip f2 zip f3).map(x => (x._1._1, x._1._2, x._2))
  val data = features.map(x => (5 * x._1 + 14 * x._2 + 0.5 * x._3 + 2, Array(x._1, x._2, x._3, 1)))


  if (repartition != 0) {
    val rdd = sc.parallelize(data, repartition)
    return rdd
  }
  else {
    val rdd = sc.parallelize(data)
    return rdd
  }
}

def generateDataFrame(taille: Int, repartition: Int = 0): DataFrame = {
  val r = scala.util.Random
  val spark = getSS
  import spark.implicits._

  val f1 = for (i <- 1 to taille) yield r.nextInt(100).toDouble
  val f2 = for (i <- 1 to taille) yield r.nextInt(100).toDouble
  val f3 = for (i <- 1 to taille) yield r.nextInt(100).toDouble
  val features = (f1 zip f2 zip f3).map(x => (x._1._1, x._1._2, x._2))
  val data = features.map(x => (5 * x._1 + 14 * x._2 + 0.5 * x._3 + 2, Array(x._1, x._2, x._3, 1)))

  if (repartition != 0) {
    val rdd = sc.parallelize(data, repartition)
    val df = rdd.toDF("labels", "features")
    return df
  }
  else {
    val rdd = sc.parallelize(data)
    val df = rdd.toDF("labels", "features")
    return df
  }

}

def generateDataSet(taille: Int, repartition: Int = 0): Dataset[GenericDSRow] = {
  val r = scala.util.Random
  val spark = getSS
  import spark.implicits._

  val f1 = for (i <- 1 to taille) yield r.nextInt(100).toDouble
  val f2 = for (i <- 1 to taille) yield r.nextInt(100).toDouble
  val f3 = for (i <- 1 to taille) yield r.nextInt(100).toDouble
  val features = (f1 zip f2 zip f3).map(x => (x._1._1, x._1._2, x._2))
  val data = features.map(x => (5 * x._1 + 14 * x._2 + 0.5 * x._3 + 2, Array(x._1, x._2, x._3, 1)))

  if (repartition != 0) {
    val rdd = sc.parallelize(data, repartition)
    val df = rdd.toDF("labels", "features")
    val ds: Dataset[GenericDSRow] = df.as[GenericDSRow]
    return ds
  }
  else {
    val rdd = sc.parallelize(data)
    val df = rdd.toDF("labels", "features")
    val ds: Dataset[GenericDSRow] = df.as[GenericDSRow]
    return ds
  }

}

In [0]:
%scala

def gridSearchGeneriqueDataSetGenere(tailleMaxDatasetAGenerer: Int,
                                     listePartitions: Array[Int],
                                     listeTypeDataset: Array[Int], // RDD = 0, DF = 1, DS = 2
                                     listeDataSize: Array[Int],
                                     listeBatchSize: Array[Int],
                                     listeVariantesGD: Array[Int], // Standard = 0, Momentum = 1, Adagrad = 2
                                     listeLocalParallele: Array[Int], // 0 = local, 1 = parralèle
                                     listeEpochsGlobal: Array[Int],
                                     listeEpochsLocal: Array[Int],
                                     pathCSVExport : String): DataFrame = {

  var lignesCVS = ArrayBuffer[Row]()
  var mapLigneCSV = Map[String, Any]()

  for (dataSize <- listeDataSize) {
    println("###### datasize = " + dataSize)
    for (partitionsNumber <- listePartitions) {
      println("#### Partitions number = " + partitionsNumber)
      val rddG = generateRDD(tailleMaxDatasetAGenerer, partitionsNumber)
      val dfG = generateDataFrame(tailleMaxDatasetAGenerer, partitionsNumber)
      val dsG = generateDataSet(tailleMaxDatasetAGenerer, partitionsNumber)
      val datasetGArray = rddG.collect()
      for (epochGlobal <- listeEpochsGlobal) {
        println("## Epoch globale = " + epochGlobal)
        for (epochLocal <- listeEpochsLocal) {
          println("Epoch locale = " + epochLocal)


          for (batchSize <- listeBatchSize) {
            println("Batch size = " + batchSize)
            val spark = getSS
            import spark.implicits._
            val dfT = datasetGArray.toSeq.toDF("labels", "features")

            var w = Array(0.0, 0.0, 0.0, 0.0)
            var r = Row.empty

            for (localParallel <- listeLocalParallele) {
              for (varianteGD <- listeVariantesGD) {

                if (localParallel == 0) {
                  if (varianteGD == 0) {
                    //region Standard local
                    // Version standard
                    // Local

                    w = Array(0.0, 0.0, 0.0, 0.0)

                    val t0 = System.nanoTime()
                    w = MBGD_local_SansSpark(w, 0.00001, epochLocal, datasetGArray, batchSize)
                    val t1 = System.nanoTime()
                    val predStandardLocal = predict(dfT, w)
                    val scoreStandardLocal = r2_score(predStandardLocal, dfT)
                    //println("Score = " + scoreStandardLocal)
                    mapLigneCSV += ("epochGlobal" -> epochGlobal, "epochLocal" -> epochLocal,
                      "score" -> scoreStandardLocal, "batchSize" -> batchSize,
                      "partitionsNumber" -> partitionsNumber, "varianteGD" -> "Standard",
                      "localParallele" -> "Local", "dataSize" -> dataSize,
                      "typeDataset" -> "Array")
                    mapLigneCSV += ("tempsExecution" -> (t1 - t0) / 1000000000.0)
                    r = Row.fromSeq(Seq(mapLigneCSV("varianteGD"), mapLigneCSV("typeDataset"),
                      mapLigneCSV("localParallele"), mapLigneCSV("dataSize"),
                      mapLigneCSV("batchSize"), mapLigneCSV("partitionsNumber"),
                      mapLigneCSV("epochGlobal"), mapLigneCSV("epochLocal"), mapLigneCSV("score"),
                      mapLigneCSV("tempsExecution")))
                    //println(r mkString ", ")
                    lignesCVS += r
                    //endregion
                  }
                  //region Momentum local
                  // Version Momentum
                  // Local
                  if (varianteGD == 1) {
                    w = Array(0.0, 0.0, 0.0, 0.0)

                    val M_Mom = w
                    val t0ML = System.nanoTime()
                    w = MOM_MBGD_local(w, 0.00001, epochLocal, datasetGArray, batchSize, M_Mom, 0.90)._1
                    val t1ML = System.nanoTime()
                    val predMomentumLocal = predict(dfT, w)
                    val scoreMomentumLocal = r2_score(predMomentumLocal, dfT)
                    mapLigneCSV += ("epochGlobal" -> epochGlobal, "epochLocal" -> epochLocal,
                      "score" -> scoreMomentumLocal, "batchSize" -> batchSize,
                      "partitionsNumber" -> partitionsNumber, "varianteGD" -> "Momentum",
                      "localParallele" -> "Local", "dataSize" -> dataSize,
                      "typeDataset" -> "Array")
                    //println("Time = " + timeStandardLocal)
                    mapLigneCSV += ("tempsExecution" -> (t1ML - t0ML) / 1000000000.0)
                    r = Row.fromSeq(Seq(mapLigneCSV("varianteGD"), mapLigneCSV("typeDataset"),
                      mapLigneCSV("localParallele"), mapLigneCSV("dataSize"), mapLigneCSV("batchSize"),
                      mapLigneCSV("partitionsNumber"),
                      mapLigneCSV("epochGlobal"), mapLigneCSV("epochLocal"), mapLigneCSV("score"),
                      mapLigneCSV("tempsExecution")))
                    //println(r mkString ", ")
                    lignesCVS += r
                    //endregion

                  }
                  //region Adagrad local
                  // Version Adagrad
                  // Local
                  if (varianteGD == 2) {
                    w = Array(0.0, 0.0, 0.0, 0.0)

                    val M_Ada = w
                    val t0AL = System.nanoTime()
                    w = ADA_MBGD_local(w, 0.00001, epochLocal, datasetGArray, batchSize, M_Ada, 0.90)._1
                    val t1AL = System.nanoTime()
                    val predAdagradLocal = predict(dfT, w)
                    val scoreAdagradLocal = r2_score(predAdagradLocal, dfT)
                    //println("Score = " + scoreStandardLocal)
                    mapLigneCSV += ("epochGlobal" -> epochGlobal, "epochLocal" -> epochLocal,
                      "score" -> scoreAdagradLocal, "batchSize" -> batchSize,
                      "partitionsNumber" -> partitionsNumber, "varianteGD" -> "Adagrad",
                      "localParallele" -> "Local", "dataSize" -> dataSize,
                      "typeDataset" -> "Array")
                    mapLigneCSV += ("tempsExecution" -> (t1AL - t0AL) / 1000000000.0)
                    r = Row.fromSeq(Seq(mapLigneCSV("varianteGD"), mapLigneCSV("typeDataset"),
                      mapLigneCSV("localParallele"), mapLigneCSV("dataSize"), mapLigneCSV("batchSize"),
                      mapLigneCSV("partitionsNumber"),
                      mapLigneCSV("epochGlobal"), mapLigneCSV("epochLocal"), mapLigneCSV("score"),
                      mapLigneCSV("tempsExecution")))
                    //println(r mkString ", ")
                    lignesCVS += r
                    //endregion
                  }
                }
                else {
                  // Parallèle

                  for (typeDS <- listeTypeDataset) {
                    if (varianteGD == 0) {
                      //<editor-fold desc="Parallele standard">
                      // Standard

                      if (typeDS == 0) {
                        w = Array(0.0, 0.0, 0.0, 0.0)
                        val t0PS = System.nanoTime()
                        w = MBGD_parallel_RDD(w, 0.00001, epochGlobal, epochLocal, rddG, batchSize)
                        val t1PS = System.nanoTime()
                        mapLigneCSV += ("typeDataset" -> "RDD")
                        mapLigneCSV += ("tempsExecution" -> (t1PS - t0PS) / 1000000000.0)
                      }

                      if (typeDS == 1) {
                        w = Array(0.0, 0.0, 0.0, 0.0)
                        val t0PS = System.nanoTime()
                        w = MBGD_parallel_DF(w, 0.00001, epochGlobal, epochLocal, dfG, batchSize)
                        val t1PS = System.nanoTime()
                        mapLigneCSV += ("typeDataset" -> "DataFrame")
                        mapLigneCSV += ("tempsExecution" -> (t1PS - t0PS) / 1000000000.0)
                      }
                      if (typeDS == 2) {
                        w = Array(0.0, 0.0, 0.0, 0.0)
                        val t0PS = System.nanoTime()
                        w = MBGD_parallel_DS(w, 0.00001, epochGlobal, epochLocal, dsG, batchSize)
                        val t1PS = System.nanoTime()
                        mapLigneCSV += ("typeDataset" -> "Dataset")
                        mapLigneCSV += ("tempsExecution" -> (t1PS - t0PS) / 1000000000.0)
                      }
                      val predStandardParallele = predict(dfG, w)
                      val scoreStandardParallele = r2_score(predStandardParallele, dfG)
                      mapLigneCSV += ("epochGlobal" -> epochGlobal, "epochLocal" -> epochLocal,
                        "score" -> scoreStandardParallele, "batchSize" -> batchSize,
                        "partitionsNumber" -> partitionsNumber, "varianteGD" -> "Standard",
                        "localParallele" -> "Parallele", "dataSize" -> dataSize)
                      r = Row.fromSeq(Seq(mapLigneCSV("varianteGD"), mapLigneCSV("typeDataset"),
                        mapLigneCSV("localParallele"), mapLigneCSV("dataSize"), mapLigneCSV("batchSize"),
                        mapLigneCSV("partitionsNumber"),
                        mapLigneCSV("epochGlobal"), mapLigneCSV("epochLocal"), mapLigneCSV("score"),
                        mapLigneCSV("tempsExecution")))
                      //println(r mkString ", ")
                      lignesCVS += r
                      //</editor-fold>
                    }
                    //region Paralle Momentum
                    // Momentum
                    if (varianteGD == 1) {
                      if (typeDS == 0) {
                        w = Array(0.0, 0.0, 0.0, 0.0)
                        val t0PM = System.nanoTime()
                        w = MOM_MBGD_parallel_RDD(w, 0.00001, epochGlobal, epochLocal,
                          rddG, batchSize, 0.90)
                        val t1PM = System.nanoTime()
                        mapLigneCSV += ("typeDataset" -> "RDD")
                        mapLigneCSV += ("tempsExecution" -> (t1PM - t0PM) / 1000000000.0)
                      }

                      if (typeDS == 1) {
                        w = Array(0.0, 0.0, 0.0, 0.0)
                        val t0PM = System.nanoTime()
                        w = MOM_MBGD_parallel_DF(w, 0.00001, epochGlobal, epochLocal,
                          dfG, batchSize, 0.90)
                        val t1PM = System.nanoTime()
                        mapLigneCSV += ("typeDataset" -> "DataFrame")
                        mapLigneCSV += ("tempsExecution" -> (t1PM - t0PM) / 1000000000.0)
                      }
                      if (typeDS == 2) {
                        w = Array(0.0, 0.0, 0.0, 0.0)
                        val t0PM = System.nanoTime()
                        w = MOM_MBGD_parallel_DS(w, 0.00001, epochGlobal, epochLocal,
                          dsG, batchSize, 0.90)
                        val t1PM = System.nanoTime()
                        mapLigneCSV += ("typeDataset" -> "Dataset")
                        mapLigneCSV += ("tempsExecution" -> (t1PM - t0PM) / 1000000000.0)
                      }
                      val predParalleleMomentum = predict(dfG, w)
                      val scoreParalleleMomentum = r2_score(predParalleleMomentum, dfG)
                      mapLigneCSV += ("epochGlobal" -> epochGlobal, "epochLocal" -> epochLocal,
                        "score" -> scoreParalleleMomentum, "batchSize" -> batchSize,
                        "partitionsNumber" -> partitionsNumber, "varianteGD" -> "Momentum",
                        "localParallele" -> "Parallele", "dataSize" -> dataSize)
                      r = Row.fromSeq(Seq(mapLigneCSV("varianteGD"), mapLigneCSV("typeDataset"),
                        mapLigneCSV("localParallele"),
                        mapLigneCSV("dataSize"), mapLigneCSV("batchSize"), mapLigneCSV("partitionsNumber"),
                        mapLigneCSV("epochGlobal"), mapLigneCSV("epochLocal"), mapLigneCSV("score"),
                        mapLigneCSV("tempsExecution")))
                      //println(r mkString ", ")
                      lignesCVS += r
                      //endregion
                    }
                    if (varianteGD == 2) {
                      //region Parallele Adagrad
                      // Adagrad
                      if (typeDS == 0) {
                        w = Array(0.0, 0.0, 0.0, 0.0)
                        val t0PA = System.nanoTime()
                        w = ADA_MBGD_parallel_RDD(w, 10, epochGlobal, epochLocal,
                          rddG, batchSize, 0.000000001)
                        val t1PA = System.nanoTime()
                        mapLigneCSV += ("typeDataset" -> "RDD")
                        mapLigneCSV += ("tempsExecution" -> (t1PA - t0PA) / 1000000000.0)
                      }
                      if (typeDS == 1) {
                        w = Array(0.0, 0.0, 0.0, 0.0)
                        val t0PA = System.nanoTime()
                        w = ADA_MBGD_parallel_DF(w, 10, epochGlobal, epochLocal,
                          dfG, batchSize, 0.000000001)
                        val t1PA = System.nanoTime()
                        mapLigneCSV += ("typeDataset" -> "DataFrame")
                        mapLigneCSV += ("tempsExecution" -> (t1PA - t0PA) / 1000000000.0)
                      }
                      if (typeDS == 2) {
                        w = Array(0.0, 0.0, 0.0, 0.0)
                        val t0PA = System.nanoTime()
                        w = ADA_MBGD_parallel_DS(w, 10, epochGlobal, epochLocal,
                          dsG, batchSize, 0.000000001)
                        val t1PA = System.nanoTime()
                        mapLigneCSV += ("typeDataset" -> "Dataset")
                        mapLigneCSV += ("tempsExecution" -> (t1PA - t0PA) / 1000000000.0)
                      }
                      val predParalleleAdagrad = predict(dfG, w)
                      val scoreParalleleAdagrad = r2_score(predParalleleAdagrad, dfG)
                      mapLigneCSV += ("epochGlobal" -> epochGlobal, "epochLocal" -> epochLocal,
                        "score" -> scoreParalleleAdagrad, "batchSize" -> batchSize,
                        "partitionsNumber" -> partitionsNumber, "varianteGD" -> "Adagrad",
                        "localParallele" -> "Parallele", "dataSize" -> dataSize)
                      r = Row.fromSeq(Seq(mapLigneCSV("varianteGD"), mapLigneCSV("typeDataset"),
                        mapLigneCSV("localParallele"), mapLigneCSV("dataSize"), mapLigneCSV("batchSize"),
                        mapLigneCSV("partitionsNumber"),
                        mapLigneCSV("epochGlobal"), mapLigneCSV("epochLocal"), mapLigneCSV("score"),
                        mapLigneCSV("tempsExecution")))
                      //println(r mkString ", ")
                      lignesCVS += r
                      //endregion
                    }
                  }
                }
              }
            } // localParallele
          } // batchSize
        } // epochLocal
      } // epochGlobal
    } // partitionNumber
  } // dataSize

  val rdd = sc.makeRDD(lignesCVS)
  val schema = StructType(
    StructField("varianteGD", StringType, false) ::
      StructField("typeDataset", StringType, false) ::
      StructField("localParallele", StringType, false) ::
      StructField("dataSize", IntegerType, false) ::
      StructField("batchSize", IntegerType, false) ::
      StructField("partitionsNumber", IntegerType, false) ::
      StructField("epochGlobal", IntegerType, true) ::
      StructField("epochLocal", IntegerType, false) ::
      StructField("score", DoubleType, false) ::
      StructField("tempsExecutionMs", DoubleType, false) ::
      Nil)
  val df = getSS.createDataFrame(rdd, schema)
  df.show(false)
  exportToCSV(df, pathCSVExport)
  df

}

In [0]:
%scala

gridSearchGeneriqueDataSetGenere(
    listePartitions = Array(4, 8),
    listeBatchSize = Array(1, 32),
    listeDataSize = Array(10000),
    listeEpochsGlobal = Array(1),
    listeEpochsLocal = Array(1),
    listeLocalParallele = Array(1),
    listeTypeDataset = Array(0, 1, 2),
    listeVariantesGD = Array(0, 1, 2),
    tailleMaxDatasetAGenerer = 10000,
    pathCSVExport = "grid_DSG.csv")

In [0]:
%scala

def gridSearchGeneriqueMinierDataSet(pathDatasetTrain: String = "/FileStore/tables/dtrain.csv",
                                       pathDatasetTest: String = "/FileStore/tables/dtest.csv",
                                       listePartitions: Array[Int],
                                       listeTypeDataset: Array[Int], // RDD = 0, DF = 1, DS = 2
                                       listeDataSize: Array[Int],
                                       listeBatchSize: Array[Int],
                                       listeVariantesGD: Array[Int], // Standard = 0, Momentum = 1, Adagrad = 2
                                       listeLocalParallele: Array[Int], // 0 = local, 1 = parralèle
                                       listeEpochsGlobal: Array[Int],
                                       listeEpochsLocal: Array[Int],
                                       pathCVSExport : String
                                      ): DataFrame = {

    var lignesCVS = ArrayBuffer[Row]()
    var mapLigneCSV = Map[String, Any]()

    for (dataSize <- listeDataSize) {
      println("###### datasize = " + dataSize)
      for (partitionsNumber <- listePartitions) {

        println("#### Partitions number = " + partitionsNumber)

        val spark = getSS
        import spark.implicits._

        val dfTrain: DataFrame = recupCsvEnDataFrame(pathDatasetTrain).coalesce(partitionsNumber)
        //display(dfTrain)
        val dfTest: DataFrame = recupCsvEnDataFrame(pathDatasetTest).coalesce(partitionsNumber)
        val dsTrain: Dataset[GenericDSRow] = dfTrain.as[GenericDSRow]
        val dsTest: Dataset[GenericDSRow] = dfTest.as[GenericDSRow]
        val rddTrain = dfTrain.rdd.map(x => (x.getDouble(0), x.getSeq(1).toArray[Double]))
        val rddTest = dfTest.rdd.map(x => (x.getDouble(0), x.getSeq(1).toArray[Double]))
        val datasetTrainArray = rddTrain.collect()
        val datasetTestArray = rddTest.collect()

        for (epochGlobal <- listeEpochsGlobal) {
          println("## Epoch globale = " + epochGlobal)
          for (epochLocal <- listeEpochsLocal) {
            println("Epoch locale = " + epochLocal)

            for (batchSize <- listeBatchSize) {
              println("Batch size = " + batchSize)

              val spark = getSS
              import spark.implicits._

              val dfT = datasetTestArray.toSeq.toDF("labels", "features")

              var w = Array(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
              var r = Row.empty

              for (localParallel <- listeLocalParallele) {
                for (varianteGD <- listeVariantesGD) {

                  if (localParallel == 0) {
                    if (varianteGD == 0) {
                      //region Standard local
                      // Version standard
                      // Local

                      w = Array(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)

                      val t0 = System.nanoTime()
                      w = MBGD_local_SansSpark(w, 0.00001, epochLocal, datasetTrainArray, batchSize)
                      val t1 = System.nanoTime()
                      val predStandardLocal = predict(dfT, w)
                      val scoreStandardLocal = r2_score(predStandardLocal, dfT)
                      //println("Score = " + scoreStandardLocal)
                      mapLigneCSV += ("epochGlobal" -> epochGlobal, "epochLocal" -> epochLocal,
                        "score" -> scoreStandardLocal, "batchSize" -> batchSize,
                        "partitionsNumber" -> partitionsNumber, "varianteGD" -> "Standard",
                        "localParallele" -> "Local", "dataSize" -> dataSize,
                        "typeDataset" -> "Array")
                      mapLigneCSV += ("tempsExecution" -> (t1 - t0) / 1000000000.0)
                      r = Row.fromSeq(Seq(mapLigneCSV("varianteGD"), mapLigneCSV("typeDataset"),
                        mapLigneCSV("localParallele"), mapLigneCSV("dataSize"), mapLigneCSV("batchSize"),
                        mapLigneCSV("partitionsNumber"),
                        mapLigneCSV("epochGlobal"), mapLigneCSV("epochLocal"), mapLigneCSV("score"),
                        mapLigneCSV("tempsExecution")))
                      //println(r mkString ", ")
                      lignesCVS += r
                      //endregion
                    }
                    //region Momentum local
                    // Version Momentum
                    // Local
                    if (varianteGD == 1) {
                      w = Array(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)

                      val M_Mom = w
                      val t0ML = System.nanoTime()
                      w = MOM_MBGD_local(w, 0.00001, epochLocal, datasetTrainArray,
                        batchSize, M_Mom, 0.90)._1
                      val t1ML = System.nanoTime()
                      val predMomentumLocal = predict(dfT, w)
                      val scoreMomentumLocal = r2_score(predMomentumLocal, dfT)
                      mapLigneCSV += ("epochGlobal" -> epochGlobal, "epochLocal" -> epochLocal,
                        "score" -> scoreMomentumLocal, "batchSize" -> batchSize,
                        "partitionsNumber" -> partitionsNumber, "varianteGD" -> "Momentum",
                        "localParallele" -> "Local", "dataSize" -> dataSize,
                        "typeDataset" -> "Array")
                      //println("Time = " + timeStandardLocal)
                      mapLigneCSV += ("tempsExecution" -> (t1ML - t0ML) / 1000000000.0)
                      r = Row.fromSeq(Seq(mapLigneCSV("varianteGD"), mapLigneCSV("typeDataset"),
                        mapLigneCSV("localParallele"), mapLigneCSV("dataSize"), mapLigneCSV("batchSize"),
                        mapLigneCSV("partitionsNumber"), mapLigneCSV("epochGlobal"), mapLigneCSV("epochLocal"),
                        mapLigneCSV("score"), mapLigneCSV("tempsExecution")))
                      //println(r mkString ", ")
                      lignesCVS += r
                      //endregion

                    }
                    //region Adagrad local
                    // Version Adagrad
                    // Local
                    if (varianteGD == 2) {
                      w = Array(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)

                      val M_Ada = w
                      val t0AL = System.nanoTime()
                      w = ADA_MBGD_local(w, 0.00001, epochLocal,
                        datasetTrainArray, batchSize, M_Ada, 0.90)._1
                      val t1AL = System.nanoTime()
                      val predAdagradLocal = predict(dfT, w)
                      val scoreAdagradLocal = r2_score(predAdagradLocal, dfT)
                      //println("Score = " + scoreStandardLocal)
                      mapLigneCSV += ("epochGlobal" -> epochGlobal, "epochLocal" -> epochLocal,
                        "score" -> scoreAdagradLocal, "batchSize" -> batchSize,
                        "partitionsNumber" -> partitionsNumber, "varianteGD" -> "Adagrad",
                        "localParallele" -> "Local", "dataSize" -> dataSize,
                        "typeDataset" -> "Array")
                      mapLigneCSV += ("tempsExecution" -> (t1AL - t0AL) / 1000000000.0)
                      r = Row.fromSeq(Seq(mapLigneCSV("varianteGD"), mapLigneCSV("typeDataset"),
                        mapLigneCSV("localParallele"), mapLigneCSV("dataSize"), mapLigneCSV("batchSize"),
                        mapLigneCSV("partitionsNumber"),
                        mapLigneCSV("epochGlobal"), mapLigneCSV("epochLocal"), mapLigneCSV("score"),
                        mapLigneCSV("tempsExecution")))
                      //println(r mkString ", ")
                      lignesCVS += r
                      //endregion
                    }
                  }
                  else {
                    // Parallèle

                    for (typeDS <- listeTypeDataset) {
                      if (varianteGD == 0) {
                        //<editor-fold desc="Parallele standard">
                        // Standard

                        if (typeDS == 0) {
                          w = Array(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
                          val t0PS = System.nanoTime()
                          w = MBGD_parallel_RDD(w, 0.00001, epochGlobal, epochLocal, rddTrain, batchSize)
                          val t1PS = System.nanoTime()
                          mapLigneCSV += ("typeDataset" -> "RDD")
                          mapLigneCSV += ("tempsExecution" -> (t1PS - t0PS) / 1000000000.0)
                        }

                        if (typeDS == 1) {
                          w = Array(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
                          val t0PS = System.nanoTime()
                          w = MBGD_parallel_DF(w, 0.00001, epochGlobal, epochLocal, dfTrain, batchSize)
                          val t1PS = System.nanoTime()
                          mapLigneCSV += ("typeDataset" -> "DataFrame")
                          mapLigneCSV += ("tempsExecution" -> (t1PS - t0PS) / 1000000000.0)
                        }
                        if (typeDS == 2) {
                          w = Array(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
                          val t0PS = System.nanoTime()
                          w = MBGD_parallel_DS(w, 0.00001, epochGlobal, epochLocal, dsTrain, batchSize)
                          val t1PS = System.nanoTime()
                          mapLigneCSV += ("typeDataset" -> "Dataset")
                          mapLigneCSV += ("tempsExecution" -> (t1PS - t0PS) / 1000000000.0)
                        }
                        val predStandardParallele = predict(dfTest, w)
                        val scoreStandardParallele = r2_score(predStandardParallele, dfTest)
                        mapLigneCSV += ("epochGlobal" -> epochGlobal, "epochLocal" -> epochLocal,
                          "score" -> scoreStandardParallele, "batchSize" -> batchSize,
                          "partitionsNumber" -> partitionsNumber, "varianteGD" -> "Standard",
                          "localParallele" -> "Parallele", "dataSize" -> dataSize)
                        r = Row.fromSeq(Seq(mapLigneCSV("varianteGD"), mapLigneCSV("typeDataset"),
                          mapLigneCSV("localParallele"), mapLigneCSV("dataSize"), mapLigneCSV("batchSize"),
                          mapLigneCSV("partitionsNumber"),
                          mapLigneCSV("epochGlobal"), mapLigneCSV("epochLocal"), mapLigneCSV("score"),
                          mapLigneCSV("tempsExecution")))
                        //println(r mkString ", ")
                        lignesCVS += r
                        //</editor-fold>
                      }
                      //region Paralle Momentum
                      // Momentum
                      if (varianteGD == 1) {
                        if (typeDS == 0) {
                          w = Array(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
                          val t0PM = System.nanoTime()
                          w = MOM_MBGD_parallel_RDD(w, 0.00001, epochGlobal, epochLocal,
                            rddTrain, batchSize, 0.90)
                          val t1PM = System.nanoTime()
                          mapLigneCSV += ("typeDataset" -> "RDD")
                          mapLigneCSV += ("tempsExecution" -> (t1PM - t0PM) / 1000000000.0)
                        }

                        if (typeDS == 1) {
                          w = Array(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
                          val t0PM = System.nanoTime()
                          w = MOM_MBGD_parallel_DF(w, 0.00001, epochGlobal, epochLocal,
                            dfTrain, batchSize, 0.90)
                          val t1PM = System.nanoTime()
                          mapLigneCSV += ("typeDataset" -> "DataFrame")
                          mapLigneCSV += ("tempsExecution" -> (t1PM - t0PM) / 1000000000.0)
                        }
                        if (typeDS == 2) {
                          w = Array(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
                          val t0PM = System.nanoTime()
                          w = MOM_MBGD_parallel_DS(w, 0.00001, epochGlobal, epochLocal,
                            dsTrain, batchSize, 0.90)
                          val t1PM = System.nanoTime()
                          mapLigneCSV += ("typeDataset" -> "Dataset")
                          mapLigneCSV += ("tempsExecution" -> (t1PM - t0PM) / 1000000000.0)
                        }
                        val predParalleleMomentum = predict(dfTest, w)
                        val scoreParalleleMomentum = r2_score(predParalleleMomentum, dfTest)
                        mapLigneCSV += ("epochGlobal" -> epochGlobal, "epochLocal" -> epochLocal,
                          "score" -> scoreParalleleMomentum, "batchSize" -> batchSize,
                          "partitionsNumber" -> partitionsNumber, "varianteGD" -> "Momentum",
                          "localParallele" -> "Parallele", "dataSize" -> dataSize)
                        r = Row.fromSeq(Seq(mapLigneCSV("varianteGD"), mapLigneCSV("typeDataset"),
                          mapLigneCSV("localParallele"),
                          mapLigneCSV("dataSize"), mapLigneCSV("batchSize"), mapLigneCSV("partitionsNumber"),
                          mapLigneCSV("epochGlobal"), mapLigneCSV("epochLocal"), mapLigneCSV("score"),
                          mapLigneCSV("tempsExecution")))
                        //println(r mkString ", ")
                        lignesCVS += r
                        //endregion
                      }
                      if (varianteGD == 2) {
                        //region Parallele Adagrad
                        // Adagrad
                        if (typeDS == 0) {
                          w = Array(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
                          val t0PA = System.nanoTime()
                          w = ADA_MBGD_parallel_RDD(w, 10, epochGlobal, epochLocal,
                            rddTrain, batchSize, 0.000000001)
                          val t1PA = System.nanoTime()
                          mapLigneCSV += ("typeDataset" -> "RDD")
                          mapLigneCSV += ("tempsExecution" -> (t1PA - t0PA) / 1000000000.0)
                        }
                        if (typeDS == 1) {
                          w = Array(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
                          val t0PA = System.nanoTime()
                          w = ADA_MBGD_parallel_DF(w, 10, epochGlobal, epochLocal,
                            dfTrain, batchSize, 0.000000001)
                          val t1PA = System.nanoTime()
                          mapLigneCSV += ("typeDataset" -> "DataFrame")
                          mapLigneCSV += ("tempsExecution" -> (t1PA - t0PA) / 1000000000.0)
                        }
                        if (typeDS == 2) {
                          w = Array(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
                          val t0PA = System.nanoTime()
                          w = ADA_MBGD_parallel_DS(w, 10, epochGlobal, epochLocal,
                            dsTrain, batchSize, 0.000000001)
                          val t1PA = System.nanoTime()
                          mapLigneCSV += ("typeDataset" -> "Dataset")
                          mapLigneCSV += ("tempsExecution" -> (t1PA - t0PA) / 1000000000.0)
                        }
                        val predParalleleAdagrad = predict(dfTest, w)
                        val scoreParalleleAdagrad = r2_score(predParalleleAdagrad, dfTest)
                        mapLigneCSV += ("epochGlobal" -> epochGlobal, "epochLocal" -> epochLocal,
                          "score" -> scoreParalleleAdagrad, "batchSize" -> batchSize,
                          "partitionsNumber" -> partitionsNumber, "varianteGD" -> "Adagrad",
                          "localParallele" -> "Parallele", "dataSize" -> dataSize)
                        r = Row.fromSeq(Seq(mapLigneCSV("varianteGD"), mapLigneCSV("typeDataset"),
                          mapLigneCSV("localParallele"), mapLigneCSV("dataSize"), mapLigneCSV("batchSize"),
                          mapLigneCSV("partitionsNumber"),
                          mapLigneCSV("epochGlobal"), mapLigneCSV("epochLocal"), mapLigneCSV("score"),
                          mapLigneCSV("tempsExecution")))
                        //println(r mkString ", ")
                        lignesCVS += r
                        //endregion
                      }
                    }
                  }
                }
              } // localParallele
            } // batchSize
          } // epochLocal
        } // epochGlobal
      } // partitionNumber
    } // dataSize

    val rdd = sc.makeRDD(lignesCVS)
    val schema = StructType(
      StructField("varianteGD", StringType, false) ::
        StructField("typeDataset", StringType, false) ::
        StructField("localParallele", StringType, false) ::
        StructField("dataSize", IntegerType, false) ::
        StructField("batchSize", IntegerType, false) ::
        StructField("partitionsNumber", IntegerType, false) ::
        StructField("epochGlobal", IntegerType, true) ::
        StructField("epochLocal", IntegerType, false) ::
        StructField("score", DoubleType, false) ::
        StructField("tempsExecutionMs", DoubleType, false) ::
        Nil)
    val df = getSS.createDataFrame(rdd, schema)
    df.show(false)
    exportToCSV(df, pathCVSExport)
    df

  }

In [0]:
%scala

gridSearchGeneriqueMinierDataSet(
  listePartitions = Array(4),
  listeBatchSize = Array(32),
  listeDataSize = Array(1000),
  listeEpochsGlobal = Array(1),
  listeEpochsLocal = Array(1),
  listeLocalParallele = Array(1),
  listeTypeDataset = Array(0, 1, 2),
  listeVariantesGD = Array(0, 1, 2),
  pathCVSExport = "grid_Minier.csv")