In [1]:
import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.functions._
import org.apache.spark.SparkConf
import org.apache.spark.ml.feature.{RegexTokenizer,StopWordsRemover,
                                    StringIndexer,CountVectorizer,
                                    CountVectorizerModel,VectorAssembler,
                                   IDF,OneHotEncoderEstimator}

import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}

// import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
// import org.apache.spark.ml.evaluation
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator

import org.apache.spark.ml.param.ParamMap


val conf = new SparkConf().setAll(Map(
      "spark.scheduler.mode" -> "FIFO",
      "spark.speculation" -> "false",
      "spark.reducer.maxSizeInFlight" -> "48m",
      "spark.serializer" -> "org.apache.spark.serializer.KryoSerializer",
      "spark.kryoserializer.buffer.max" -> "1g",
      "spark.shuffle.file.buffer" -> "32k",
      "spark.default.parallelism" -> "12",
      "spark.sql.shuffle.partitions" -> "12",
      "spark.driver.maxResultSize" -> "2g"
    ))

val spark = SparkSession
  .builder
  .config(conf)
  .appName("TP Spark : Trainer")
  .getOrCreate()


/*******************************************************************************
  *
  *       TP 3
  *
  *       - lire le fichier sauvegarder précédemment
  *       - construire les Stages du pipeline, puis les assembler
  *       - trouver les meilleurs hyperparamètres pour l'entraînement du pipeline avec une grid-search
  *       - Sauvegarder le pipeline entraîné
  *
  *       if problems with unimported modules => sbt plugins update
  *
  ********************************************************************************/

println("hello world ! from Trainer")

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.1.88:4040
SparkContext available as 'sc' (version = 2.4.4, master = local[*], app id = local-1572451134780)
SparkSession available as 'spark'


hello world ! from Trainer


import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.functions._
import org.apache.spark.SparkConf
import org.apache.spark.ml.feature.{RegexTokenizer, StopWordsRemover, StringIndexer, CountVectorizer, CountVectorizerModel, VectorAssembler, IDF, OneHotEncoderEstimator}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
conf: org.apache.spark.SparkConf = org.apache.spark.SparkConf@20ba0352
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@4c1ca7a


In [2]:
val df:DataFrame = spark.read.parquet("/home/jorge/Documents/Cours/Spark/RepoAdotTPs/data/prepared_trainingset/")

// df.select("project_id", "name", "desc", "goal").show(5)
// df.select("keywords", "final_status", "country2", "currency2").show(5)
// df.select("deadline2", "created_at2", "launched_at2", "days_campaign").show(5)
// df.select("hours_prepa", "text")

df: org.apache.spark.sql.DataFrame = [project_id: string, name: string ... 12 more fields]


In [87]:
val tokenizer = new RegexTokenizer()
  .setPattern("\\W+")
  .setGaps(true)
  .setInputCol("text")
  .setOutputCol("tokens")
// val dfTokenized = tokenizer.transform(df)


val remover = new StopWordsRemover()
  .setInputCol(tokenizer.getOutputCol)
  .setOutputCol("filtered")

// val dfsw = remover.transform(dfTokenized)
 

val cvModel: CountVectorizer = new CountVectorizer()
    .setInputCol(remover.getOutputCol)
    .setOutputCol("vect")
    .setMinDF(50)
    
    

// val dfv = cvModel.fit(dfsw).transform(dfsw)



val idf = new IDF()
    .setInputCol(cvModel.getOutputCol)
    .setOutputCol("tfidf")

// val idfModel = idf.fit(dfv)

// val rescaledData = idfModel.transform(dfv)
                                      

val indexerCountry = new StringIndexer()
  .setInputCol("country2")
  .setOutputCol("country_indexed")

val indexerCurrency = new StringIndexer()
  .setInputCol("currency2")
  .setOutputCol("currency_indexed")


// val indexedCountry = indexerCountry.fit(rescaledData).transform(rescaledData)
// val indexedCountryCurrency = indexerCurrency.fit(indexedCountry).transform(indexedCountry)


val encoder = new OneHotEncoderEstimator()
  .setInputCols(Array("country_indexed", "currency_indexed"))
  .setOutputCols(Array("country_onehot", "currency_onehot"))

// val model = encoder.fit(indexedCountryCurrency)
// val encoded = model.transform(indexedCountryCurrency)


val assembler = new VectorAssembler()
  .setInputCols(Array("tfidf","days_campaign","hours_prepa","goal","country_onehot","currency_onehot"))
  .setOutputCol("features")

val lr = new LogisticRegression()
  .setElasticNetParam(0.0)
  .setFitIntercept(true)
  .setFeaturesCol("features")
  .setLabelCol("final_status")
  .setStandardization(true)
  .setPredictionCol("predictions")
  .setRawPredictionCol("raw_predictions")
  .setThresholds(Array(0.7, 0.3))
  .setTol(1.0e-6)
  .setMaxIter(20)

// val transformed = assembler
//     .setHandleInvalid("skip")
//     .transform(encoded)
//     .drop("project_id","name","desc","goal","keywords",
//          "country2","currency2","deadline2","created_at2","launched_at2",
//          "days_campaign","hours_prepa","text","tokens","filtered","vect",
//          "country_indexed","currency_indexed","tfidf","days_campaign",
//           "hours_prepa","goal","country_onehot","currency_onehot")
    




tokenizer: org.apache.spark.ml.feature.RegexTokenizer = regexTok_aa96344723f5
remover: org.apache.spark.ml.feature.StopWordsRemover = stopWords_9cc38643efbf
cvModel: org.apache.spark.ml.feature.CountVectorizer = cntVec_fd09521beb46
idf: org.apache.spark.ml.feature.IDF = idf_41061e223cb2
indexerCountry: org.apache.spark.ml.feature.StringIndexer = strIdx_3483fbbf35e5
indexerCurrency: org.apache.spark.ml.feature.StringIndexer = strIdx_6ab476750a62
encoder: org.apache.spark.ml.feature.OneHotEncoderEstimator = oneHotEncoder_f0ff38978403
assembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_6420eefb8ffc
lr: org.apache.spark.ml.classification.LogisticRegression = logreg_7240eeb9b1b1


In [4]:
// transformed.show(numRows=5,truncate=false)

val pipeline = new Pipeline()
  .setStages(Array(tokenizer, remover,cvModel,idf,indexerCountry,
                  indexerCurrency,encoder, assembler,lr ))

// Fit the pipeline to training documents.
val model = pipeline.fit(df)



pipeline: org.apache.spark.ml.Pipeline = pipeline_a690d506a23b
model: org.apache.spark.ml.PipelineModel = pipeline_a690d506a23b


In [5]:
model.transform(df).columns

model.transform(df).select("text","predictions","final_status").show(100)

+--------------------+-----------+------------+
|                text|predictions|final_status|
+--------------------+-----------+------------+
|american options ...|        1.0|           0|
|iheadbones bone c...|        0.0|           0|
|the fridge magazi...|        0.0|           0|
|support new men's...|        0.0|           0|
|can('t) a psychol...|        0.0|           0|
|fragmented fate e...|        0.0|           0|
|transport (suspen...|        0.0|           0|
|the secret life o...|        0.0|           0|
|cc survival decep...|        0.0|           0|
|the best protein ...|        0.0|           0|
|paradise falls pa...|        1.0|           0|
|the chalet woodsh...|        1.0|           1|
|vagabond mobile g...|        0.0|           0|
|southern shakespe...|        1.0|           1|
|leviathan: montau...|        1.0|           1|
|the candle tray h...|        0.0|           0|
|sun skin the miss...|        0.0|           0|
|7sonic debut stud...|        0.0|      

In [6]:
val Array(train,test) = df.randomSplit(Array[Double](0.9, 0.1))
val size = (train.count,test.count)

train: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [project_id: string, name: string ... 12 more fields]
test: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [project_id: string, name: string ... 12 more fields]
size: (Long, Long) = (96910,10704)


In [7]:
val model2 = pipeline.fit(train)

model2: org.apache.spark.ml.PipelineModel = pipeline_a690d506a23b


In [8]:
val predictions = model2.transform(test)

predictions.select("final_status","predictions","probability").show(100)

+------------+-----------+--------------------+
|final_status|predictions|         probability|
+------------+-----------+--------------------+
|           0|        1.0|[0.32634362590507...|
|           0|        0.0|[0.99999999999840...|
|           0|        0.0|[0.99990862947358...|
|           0|        0.0|[0.99999997832714...|
|           1|        0.0|[0.99999829968507...|
|           0|        0.0|[0.99999997618484...|
|           0|        1.0|[6.07975854013273...|
|           0|        1.0|[0.07003446318569...|
|           1|        1.0|[1.14444280953689...|
|           0|        0.0|[0.99999999862510...|
|           1|        1.0|[1.88378357933403...|
|           0|        1.0|[0.43119273106945...|
|           0|        0.0|[0.99999999696719...|
|           1|        0.0|[0.99999999911144...|
|           0|        0.0|[0.99972014926533...|
|           1|        1.0|[4.72478290707708...|
|           0|        0.0|[0.99994963921231...|
|           0|        0.0|[0.98645639926

predictions: org.apache.spark.sql.DataFrame = [project_id: string, name: string ... 24 more fields]


In [9]:
// // Clear the prediction threshold so the model will return probabilities
// model2.clearThreshold

// // Compute raw scores on the test set
// val predictionAndLabels = test.map { case LabeledPoint(label, features) =>
//   val prediction = model2.predict(features)
//   (prediction, label)
// }

// // Instantiate metrics object
// val metrics = new BinaryClassificationMetrics(predictionAndLabels)

// // Precision by threshold
// val precision = metrics.precisionByThreshold
// precision.foreach { case (t, p) =>
//   println(s"Threshold: $t, Precision: $p")
// }

// // Recall by threshold
// val recall = metrics.recallByThreshold
// recall.foreach { case (t, r) =>
//   println(s"Threshold: $t, Recall: $r")
// }

// // Precision-Recall Curve
// val PRC = metrics.pr

// // F-measure
// val f1Score = metrics.fMeasureByThreshold
// f1Score.foreach { case (t, f) =>
//   println(s"Threshold: $t, F-score: $f, Beta = 1")
// }

// val beta = 0.5
// val fScore = metrics.fMeasureByThreshold(beta)
// f1Score.foreach { case (t, f) =>
//   println(s"Threshold: $t, F-score: $f, Beta = 0.5")
// }

// // AUPRC
// val auPRC = metrics.areaUnderPR
// println(s"Area under precision-recall curve = $auPRC")

// // Compute thresholds used in ROC and PR curves
// val thresholds = precision.map(_._1)

// // ROC Curve
// val roc = metrics.roc

// // AUROC
// val auROC = metrics.areaUnderROC
// println(s"Area under ROC = $auROC")

In [10]:
val evaluator = new MulticlassClassificationEvaluator()
      .setLabelCol("final_status")
      .setPredictionCol("predictions")
      .setMetricName("f1")

val f1 = evaluator.evaluate(predictions)
println("Test set accuracy = " + f1)

Test set accuracy = 0.6205254720418814


evaluator: org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator = mcEval_7a5033ea15df
f1: Double = 0.6205254720418814


In [11]:
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder,CrossValidatorModel,TrainValidationSplit}
import org.apache.spark.ml.param.ParamMap



import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder, CrossValidatorModel, TrainValidationSplit}
import org.apache.spark.ml.param.ParamMap


In [12]:
val grid = new ParamGridBuilder()
    .addGrid(lr.regParam,Array(10e-8,10e-6,10e-4,10e-2))
    .build()

// val cv = new CrossValidator()
//   .setEstimator(pipeline)
//   .setEvaluator(evaluator)
//   .setEstimatorParamMaps(grid)
//   .setNumFolds(5)

// val cvModel = cv.fit(df)

val trainValidationSplit = new TrainValidationSplit()
  .setEstimator(pipeline)
  .setEvaluator(evaluator)
  .setEstimatorParamMaps(grid)
  // 80% of the data will be used for training and the remaining 20% for validation.
  .setTrainRatio(0.7)

val model = trainValidationSplit.fit(train)


grid: Array[org.apache.spark.ml.param.ParamMap] =
Array({
	logreg_ba37d9389c1d-regParam: 1.0E-7
}, {
	logreg_ba37d9389c1d-regParam: 1.0E-5
}, {
	logreg_ba37d9389c1d-regParam: 0.001
}, {
	logreg_ba37d9389c1d-regParam: 0.1
})
trainValidationSplit: org.apache.spark.ml.tuning.TrainValidationSplit = tvs_4ec7686e2310
model: org.apache.spark.ml.tuning.TrainValidationSplitModel = tvs_4ec7686e2310


In [13]:
// model.bestModel.params.
model.bestModel.transform(test).select("name","final_status","predictions","probability").show(100)

+--------------------+------------+-----------+--------------------+
|                name|final_status|predictions|         probability|
+--------------------+------------+-----------+--------------------+
|northern lights o...|           0|        0.0|[0.78392009081449...|
|happy birthday, m...|           0|        0.0|[0.75705606894479...|
|           plexibots|           0|        0.0|[0.70923059165288...|
|           sellsafer|           0|        0.0|[0.95349249745577...|
|arresting power: ...|           1|        0.0|[0.77689872101372...|
|web page to revol...|           0|        0.0|[0.93434627040621...|
|        inchicago.co|           0|        1.0|[0.47398301919751...|
|project tangled -...|           0|        0.0|[0.73872767245204...|
|day of the dead b...|           1|        1.0|[0.24081599891754...|
|j.calarese&co mar...|           0|        0.0|[0.91734089576189...|
|official elfquest...|           1|        1.0|[0.31152821301541...|
|"kol presents fir...|           0

In [14]:
evaluator.evaluate(model.bestModel.transform(test))

res6: Double = 0.6468747625444753


In [103]:
val grid2 = new ParamGridBuilder()
    .addGrid(lr.regParam,Array(10e-8,10e-6,10e-4,10e-2))
    .addGrid(cvModel.minDF,Array(55.0,75.0,95.0))
    .build()


// val cv = new CrossValidator()
//   .setEstimator(pipeline)
//   .setEvaluator(evaluator)
//   .setEstimatorParamMaps(grid)
//   .setNumFolds(5)

// val cvModel = cv.fit(df)

val trainValidationSplit2 = new TrainValidationSplit()
  .setEstimator(pipeline)
  .setEvaluator(evaluator)
  .setEstimatorParamMaps(grid)
  // 80% of the data will be used for training and the remaining 20% for validation.
  .setTrainRatio(0.7)

val modelbis = trainValidationSplit2.fit(train)


grid2: Array[org.apache.spark.ml.param.ParamMap] =
Array({
	cntVec_fd09521beb46-minDF: 55.0,
	logreg_7240eeb9b1b1-regParam: 1.0E-7
}, {
	cntVec_fd09521beb46-minDF: 75.0,
	logreg_7240eeb9b1b1-regParam: 1.0E-7
}, {
	cntVec_fd09521beb46-minDF: 95.0,
	logreg_7240eeb9b1b1-regParam: 1.0E-7
}, {
	cntVec_fd09521beb46-minDF: 55.0,
	logreg_7240eeb9b1b1-regParam: 1.0E-5
}, {
	cntVec_fd09521beb46-minDF: 75.0,
	logreg_7240eeb9b1b1-regParam: 1.0E-5
}, {
	cntVec_fd09521beb46-minDF: 95.0,
	logreg_7240eeb9b1b1-regParam: 1.0E-5
}, {
	cntVec_fd09521beb46-minDF: 55.0,
	logreg_7240eeb9b1b1-regParam: 0.001
}, {
	cntVec_fd09521beb46-minDF: 75.0,
	logreg_7240eeb9b1b1-regParam: 0.001
}, {
	cntVec_fd09521beb46-minDF: 95.0,
	logreg_7240eeb9b1b1-regParam: 0.001
}, {
	cntVec_fd09521beb46-minDF: 55.0,
	logreg_7240ee...

In [110]:
// modelbis.bestModel
evaluator.evaluate(modelbis.bestModel.transform(test))

res66: Double = 0.6500708947162648


In [111]:
evaluator.evaluate(model.bestModel.transform(test))

res67: Double = 0.6468747625444753
