In [1]:
import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.functions._
import org.apache.spark.SparkConf
import org.apache.spark.ml.feature.{RegexTokenizer,StopWordsRemover,
                                    StringIndexer,CountVectorizer,
                                    CountVectorizerModel,VectorAssembler,
                                   IDF,OneHotEncoderEstimator}

import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}

// import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
// import org.apache.spark.ml.evaluation
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator


val conf = new SparkConf().setAll(Map(
      "spark.scheduler.mode" -> "FIFO",
      "spark.speculation" -> "false",
      "spark.reducer.maxSizeInFlight" -> "48m",
      "spark.serializer" -> "org.apache.spark.serializer.KryoSerializer",
      "spark.kryoserializer.buffer.max" -> "1g",
      "spark.shuffle.file.buffer" -> "32k",
      "spark.default.parallelism" -> "12",
      "spark.sql.shuffle.partitions" -> "12",
      "spark.driver.maxResultSize" -> "2g"
    ))

val spark = SparkSession
  .builder
  .config(conf)
  .appName("TP Spark : Trainer")
  .getOrCreate()


/*******************************************************************************
  *
  *       TP 3
  *
  *       - lire le fichier sauvegarder précédemment
  *       - construire les Stages du pipeline, puis les assembler
  *       - trouver les meilleurs hyperparamètres pour l'entraînement du pipeline avec une grid-search
  *       - Sauvegarder le pipeline entraîné
  *
  *       if problems with unimported modules => sbt plugins update
  *
  ********************************************************************************/

println("hello world ! from Trainer")

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.1.88:4040
SparkContext available as 'sc' (version = 2.4.4, master = local[*], app id = local-1572432463337)
SparkSession available as 'spark'


hello world ! from Trainer


import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.functions._
import org.apache.spark.SparkConf
import org.apache.spark.ml.feature.{RegexTokenizer, StopWordsRemover, StringIndexer, CountVectorizer, CountVectorizerModel, VectorAssembler, IDF, OneHotEncoderEstimator}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
conf: org.apache.spark.SparkConf = org.apache.spark.SparkConf@2579fcc2
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@5f38b140


In [2]:
val df:DataFrame = spark.read.parquet("/home/jorge/Documents/Cours/Spark/RepoAdotTPs/data/prepared_trainingset/")

// df.select("project_id", "name", "desc", "goal").show(5)
// df.select("keywords", "final_status", "country2", "currency2").show(5)
// df.select("deadline2", "created_at2", "launched_at2", "days_campaign").show(5)
// df.select("hours_prepa", "text")

df: org.apache.spark.sql.DataFrame = [project_id: string, name: string ... 12 more fields]


In [3]:
val tokenizer = new RegexTokenizer()
  .setPattern("\\W+")
  .setGaps(true)
  .setInputCol("text")
  .setOutputCol("tokens")
// val dfTokenized = tokenizer.transform(df)


val remover = new StopWordsRemover()
  .setInputCol(tokenizer.getOutputCol)
  .setOutputCol("filtered")

// val dfsw = remover.transform(dfTokenized)
 

val cvModel: CountVectorizer = new CountVectorizer()
    .setInputCol(remover.getOutputCol)
    .setOutputCol("vect")
    
    

// val dfv = cvModel.fit(dfsw).transform(dfsw)



val idf = new IDF()
    .setInputCol(cvModel.getOutputCol)
    .setOutputCol("tfidf")

// val idfModel = idf.fit(dfv)

// val rescaledData = idfModel.transform(dfv)
                                      

val indexerCountry = new StringIndexer()
  .setInputCol("country2")
  .setOutputCol("country_indexed")

val indexerCurrency = new StringIndexer()
  .setInputCol("currency2")
  .setOutputCol("currency_indexed")


// val indexedCountry = indexerCountry.fit(rescaledData).transform(rescaledData)
// val indexedCountryCurrency = indexerCurrency.fit(indexedCountry).transform(indexedCountry)


val encoder = new OneHotEncoderEstimator()
  .setInputCols(Array("country_indexed", "currency_indexed"))
  .setOutputCols(Array("country_onehot", "currency_onehot"))

// val model = encoder.fit(indexedCountryCurrency)
// val encoded = model.transform(indexedCountryCurrency)


val assembler = new VectorAssembler()
  .setInputCols(Array("tfidf","days_campaign","hours_prepa","goal","country_onehot","currency_onehot"))
  .setOutputCol("features")

val lr = new LogisticRegression()
  .setElasticNetParam(0.0)
  .setFitIntercept(true)
  .setFeaturesCol("features")
  .setLabelCol("final_status")
  .setStandardization(true)
  .setPredictionCol("predictions")
  .setRawPredictionCol("raw_predictions")
  .setThresholds(Array(0.7, 0.3))
  .setTol(1.0e-6)
  .setMaxIter(20)

// val transformed = assembler
//     .setHandleInvalid("skip")
//     .transform(encoded)
//     .drop("project_id","name","desc","goal","keywords",
//          "country2","currency2","deadline2","created_at2","launched_at2",
//          "days_campaign","hours_prepa","text","tokens","filtered","vect",
//          "country_indexed","currency_indexed","tfidf","days_campaign",
//           "hours_prepa","goal","country_onehot","currency_onehot")
    




tokenizer: org.apache.spark.ml.feature.RegexTokenizer = regexTok_90f95d9fbda1
remover: org.apache.spark.ml.feature.StopWordsRemover = stopWords_6642ff7f2608
cvModel: org.apache.spark.ml.feature.CountVectorizer = cntVec_3d7b967aea14
idf: org.apache.spark.ml.feature.IDF = idf_e6e6e41434da
indexerCountry: org.apache.spark.ml.feature.StringIndexer = strIdx_14b00bde8a30
indexerCurrency: org.apache.spark.ml.feature.StringIndexer = strIdx_ef7429f34d0d
encoder: org.apache.spark.ml.feature.OneHotEncoderEstimator = oneHotEncoder_5f7bb18aabc5
assembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_97c613ba91fd
lr: org.apache.spark.ml.classification.LogisticRegression = logreg_8beeec840755


In [4]:
// transformed.show(numRows=5,truncate=false)

val pipeline = new Pipeline()
  .setStages(Array(tokenizer, remover,cvModel,idf,indexerCountry,
                  indexerCurrency,encoder, assembler,lr ))

// Fit the pipeline to training documents.
val model = pipeline.fit(df)



pipeline: org.apache.spark.ml.Pipeline = pipeline_a3e0987b9dba
model: org.apache.spark.ml.PipelineModel = pipeline_a3e0987b9dba


In [5]:
model.transform(df).columns

model.transform(df).select("text","predictions","final_status").show(100)

+--------------------+-----------+------------+
|                text|predictions|final_status|
+--------------------+-----------+------------+
|american options ...|        1.0|           0|
|iheadbones bone c...|        0.0|           0|
|the fridge magazi...|        0.0|           0|
|support new men's...|        0.0|           0|
|can('t) a psychol...|        0.0|           0|
|fragmented fate e...|        0.0|           0|
|transport (suspen...|        0.0|           0|
|the secret life o...|        0.0|           0|
|cc survival decep...|        0.0|           0|
|the best protein ...|        0.0|           0|
|paradise falls pa...|        1.0|           0|
|the chalet woodsh...|        1.0|           1|
|vagabond mobile g...|        0.0|           0|
|southern shakespe...|        1.0|           1|
|leviathan: montau...|        1.0|           1|
|the candle tray h...|        0.0|           0|
|sun skin the miss...|        0.0|           0|
|7sonic debut stud...|        0.0|      

In [5]:
val Array(train,test) = df.randomSplit(Array[Double](0.9, 0.1))
val size = (train.count,test.count)

train: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [project_id: string, name: string ... 12 more fields]
test: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [project_id: string, name: string ... 12 more fields]
size: (Long, Long) = (96845,10769)


In [6]:
val model2 = pipeline.fit(train)

model2: org.apache.spark.ml.PipelineModel = pipeline_a3e0987b9dba


In [8]:
val predictions = model2.transform(test)

predictions.select("final_status","predictions","probability").show(100)

+------------+-----------+--------------------+
|final_status|predictions|         probability|
+------------+-----------+--------------------+
|           0|        0.0|[0.99999987707389...|
|           1|        0.0|[0.99999999992283...|
|           0|        1.0|[0.01376248593450...|
|           0|        0.0|[0.99977858802339...|
|           0|        0.0|[0.99999968129451...|
|           1|        1.0|[0.09710237657632...|
|           0|        0.0|[1.0,2.0860542810...|
|           0|        0.0|[0.99999999997036...|
|           0|        0.0|[0.95399092757000...|
|           0|        0.0|[1.0,1.5001636745...|
|           1|        0.0|[0.79988997181747...|
|           0|        0.0|[0.99888812244579...|
|           0|        0.0|[0.99999970884972...|
|           1|        1.0|[8.26967814844707...|
|           0|        0.0|[0.76357898886251...|
|           0|        0.0|[0.99999999991236...|
|           0|        1.0|[0.17061237784837...|
|           1|        1.0|[0.24769895038

predictions: org.apache.spark.sql.DataFrame = [project_id: string, name: string ... 24 more fields]


In [10]:
// // Clear the prediction threshold so the model will return probabilities
// model2.clearThreshold

// // Compute raw scores on the test set
// val predictionAndLabels = test.map { case LabeledPoint(label, features) =>
//   val prediction = model2.predict(features)
//   (prediction, label)
// }

// // Instantiate metrics object
// val metrics = new BinaryClassificationMetrics(predictionAndLabels)

// // Precision by threshold
// val precision = metrics.precisionByThreshold
// precision.foreach { case (t, p) =>
//   println(s"Threshold: $t, Precision: $p")
// }

// // Recall by threshold
// val recall = metrics.recallByThreshold
// recall.foreach { case (t, r) =>
//   println(s"Threshold: $t, Recall: $r")
// }

// // Precision-Recall Curve
// val PRC = metrics.pr

// // F-measure
// val f1Score = metrics.fMeasureByThreshold
// f1Score.foreach { case (t, f) =>
//   println(s"Threshold: $t, F-score: $f, Beta = 1")
// }

// val beta = 0.5
// val fScore = metrics.fMeasureByThreshold(beta)
// f1Score.foreach { case (t, f) =>
//   println(s"Threshold: $t, F-score: $f, Beta = 0.5")
// }

// // AUPRC
// val auPRC = metrics.areaUnderPR
// println(s"Area under precision-recall curve = $auPRC")

// // Compute thresholds used in ROC and PR curves
// val thresholds = precision.map(_._1)

// // ROC Curve
// val roc = metrics.roc

// // AUROC
// val auROC = metrics.areaUnderROC
// println(s"Area under ROC = $auROC")

<console>: 53: error: not found: value LabeledPoint

In [9]:
val evaluator = new MulticlassClassificationEvaluator()
      .setLabelCol("final_status")
      .setPredictionCol("predictions")
      .setMetricName("f1")
val f1 = evaluator.evaluate(predictions)
println("Test set accuracy = " + f1)

Test set accuracy = 0.624888227666162


evaluator: org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator = mcEval_6bedc65f5757
f1: Double = 0.624888227666162


In [24]:
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder,CrossValidatorModel,TrainValidationSplit}
import org.apache.spark.ml.param.ParamMap



import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder, CrossValidatorModel}
import org.apache.spark.ml.param.ParamMap


In [31]:
val grid = new ParamGridBuilder()
    .addGrid(lr.regParam,Array(10e-8,10e-6,10e-4,10e-2))
    .build()

// val cv = new CrossValidator()
//   .setEstimator(pipeline)
//   .setEvaluator(evaluator)
//   .setEstimatorParamMaps(grid)
//   .setNumFolds(5)

// val cvModel = cv.fit(df)

val trainValidationSplit = new TrainValidationSplit()
  .setEstimator(pipeline)
  .setEvaluator(evaluator)
  .setEstimatorParamMaps(grid)
  // 80% of the data will be used for training and the remaining 20% for validation.
  .setTrainRatio(0.7)

val model = trainValidationSplit.fit(train)


grid: Array[org.apache.spark.ml.param.ParamMap] =
Array({
	logreg_8beeec840755-regParam: 1.0E-7
}, {
	logreg_8beeec840755-regParam: 1.0E-5
}, {
	logreg_8beeec840755-regParam: 0.001
}, {
	logreg_8beeec840755-regParam: 0.1
})
trainValidationSplit: org.apache.spark.ml.tuning.TrainValidationSplit = tvs_dc9f31a4ea30
model: org.apache.spark.ml.tuning.TrainValidationSplitModel = tvs_dc9f31a4ea30


In [64]:
// model.bestModel.params.
model.bestModel.transform(test).select("name","final_status","predictions","probability").show(100)

+--------------------+------------+-----------+--------------------+
|                name|final_status|predictions|         probability|
+--------------------+------------+-----------+--------------------+
|             the uvu|           0|        0.0|[0.89219364023528...|
|have mic will tra...|           1|        0.0|[0.85525926772793...|
|owlboard fpga dev...|           0|        1.0|[0.40587133154625...|
|            darkarts|           0|        0.0|[0.85331445021782...|
|"""""""""""""""""...|           0|        0.0|[0.83254182259887...|
|please join me in...|           1|        1.0|[0.60509109830901...|
|magnifying glass ...|           0|        0.0|[0.99496553490080...|
| teacher in thailand|           0|        0.0|[0.87563744941621...|
|         qlashamusic|           0|        0.0|[0.78186603235272...|
|gourmet banana pu...|           0|        0.0|[0.99986967876772...|
|ghost -- a music ...|           1|        0.0|[0.70818885942979...|
|lf designs embroi...|           0

In [66]:
evaluator.evaluate(model.bestModel.transform(test))

res37: Double = 0.6504788520086969
