In [1]:
import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, SparkSession}

val conf = {new SparkConf().setAll(Map("spark.scheduler.mode" -> "FIFO",
      "spark.speculation" -> "false",
      "spark.reducer.maxSizeInFlight" -> "48m",
      "spark.serializer" -> "org.apache.spark.serializer.KryoSerializer",
      "spark.kryoserializer.buffer.max" -> "1g",
      "spark.shuffle.file.buffer" -> "32k",
      "spark.default.parallelism" -> "12",
      "spark.sql.shuffle.partitions" -> "12"
    ))}

    // Initialisation du SparkSession qui est le point d'entrée vers Spark SQL (donne accès aux dataframes, aux RDD,
    // création de tables temporaires, etc., et donc aux mécanismes de distribution des calculs)
val spark = {SparkSession
  .builder
  .config(conf)
  .appName("TP Spark : Preprocessor")
  .getOrCreate}

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.1.88:4040
SparkContext available as 'sc' (version = 2.4.4, master = local[*], app id = local-1575311985253)
SparkSession available as 'spark'


import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, SparkSession}
conf: org.apache.spark.SparkConf = org.apache.spark.SparkConf@4719d0b6
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@64a23c0c


In [108]:
val df:DataFrame = spark
      .read
      .option("header", true) 
      .option("inferSchema", "true") // pour inférer le type de chaque colonne (Int, String, etc.)
      .csv("/home/jorge/Documents/Git/spark_project_kickstarter_2019_2020/data/train_clean.csv")

println(s"Nombre de lignes : ${df.count}")
println(s"Nombre de colonnes : ${df.columns.length}")
println("\n")
println("Hello World ! from Preprocessor")
println("\n")

// val dfcasted:DataFrame = df
//     .withColumn("goal",$"goal".cast("Int"))
//     .withColumn("deadline",$"deadline".cast("Int"))
//     .withColumn("state_changed_at",$"state_changed_at".cast("Int"))
//     .withColumn("created_at",$"created_at".cast("Int"))
//     .withColumn("launched_at", $"launched_at".cast("Int"))
//     .withColumn("backers_count", $"backers_count".cast("Int"))
//     .withColumn("final_status", $"final_status".cast("Int"))

val cleaner = (s:String)=>{s.replaceAll(",","_")}
val udf_c = udf(cleaner)


val dfCasted: DataFrame = df
    .withColumn("goal", $"goal".cast("Int"))
    .withColumn("deadline" , $"deadline".cast("Int"))
    .withColumn("state_changed_at", $"state_changed_at".cast("Int"))
    .withColumn("created_at", $"created_at".cast("Int"))
    .withColumn("launched_at", $"launched_at".cast("Int"))
    .withColumn("backers_count", $"backers_count".cast("Int"))
    .withColumn("final_status", $"final_status".cast("Int"))
    .withColumn("name",udf_c($"name"))
    .dropDuplicates("deadline")
    .filter(!isnull($"state_changed_at"))
    .withColumn("country",when($"country" === "False",$"currency").otherwise($"country"))
    .filter(($"disable_communication"==="True") || ($"disable_communication"==="False"))
    .drop("disable_communication")
    .filter($"country" rlike ".{2}")
    .filter($"currency" rlike ".{3}")
    .drop("backers_count","state_changed_at")
    .withColumn("days_campaign",datediff(from_unixtime($"deadline"),from_unixtime($"launched_at")))
    .withColumn("hours_prepa",(($"launched_at"-$"created_at")/60).cast("Int"))
    .drop("launched_at","deadline","created_at")
    .withColumn("name",lower($"name"))
    .withColumn("desc",lower($"desc"))
    .withColumn("keywords",lower($"keywords"))
    .withColumn("text",concat($"name",lit(" "),$"desc",lit(" "),$"keywords"))
    .withColumn("days_campaign",when(isnull($"days_campaign"),-1).otherwise($"days_campaign"))
    .withColumn("hours_prepa",when(isnull($"hours_prepa"),-1).otherwise($"hours_prepa"))
    .withColumn("goal",when(isnull($"goal"),-1).otherwise($"goal"))
    .withColumn("country",when(isnull($"country")," ").otherwise($"country"))
    .withColumn("currency",when(isnull($"currency")," ").otherwise($"currency"))


// df3.write.parquet("/home/jorge/Documents/Git/spark_project_kickstarter_2019_2020/cleanData.parquet")
// df.select($"goal").filter(col("goal").isNull).show

// dfCasted.select("name","goal", "backers_count", "final_status").describe().show
// dfCasted.groupBy("deadline").count.orderBy($"count".desc).show(100)

dfCasted.select("country","currency").groupBy("currency").count.orderBy("count").show
// dfCasted.select("name","desc","goal","keywords","currency").filter(isnull($"country")).show(100)


Nombre de lignes : 108129
Nombre de colonnes : 14


Hello World ! from Preprocessor


+--------+-----+
|currency|count|
+--------+-----+
|     NOK|  113|
|     DKK|  190|
|     SEK|  230|
|     NZD|  348|
|     EUR|  792|
|     AUD| 1849|
|     CAD| 3515|
|     GBP| 8381|
|     USD|85515|
+--------+-----+



df: org.apache.spark.sql.DataFrame = [project_id: string, name: string ... 12 more fields]
cleaner: String => String = <function1>
udf_c: org.apache.spark.sql.expressions.UserDefinedFunction = UserDefinedFunction(<function1>,StringType,Some(List(StringType)))
dfCasted: org.apache.spark.sql.DataFrame = [project_id: string, name: string ... 9 more fields]


In [78]:
dfCasted.columns
// dfCasted.groupBy("project_id").count.show()

res69: Array[String] = Array(project_id, name, desc, goal, keywords, disable_communication, country, currency, deadline, state_changed_at, created_at, launched_at, backers_count, final_status)


In [27]:
dfCasted.select("goal", "backers_count", "final_status").count

org.apache.spark.sql.AnalysisException:  cannot resolve '`backers_count`' given input columns: [hours_prepa, keywords, days_campaign, project_id, currency, goal, name, desc, final_status, country, text];;

In [35]:
val n = 5

dfCasted
    .select("name","goal", "backers_count", "final_status")
    .show(n)

dfCasted
    .select("country","keywords","disable_communication","currency")
    .show(n)

dfCasted
    .select("deadline","state_changed_at","created_at","launched_at")
    .show(n)

+--------------------+----+-------------+------------+
|                name|goal|backers_count|final_status|
+--------------------+----+-------------+------------+
| drawing for dollars|  20|            3|           1|
|Sponsor Dereck Bl...| 300|            2|           0|
|       Mr. Squiggles|  30|            0|           0|
|Help me write my ...| 500|           18|           1|
|Support casting m...|2000|            1|           0|
+--------------------+----+-------------+------------+
only showing top 5 rows

+-------+--------------------+---------------------+--------+
|country|            keywords|disable_communication|currency|
+-------+--------------------+---------------------+--------+
|     US| drawing-for-dollars|                False|     USD|
|     US|sponsor-dereck-bl...|                False|     USD|
|     US|        mr-squiggles|                False|     USD|
|     US|help-me-write-my-...|                False|     USD|
|     US|support-casting-m...|                

n: Int = 5


In [None]:
// dfCasted.groupBy("disable_communication").count.orderBy($"count".desc).show(100)
// dfCasted.groupBy("disable_communication").count.orderBy($"count".desc).show
// dfCasted.groupBy("country").count.orderBy($"count".desc).show(100)
// dfClean.groupBy("currency").count.orderBy($"count".desc).show(100)
// dfCasted.select("deadline").dropDuplicates.show()

// (dfCasted.select("deadline").dropDuplicates.count()
//  ,dfCasted.select("deadline").count())

// dfCasted.groupBy("state_changed_at").count.orderBy($"count".desc).show(100)
// dfCasted.groupBy("backers_count").count.orderBy($"count".desc).show(100)
// dfCasted.select("goal", "final_status").show(30)
// dfCasted.groupBy("country", "currency").count.orderBy($"count".desc).show(50)

// Cleaning à faire: 
// Only keep rows with "True" or "False" in disable_communication
// drop disable_communication
// drop rows where regex or different from the main countries (after
// trying to fill with currency stage later)

// same than above with the country culumn
// dropduplicates in id column --done
// filter rows where state_changed_at null
// infer country from / currency if country is null (befre dropping countries)
// US -> US , GB ->GB, CA->CA, AU-AU, NL->NL
// 

In [11]:
val dfClean:DataFrame = dfCasted
//     .dropDuplicates("deadline")
    .filter(!isnull($"state_changed_at"))
    .withColumn("country",when($"country" === "False",$"currency").otherwise($"country"))
    .filter(($"disable_communication"==="True") || ($"disable_communication"==="False"))
    .drop("disable_communication")
    .filter($"country" rlike ".{2}")
    .filter($"currency" rlike ".{3}")
    .drop("backers_count","state_changed_at")

// dfClean
//     .select("name","goal","final_status")
//     .show(n)

// dfClean
//     .select("country","keywords","currency")
//     .show(n)

// dfClean
//     .select("deadline","created_at","launched_at")
//     .show(n)

// df.filter($"country" === "False")
//   .groupBy("currency")
//   .count
//   .orderBy($"count".desc)
//   .show(50)

dfClean: org.apache.spark.sql.DataFrame = [project_id: string, name: string ... 9 more fields]


In [None]:
// Important:  col equivalent to $

In [None]:
// Solution proposée

// def cleanCountry(country: String, currency: String): String = {
//   if (country == "False")
//     currency
//   else
//     country
// }

// def cleanCurrency(currency: String): String = {
//   if (currency != null && currency.length != 3)
//     null
//   else
//     currency
// }

// val cleanCountryUdf = udf(cleanCountry _)
// val cleanCurrencyUdf = udf(cleanCurrency _)

// val dfCountry: DataFrame = dfNoFutur
//   .withColumn("country2", cleanCountryUdf($"country", $"currency"))
//   .withColumn("currency2", cleanCurrencyUdf($"currency"))
//   .drop("country", "currency")

// // ou encore, en utilisant sql.functions.when:
// dfNoFutur
//   .withColumn("country2", when($"country" === "False", $"currency").otherwise($"country"))
//   .withColumn("currency2", when($"country".isNotNull && length($"currency") =!= 3, null).otherwise($"currency"))
//   .drop("country", "currency")

In [16]:
val df2:DataFrame = dfClean
    .withColumn("days_campaign",datediff(from_unixtime($"deadline"),from_unixtime($"launched_at")))
    .withColumn("hours_prepa",(($"launched_at"-$"created_at")/60).cast("Int"))
    .drop("launched_at","deadline","created_at")
    .withColumn("name",lower($"name"))
    .withColumn("desc",lower($"desc"))
    .withColumn("keywords",lower($"keywords"))
    

val df3:DataFrame = df2
    .withColumn("text",concat($"name",lit(" "),$"desc",lit(" "),$"keywords"))
    .withColumn("days_campaign",when(isnull($"days_campaign"),-1).otherwise($"days_campaign"))
    .withColumn("hours_prepa",when(isnull($"hours_prepa"),-1).otherwise($"hours_prepa"))
    .withColumn("goal",when(isnull($"goal"),-1).otherwise($"goal"))
    .withColumn("country",when(isnull($"country")," ").otherwise($"country"))
    .withColumn("currency",when(isnull($"currency")," ").otherwise($"currency"))

// df3.columns

org.apache.spark.sql.AnalysisException:  cannot resolve '`deadline`' given input columns: [goal, currency, final_status, days_campaign, hours_prepa, desc, keywords, project_id, country, name, text];;

In [14]:
df3
    .select("name","goal","final_status")
    .show(n)

df3
    .select("country","keywords","currency")
    .show(n)


df3
    .select("days_campaign","hours_prepa","text")
    .show(n)

<console>: 36: error: not found: value df3

In [None]:
df3.write.parquet("/home/jorge/Documents/Git/spark_project_kickstarter_2019_2020/cleanData.parquet")

In [15]:
df3.show(5)

<console>: 28: error: not found: value df3

In [30]:
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.functions._
import org.apache.spark.SparkConf
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel, IDF, OneHotEncoderEstimator, RegexTokenizer, StopWordsRemover, StringIndexer, VectorAssembler}
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel, ParamGridBuilder, TrainValidationSplit}
// import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
// import org.apache.spark.ml.evaluation
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator


import org.apache.spark.ml.param.ParamMap


val conf = new SparkConf().setAll(Map(
  "spark.scheduler.mode" -> "FIFO",
  "spark.speculation" -> "false",
  "spark.reducer.maxSizeInFlight" -> "48m",
  "spark.serializer" -> "org.apache.spark.serializer.KryoSerializer",
  "spark.kryoserializer.buffer.max" -> "1g",
  "spark.shuffle.file.buffer" -> "32k",
  "spark.default.parallelism" -> "12",
  "spark.sql.shuffle.partitions" -> "12",
  "spark.driver.maxResultSize" -> "2g"
))

val spark = SparkSession
  .builder
  .config(conf)
  .appName("TP Spark : Trainer")
  .getOrCreate()


/*******************************************************************************
  *
  *       TP 3
  *
  *       - lire le fichier sauvegarder précédemment
  *       - construire les Stages du pipeline, puis les assembler
  *       - trouver les meilleurs hyperparamètres pour l'entraînement du pipeline avec une grid-search
  *       - Sauvegarder le pipeline entraîné
  *
  *       if problems with unimported modules => sbt plugins update
  *
  ********************************************************************************/
// val df:DataFrame = spark.read.parquet("prepared_trainingset/")
val df:DataFrame = spark.read.parquet("cleanData.parquet/")
    .filter(!isnull($"text"))
    .filter(!($"country" rlike "DE"))



val tokenizer = new RegexTokenizer()
  .setPattern("\\W+")
  .setGaps(true)
  .setInputCol("text")
  .setOutputCol("tokens")

val remover = new StopWordsRemover()
  .setInputCol(tokenizer.getOutputCol)
  .setOutputCol("filtered")

val cvModel: CountVectorizer = new CountVectorizer()
  .setInputCol(remover.getOutputCol)
  .setOutputCol("vect")
  .setMinDF(50)

val idf = new IDF()
  .setInputCol(cvModel.getOutputCol)
  .setOutputCol("tfidf")

val indexerCountry = new StringIndexer()
  .setInputCol("country")
  .setOutputCol("country_indexed")

val indexerCurrency = new StringIndexer()
  .setInputCol("currency")
  .setOutputCol("currency_indexed")

val encoder = new OneHotEncoderEstimator()
  .setInputCols(Array("country_indexed", "currency_indexed"))
  .setOutputCols(Array("country_onehot", "currency_onehot"))

val assembler = new VectorAssembler()
  .setInputCols(Array("tfidf","days_campaign","hours_prepa","goal","country_onehot","currency_onehot"))
  .setOutputCol("features")

val lr = new LogisticRegression()
  .setElasticNetParam(0.0)
  .setFitIntercept(true)
  .setFeaturesCol("features")
  .setLabelCol("final_status")
  .setStandardization(true)
  .setPredictionCol("predictions")
  .setRawPredictionCol("raw_predictions")
//   .setThresholds(Array(0.7, 0.3))
//   .setTol(1.0e-6)
//   .setMaxIter(20)

val pipeline = new Pipeline()
  .setStages(Array(tokenizer,remover,cvModel,idf,indexerCountry, indexerCurrency,encoder, assembler,lr))


val Array(train,test) = df.randomSplit(Array[Double](0.9, 0.1))
val size = (train.count,test.count)


val model1 = pipeline.fit(train)
val predictions = model1.transform(test)

val evaluator = new MulticlassClassificationEvaluator()
  .setLabelCol("final_status")
  .setPredictionCol("predictions")
  .setMetricName("f1")

val f1 = evaluator.evaluate(predictions)
println("Test set accuracy for Model 1 = " + f1)


val grid = new ParamGridBuilder()
  .addGrid(lr.regParam,Array(10e-10,10e-8,10e-6,10e-4,10e-2))
  .addGrid(cvModel.minDF,Array(20.0,35.0,55.0,75.0,95.0))
  .build()

val trainValidationSplit = new TrainValidationSplit()
  .setEstimator(pipeline)
  .setEvaluator(evaluator)
  .setEstimatorParamMaps(grid)
  // 80% of the data will be used for training and the remaining 20% for validation.
  .setTrainRatio(0.8)

val gridSearch = trainValidationSplit.fit(df)
val gridSearchBestModel = gridSearch.bestModel

val f1best = evaluator.evaluate(gridSearchBestModel.transform(test))


val bestPipelineModel = gridSearchBestModel.asInstanceOf[PipelineModel]
val stages = bestPipelineModel.stages
val cvStage = stages(2).asInstanceOf[CountVectorizerModel]
println("numFeatures = " + cvStage.getMinDF)
val lrStage = stages(8).asInstanceOf[LogisticRegressionModel]
println("regParam = " + lrStage.getRegParam)


//    predictions.groupBy("final_status", "predictions").count.show()
println("Train dataset size is : " + size._1)
println("Test dataset size is : " + size._2)
println("Test set accuracy for Model 1 = " + f1)
println("Test set accuracy for the best model of the Grid Search is = " + f1best)
println("regParam = " + lrStage.getRegParam)
println("minDF = " + cvStage.getMinDF)
//    println("Params for best model are : " + gridSearchBestModel.getParam(lr.getRegParam))




Test set accuracy for Model 1 = 0.6875604716253096
numFeatures = 35.0
regParam = 1.0E-9
Train dataset size is : 90932
Test dataset size is : 9992
Test set accuracy for Model 1 = 0.6875604716253096
Test set accuracy for the best model of the Grid Search is = 0.7303824882872976
regParam = 1.0E-9
minDF = 35.0


import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.functions._
import org.apache.spark.SparkConf
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel, IDF, OneHotEncoderEstimator, RegexTokenizer, StopWordsRemover, StringIndexer, VectorAssembler}
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel, ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.param.ParamMap
conf: org.apache.spark.SparkConf = org.apache.spark.Spark...

In [29]:
gridSearch.

res21: String =
estimator: estimator for selection (current: pipeline_1b6c3ce11f36)
estimatorParamMaps: param maps for the estimator (current: [Lorg.apache.spark.ml.param.ParamMap;@5b06b566)
evaluator: evaluator used to select hyper-parameters that maximize the validated metric (current: mcEval_030e736e766e)
parallelism: the number of threads to use when running parallel algorithms (default: 1)
seed: random seed (default: -1772833110)
trainRatio: ratio between train...

In [13]:
val f1best = evaluator.evaluate(gridSearchBestModel.transform(test))
val bestPipelineModel = gridSearchBestModel.asInstanceOf[PipelineModel]
val stages = bestPipelineModel.stages
val cvStage = stages(2).asInstanceOf[CountVectorizerModel]
println("numFeatures = " + cvStage.getMinDF)
val lrStage = stages(8).asInstanceOf[LogisticRegressionModel]
println("regParam = " + lrStage.getRegParam)


//    predictions.groupBy("final_status", "predictions").count.show()
println("Train dataset size is : " + size._1)
println("Test dataset size is : " + size._2)
println("Test set accuracy for Model 1 = " + f1)
println("Test set accuracy for the best model of the Grid Search is = " + f1best)
println("regParam = " + lrStage.getRegParam)
println("minDF = " + cvStage.getMinDF)
//    println("Params for best model are : " + gridSearchBestModel.getParam(lr.getRegParam))

numFeatures = 35.0
regParam = 1.0E-9
Train dataset size is : 90930
Test dataset size is : 9994
Test set accuracy for Model 1 = 0.654367644345559
Test set accuracy for the best model of the Grid Search is = 0.7046142667490585
regParam = 1.0E-9
minDF = 35.0


f1best: Double = 0.7046142667490585
bestPipelineModel: org.apache.spark.ml.PipelineModel = pipeline_1b6c3ce11f36
stages: Array[org.apache.spark.ml.Transformer] = Array(regexTok_f49dd8aaa75b, stopWords_ea9073cb3311, cntVec_57221211936f, idf_78484bd92bbf, strIdx_c3a4316e21e6, strIdx_4f78098f12d2, oneHotEncoder_9d2d1fa42661, vecAssembler_45d2dc3e69c7, LogisticRegressionModel: uid = logreg_5696f06e88aa, numClasses = 2, numFeatures = 5536)
cvStage: org.apache.spark.ml.feature.CountVectorizerModel = cntVec_57221211936f
lrStage: org.apache.spark.ml.classification.LogisticRegressionModel = LogisticRegressionModel: uid = logreg_5696f06e88aa, numClasses = 2, numFeatures = 5536


In [6]:
// val df:DataFrame = spark.read.parquet("prepared_trainingset/")
val df:DataFrame = spark.read.parquet("cleanData.parquet/")
    .filter(!isnull($"text"))
    .filter($"country" rlike "DE")

// predictions.select("predictions").show
df.select("country").groupBy("country").count.show

+-------+-----+
|country|count|
+-------+-----+
|     DE|    1|
+-------+-----+



df: org.apache.spark.sql.DataFrame = [project_id: string, name: string ... 9 more fields]


In [139]:
df.withColumn("test", when(isnan($"goal"),1).otherwise(0)).groupBy("test").count.show

+----+------+
|test| count|
+----+------+
|   0|100933|
+----+------+



In [None]:
// df.select("country").map(line => (line.toString(),line.toString.length())).orderBy($"_2".desc).show(100)