In [8]:
import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, SparkSession}

val conf = {new SparkConf().setAll(Map("spark.scheduler.mode" -> "FIFO",
      "spark.speculation" -> "false",
      "spark.reducer.maxSizeInFlight" -> "48m",
      "spark.serializer" -> "org.apache.spark.serializer.KryoSerializer",
      "spark.kryoserializer.buffer.max" -> "1g",
      "spark.shuffle.file.buffer" -> "32k",
      "spark.default.parallelism" -> "12",
      "spark.sql.shuffle.partitions" -> "12"
    ))}

    // Initialisation du SparkSession qui est le point d'entrée vers Spark SQL (donne accès aux dataframes, aux RDD,
    // création de tables temporaires, etc., et donc aux mécanismes de distribution des calculs)
val spark = {SparkSession
  .builder
  .config(conf)
  .appName("TP Spark : Preprocessor")
  .getOrCreate}

import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, SparkSession}
conf: org.apache.spark.SparkConf = org.apache.spark.SparkConf@4acd67d5
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@4ceef85a


In [11]:
val df:DataFrame = spark
      .read
      .option("header", true) 
      .option("inferSchema", "true") // pour inférer le type de chaque colonne (Int, String, etc.)
      .csv("/home/jorge/Documents/Cours/Spark/RepoAdotTPs/data/train_clean.csv")

println(s"Nombre de lignes : ${df.count}")
println(s"Nombre de colonnes : ${df.columns.length}")
println("\n")
println("Hello World ! from Preprocessor")
println("\n")

// val dfcasted:DataFrame = df
//     .withColumn("goal",$"goal".cast("Int"))
//     .withColumn("deadline",$"deadline".cast("Int"))
//     .withColumn("state_changed_at",$"state_changed_at".cast("Int"))
//     .withColumn("created_at",$"created_at".cast("Int"))
//     .withColumn("launched_at", $"launched_at".cast("Int"))
//     .withColumn("backers_count", $"backers_count".cast("Int"))
//     .withColumn("final_status", $"final_status".cast("Int"))

val dfCasted: DataFrame = df
    .withColumn("goal", $"goal".cast("Int"))
    .withColumn("deadline" , $"deadline".cast("Int"))
    .withColumn("state_changed_at", $"state_changed_at".cast("Int"))
    .withColumn("created_at", $"created_at".cast("Int"))
    .withColumn("launched_at", $"launched_at".cast("Int"))
    .withColumn("backers_count", $"backers_count".cast("Int"))
    .withColumn("final_status", $"final_status".cast("Int"))
    .dropDuplicates("deadline")
    .filter(!isnull($"state_changed_at"))
    .withColumn("country",when($"country" === "False",$"currency").otherwise($"country"))
    .filter(($"disable_communication"==="True") || ($"disable_communication"==="False"))
    .drop("disable_communication")
    .filter($"country" rlike ".{2}")
    .filter($"currency" rlike ".{3}")
    .drop("backers_count","state_changed_at")
    .withColumn("days_campaign",datediff(from_unixtime($"deadline"),from_unixtime($"launched_at")))
    .withColumn("hours_prepa",(($"launched_at"-$"created_at")/60).cast("Int"))
    .drop("launched_at","deadline","created_at")
    .withColumn("name",lower($"name"))
    .withColumn("desc",lower($"desc"))
    .withColumn("keywords",lower($"keywords"))
    .withColumn("text",concat($"name",lit(" "),$"desc",lit(" "),$"keywords"))
    .withColumn("days_campaign",when(isnull($"days_campaign"),-1).otherwise($"days_campaign"))
    .withColumn("hours_prepa",when(isnull($"hours_prepa"),-1).otherwise($"hours_prepa"))
    .withColumn("goal",when(isnull($"goal"),-1).otherwise($"goal"))
    .withColumn("country",when(isnull($"country")," ").otherwise($"country"))
    .withColumn("currency",when(isnull($"currency")," ").otherwise($"currency"))


// df3.write.parquet("/home/jorge/Documents/Git/spark_project_kickstarter_2019_2020/cleanData.parquet")
df.select($"goal").filter(col("goal").isNull).show



Nombre de lignes : 108129
Nombre de colonnes : 14


Hello World ! from Preprocessor


+----+
|goal|
+----+
|null|
+----+



df: org.apache.spark.sql.DataFrame = [project_id: string, name: string ... 12 more fields]
dfCasted: org.apache.spark.sql.DataFrame = [project_id: string, name: string ... 9 more fields]


In [15]:
dfCasted
  .select("goal", "backers_count", "final_status")
  .describe()
  .show

<console>: 30: error: not found: value dfClean

In [None]:
val n = 5

dfCasted
    .select("name","goal", "backers_count", "final_status")
    .show(n)

dfCasted
    .select("country","keywords","disable_communication","currency")
    .show(n)

dfCasted
    .select("deadline","state_changed_at","created_at","launched_at")
    .show(n)

In [None]:
// dfCasted.groupBy("disable_communication").count.orderBy($"count".desc).show(100)
// dfCasted.groupBy("disable_communication").count.orderBy($"count".desc).show
// dfCasted.groupBy("country").count.orderBy($"count".desc).show(100)
// dfClean.groupBy("currency").count.orderBy($"count".desc).show(100)
// dfCasted.select("deadline").dropDuplicates.show()

// (dfCasted.select("deadline").dropDuplicates.count()
//  ,dfCasted.select("deadline").count())

// dfCasted.groupBy("state_changed_at").count.orderBy($"count".desc).show(100)
// dfCasted.groupBy("backers_count").count.orderBy($"count".desc).show(100)
// dfCasted.select("goal", "final_status").show(30)
// dfCasted.groupBy("country", "currency").count.orderBy($"count".desc).show(50)

// Cleaning à faire: 
// Only keep rows with "True" or "False" in disable_communication
// drop disable_communication
// drop rows where regex or different from the main countries (after
// trying to fill with currency stage later)

// same than above with the country culumn
// dropduplicates in id column --done
// filter rows where state_changed_at null
// infer country from / currency if country is null (befre dropping countries)
// US -> US , GB ->GB, CA->CA, AU-AU, NL->NL
// 

In [None]:
val dfClean:DataFrame = dfCasted
    .dropDuplicates("deadline")
    .filter(!isnull($"state_changed_at"))
    .withColumn("country",when($"country" === "False",$"currency").otherwise($"country"))
    .filter(($"disable_communication"==="True") || ($"disable_communication"==="False"))
    .drop("disable_communication")
    .filter($"country" rlike ".{2}")
    .filter($"currency" rlike ".{3}")
    .drop("backers_count","state_changed_at")

dfClean
    .select("name","goal","final_status")
    .show(n)

dfClean
    .select("country","keywords","currency")
    .show(n)

dfClean
    .select("deadline","created_at","launched_at")
    .show(n)

// df.filter($"country" === "False")
//   .groupBy("currency")
//   .count
//   .orderBy($"count".desc)
//   .show(50)

In [None]:
// Important:  col equivalent to $

In [None]:
// Solution proposée

// def cleanCountry(country: String, currency: String): String = {
//   if (country == "False")
//     currency
//   else
//     country
// }

// def cleanCurrency(currency: String): String = {
//   if (currency != null && currency.length != 3)
//     null
//   else
//     currency
// }

// val cleanCountryUdf = udf(cleanCountry _)
// val cleanCurrencyUdf = udf(cleanCurrency _)

// val dfCountry: DataFrame = dfNoFutur
//   .withColumn("country2", cleanCountryUdf($"country", $"currency"))
//   .withColumn("currency2", cleanCurrencyUdf($"currency"))
//   .drop("country", "currency")

// // ou encore, en utilisant sql.functions.when:
// dfNoFutur
//   .withColumn("country2", when($"country" === "False", $"currency").otherwise($"country"))
//   .withColumn("currency2", when($"country".isNotNull && length($"currency") =!= 3, null).otherwise($"currency"))
//   .drop("country", "currency")

In [None]:
val df2:DataFrame = dfClean
    .withColumn("days_campaign",datediff(from_unixtime($"deadline"),from_unixtime($"launched_at")))
    .withColumn("hours_prepa",(($"launched_at"-$"created_at")/60).cast("Int"))
    .drop("launched_at","deadline","created_at")
    .withColumn("name",lower($"name"))
    .withColumn("desc",lower($"desc"))
    .withColumn("keywords",lower($"keywords"))
    

val df3:DataFrame = df2
    .withColumn("text",concat($"name",lit(" "),$"desc",lit(" "),$"keywords"))
    .withColumn("days_campaign",when(isnull($"days_campaign"),-1).otherwise($"days_campaign"))
    .withColumn("hours_prepa",when(isnull($"hours_prepa"),-1).otherwise($"hours_prepa"))
    .withColumn("goal",when(isnull($"goal"),-1).otherwise($"goal"))
    .withColumn("country",when(isnull($"country")," ").otherwise($"country"))
    .withColumn("currency",when(isnull($"currency")," ").otherwise($"currency"))

df3.columns

In [None]:
df3
    .select("name","goal","final_status")
    .show(n)

df3
    .select("country","keywords","currency")
    .show(n)


df3
    .select("days_campaign","hours_prepa","text")
    .show(n)

In [None]:
df3.write.parquet("/home/jorge/Documents/Git/spark_project_kickstarter_2019_2020/cleanData.parquet")

In [None]:
// df.select("country").map(line => (line.toString(),line.toString.length())).orderBy($"_2".desc).show(100)