# NLP RECSYS Train Pipeline Stage2




In [1]:
import com.johnsnowlabs.nlp.SparkNLP
import com.johnsnowlabs.nlp.annotator._
import com.johnsnowlabs.nlp.base._
import com.johnsnowlabs.ml.tensorflow.TensorflowBert
import org.apache.spark.ml.Pipeline
import org.apache.spark.sql.types._
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.functions.{udf,to_timestamp}
import org.apache.spark.storage._
import org.apache.spark.ml.feature._
import org.apache.spark.ml.classification._
import org.apache.spark.ml.linalg.DenseVector

import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.{Param, ParamMap}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.{col, explode, udf}
import org.apache.spark.sql.types.{DataTypes, StructType}


val dataDir = sys.env("HOME") + "/recsys2020"
val dsName = "training1m"

In [2]:
val df = spark.read.parquet(dataDir + s"/${dsName}_stage1.parquet")

In [3]:
val classNames = Array(
  "retweet",
  "retweet_with_comment",
  "like",
  "reply")

val featureCols = (for (i <- 1 to 522) yield "f_" + i).toArray

val ass = new VectorAssembler()
  .setInputCols(featureCols)
  .setOutputCol("features")

val classifiers = for (className <- classNames) yield new GBTClassifier() // RandomForestClassifier() should be faster if we run into problems
                        .setLabelCol("has_" + className)
                        .setFeaturesCol("features")
                        .setProbabilityCol("prob_" + className)
                        .setPredictionCol("pred_" + className)
                        .setRawPredictionCol("predraw_" + className)
                        .setMaxIter(10)
                        .setFeatureSubsetStrategy("auto");

val pred_pipeline = new Pipeline().setStages(Array(ass) ++ classifiers)

In [4]:
val udf_bool_to_int = udf[Integer, Boolean](x => if (x) 1 else 0)

val df_with_ints = df
    .withColumn("has_retweet", udf_bool_to_int(col("has_retweet")))
    .withColumn("has_retweet_with_comment", udf_bool_to_int(col("has_retweet_with_comment")))
    .withColumn("has_like", udf_bool_to_int(col("has_like")))
    .withColumn("has_reply", udf_bool_to_int(col("has_reply")))

val fitted_pipeline = pred_pipeline.fit(df_with_ints)

In [5]:
fitted_pipeline.write.overwrite().save(dataDir + "/pipeline_stage2_v1")

In [11]:
var res = fitted_pipeline.transform(df_with_ints)
// only get relevant columns
val toArr: Any => Double = _.asInstanceOf[DenseVector].toArray(1)
val toArrUdf = udf(toArr)
val labelColumns = for (className <- classNames) yield "has_" + className;


val outputNames = for (className <- classNames) yield {
    val inputName = "prob_" + className;
    val outputName = "out_" + className;
    res = res.withColumn(outputName, toArrUdf(col(inputName)));
    outputName
}

res.selectExpr((Array("user_id", "tweet_id") ++ labelColumns ++ outputNames):_*).show()

+--------------------+--------------------+-----------+------------------------+--------+---------+-------------------+------------------------+-------------------+-------------------+
|             user_id|            tweet_id|has_retweet|has_retweet_with_comment|has_like|has_reply|        out_retweet|out_retweet_with_comment|           out_like|          out_reply|
+--------------------+--------------------+-----------+------------------------+--------+---------+-------------------+------------------------+-------------------+-------------------+
|D0EA9DDFE93EDA782...|FB6304C97F6CC05AF...|          0|                       0|       1|        0|0.19046776145930178|     0.07215823588693837| 0.3069462231381258| 0.0678028517380228|
|5CD17BD84873464C2...|8A025814338D4CAB5...|          0|                       0|       0|        0|0.10882982684772913|     0.06869664013285715|  0.532142904019473|0.10217446174063816|
|6B8C5EFC300170EF8...|8E8EC11B9492B6C09...|          0|                    

In [12]:
res.show(false)

+--------------------------------+--------------------------------+-------------+-------------+------------+------------+------------+-------------+------------+------------+-------------+------------+------------+-------------+------------+------------+-------------+------------+------------+-------------+------------+-------------+------------+-------------+------------+------------+------------+-------------+------------+------------+-------------+-------------+-------------+------------+-------------+------------+------------+-------------+-------------+------------+-------------+------------+-------------+-------------+-------------+------------+-------------+-------------+------------+-------------+------------+-------------+------------+------------+-------------+------------+-----------+------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+------------+-------------+-------------+------------+------------+------------+---

In [6]:
val val_df = spark.read.parquet(dataDir + s"/val_stage1.parquet")



In [7]:
var finalDf = fitted_pipeline.transform(df)
finalDf.show()

In [8]:
// only get relevant columns
val toArr: Any => Double = _.asInstanceOf[DenseVector].toArray(1)
val toArrUdf = udf(toArr)

var finalFinalDf = finalDf;
val outputNames = for (className <- classNames) yield {
    val inputName = "prob_" + className;
    val outputName = "out_" + className;
    finalFinalDf = finalFinalDf.withColumn(outputName, toArrUdf(col(inputName)));
    outputName
}

finalFinalDf.selectExpr((Array("user_id", "tweet_id") ++ outputNames):_*).show()

In [9]:
for (className <- classNames) {
    finalFinalDf.select($"tweet_id", $"user_id", col("out_" + className)).write.format("csv").option("header", "false").save(dataDir + "/" + className + ".csv")
}