# NLP RECSYS Run Pipeline Stage1




In [1]:
import com.johnsnowlabs.nlp.SparkNLP
import com.johnsnowlabs.nlp.annotator._
import com.johnsnowlabs.nlp.base._
import org.apache.spark.ml.{Pipeline, PipelineModel, Transformer}
import org.apache.spark.sql.types._
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.functions.{udf,to_timestamp}
import org.apache.spark.storage._
import org.apache.spark.ml.feature._
import org.apache.spark.ml.classification._
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.ml.param.{Param, ParamMap}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.{col, explode, udf}
import org.apache.spark.sql.types.{DataTypes, StructType}

import org.apache.spark.ml.linalg.Vectors


val dataDir = sys.env("HOME") + "/recsys2020"
val dsName = "competition_test"

val classNames = Array(
  "retweet",
  "retweet_with_comment",
  "like",
  "reply")
val labelColumns = for (className <- classNames) yield "has_" + className;

In [2]:
val pipeline = PipelineModel.load(dataDir + "/pipeline_stage2_v1")
val df = spark.read.parquet(dataDir + s"/${dsName}_stage1.parquet")
val udf_bool_to_int = udf[Integer, Boolean](x => if (x) 1 else 0)

val df_with_ints = df
    .withColumn("has_retweet", udf_bool_to_int(col("has_retweet")))
    .withColumn("has_retweet_with_comment", udf_bool_to_int(col("has_retweet_with_comment")))
    .withColumn("has_like", udf_bool_to_int(col("has_like")))
    .withColumn("has_reply", udf_bool_to_int(col("has_reply")))

val convertUDF = udf((array : Seq[Float]) => {
  Vectors.dense(array.toArray.map(_.toDouble))
})

val df_with_embeddings = df_with_ints
        .withColumn("embeddings", convertUDF('embeddings))

val resDf = pipeline.transform(df_with_embeddings)

In [5]:
resDf.printSchema

In [3]:
// only get relevant columns
val toArr: Any => Double = _.asInstanceOf[DenseVector].toArray(1)
val toArrUdf = udf(toArr)

var tmpDf = resDf;
val outputNames = for (className <- classNames) yield {
    val inputName = "prob_" + className;
    val outputName = "out_" + className;
    tmpDf = tmpDf.withColumn(outputName, toArrUdf(col(inputName)));
    outputName
}
val cleanDf = tmpDf.selectExpr((Array("user_id", "tweet_id") ++ outputNames):_*)

In [4]:
// write separate csv files for submission
for (className <- classNames) {
    cleanDf.select(col("tweet_id"), 
                 col("user_id"), 
                 col("out_" + className))
          .write.format("csv")
          .mode(SaveMode.Overwrite)
          .option("header", "false")
          .save(dataDir + s"/out/${dsName}/${className}.csv")
}