# NLP RECSYS Train Pipeline Stage1




In [1]:
import com.johnsnowlabs.nlp.SparkNLP
import com.johnsnowlabs.nlp.annotator._
import com.johnsnowlabs.nlp.base._
import org.apache.spark.ml.{Pipeline, PipelineModel, Transformer}
import org.apache.spark.sql.types._
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.functions.{udf,to_timestamp}
import org.apache.spark.storage._
import org.apache.spark.ml.feature._
import org.apache.spark.ml.classification._
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.ml.param.{Param, ParamMap}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.{col, explode, udf}
import org.apache.spark.sql.types.{DataTypes, StructType}


val dataDir = sys.env("HOME") + "/recsys2020"
val dsName = "training"

In [2]:
val df = spark.read.parquet(dataDir + s"/${dsName}.parquet")

In [3]:
package nlprecsys
import com.johnsnowlabs.nlp.SparkNLP
import com.johnsnowlabs.nlp.annotator._
import com.johnsnowlabs.nlp.base._
import org.apache.spark.ml.{Pipeline, PipelineModel, Transformer}
import org.apache.spark.sql.types._
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.functions.{udf,to_timestamp}
import org.apache.spark.storage._
import org.apache.spark.ml.feature._
import org.apache.spark.ml.classification._
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.ml.param.{Param, ParamMap}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.{col, explode, udf}
import org.apache.spark.sql.types.{DataTypes, StructType}

class Exploder(override val uid: String) extends Transformer with DefaultParamsWritable {
  def this() = this(Identifiable.randomUID("Exploder"))
  def setInputCol(value: String): this.type = set(inputCol, value)
  def setOutputCol(value: String): this.type = set(outputCol, value)
  def getOutputCol: String = getOrDefault(outputCol)
  val inputCol = new Param[String](this, "inputCol", "input column")
  val outputCol = new Param[String](this, "outputCol", "output column")

  override def transform(dataset: Dataset[_]): DataFrame = {
    val outCol = extractParamMap.getOrElse(outputCol, "output")
    val inCol = extractParamMap.getOrElse(inputCol, "input")
    dataset.withColumn(outCol, explode(col(inCol)))
  }

  override def transformSchema(schema: StructType): StructType = {
      val outCol = extractParamMap.getOrElse(outputCol, "output")
      val inCol = extractParamMap.getOrElse(inputCol, "input")
      val inputColType = schema.fields(schema.fieldIndex(inCol)).dataType.asInstanceOf[ArrayType];
      schema.add(outCol, inputColType.elementType)
  }
  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)
}
object Exploder extends DefaultParamsReadable[Exploder] {
  override def load(path: String): Exploder = super.load(path)
}



In [4]:
val doc = new DocumentAssembler()
    .setInputCol("tweet_text")
    .setOutputCol("document")
    .setCleanupMode("shrink")

val use = UniversalSentenceEncoder
      .pretrained()
      .setInputCols(Array("document"))
      .setOutputCol("tweet_embeddings")

val fin = new EmbeddingsFinisher()
      .setInputCols(use.getOutputCol)
      .setOutputCols("finished_tweet_embeddings")
      .setOutputAsVector(true)
      .setCleanAnnotations(false)

val exploder = new Exploder()
  .setInputCol(fin.getOutputCols(0))
  .setOutputCol("embedding_features")

val tweetTypeIndexer = new StringIndexerModel(Array("TopLevel", "Retweet", "Quote", "Reply"))
  .setInputCol("tweet_type")
  .setOutputCol("tweet_type_idx");

val tweetTypeEncoder = new OneHotEncoder()
  .setInputCol(tweetTypeIndexer.getOutputCol)
  .setOutputCol("tweet_type_onehot")

val scaleAss = new VectorAssembler()
  .setInputCols(Array("author_follower_count", "author_following_count", "user_follower_count", "user_following_count"))
  .setOutputCol("count_features")

val scaler = new StandardScaler()
  .setInputCol(scaleAss.getOutputCol)
  .setOutputCol("count_features_scaled")
  .setWithStd(true)
  .setWithMean(false)

val ass = new VectorAssembler()
  .setInputCols(Array(
      "embedding_features",
      tweetTypeEncoder.getOutputCol,
      scaler.getOutputCol,
      "author_is_verified",
      "user_is_verified",
      "follows"
    )).setOutputCol("features")

val classNames = Array(
  "retweet",
  "retweet_with_comment",
  "like",
  "reply")

val pipeline_stage1 = new Pipeline().setStages(Array(
  scaleAss, 
  scaler, 
  tweetTypeIndexer, 
  tweetTypeEncoder, 
  doc, 
  use,
  fin,
  exploder,
  ass))

In [5]:
val fitted_pipeline_stage1 = pipeline_stage1.fit(df)
fitted_pipeline_stage1.write.overwrite().save(dataDir + "/pipeline_stage1_v1")