# NLP RECSYS Run Pipeline Stage1




In [1]:
import com.johnsnowlabs.nlp.SparkNLP
import com.johnsnowlabs.nlp.annotator._
import com.johnsnowlabs.nlp.base._
import org.apache.spark.ml.{Pipeline, PipelineModel, Transformer}
import org.apache.spark.sql.types._
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.functions.{udf,to_timestamp}
import org.apache.spark.storage._
import org.apache.spark.ml.feature._
import org.apache.spark.ml.classification._
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.ml.param.{Param, ParamMap}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.{col, explode, udf}
import org.apache.spark.sql.types.{DataTypes, StructType}

val dataDir = sys.env("HOME") + "/recsys2020"

In [2]:
spark

In [3]:
package nlprecsys
import com.johnsnowlabs.nlp.SparkNLP
import com.johnsnowlabs.nlp.annotator._
import com.johnsnowlabs.nlp.base._
import org.apache.spark.ml.{Pipeline, PipelineModel, Transformer}
import org.apache.spark.sql.types._
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.functions.{udf,to_timestamp}
import org.apache.spark.storage._
import org.apache.spark.ml.feature._
import org.apache.spark.ml.classification._
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.ml.param.{Param, ParamMap}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.{col, explode, udf}
import org.apache.spark.sql.types.{DataTypes, StructType}

class Exploder(override val uid: String) extends Transformer with DefaultParamsWritable {
  def this() = this(Identifiable.randomUID("Exploder"))
  def setInputCol(value: String): this.type = set(inputCol, value)
  def setOutputCol(value: String): this.type = set(outputCol, value)
  def getOutputCol: String = getOrDefault(outputCol)
  val inputCol = new Param[String](this, "inputCol", "input column")
  val outputCol = new Param[String](this, "outputCol", "output column")

  override def transform(dataset: Dataset[_]): DataFrame = {
    val outCol = extractParamMap.getOrElse(outputCol, "output")
    val inCol = extractParamMap.getOrElse(inputCol, "input")
    dataset.withColumn(outCol, explode(col(inCol)))
  }

  override def transformSchema(schema: StructType): StructType = {
      val outCol = extractParamMap.getOrElse(outputCol, "output")
      val inCol = extractParamMap.getOrElse(inputCol, "input")
      val inputColType = schema.fields(schema.fieldIndex(inCol)).dataType.asInstanceOf[ArrayType];
      schema.add(outCol, inputColType.elementType)
  }
  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)
}
object Exploder extends DefaultParamsReadable[Exploder] {
  override def load(path: String): Exploder = super.load(path)
}



In [4]:
val pipeline = PipelineModel.load(dataDir + "/pipeline_stage1_v1")

In [5]:
val dsName = "training1m"

val classNames = Array(
  "retweet",
  "retweet_with_comment",
  "like",
  "reply")
val labelColumns = for (className <- classNames) yield "has_" + className;
val idCols = Array("tweet_id", "user_id")
val df = spark.read.parquet(dataDir + s"/${dsName}.parquet").limit(100000)
val relevantCols = idCols ++  Array("features") ++ labelColumns
val transDf = pipeline.transform(df).selectExpr(relevantCols:_*)

// this fails with out of disk space because the data is too big
// transDf
//     .write
//     .mode(SaveMode.Overwrite)
//     .parquet(dataDir + s"/${dsName}_stage1.parquet")

In [6]:
// convert to float columns to make data smaller
val vecToArray = udf( (xs: org.apache.spark.ml.linalg.DenseVector) => xs.toArray )
val elements = for (i <- 1 to 522) yield "f_" + i

val sqlExpr = idCols.map{x => col(x)} ++ 
              elements.zipWithIndex.map{ case (alias, idx) => col("feat_arr").getItem(idx).cast(FloatType).as(alias) } ++
              labelColumns.map{x => col(x).cast(BooleanType)}

transDf.withColumn("feat_arr", vecToArray(col("features"))).select(sqlExpr : _*)
    .write
    .mode(SaveMode.Overwrite)
    .parquet(dataDir + s"/${dsName}_stage1.parquet")