# NLP RECSYS Train Pipeline Stage2




In [1]:
import com.johnsnowlabs.nlp.SparkNLP
import com.johnsnowlabs.nlp.annotator._
import com.johnsnowlabs.nlp.base._
import com.johnsnowlabs.ml.tensorflow.TensorflowBert
import org.apache.spark.ml.Pipeline
import org.apache.spark.sql.types._
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.functions.{udf,to_timestamp}
import org.apache.spark.storage._
import org.apache.spark.ml.feature._
import org.apache.spark.ml.classification._
import org.apache.spark.ml.linalg.DenseVector

import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.{Param, ParamMap}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.{col, explode, udf}
import org.apache.spark.sql.types.{DataTypes, StructType}


val dataDir = sys.env("HOME") + "/recsys2020"
val dsName = "training"

In [2]:
val df = spark.read.parquet(dataDir + s"/${dsName}_stage1.parquet")

In [3]:
val classNames = Array(
  "retweet",
  "retweet_with_comment",
  "like",
  "reply")

val classifiers = for (className <- classNames) yield new GBTClassifier() // RandomForestClassifier() should be faster if we run into problems
                        .setLabelCol("has_" + className)
                        .setFeaturesCol("features")
                        .setProbabilityCol("prob_" + className)
                        .setPredictionCol("pred_" + className)
                        .setRawPredictionCol("predraw_" + className)
                        .setMaxIter(10)
                        .setFeatureSubsetStrategy("auto");

val pred_pipeline = new Pipeline().setStages(classifiers)

In [4]:
val fitted_pipeline = pred_pipeline.fit(df)

In [5]:
fitted_pipeline.write.overwrite().save(dataDir + "/pipeline_stage2_v1")

In [6]:
val val_df = spark.read.parquet(dataDir + s"/val_stage1.parquet")



In [7]:
var finalDf = fitted_pipeline.transform(df)
finalDf.show()

In [8]:
// only get relevant columns
val toArr: Any => Double = _.asInstanceOf[DenseVector].toArray(1)
val toArrUdf = udf(toArr)

var finalFinalDf = finalDf;
val outputNames = for (className <- classNames) yield {
    val inputName = "prob_" + className;
    val outputName = "out_" + className;
    finalFinalDf = finalFinalDf.withColumn(outputName, toArrUdf(col(inputName)));
    outputName
}

finalFinalDf.selectExpr((Array("user_id", "tweet_id") ++ outputNames):_*).show()

In [9]:
for (className <- classNames) {
    finalFinalDf.select($"tweet_id", $"user_id", col("out_" + className)).write.format("csv").option("header", "false").save(dataDir + "/" + className + ".csv")
}