# NLP RECSYS Train Pipeline Stage2




In [1]:
import com.johnsnowlabs.nlp.SparkNLP
import com.johnsnowlabs.nlp.annotator._
import com.johnsnowlabs.nlp.base._
import com.johnsnowlabs.ml.tensorflow.TensorflowBert
import org.apache.spark.ml.Pipeline
import org.apache.spark.sql.types._
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.functions.{udf,to_timestamp}
import org.apache.spark.storage._
import org.apache.spark.ml.feature._
import org.apache.spark.ml.classification._
import org.apache.spark.ml.linalg.DenseVector

import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.{Param, ParamMap}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.{col, explode, udf}
import org.apache.spark.sql.types.{DataTypes, StructType}

import org.apache.spark.ml.linalg.Vectors


val dataDir = sys.env("HOME") + "/recsys2020"
val dsName = "training1m"

In [2]:
val df = spark.read.parquet(dataDir + s"/${dsName}_stage1.parquet")

In [3]:
val classNames = Array(
  "retweet",
  "retweet_with_comment",
  "like",
  "reply")

val tweetTypeIndexer = new StringIndexerModel(Array("TopLevel", "Retweet", "Quote", "Reply"))
  .setInputCol("tweet_type")
  .setOutputCol("tweet_type_idx");

val tweetTypeEncoder = new OneHotEncoder()
  .setInputCol(tweetTypeIndexer.getOutputCol)
  .setOutputCol("tweet_type_onehot")

val scaleAss = new VectorAssembler()
  .setInputCols(Array(
    "author_follower_count", 
    "author_following_count", 
    "user_follower_count", 
    "user_following_count", 
    "num_hashtags",
    "num_media",
    "num_links",
    "num_domains"
    // "num_tokens"
  ))
  .setOutputCol("count_features")

val scaler = new StandardScaler()
  .setInputCol(scaleAss.getOutputCol)
  .setOutputCol("count_features_scaled")
  .setWithStd(true)
  .setWithMean(false)

val ass = new VectorAssembler()
  .setInputCols(Array(
      "embeddings",
      tweetTypeEncoder.getOutputCol,
      scaler.getOutputCol,
      "author_is_verified",
      "user_is_verified",
      "follows"
    ))
  .setOutputCol("features")

val classifiers = for (className <- classNames) yield new RandomForestClassifier()
                        .setLabelCol("has_" + className)
                        .setFeaturesCol("features")
                        .setProbabilityCol("prob_" + className)
                        .setPredictionCol("pred_" + className)
                        .setRawPredictionCol("predraw_" + className)
                        .setFeatureSubsetStrategy("auto");

val pred_pipeline = new Pipeline().setStages(Array(tweetTypeIndexer, tweetTypeEncoder, scaleAss, scaler, ass) ++ classifiers)

In [4]:
val udf_bool_to_int = udf[Integer, Boolean](x => if (x) 1 else 0)

val df_with_ints = df
    .withColumn("has_retweet", udf_bool_to_int(col("has_retweet")))
    .withColumn("has_retweet_with_comment", udf_bool_to_int(col("has_retweet_with_comment")))
    .withColumn("has_like", udf_bool_to_int(col("has_like")))
    .withColumn("has_reply", udf_bool_to_int(col("has_reply")))

val convertUDF = udf((array : Seq[Float]) => {
  Vectors.dense(array.toArray.map(_.toDouble))
})

val df_with_embeddings = df_with_ints
        .withColumn("embeddings", convertUDF('embeddings))

val fitted_pipeline = pred_pipeline.fit(df_with_embeddings)

In [5]:
fitted_pipeline.write.overwrite().save(dataDir + "/pipeline_stage2_v1")

In [6]:
var res = fitted_pipeline.transform(df_with_embeddings)
// only get relevant columns
val toArr: Any => Double = _.asInstanceOf[DenseVector].toArray(1)
val toArrUdf = udf(toArr)
val labelColumns = for (className <- classNames) yield "has_" + className;


val outputNames = for (className <- classNames) yield {
    val inputName = "prob_" + className;
    val outputName = "out_" + className;
    res = res.withColumn(outputName, toArrUdf(col(inputName)));
    outputName
}

res.selectExpr((Array("user_id", "tweet_id") ++ labelColumns ++ outputNames):_*).show()

+--------------------+--------------------+-----------+------------------------+--------+---------+--------------------+------------------------+-------------------+--------------------+
|             user_id|            tweet_id|has_retweet|has_retweet_with_comment|has_like|has_reply|         out_retweet|out_retweet_with_comment|           out_like|           out_reply|
+--------------------+--------------------+-----------+------------------------+--------+---------+--------------------+------------------------+-------------------+--------------------+
|5BCEFB026703939F2...|3C07C7EDBF91457F1...|          0|                       0|       0|        0| 0.05104809329448203|    2.290393797092667...|0.30713496706923177|0.007886866218065289|
|0D06557BA01CA9935...|85F62F7831FC3C991...|          0|                       0|       0|        0| 0.15327145131810457|    1.096491228070175...|0.39198552411620835|  0.0577880367844353|
|E05AFECF10AA77863...|26BB95D5CFEE7A4E4...|          0|          

In [7]:
res.show(false)

+--------------------------------+--------------------------------+----------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------+----------------------+------------------+-------------------+--------------------+----------------+-------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [8]:
val val_df = spark.read.parquet(dataDir + s"/val_stage1.parquet")



org.apache.spark.sql.AnalysisException: Path does not exist: file:/home/iseratho/recsys2020/val_stage1.parquet;

In [9]:
var finalDf = fitted_pipeline.transform(val_df)
finalDf.show()

java.lang.IllegalArgumentException: Data type array<float> of column embeddings is not supported.

In [10]:
// only get relevant columns
val toArr: Any => Double = _.asInstanceOf[DenseVector].toArray(1)
val toArrUdf = udf(toArr)

var finalFinalDf = finalDf;
val outputNames = for (className <- classNames) yield {
    val inputName = "prob_" + className;
    val outputName = "out_" + className;
    finalFinalDf = finalFinalDf.withColumn(outputName, toArrUdf(col(inputName)));
    outputName
}

finalFinalDf.selectExpr((Array("user_id", "tweet_id") ++ outputNames):_*).show()

Error: not found: value finalDf (141)

In [11]:
for (className <- classNames) {
    finalFinalDf.select($"tweet_id", $"user_id", col("out_" + className)).write.format("csv").option("header", "false").save(dataDir + "/" + className + ".csv")
}

Error: not found: value finalFinalDf (36)