# NLP RECSYS Eval Stage




In [1]:
import com.johnsnowlabs.nlp.SparkNLP
import com.johnsnowlabs.nlp.annotator._
import com.johnsnowlabs.nlp.base._
import org.apache.spark.ml.{Pipeline, PipelineModel, Transformer}
import org.apache.spark.sql.types._
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.functions.{udf,to_timestamp}
import org.apache.spark.storage._
import org.apache.spark.ml.feature._
import org.apache.spark.ml.classification._
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.ml.param.{Param, ParamMap}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.{col, explode, udf}
import org.apache.spark.sql.types.{DataTypes, StructType}
import org.apache.spark.ml.evaluation._
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics


val dataDir = sys.env("HOME") + "/recsys2020"
val dsName = "val10k"

val classNames = Array(
  "retweet",
  "retweet_with_comment",
  "like",
  "reply")

In [2]:
val df = spark.read.parquet(dataDir + s"/${dsName}.parquet")

val schema = new StructType()
    .add("tweet_id", StringType, true)
    .add("user_id", StringType, true)
    .add("out_pred", DoubleType, true)

val preds = for(className <- classNames) yield spark.read.format("csv")
        .option("delimiter", ",")
        .schema(schema)
        .load(dataDir + s"/out/${dsName}/${className}.csv")

In [5]:
val udf_bool_to_int = udf[Double, Boolean](x => if (x) 1.0 else 0.0)

val joined = classNames.zip(preds).foldLeft(df){ 
    case (j, (className, classPred)) => j.join(classPred, Seq("user_id", "tweet_id"), "outer")
                                         .withColumnRenamed("out_pred", "pred_" + className)
                                         .withColumn("has_" + className, udf_bool_to_int(col("has_" + className)))
}
joined

[user_id: string, tweet_id: string ... 26 more fields]

In [6]:
// Select (prediction, true label) and compute test error.

val log_loss = { (label: Double, pred: Double) => -(label * math.log(pred) + (1 - label) * math.log(1 - pred)) }

for (className <- classNames)
{
  val metrics = new BinaryClassificationMetrics(
    joined.select(col("pred_" + className), col("has_" + className)).rdd.map{row => (row.getDouble(0), row.getDouble(1))}
  )

  println(s"${className} PRAUC = ${metrics.areaUnderPR}")
  println(s"${className} ROCAUC = ${metrics.areaUnderROC}")
}

retweet PRAUC = 0.1475587328164478
retweet ROCAUC = 0.5754907247228939
retweet_with_comment PRAUC = 0.012144980635543947
retweet_with_comment ROCAUC = 0.6017595169583473
like PRAUC = 0.5139128386938798
like ROCAUC = 0.5922976237216836
reply PRAUC = 0.03713096484535038
reply ROCAUC = 0.5748480506721724
