# NLP RECSYS preprocessing<br>




In [1]:
import com.johnsnowlabs.nlp.SparkNLP
import com.johnsnowlabs.nlp.annotator._
import com.johnsnowlabs.nlp.base._
import com.johnsnowlabs.ml.tensorflow.TensorflowBert
import org.apache.spark.ml.Pipeline
import org.apache.spark.sql.types._
import org.apache.spark.sql.SaveMode
import scala.collection.mutable.WrappedArray
import org.apache.spark.sql.functions.{udf,to_timestamp}

val dataDir = sys.env("HOME") + "/recsys2020"
val dsName = "training"


In [2]:
var embeddingIdxToTokenStringMap = BertEmbeddings.pretrained(name="bert_multi_cased", lang="xx").vocabulary.getOrDefault.map(_ swap)

In [3]:
val schema = new StructType()
    .add("text_tokens", StringType, true)
    .add("hashtags", StringType, true)
    .add("tweet_id", StringType, true)
    .add("present_media", StringType, true)
    .add("present_links", StringType, true)
    .add("present_domains", StringType, true)
    .add("tweet_type", StringType, true)
    .add("language", StringType, true)
    .add("tweet_timestamp", IntegerType, true)
    .add("author_id", StringType, true)
    .add("author_follower_count", IntegerType, true)
    .add("author_following_count", IntegerType, true)
    .add("author_is_verified", BooleanType, true)
    .add("author_account_creation", IntegerType, true)
    .add("user_id", StringType, true)
    .add("user_follower_count", IntegerType, true)
    .add("user_following_count", IntegerType, true)
    .add("user_is_verified", BooleanType, true)
    .add("user_account_creation", IntegerType, true)
    .add("follows", BooleanType, true)
    .add("reply_timestamp", IntegerType, true)
    .add("retweet_timestamp", IntegerType, true)
    .add("retweet_with_comment_timestamp", IntegerType, true)
    .add("like_timestamp", IntegerType, true)

val df = spark.read.format("csv")
        .option("delimiter", "\u0001")
        .schema(schema)
        .load(dataDir + s"/${dsName}.tsv")


In [4]:
val udf_unbert = udf[Array[String], String](_.split("\t")
    .map(strTokenIdx => 
            embeddingIdxToTokenStringMap.getOrElse(key=strTokenIdx.toInt, default="[UNK]")))
val udf_has_engagement = udf[Integer, Integer](x => if (x != null) 1 else 0)
val udf_bool_to_int = udf[Integer, Boolean](x => if (x) 1 else 0)

In [5]:
// If there is media like photo or video, the last link in the text is always a link to the tweet itself
val ignored_tokens = Set("[CLS]","[UNK]","[SEP]","UNKN")
val udf_tweet = udf((text_tokens: WrappedArray[String], present_media: WrappedArray[String]) => {
    text_tokens.filterNot(token => ignored_tokens.contains(token)).foldLeft(List[String]()){(soFar, next) => {
            var m: String = null;
            if (!soFar.isEmpty && soFar.last.startsWith("https") && !(next == "https")) {
                soFar.last.length match {
                    case 5 => if (next == ":") m = next;
                    case 6 | 7 | 12 => if (next == "/") m = next;
                    case 8 => if (next == "t") m = next;
                    case 9 => if (next == ".") m = next;
                    case 10 => if (next == "co") m = next;
                    case x  if (x == 13) => 
                        if (next.forall(_.isLetterOrDigit)) {
                            m = next;
                        }
                    case x if (x > 13) =>
                        if (next.startsWith("##")) {
                            val nwop = next.stripPrefix("##");
                            if (nwop.forall(_.isLetterOrDigit))
                            {
                                m = nwop;
                            }
                        }
                }
            }
            else if (!soFar.isEmpty && (soFar.last.startsWith("#") || soFar.last.startsWith("@"))) {
                val isFirst = Set('_', '#', '@').contains(soFar.last.last);
                if (isFirst || next.startsWith("##") || next == "_")
                {
                    val nwop = next.stripPrefix("##");
                    if (nwop.forall(c => c.isLetterOrDigit || c == '_'))
                    {
                        m = nwop;
                    }
                }
            }
            else if (next.startsWith("##"))
            {
                m = next.stripPrefix("##");
            }
            if (m != null) soFar.init :+ (soFar.last + m) else soFar :+ next
        }
    }.mkString(" ")
})

In [6]:
val converted_df = df.withColumn("text_tokens", udf_unbert('text_tokens))
  .withColumn("hashtags", split('hashtags, "\t"))
  .withColumn("present_media", split('present_media, "\t"))
  .withColumn("present_links", split('present_links, "\t"))
  .withColumn("present_domains", split('present_domains, "\t"))
  .withColumn("has_retweet", udf_has_engagement('retweet_timestamp))
  .withColumn("has_retweet_with_comment", udf_has_engagement('retweet_with_comment_timestamp))
  .withColumn("has_like", udf_has_engagement('like_timestamp))
  .withColumn("has_reply", udf_has_engagement('reply_timestamp))
  .withColumn("follows", udf_bool_to_int('follows))
  .withColumn("user_is_verified", udf_bool_to_int('user_is_verified))
  .withColumn("author_is_verified", udf_bool_to_int('author_is_verified))
  .withColumn("tweet_text", udf_tweet('text_tokens, 'present_media))

converted_df

[text_tokens: array<string>, hashtags: array<string> ... 27 more fields]

In [7]:
converted_df.select(
        // for evaluation
        'user_id,
        'tweet_id,
        // model inputs
        'tweet_type,
        'tweet_text,
        'author_follower_count,
        'author_following_count,
        'author_is_verified,
        'user_follower_count,
        'user_following_count,
        'user_is_verified,
        'follows,
        // unused for now
        'tweet_timestamp,
        'hashtags,
        'present_media,
        'present_domains,
        // model outputs
        'has_retweet,
        'has_retweet_with_comment,
        'has_like,
        'has_reply
    ).write
    .mode(SaveMode.Overwrite)
    .parquet(dataDir + s"/${dsName}.parquet")

In [8]:
spark.read.parquet(dataDir + s"/${dsName}.parquet")

[user_id: string, tweet_id: string ... 17 more fields]