# NLP RECSYS sparknlp scala experiments<br>




In [1]:
import com.johnsnowlabs.nlp.SparkNLP
import com.johnsnowlabs.nlp.annotator._
import com.johnsnowlabs.nlp.base._
import com.johnsnowlabs.ml.tensorflow.TensorflowBert
import org.apache.spark.ml.Pipeline
import org.apache.spark.sql.types._
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.functions.{udf,to_timestamp}

val dataDir = "/home/sko/sync/data/recsys2020"


In [3]:
val embeddingIdxToTokenStringMap =  BertEmbeddings.pretrained(name="bert_multi_cased", lang="xx").vocabulary.getOrDefault.map(_ swap)


bert_multi_cased download started this may take some time.
Approximate size to download 638.7 MB
Download done! Loading the resource.


In [4]:
val schema = new StructType()
    .add("text_tokens", StringType, true)
    .add("hashtags", StringType, true)
    .add("tweet_id", StringType, true)
    .add("present_media", StringType, true)
    .add("present_links", StringType, true)
    .add("present_domains", StringType, true)
    .add("tweet_type", StringType, true)
    .add("language", StringType, true)
    .add("tweet_timestamp", IntegerType, true)
    .add("engaged_with_user_id", StringType, true)
    .add("engaged_with_user_follower_count", IntegerType, true)
    .add("engaged_with_user_following_count", IntegerType, true)
    .add("engaged_with_user_is_verified", BooleanType, true)
    .add("engaged_with_user_account_creation", IntegerType, true)
    .add("engaging_user_id", StringType, true)
    .add("engaging_user_follower_count", IntegerType, true)
    .add("engaging_user_following_count", IntegerType, true)
    .add("engaging_user_is_verified", BooleanType, true)
    .add("engaging_user_account_creation", IntegerType, true)
    .add("engagee_follows_engager", BooleanType, true)
    .add("reply_timestamp", IntegerType, true)
    .add("retweet_timestamp", IntegerType, true)
    .add("retweet_with_comment_timestamp", IntegerType, true)
    .add("like_timestamp", IntegerType, true)

val df = spark.read.format("csv")
        .option("delimiter", "\u0001")
        .schema(schema)
        .load(dataDir + "/training1m.tsv")


[text_tokens: string, hashtags: string ... 22 more fields]

In [5]:
val udf_unbert = udf[Array[String], String](_.split("\t").map(strTokenIdx => embeddingIdxToTokenStringMap.getOrElse(key=strTokenIdx.toInt, default="UNKN")))

In [6]:
val converted_df = df.withColumn("text_tokens", udf_unbert('text_tokens))
  .withColumn("hashtags", split('hashtags, "\t"))
  .withColumn("present_media", split('present_media, "\t"))
  .withColumn("present_links", split('present_links, "\t"))
  .withColumn("present_domains", split('present_domains, "\t"))
//   .withColumn("tweet_timestamp", 'tweet_timestamp.cast(TimestampType))
//   .withColumn("reply_timestamp", 'reply_timestamp.cast(TimestampType))
//   .withColumn("retweet_timestamp", 'retweet_timestamp.cast(TimestampType))
//   .withColumn("retweet_with_comment_timestamp", 'retweet_with_comment_timestamp.cast(TimestampType))
//   .withColumn("like_timestamp", 'like_timestamp.cast(TimestampType))
converted_df

[text_tokens: array<string>, hashtags: array<string> ... 22 more fields]

In [7]:
converted_df.write.mode(SaveMode.Overwrite).parquet(dataDir + "/training1m.parquet")

In [8]:
spark.read.parquet(dataDir + "/training1m.parquet")

[text_tokens: array<string>, hashtags: array<string> ... 22 more fields]