# NLP RECSYS generate preds


This is a text cell. Start editing!




In [1]:
import com.johnsnowlabs.nlp.SparkNLP
import com.johnsnowlabs.nlp.annotator._
import com.johnsnowlabs.nlp.base._
import com.johnsnowlabs.ml.tensorflow.TensorflowBert
import org.apache.spark.ml.Pipeline
import org.apache.spark.sql.types._
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.functions.{udf,to_timestamp}

val dataDir = sys.env("HOME") + "/recsys2020"

In [2]:
val schema = new StructType()
    .add("text_tokens", StringType, true)
    .add("hashtags", StringType, true)
    .add("tweet_id", StringType, true)
    .add("present_media", StringType, true)
    .add("present_links", StringType, true)
    .add("present_domains", StringType, true)
    .add("tweet_type", StringType, true)
    .add("language", StringType, true)
    .add("tweet_timestamp", IntegerType, true)
    .add("engaged_with_user_id", StringType, true)
    .add("engaged_with_user_follower_count", IntegerType, true)
    .add("engaged_with_user_following_count", IntegerType, true)
    .add("engaged_with_user_is_verified", BooleanType, true)
    .add("engaged_with_user_account_creation", IntegerType, true)
    .add("engaging_user_id", StringType, true)
    .add("engaging_user_follower_count", IntegerType, true)
    .add("engaging_user_following_count", IntegerType, true)
    .add("engaging_user_is_verified", BooleanType, true)
    .add("engaging_user_account_creation", IntegerType, true)
    .add("engagee_follows_engager", BooleanType, true)
    .add("reply_timestamp", IntegerType, true)
    .add("retweet_timestamp", IntegerType, true)
    .add("retweet_with_comment_timestamp", IntegerType, true)
    .add("like_timestamp", IntegerType, true)

val df = spark.read.format("csv")
        .option("delimiter", "\u0001")
        .schema(schema)
        .load(dataDir + "val.tsv")

df

[text_tokens: string, hashtags: string ... 22 more fields]

In [3]:
val test = df.limit(100)
test

[text_tokens: string, hashtags: string ... 22 more fields]

In [4]:
// Just random
val rand_pred = df.select($"tweet_id", $"engaging_user_id", rand() as "retweet_pred", rand() as "comment_pred", rand() as "like_pred", rand() as "reply_pred")
rand_pred.show()

+--------------------+--------------------+--------------------+--------------------+-------------------+-------------------+
|            tweet_id|    engaging_user_id|        retweet_pred|        comment_pred|          like_pred|         reply_pred|
+--------------------+--------------------+--------------------+--------------------+-------------------+-------------------+
|7647B4E9DAF4C1D89...|0000006C307460705...| 0.11074563035059959|  0.6541691914881155| 0.5500873166106947|0.28970254447583577|
|408DB1803264B5FF5...|00001331538649227...|  0.7458503165317049|   0.695968301258484|0.45846555627584007| 0.7424707417776507|
|2EE951379C47E8BF6...|00001569CB28972FC...|  0.3415847271304381|0.011981681759338003| 0.7587005135832603|  0.635234528407645|
|2135F24B05DAE3EF2...|00001607209C5774D...| 0.03214864835719167| 0.44465062584338233| 0.3877920352603751| 0.5061259056189739|
|F5F712E11F0ED10C0...|00001607209C5774D...|  0.7778124437488826|  0.8610897126859887| 0.5319407944064418|0.39091365586

In [5]:
// Estimates from training1m
val m1 = 1000000.0
val lit_pred = df.select($"tweet_id", $"engaging_user_id", lit(111500/m1) as "retweet_pred", lit(7571/m1) as "comment_pred", lit(438013/m1) as "like_pred", lit(26709/m1) as "reply_pred")
lit_pred.show()

+--------------------+--------------------+------------+------------+---------+----------+
|            tweet_id|    engaging_user_id|retweet_pred|comment_pred|like_pred|reply_pred|
+--------------------+--------------------+------------+------------+---------+----------+
|7647B4E9DAF4C1D89...|0000006C307460705...|      0.1115|    0.007571| 0.438013|  0.026709|
|408DB1803264B5FF5...|00001331538649227...|      0.1115|    0.007571| 0.438013|  0.026709|
|2EE951379C47E8BF6...|00001569CB28972FC...|      0.1115|    0.007571| 0.438013|  0.026709|
|2135F24B05DAE3EF2...|00001607209C5774D...|      0.1115|    0.007571| 0.438013|  0.026709|
|F5F712E11F0ED10C0...|00001607209C5774D...|      0.1115|    0.007571| 0.438013|  0.026709|
|09143FEDE9BD494A6...|0000177705514C315...|      0.1115|    0.007571| 0.438013|  0.026709|
|60968762145D2AF58...|00001BC7053263218...|      0.1115|    0.007571| 0.438013|  0.026709|
|3487905D0C69B0FE4...|00001D9D15FBADE90...|      0.1115|    0.007571| 0.438013|  0.026709|

In [6]:
val pred = lit_pred

In [7]:
pred.select($"tweet_id", $"engaging_user_id", $"retweet_pred").show()

+--------------------+--------------------+------------+
|            tweet_id|    engaging_user_id|retweet_pred|
+--------------------+--------------------+------------+
|7647B4E9DAF4C1D89...|0000006C307460705...|      0.1115|
|408DB1803264B5FF5...|00001331538649227...|      0.1115|
|2EE951379C47E8BF6...|00001569CB28972FC...|      0.1115|
|2135F24B05DAE3EF2...|00001607209C5774D...|      0.1115|
|F5F712E11F0ED10C0...|00001607209C5774D...|      0.1115|
|09143FEDE9BD494A6...|0000177705514C315...|      0.1115|
|60968762145D2AF58...|00001BC7053263218...|      0.1115|
|3487905D0C69B0FE4...|00001D9D15FBADE90...|      0.1115|
|706310D7975C15B9F...|00001F56CDCF81D2E...|      0.1115|
|DBC37B8C8DC70C70F...|00001F56CDCF81D2E...|      0.1115|
|BA7917AA4B620B132...|0000376314CAC0A3E...|      0.1115|
|60DD856C81BC0A115...|000043D9A730DF476...|      0.1115|
|CFBD0716FB1FE9969...|00004E42009644A76...|      0.1115|
|DDCA4D6A45EFFFAD4...|00004E42009644A76...|      0.1115|
|97B2B88F6A4C30F67...|000052B88

In [8]:
pred.select($"tweet_id", $"engaging_user_id", $"retweet_pred").write.format("csv").option("header", "false").save(dataDir + "retweet.csv")
pred.select($"tweet_id", $"engaging_user_id", $"comment_pred").write.format("csv").option("header", "false").save(dataDir + "comment.csv")
pred.select($"tweet_id", $"engaging_user_id", $"like_pred").write.format("csv").option("header", "false").save(dataDir + "like.csv")
pred.select($"tweet_id", $"engaging_user_id", $"reply_pred").write.format("csv").option("header", "false").save(dataDir + "reply.csv")

// combine files with `cat like.csv/*.csv > likes.csv`