# generate preds

This is a text cell. Start editing!

In [1]:
import com.johnsnowlabs.nlp.SparkNLP
import com.johnsnowlabs.nlp.annotator._
import com.johnsnowlabs.nlp.base._
import com.johnsnowlabs.ml.tensorflow.TensorflowBert
import org.apache.spark.ml.Pipeline
import org.apache.spark.sql.types._
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.functions.{udf,to_timestamp}

val dataDir = "../../data/recsys2020/"

In [2]:
val schema = new StructType()
    .add("text_tokens", StringType, true)
    .add("hashtags", StringType, true)
    .add("tweet_id", StringType, true)
    .add("present_media", StringType, true)
    .add("present_links", StringType, true)
    .add("present_domains", StringType, true)
    .add("tweet_type", StringType, true)
    .add("language", StringType, true)
    .add("tweet_timestamp", IntegerType, true)
    .add("engaged_with_user_id", StringType, true)
    .add("engaged_with_user_follower_count", IntegerType, true)
    .add("engaged_with_user_following_count", IntegerType, true)
    .add("engaged_with_user_is_verified", BooleanType, true)
    .add("engaged_with_user_account_creation", IntegerType, true)
    .add("engaging_user_id", StringType, true)
    .add("engaging_user_follower_count", IntegerType, true)
    .add("engaging_user_following_count", IntegerType, true)
    .add("engaging_user_is_verified", BooleanType, true)
    .add("engaging_user_account_creation", IntegerType, true)
    .add("engagee_follows_engager", BooleanType, true)
    .add("reply_timestamp", IntegerType, true)
    .add("retweet_timestamp", IntegerType, true)
    .add("retweet_with_comment_timestamp", IntegerType, true)
    .add("like_timestamp", IntegerType, true)

val df = spark.read.format("csv")
        .option("delimiter", "\u0001")
        .schema(schema)
        .load(dataDir + "val.tsv")

df

[text_tokens: string, hashtags: string ... 22 more fields]

In [3]:
val test = df.limit(100)
test

[text_tokens: string, hashtags: string ... 22 more fields]

In [4]:
val rand_pred = df.select($"tweet_id", $"engaging_user_id", rand() as "retweet_pred", rand() as "comment_pred", rand() as "like_pred", rand() as "reply_pred")
rand_pred.show()

+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+
|            tweet_id|    engaging_user_id|       retweet_pred|        comment_pred|           like_pred|          reply_pred|
+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+
|7647B4E9DAF4C1D89...|0000006C307460705...| 0.4036248872770235|  0.0580106717104073|  0.6956724216590287|  0.8732374546130629|
|408DB1803264B5FF5...|00001331538649227...| 0.7350791454166347|  0.2296441695971918|  0.8979647611637663|  0.7668456453623054|
|2EE951379C47E8BF6...|00001569CB28972FC...|0.22283278604187862|0.052529003995441914| 0.39101942336068773|   0.826292099387204|
|2135F24B05DAE3EF2...|00001607209C5774D...|0.46427008648349377|  0.9645112161672722|  0.8952030165011892|0.011009512560946177|
|F5F712E11F0ED10C0...|00001607209C5774D...|0.18259647290861047|  0.9261971889328737|  0.2630592537290849|  0.65

In [6]:
rand_pred.select($"tweet_id", $"engaging_user_id", $"retweet_pred").show()

+--------------------+--------------------+-------------------+
|            tweet_id|    engaging_user_id|       retweet_pred|
+--------------------+--------------------+-------------------+
|7647B4E9DAF4C1D89...|0000006C307460705...| 0.4036248872770235|
|408DB1803264B5FF5...|00001331538649227...| 0.7350791454166347|
|2EE951379C47E8BF6...|00001569CB28972FC...|0.22283278604187862|
|2135F24B05DAE3EF2...|00001607209C5774D...|0.46427008648349377|
|F5F712E11F0ED10C0...|00001607209C5774D...|0.18259647290861047|
|09143FEDE9BD494A6...|0000177705514C315...|0.18937689577016614|
|60968762145D2AF58...|00001BC7053263218...|0.40800875228365163|
|3487905D0C69B0FE4...|00001D9D15FBADE90...| 0.9636274248752141|
|706310D7975C15B9F...|00001F56CDCF81D2E...|  0.385405784803737|
|DBC37B8C8DC70C70F...|00001F56CDCF81D2E...| 0.9816287680846935|
|BA7917AA4B620B132...|0000376314CAC0A3E...|0.00899342893678634|
|60DD856C81BC0A115...|000043D9A730DF476...|0.15993884740431674|
|CFBD0716FB1FE9969...|00004E42009644A76.

In [5]:
rand_pred.select($"tweet_id", $"engaging_user_id", $"retweet_pred").write.format("csv").option("header", "false").save(dataDir + "retweet.csv")
rand_pred.select($"tweet_id", $"engaging_user_id", $"comment_pred").write.format("csv").option("header", "false").save(dataDir + "comment.csv")
rand_pred.select($"tweet_id", $"engaging_user_id", $"like_pred").write.format("csv").option("header", "false").save(dataDir + "like.csv")
rand_pred.select($"tweet_id", $"engaging_user_id", $"reply_pred").write.format("csv").option("header", "false").save(dataDir + "reply.csv")

// combine files with `cat like.csv/*.csv > likes.csv`