# NLP RECSYS analysis<br>




In [1]:
import com.johnsnowlabs.nlp.SparkNLP
import com.johnsnowlabs.nlp.annotator._
import com.johnsnowlabs.nlp.base._
import com.johnsnowlabs.ml.tensorflow.TensorflowBert
import org.apache.spark.ml.Pipeline
import org.apache.spark.sql.types._
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.functions._
import org.apache.spark.storage.StorageLevel

val dataDir = sys.env("HOME") + "/recsys2020"

In [2]:
val schema = new StructType()
    .add("text_tokens", StringType, true)
    .add("hashtags", StringType, true)
    .add("tweet_id", StringType, true)
    .add("present_media", StringType, true)
    .add("present_links", StringType, true)
    .add("present_domains", StringType, true)
    .add("tweet_type", StringType, true)
    .add("language", StringType, true)
    .add("tweet_timestamp", IntegerType, true)
    .add("engaged_with_user_id", StringType, true)
    .add("engaged_with_user_follower_count", IntegerType, true)
    .add("engaged_with_user_following_count", IntegerType, true)
    .add("engaged_with_user_is_verified", BooleanType, true)
    .add("engaged_with_user_account_creation", IntegerType, true)
    .add("engaging_user_id", StringType, true)
    .add("engaging_user_follower_count", IntegerType, true)
    .add("engaging_user_following_count", IntegerType, true)
    .add("engaging_user_is_verified", BooleanType, true)
    .add("engaging_user_account_creation", IntegerType, true)
    .add("engagee_follows_engager", BooleanType, true)
    .add("reply_timestamp", IntegerType, true)
    .add("retweet_timestamp", IntegerType, true)
    .add("retweet_with_comment_timestamp", IntegerType, true)
    .add("like_timestamp", IntegerType, true)

val df = spark.read.format("csv")
        .option("delimiter", "\u0001")
        .schema(schema)
        .load(dataDir + "/val.tsv")

In [3]:
// Language distribution
val rows = df.count()
val res = df.select("language").groupBy("language").agg(count("*").alias("cnt")).withColumn("cnt", col("cnt").divide(rows)).orderBy(desc("cnt"))
val sumPercent = res.agg(sum("cnt")).first.get(0)

res.show(1000, false)
sumPercent

+--------------------------------+---------------------+
|language                        |cnt                  |
+--------------------------------+---------------------+
|D3164C7FBCF2565DDF915B1B3AEFB1DC|0.43447980391345364  |
|22C448FF81263D4BAF2A176145EE9EAD|0.15602849009586697  |
|06D61DCBBE938971E1EA0C38BD9B5446|0.08845777233092927  |
|ECED8A16BE2A5E8871FD55F4842F16B1|0.0704333397805851   |
|B9175601E87101A984A50F8A62A1C374|0.05070714245695474  |
|4DC22C3F31C5C43721E6B5815A595ED6|0.030931816511777777 |
|167115458A0DBDFF7E9C0C53A83BAC9B|0.0280888318677801   |
|022EC308651FACB02794A8147AEE1B78|0.021799946000747205 |
|125C57F4FA6D4E110983FB11B52EFD4E|0.020493532899706384 |
|FA3F382BC409C271E3D6EAF8BE4648DD|0.02047157282818412  |
|9BF3403E0EB7EA8A256DA9019C0B0716|0.01612917983919034  |
|975B38F44D65EE42A547283787FF5A21|0.009035724813276046 |
|2996EB2FE8162C076D070A4C8D6532CD|0.007845869014930314 |
|FF60A88F53E63000266F8B9149E35AD9|0.005542947283722153 |
|717293301FE296B0B61950D0414858

0.9999999999999998

In [5]:
val hist = num_tweets_reacted_with_by_user.groupBy("num_tweets").count().persist(StorageLevel.MEMORY_ONLY)
hist

In [6]:
hist

[num_tweets: bigint, count: bigint]