# NLP RECSYS Analyze the Dataset




In [1]:
import com.johnsnowlabs.nlp.SparkNLP
import com.johnsnowlabs.nlp.annotator._
import com.johnsnowlabs.nlp.base._
import com.johnsnowlabs.ml.tensorflow.TensorflowBert
import org.apache.spark.ml.Pipeline
import org.apache.spark.sql.types._
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.functions.{udf,to_timestamp}
import org.apache.spark.ml.feature.QuantileDiscretizer

val dataDir = sys.env("HOME") + "/recsys2020"

In [2]:
val df = spark.read.parquet(dataDir + "/training1m.parquet")

In [3]:
df.select("language")

org.apache.spark.sql.AnalysisException: cannot resolve '`language`' given input columns: [user_following_count, has_retweet_with_comment, follows, present_domains, num_hashtags, tweet_type, hashtags, user_follower_count, user_is_verified, num_domains, author_follower_count, has_like, num_links, has_reply, present_media, tweet_id, user_id, num_media, tokens, has_retweet, tweet_timestamp, author_following_count, author_is_verified];;
'Project ['language]
+- Relation[user_id#0,tweet_id#1,tweet_type#2,tokens#3,author_follower_count#4,author_following_count#5,author_is_verified#6,user_follower_count#7,user_following_count#8,user_is_verified#9,follows#10,num_hashtags#11,num_media#12,num_links#13,num_domains#14,tweet_timestamp#15,hashtags#16,present_media#17,present_domains#18,has_retweet#19,has_retweet_with_comment#20,has_like#21,has_reply#22] parquet


In [4]:
df.describe("reply_timestamp", "retweet_timestamp", "retweet_with_comment_timestamp", "like_timestamp").show()

org.apache.spark.sql.AnalysisException: cannot resolve '`reply_timestamp`' given input columns: [present_domains, user_following_count, follows, sentence, has_reply, hashtags, tweet_type, user_is_verified, user_follower_count, author_follower_count, has_retweet_with_comment, has_like, tweet_timestamp, tweet_id, user_id, has_retweet, author_is_verified, present_media, author_following_count];;
'Project ['reply_timestamp, 'retweet_timestamp, 'retweet_with_comment_timestamp, 'like_timestamp]
+- Relation[user_id#0,tweet_id#1,tweet_type#2,sentence#3,author_follower_count#4,author_following_count#5,author_is_verified#6,user_follower_count#7,user_following_count#8,user_is_verified#9,follows#10,tweet_timestamp#11,hashtags#12,present_media#13,present_domains#14,has_retweet#15,has_retweet_with_comment#16,has_like#17,has_reply#18] parquet


In [5]:
val udf_has_engagement = udf[Integer, Integer](x => if (x != null) 1 else 0)
val udf_bool_to_int = udf[Integer, Boolean](x => if (x) 1 else 0)

df.withColumn("follows", udf_bool_to_int('engagee_follows_engager)).show()
// df.withColumn("has_like", udf_has_engagement('like_timestamp)).show()
// df.withColumn("has_reply", udf_has_engagement('reply_timestamp)).show()

+-----------------------+--------------------+--------------------+-------------+-------------+---------------+----------+--------------------+---------------+--------------------+--------------------------------+---------------------------------+-----------------------------+----------------------------------+--------------------+----------------------------+-----------------------------+-------------------------+------------------------------+-----------------------+---------------+-----------------+------------------------------+--------------+-------+
|            text_tokens|            hashtags|            tweet_id|present_media|present_links|present_domains|tweet_type|            language|tweet_timestamp|engaged_with_user_id|engaged_with_user_follower_count|engaged_with_user_following_count|engaged_with_user_is_verified|engaged_with_user_account_creation|    engaging_user_id|engaging_user_follower_count|engaging_user_following_count|engaging_user_is_verified|engaging_user_accoun

In [6]:
df

[text_tokens: array<string>, hashtags: array<string> ... 22 more fields]

In [7]:
// Calculate average number of interactions
val int_counts = df.groupBy("engaging_user_id").count()
int_counts.agg(avg("count")).show()

+------------------+
|        avg(count)|
+------------------+
|1.0337498656125175|
+------------------+



In [8]:
int_counts

[engaging_user_id: string, count: bigint]

In [9]:
val discretizer = new QuantileDiscretizer()
  .setInputCol("count")
  .setOutputCol("hist")
  .setNumBuckets(5)

var result = discretizer.fit(int_counts).transform(int_counts)
result = result.withColumn("hist", col("hist").cast(IntegerType))
result.show(false)

+--------------------------------+-----+----+
|engaging_user_id                |count|hist|
+--------------------------------+-----+----+
|8E7635BA67D7D363604AAEF78869CF8A|1    |1   |
|AEC33191EEF88AE87172D35FD6B7DEB8|1    |1   |
|384773E169D00767CDBD36D8929CED75|1    |1   |
|A473AB8FAD982872C89742D724D4F6AB|1    |1   |
|D8ABFB24737488D0F30916871DC02E61|1    |1   |
|C328B8FF5225AF35AD213B586E3D10A6|1    |1   |
|D02ED9169AF2AF79BFAB2D04CA4AEE08|1    |1   |
|ADEB7FAB341E95C391F1394C496EF9ED|1    |1   |
|C0ACF8893A2F46D0259C7E6881D381A0|2    |1   |
|CA041E9EF58477D4B097BBF67E3411FD|1    |1   |
|EB123E68BC54C6B96D9BB27E5FF53D98|3    |1   |
|1C7DBF99B5F82B65B28155128DB40470|1    |1   |
|0413B272EEE0B987C2E23A31CA5AA110|1    |1   |
|89FE00F0BC717421923581681950A087|1    |1   |
|5A9714AC339C63AE0FC09E4FA00725AD|1    |1   |
|41EDFE146E65938905B80FFD312AD06F|1    |1   |
|536EC4034D544A6A17FD804D416FE4F0|1    |1   |
|1B1C8F849A91786F8E5DF69821F99980|1    |1   |
|3DF4EDB10EF6A5E01A2DB10A4EB55598|

In [10]:
int_counts.groupBy("count").agg(count("count") as "countcount")

[count: bigint, countcount: bigint]

In [11]:
int_counts.select(int_counts("count").cast(DoubleType)).rdd.map(x => x)

MapPartitionsRDD[500] at map at Cell10:1

In [12]:
val hist = int_counts.selectExpr("histogram_numeric(count, 50)")
hist.show(false)
hist.toDF()

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|histogram_numeric( count, 50)                                                                                                                                                                                                    

[histogram_numeric( count, 50): array<struct<x:double,y:double>>]

In [13]:
val newHist = Array([[1.3836935684628464, 550335.0], [16.50406504065044, 4059.0], [28.252234359483612, 1007.0], [37.101832993890014, 491.0], [46.77844311377246, 334.0], [56.30232558139532, 172.0], [64.57333333333332, 150.0], [74.26363636363635, 110.0], [82.6438356164384, 73.0], [91.74324324324326, 74.0], [101.29999999999998, 40.0], [109.3260869565217, 46.0], [120.0, 29.0], [133.10526315789474, 38.0], [148.8157894736842, 38.0], [162.09090909090907, 11.0], [174.80769230769232, 26.0], [190.0, 10.0], [204.39999999999998, 5.0], [218.66666666666666, 12.0], [229.63636363636365, 11.0], [239.60000000000002, 5.0], [248.77777777777777, 9.0], [257.2, 5.0], [266.66666666666663, 3.0], [284.8, 5.0], [294.3333333333333, 3.0], [306.0, 3.0], [332.0, 1.0], [341.4, 5.0], [355.3333333333333, 3.0], [371.0, 1.0], [386.0, 2.0], [401.0, 3.0], [419.0, 3.0], [429.0, 4.0], [440.0, 2.0], [460.5, 2.0], [477.5, 2.0], [490.0, 1.0], [501.0, 1.0], [513.0, 1.0], [602.0, 1.0], [628.0, 1.0], [649.0, 1.0], [883.5, 2.0], [977.0, 1.0], [1044.0, 1.0], [1312.0, 1.0], [2778.0, 1.0]])
newHist

Error: illegal start of simple expression (20)Error: ')' expected but eof found. (1065)

In [14]:
hist.flatMap(x => x.toS)

Error: value toS is not a member of org.apache.spark.sql.Row (18)

In [15]:
df.limit(10)

[text_tokens: array<string>, hashtags: array<string> ... 22 more fields]