https://ieeexplore.ieee.org/document/7072954
TDIDF cosine similarity

https://towardsdatascience.com/calculating-document-similarities-using-bert-and-other-models-b2c1a29c9630
Calculating document similarities

https://edstem.org/courses/5492/discussion/469433
Usually would be using feature extraction on tweets themselves, but for sake of simplicity we are doing it on the document represention instead.

https://edstem.org/courses/5492/discussion/471511
NB: USE STRING REPRESENTATIONS OF THE DOCUMENT REPRESENTATION; (539, 47, 4) == "539 47 4"



In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("mdon9995 Assignment 2") \
    .getOrCreate()

sc = spark.sparkContext

file_path = "tweets.json"
tweets_data_raw = spark.read.option("multiline", "true").json(file_path)

In [2]:
tweets_data_raw.printSchema()

root
 |-- created_at: string (nullable = true)
 |-- hash_tags: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- indices: array (nullable = true)
 |    |    |    |-- element: long (containsNull = true)
 |    |    |-- text: string (nullable = true)
 |-- id: long (nullable = true)
 |-- replyto_id: long (nullable = true)
 |-- replyto_user_id: long (nullable = true)
 |-- retweet_id: long (nullable = true)
 |-- retweet_user_id: long (nullable = true)
 |-- text: string (nullable = true)
 |-- user_id: long (nullable = true)
 |-- user_mentions: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- indices: array (nullable = true)
 |    |    |    |-- element: long (containsNull = true)



In [3]:
tweets_data_raw.createOrReplaceTempView("tweets")

In [81]:
df = spark.sql("""
                    SELECT user_id, replyto_id, retweet_id
                    FROM tweets
                    WHERE replyto_id IS NOT NULL
                    OR retweet_id IS NOT NULL
                    """)
df.show(1000,False)
df.count()

+-------------------+-------------------+-------------------+
|user_id            |replyto_id         |retweet_id         |
+-------------------+-------------------+-------------------+
|17799542           |null               |1390027514332991489|
|1166466828         |null               |1390022155350446082|
|1343606436149022723|null               |1390050885229817856|
|930226031276982273 |null               |1390066365046865929|
|920858307392192513 |null               |1390027514332991489|
|21458110           |null               |1390025466539614212|
|787062740183552000 |null               |1390023742194061312|
|2955789098         |null               |1390027514332991489|
|198453947          |null               |1390027514332991489|
|1431726547         |null               |1390066365046865929|
|1245145031045980163|null               |1390023742194061312|
|2181244875         |null               |1390039923588206598|
|34865264           |null               |1390023742194061312|
|1799129

8833

In [137]:
# # Concatenating reply and retweet ids per entry. This also conveniently removes the null values.
# df_concat = df.select(
#                     'user_id', f.concat_ws(", ", "replyto_id", "retweet_id")\
#                     .alias("doc_rep"))


# # Grouping by user_id, again concatenating across entries
# df_group = df_concat.groupby("user_id")\
#             .agg(f.concat_ws(", ", f.collect_list(df_concat.doc_rep))\
#             .alias("doc_rep"))
# df_group.show()

+----------+--------------------+
|   user_id|             doc_rep|
+----------+--------------------+
|  15466159| 1390027514332991489|
|  19652471| 1390023742194061312|
|  30616018| 1390026843068239874|
|  32947971| 1390027514332991489|
|  43301934| 1390022155350446082|
|  51797430| 1390023742194061312|
|  54579715| 1390027514332991489|
| 106456868| 1390060755995799552|
| 109826391| 1390027514332991489|
| 111249239| 1390023742194061312|
| 147126487| 1390043577703563265|
| 157101980| 1390056156756914182|
| 202170318|13900713418947502...|
| 228531805| 1390027514332991489|
| 257121078| 1390023742194061312|
| 302864870| 1390050885229817856|
| 417365324|13900561567569141...|
| 528706065| 1390067358534864897|
| 569489725| 1390023742194061312|
|1567435968| 1390022155350446082|
+----------+--------------------+
only showing top 20 rows



In [30]:
from pyspark.sql import functions as f

# Retrieve user, reply and retweet ids per entry.
# By using concat_ws we also remove the NULLS that occur.
df = spark.sql("""
                    SELECT user_id, CONCAT_WS(" ", replyto_id, retweet_id) AS reply_retweet
                    FROM tweets
                    WHERE replyto_id IS NOT NULL
                    OR retweet_id IS NOT NULL
                    """)
df.show(20, False)
print("Extracted {} entries from json file.".format(df.count()))
print("\n")

# Grouping by user_id, aggregate by concatenating across entries, with final column name of doc_rep
df_group = df.groupby("user_id")\
            .agg(f.concat_ws(" ", f.collect_list(df.reply_retweet))\
            .alias("doc_rep"))
df_group.show(20, False)
print("Grouped into {} entries.".format(df_group.count()))

+-------------------+-------------------+
|user_id            |reply_retweet      |
+-------------------+-------------------+
|17799542           |1390027514332991489|
|1166466828         |1390022155350446082|
|1343606436149022723|1390050885229817856|
|930226031276982273 |1390066365046865929|
|920858307392192513 |1390027514332991489|
|21458110           |1390025466539614212|
|787062740183552000 |1390023742194061312|
|2955789098         |1390027514332991489|
|198453947          |1390027514332991489|
|1431726547         |1390066365046865929|
|1245145031045980163|1390023742194061312|
|2181244875         |1390039923588206598|
|34865264           |1390023742194061312|
|179912903          |1390087644235902979|
|1173096863840423941|1390069325353033729|
|40404318           |1390071789208936452|
|1326851827879604226|1390084013864460296|
|5567892            |1390023742194061312|
|55199013           |1390086925780934662|
|2289225258         |1389978947723546625|
+-------------------+-------------

In [17]:
#TODO for q1: implement 2 feature extractors to perform cosine similarity.
import random

random.seed(430113983)
user_id = random.choice(df_group.select("user_id").collect())[0]
print(user_id)



174233697


In [125]:
sent = ("a b " * 100 + "a c " * 10).split(" ")
doc = spark.createDataFrame([(sent,), (sent,)], ["sentence"])

In [138]:
df_group.collect()

[Row(user_id=15466159, doc_rep='1390027514332991489'),
 Row(user_id=19652471, doc_rep='1390023742194061312'),
 Row(user_id=30616018, doc_rep='1390026843068239874'),
 Row(user_id=32947971, doc_rep='1390027514332991489'),
 Row(user_id=43301934, doc_rep='1390022155350446082'),
 Row(user_id=51797430, doc_rep='1390023742194061312'),
 Row(user_id=54579715, doc_rep='1390027514332991489'),
 Row(user_id=106456868, doc_rep='1390060755995799552'),
 Row(user_id=109826391, doc_rep='1390027514332991489'),
 Row(user_id=111249239, doc_rep='1390023742194061312'),
 Row(user_id=147126487, doc_rep='1390043577703563265'),
 Row(user_id=157101980, doc_rep='1390056156756914182'),
 Row(user_id=202170318, doc_rep='1390071341894750211, 1390047008971444231, 1390022155350446082, 1390062246034698240, 1390068042474917888'),
 Row(user_id=228531805, doc_rep='1390027514332991489'),
 Row(user_id=257121078, doc_rep='1390023742194061312'),
 Row(user_id=302864870, doc_rep='1390050885229817856'),
 Row(user_id=417365324, doc

In [126]:
doc.collect()

[Row(sentence=['a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 

In [82]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, Normalizer

tokenizer = Tokenizer(inputCol="doc_rep", outputCol="id")
data_id = tokenizer.transform(df_group)

numFeatures = 20

hashingTF = HashingTF(inputCol="id", outputCol="rawFeatures", numFeatures=numFeatures)
data_featurized = hashingTF.transform(data_id)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idf_model = idf.fit(data_featurized)
data_rescaled = idf_model.transform(data_featurized)

data_rescaled.select("user_id", "features").show(10, False)

+---------+------------------------------+
|user_id  |features                      |
+---------+------------------------------+
|15466159 |(20,[16],[1.133629785467845]) |
|19652471 |(20,[1],[1.311352939827772])  |
|30616018 |(20,[16],[1.133629785467845]) |
|32947971 |(20,[16],[1.133629785467845]) |
|43301934 |(20,[18],[2.32521271851598])  |
|51797430 |(20,[1],[1.311352939827772])  |
|54579715 |(20,[16],[1.133629785467845]) |
|106456868|(20,[19],[3.8790135506446846])|
|109826391|(20,[16],[1.133629785467845]) |
|111249239|(20,[1],[1.311352939827772])  |
+---------+------------------------------+
only showing top 10 rows



In [78]:
# # Taking L2 norm for matrix product
# normalizer = Normalizer(inputCol="features", outputCol="norm")
# data_norm = normalizer.transform(data_rescaled)
# data_norm.select("user_id", "norm").show(100,False)

# from pyspark.mllib.linalg.distributed import IndexedRowMatrix
# mat = IndexedRowMatrix(
#         data_norm.select("user_id", "norm")\
#         .rdd.map(lambda row: IndexedRow(row.user_id, row.norm.toArray()))).toBlockMatrix()
# input_user = top_5_interest(user_id, data_norm).collect()[0]
# # dot = mat.multiply(input_user)
# # dot.toLocalMatrix().toArray()
# # https://stackoverflow.com/questions/46758768/calculating-the-cosine-similarity-between-all-the-rows-of-a-dataframe-in-pyspark

+-------------------+-----------------------------------------------------------------------------------------------+
|user_id            |norm                                                                                           |
+-------------------+-----------------------------------------------------------------------------------------------+
|15466159           |(20,[16],[1.0])                                                                                |
|19652471           |(20,[1],[1.0])                                                                                 |
|30616018           |(20,[16],[1.0])                                                                                |
|32947971           |(20,[16],[1.0])                                                                                |
|43301934           |(20,[18],[1.0])                                                                                |
|51797430           |(20,[1],[1.0])                     

IllegalArgumentException: requirement failed: Number of rows divided by rowsPerBlock cannot exceed maximum integer.

In [112]:
#For a particular user, we calculate the cosine similarity with all other users, then return top 5 users with similar interest.

def top_5_interest(user_id, data):
    user_features = data.filter(data.user_id == user_id).select("features")
    return user_features

def cos_mul(x, y):
    return 1 - x.dot(y)/(x.norm(2)*y.norm(2))

def cos_mul_mapping(values):
    vlist = values.collect()
    feature = vlist[0]
    print(feature)
#     return vlist

data_rescaled.select("features").rdd.map(lambda vector: DenseVector(vector.toArray())).mapValues(cos_mul_mapping).collect()

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 8 in stage 887.0 failed 1 times, most recent failure: Lost task 8.0 in stage 887.0 (TID 58746) (1b2e1bee6859 executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 604, in main
    process()
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 596, in process
    serializer.dump_stream(out_iter, outfile)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 259, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/util.py", line 73, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-112-978e6444f111>", line 16, in <lambda>
NameError: name 'DenseVector' is not defined

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:517)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:652)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:635)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:470)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:315)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:313)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:307)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:307)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:294)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:288)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1030)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2242)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2253)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2202)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2201)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2201)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1078)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1078)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1078)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2440)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2382)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2371)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2202)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2223)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2242)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2267)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1030)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1029)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:180)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 604, in main
    process()
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 596, in process
    serializer.dump_stream(out_iter, outfile)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 259, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/util.py", line 73, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-112-978e6444f111>", line 16, in <lambda>
NameError: name 'DenseVector' is not defined

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:517)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:652)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:635)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:470)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:315)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:313)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:307)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:307)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:294)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:288)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1030)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2242)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more


In [117]:
from pyspark.ml.linalg import DenseVector
data_rescaled.select("features").collect()

[Row(features=SparseVector(20, {16: 1.1336})),
 Row(features=SparseVector(20, {1: 1.3114})),
 Row(features=SparseVector(20, {16: 1.1336})),
 Row(features=SparseVector(20, {16: 1.1336})),
 Row(features=SparseVector(20, {18: 2.3252})),
 Row(features=SparseVector(20, {1: 1.3114})),
 Row(features=SparseVector(20, {16: 1.1336})),
 Row(features=SparseVector(20, {19: 3.879})),
 Row(features=SparseVector(20, {16: 1.1336})),
 Row(features=SparseVector(20, {1: 1.3114})),
 Row(features=SparseVector(20, {15: 4.3801})),
 Row(features=SparseVector(20, {16: 1.1336})),
 Row(features=SparseVector(20, {7: 3.8908, 9: 2.9793, 16: 2.2673, 18: 2.3252})),
 Row(features=SparseVector(20, {16: 1.1336})),
 Row(features=SparseVector(20, {1: 1.3114})),
 Row(features=SparseVector(20, {8: 3.1483})),
 Row(features=SparseVector(20, {8: 3.1483, 16: 1.1336})),
 Row(features=SparseVector(20, {2: 3.6076})),
 Row(features=SparseVector(20, {1: 1.3114})),
 Row(features=SparseVector(20, {18: 2.3252})),
 Row(features=SparseVec

In [99]:
for i in range(100):
    print(cos_mul(data_rescaled.select("features").collect()[i][0], top_5_interest(user_id, data_rescaled).collect()[0][0]))


1.0
0.0
1.0
1.0
1.0
0.0
1.0
1.0
1.0
0.0
1.0
1.0
1.0
1.0
0.0
1.0
1.0
1.0
0.0
1.0
1.0
0.0
0.0
1.0
1.0
1.0
0.0
1.0
0.0
0.0
0.0
1.0
0.0
1.0
0.0
1.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
1.0
1.0
1.0
0.0
1.0
0.0
0.0
0.0
1.0
1.0
0.0
1.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
1.0
0.0
1.0
1.0
1.0
0.0
1.0
1.0
1.0
0.0
1.0
1.0
0.0
0.0
1.0
1.0
1.0
1.0
1.0
0.0
0.0
1.0
0.24349033453573143
0.7995450774621465
1.0
1.0
1.0
1.0
0.0
