In [1]:
# Import all necessary libraries and setup the environment for matplotlib
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.ml.feature import PCA
from pyspark.ml.clustering import KMeans
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
import numpy as np
import matplotlib.pyplot as plt
import json
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import Word2Vec

In [2]:
spark = SparkSession \
    .builder \
    .appName("Python Spark Machine Learning Assignment") \
    .getOrCreate()

In [3]:
datafile = "tweets.json"
num_test_samples = 1000

with open(datafile) as f:
    data = spark.read.option("multiline", "true").json(datafile)

In [4]:
data.show(5)

+-------------------+--------------------+-------------------+----------+---------------+-------------------+---------------+--------------------+-------------------+--------------------+
|         created_at|           hash_tags|                 id|replyto_id|replyto_user_id|         retweet_id|retweet_user_id|                text|            user_id|       user_mentions|
+-------------------+--------------------+-------------------+----------+---------------+-------------------+---------------+--------------------+-------------------+--------------------+
|2021-05-05 23:37:51|                null|1390088382659895296|      null|           null|1390027514332991489|         807095|RT @nytimes: Brea...|           17799542| [{807095, [3, 11]}]|
|2021-05-05 23:37:45|[{[9, 18], BREAKI...|1390088354717474822|      null|           null|1390022155350446082|      380648579|RT @AFP: #BREAKIN...|         1166466828|[{380648579, [3, ...|
|2021-05-05 23:37:42|                null|139008834528864666

### Workload1

In [5]:
data1 = data.select(["user_id","replyto_id","retweet_id"])

In [6]:
data2 = data1.rdd.map(tuple)

In [7]:
data2.take(5)
#.groupByKey()\

[(17799542, None, 1390027514332991489),
 (1166466828, None, 1390022155350446082),
 (1343606436149022723, None, 1390050885229817856),
 (930226031276982273, None, 1390066365046865929),
 (920858307392192513, None, 1390027514332991489)]

In [8]:
data3 = data2.map(lambda a:(a[0],(a[1], a[2])))

In [9]:
data3.collect()

[(17799542, (None, 1390027514332991489)),
 (1166466828, (None, 1390022155350446082)),
 (1343606436149022723, (None, 1390050885229817856)),
 (930226031276982273, (None, 1390066365046865929)),
 (920858307392192513, (None, 1390027514332991489)),
 (21458110, (None, 1390025466539614212)),
 (787062740183552000, (None, 1390023742194061312)),
 (392646132, (None, None)),
 (2955789098, (None, 1390027514332991489)),
 (792380204287164416, (None, None)),
 (198453947, (None, 1390027514332991489)),
 (1431726547, (None, 1390066365046865929)),
 (1245145031045980163, (None, 1390023742194061312)),
 (2181244875, (None, 1390039923588206598)),
 (34865264, (None, 1390023742194061312)),
 (179912903, (None, 1390087644235902979)),
 (1173096863840423941, (None, 1390069325353033729)),
 (491594719, (None, None)),
 (40404318, (None, 1390071789208936452)),
 (1326851827879604226, (None, 1390084013864460296)),
 (5567892, (None, 1390023742194061312)),
 (55199013, (1390086925780934662, None)),
 (2289225258, (None, 13899

In [10]:
data3.reduceByKey(lambda a,b:(a,b)).collect()

[(17799542, (None, 1390027514332991489)),
 (1166466828, (None, 1390022155350446082)),
 (1343606436149022723, (None, 1390050885229817856)),
 (930226031276982273, (None, 1390066365046865929)),
 (920858307392192513, (None, 1390027514332991489)),
 (21458110, (None, 1390025466539614212)),
 (787062740183552000, (None, 1390023742194061312)),
 (392646132, (None, None)),
 (2955789098, (None, 1390027514332991489)),
 (792380204287164416, (None, None)),
 (198453947, (None, 1390027514332991489)),
 (1431726547, (None, 1390066365046865929)),
 (1245145031045980163, (None, 1390023742194061312)),
 (2181244875, (None, 1390039923588206598)),
 (34865264, (None, 1390023742194061312)),
 (179912903, (None, 1390087644235902979)),
 (1173096863840423941, (None, 1390069325353033729)),
 (491594719, (None, None)),
 (40404318, (None, 1390071789208936452)),
 (1326851827879604226, (None, 1390084013864460296)),
 (5567892, (None, 1390023742194061312)),
 (55199013, ((1390086925780934662, None), (1390032857846980608, None

In [11]:
from itertools import chain

def combineIds(values):
    values = list(values)
    clean = []
    for i in values:
        for j in i:
            if j != None:
                clean.append(str(j))
    return clean

In [12]:
data4 = data3.groupByKey().mapValues(combineIds).collect()

In [13]:
data4[:25]

[(17799542, ['1390027514332991489']),
 (1166466828, ['1390022155350446082']),
 (1343606436149022723, ['1390050885229817856']),
 (930226031276982273, ['1390066365046865929']),
 (920858307392192513, ['1390027514332991489']),
 (21458110, ['1390025466539614212']),
 (787062740183552000, ['1390023742194061312']),
 (392646132, []),
 (2955789098, ['1390027514332991489']),
 (792380204287164416, []),
 (198453947, ['1390027514332991489']),
 (1431726547, ['1390066365046865929']),
 (1245145031045980163, ['1390023742194061312']),
 (2181244875, ['1390039923588206598']),
 (34865264, ['1390023742194061312']),
 (179912903, ['1390087644235902979']),
 (1173096863840423941, ['1390069325353033729']),
 (491594719, []),
 (40404318, ['1390071789208936452']),
 (1326851827879604226, ['1390084013864460296']),
 (5567892, ['1390023742194061312']),
 (55199013, ['1390086925780934662', '1390032857846980608']),
 (2289225258, ['1389978947723546625']),
 (49613619, ['1390087448093503494']),
 (787859316392927232, ['1390023

In [14]:
schema =  StructType([
    StructField("uid", IntegerType(), True),
    StructField("tweets", StringType(), True)])

In [15]:
documentDF = spark.createDataFrame(data4)
#documentDF = data4.toDF()

In [16]:
documentDF.show()

+-------------------+--------------------+
|                 _1|                  _2|
+-------------------+--------------------+
|           17799542|[1390027514332991...|
|         1166466828|[1390022155350446...|
|1343606436149022723|[1390050885229817...|
| 930226031276982273|[1390066365046865...|
| 920858307392192513|[1390027514332991...|
|           21458110|[1390025466539614...|
| 787062740183552000|[1390023742194061...|
|          392646132|                  []|
|         2955789098|[1390027514332991...|
| 792380204287164416|                  []|
|          198453947|[1390027514332991...|
|         1431726547|[1390066365046865...|
|1245145031045980163|[1390023742194061...|
|         2181244875|[1390039923588206...|
|           34865264|[1390023742194061...|
|          179912903|[1390087644235902...|
|1173096863840423941|[1390069325353033...|
|          491594719|                  []|
|           40404318|[1390071789208936...|
|1326851827879604226|[1390084013864460...|
+----------

In [1]:
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="_2", outputCol="result")
model = word2Vec.fit(documentDF)

NameError: name 'Word2Vec' is not defined

In [18]:
result = model.transform(documentDF)
result.take(25)

[Row(_1=17799542, _2=['1390027514332991489'], result=DenseVector([1.4269, 1.4158, -0.2943])),
 Row(_1=1166466828, _2=['1390022155350446082'], result=DenseVector([0.2972, 0.0778, 0.054])),
 Row(_1=1343606436149022723, _2=['1390050885229817856'], result=DenseVector([-0.0964, 0.0917, 0.1609])),
 Row(_1=930226031276982273, _2=['1390066365046865929'], result=DenseVector([0.083, 0.149, -0.0041])),
 Row(_1=920858307392192513, _2=['1390027514332991489'], result=DenseVector([1.4269, 1.4158, -0.2943])),
 Row(_1=21458110, _2=['1390025466539614212'], result=DenseVector([0.3296, -0.0487, 0.6734])),
 Row(_1=787062740183552000, _2=['1390023742194061312'], result=DenseVector([0.3226, 0.0398, 0.0567])),
 Row(_1=392646132, _2=[], result=SparseVector(3, {})),
 Row(_1=2955789098, _2=['1390027514332991489'], result=DenseVector([1.4269, 1.4158, -0.2943])),
 Row(_1=792380204287164416, _2=[], result=SparseVector(3, {})),
 Row(_1=198453947, _2=['1390027514332991489'], result=DenseVector([1.4269, 1.4158, -0.294

In [19]:
vector_rep = dict()
for row in result.collect():
    text, _, vector = row
    vector_rep[text] = vector.values
vector_rep

{17799542: array([ 1.42692864,  1.41583669, -0.29434925]),
 1166466828: array([0.29721645, 0.07784763, 0.05403569]),
 1343606436149022723: array([-0.09635407,  0.09167962,  0.16087177]),
 930226031276982273: array([ 0.08295848,  0.14897694, -0.00406564]),
 920858307392192513: array([ 1.42692864,  1.41583669, -0.29434925]),
 21458110: array([ 0.32959867, -0.04871361,  0.67342329]),
 787062740183552000: array([0.32263699, 0.03976471, 0.05674132]),
 392646132: array([], dtype=float64),
 2955789098: array([ 1.42692864,  1.41583669, -0.29434925]),
 792380204287164416: array([], dtype=float64),
 198453947: array([ 1.42692864,  1.41583669, -0.29434925]),
 1431726547: array([ 0.08295848,  0.14897694, -0.00406564]),
 1245145031045980163: array([0.32263699, 0.03976471, 0.05674132]),
 2181244875: array([-0.05508367,  0.13104974, -0.03690213]),
 34865264: array([0.32263699, 0.03976471, 0.05674132]),
 179912903: array([ 0.11153234, -0.02442307,  0.12417023]),
 1173096863840423941: array([ 0.1079457

In [20]:
from numpy import dot
from numpy.linalg import norm

def cosine(a,b):
    cos_sim = dot(a, b)/(norm(a)*norm(b))
    return cos_sim

In [28]:
vector_rep

{17799542: array([ 1.42692864,  1.41583669, -0.29434925]),
 1166466828: array([0.29721645, 0.07784763, 0.05403569]),
 1343606436149022723: array([-0.09635407,  0.09167962,  0.16087177]),
 930226031276982273: array([ 0.08295848,  0.14897694, -0.00406564]),
 920858307392192513: array([ 1.42692864,  1.41583669, -0.29434925]),
 21458110: array([ 0.32959867, -0.04871361,  0.67342329]),
 787062740183552000: array([0.32263699, 0.03976471, 0.05674132]),
 392646132: array([], dtype=float64),
 2955789098: array([ 1.42692864,  1.41583669, -0.29434925]),
 792380204287164416: array([], dtype=float64),
 198453947: array([ 1.42692864,  1.41583669, -0.29434925]),
 1431726547: array([ 0.08295848,  0.14897694, -0.00406564]),
 1245145031045980163: array([0.32263699, 0.03976471, 0.05674132]),
 2181244875: array([-0.05508367,  0.13104974, -0.03690213]),
 34865264: array([0.32263699, 0.03976471, 0.05674132]),
 179912903: array([ 0.11153234, -0.02442307,  0.12417023]),
 1173096863840423941: array([ 0.1079457

In [36]:
vector_rep[55199013]

array([ 0.02121737, -0.05411746,  0.04283899])

Using ID "55199013"

In [40]:
cosine_sim = dict()
for row in vector_rep:
    if vector_rep[row].size > 0 and row != 55199013:
        cosine_sim[row] = cosine(vector_rep[55199013], vector_rep[row])

In [41]:
len(cosine_sim)

8222

In [54]:
cosine_sim_sorted = dict(sorted(cosine_sim.items(), key=lambda item: item[1], reverse=True)[:5])

In [55]:
cosine_sim_sorted

{898843662: 0.9947563278994694,
 14088332: 0.9915351452642759,
 23312234: 0.9907319186946104,
 3387727233: 0.9906133742714612,
 18399014: 0.9891317508157441}

### Workload2