In [15]:
# Import all necessary libraries and setup the environment for matplotlib
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, explode, col, concat_ws, lit
from pyspark.sql.types import StringType, IntegerType, StructType, StructField
import numpy as np

import json
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import Word2Vec
from numpy import dot
from numpy.linalg import norm

from pyspark.ml.recommendation import ALS

In [16]:
spark = SparkSession \
    .builder \
    .appName("lsah8006_assignment2_local_with_caching_less_jobs") \
    .getOrCreate()

#sc = spark.sparkContext

In [17]:
datafile = "tweets.json"

with open(datafile) as f:
    data = spark.read.option("multiline", "true").json(datafile).cache()

### Workload1

In [18]:
#data1 = data.select(data.user_id, concat_ws(" ",data.replyto_id, data.retweet_id).alias("reply_retweet"))\
#    .where("replyto_id IS NOT NULL OR retweet_id IS NOT NULL")
data1 = data.select(["user_id","replyto_id","retweet_id"]).where("replyto_id IS NOT NULL OR retweet_id IS NOT NULL")

In [19]:
#convert to RDD to (k,v) format
data1 = data1.rdd.map(lambda a:(a[0],(a[1], a[2])))

In [20]:
#Combine all ids from values except 'none'
def combineIds(values):
    values = list(values)
    clean = []
    for i in values:
        for j in i:
            if j != None:
                clean.append(str(j))
    return clean

In [21]:
#Combine all ids from value for each user
data1 = data1.groupByKey().mapValues(combineIds)

In [22]:
#Convert to spark dataframe
documentDF = spark.createDataFrame(data1).cache()

In [23]:
# Define cosine similarity function
def cosine(a,b):
    return dot(a, b)/(norm(a)*norm(b))

### Implement word2vec

In [24]:
word2Vec = Word2Vec(vectorSize=5, seed=9, minCount=2, inputCol="_2", outputCol="result")
model = word2Vec.fit(documentDF)

In [25]:
data_w2v = model.transform(documentDF)

Using ID "55199013" and show its top 5 similar users

In [26]:
feature = data_w2v.filter("_1=55199013").select("result").rdd.collect()[0][0]
#Remove this user to avoid calculating cosine similarity with itself
#Only retaining the columns needed for cosine similarity calculation
data_w2v = data_w2v.filter("_1!=55199013").select("_1", "result").rdd

In [27]:
# Computing cosine similarity between input user's features and the entire dataset
cos_sim = data_w2v.map(lambda x: (x[0], cosine(x[1],feature)))

In [28]:
# Sorting above.
cos_sim_sorted = cos_sim.sortBy(lambda x: x[1], ascending=False)

In [29]:
cos_sim_sorted.take(5)

[(1368336521602990081, 0.9222373122166656),
 (1302012787716222977, 0.8900282934912264),
 (132170371, 0.8183430839445668),
 (2391614580, 0.7935510614383173),
 (1155452351668318208, 0.7935510614383173)]

### Implement tf-idf

In [30]:
hashingTF = HashingTF(inputCol="_2", outputCol="hash_tf", numFeatures=10)
data_tf = hashingTF.transform(documentDF)

In [31]:
# Calculating IDF
idf = IDF(inputCol="hash_tf", outputCol="result")
idf_model = idf.fit(data_tf)
data_idf = idf_model.transform(data_tf)

In [32]:
feature = data_idf.filter("_1=55199013").select("result").rdd.collect()[0][0]
feature

SparseVector(10, {4: 4.1022, 5: 3.2912})

In [33]:
data_tfidf = data_idf.filter("_1!=55199013").select("_1", "result").rdd
cos_sim = data_tfidf.map(lambda x: (x[0], cosine(feature, x[1])))

In [34]:
# Sorting above.
cos_sim_sorted = cos_sim.sortBy(lambda x: x[1], ascending=False)
cos_sim_sorted.collect()[:5]

[(1164464791085342720, 1.0),
 (1303352004232830976, 1.0),
 (196110680, 1.0),
 (1078659401907621893, 0.869233156717764),
 (1338975633800536065, 0.8689424535712936)]

In [35]:
# Sorting above.
cos_sim_sorted = cos_sim.sortBy(lambda x: x[1], ascending=False)

In [36]:
cos_sim_sorted.take(5)

[(1164464791085342720, 1.0),
 (1303352004232830976, 1.0),
 (196110680, 1.0),
 (1078659401907621893, 0.869233156717764),
 (1338975633800536065, 0.8689424535712936)]

### Workload2

In [37]:
#Project only relevant columns(user id and mention Ids), converted mentions to string for easy separation of values
data2 = data.select(data.user_id, concat_ws(",",data.user_mentions.id)\
                    .alias("user_mention")).where("user_mentions IS NOT NULL")
#print(data2.count())
#data2.show(20)

In [38]:
#Convert short & fat dataframe to tall & skinny structure for calculating number of (user, mention) combinations
data2 = data2.withColumn("mentions", explode(split(data2["user_mention"], ","))).drop("user_mention")
#data2.show(20)

In [39]:
#Convert dataframe to rdd for map-reduce operations
data2_rdd = data2.rdd.map(lambda row: ((row[0], row[1]), 1))
#data2_rdd.take(15)

In [40]:
#Combine all (user, mention) combinations and count them
data2_rdd = data2_rdd.reduceByKey(lambda a, b: a+b).map(lambda a: (a[0][0], a[0][1], a[1]))
#data2_rdd.take(5)

In [41]:
#list all users for creating index, user dictionary
users = data2_rdd.map(lambda a:a[0]).collect()
#list all mention users
mentions = data2_rdd.map(lambda a:a[1]).collect()

In [42]:
#Combine users and mentions to create a common user base
users = users + mentions
#Remove duplicate user entries which may also be present in mentions
users = set(users)
len(users)

9116

In [43]:
#create indices for all users and put them in a dictionary
user_ind = dict()
ind = 0
for user in users:
    user_ind[ind] = user
    ind += 1
    
#list(user_ind.items())[:5]

In [44]:
#Functions to retrieve users from indices and vice-versa
def user2index(user_id):
    return list(user_ind.keys())[list(user_ind.values()).index(user_id)]

def index2user(index_id):
    return user_ind[index_id]

In [45]:
#Replace userIds with indices before passing to ALS algorithm
data2_rdd = data2_rdd.map(lambda row: ((user2index(row[0])), (user2index(row[1])), row[2]))
#data2_rdd.take(5)

In [46]:
#Convert RDD to DataFrame
schema = StructType([
    StructField("uid", IntegerType(), True),
    StructField("mention_id", IntegerType(), True),
    StructField("count", IntegerType(), True)])

data2 = spark.createDataFrame(data2_rdd, schema).cache()

In [59]:
#Run the ALS reccomendation
als = ALS(maxIter=10, regParam=0.01, userCol="uid", itemCol="mention_id", implicitPrefs=True, ratingCol="count",coldStartStrategy="drop")
model = als.fit(data2)

In [65]:
reco = model.recommendForAllUsers(5)

In [69]:
reco.show(5)

+----+--------------------+
| uid|     recommendations|
+----+--------------------+
|1580|[{4589, 0.2792199...|
|4900|[{4275, 0.9620833...|
|5300|[{6998, 0.9591287...|
|6620|[{6998, 0.9591287...|
|7240|[{4589, 0.0120965...|
+----+--------------------+
only showing top 5 rows



In [129]:
#Collect top 5 mention indices and convert to userIds
def rec_mention(line):
    mentions = []
    user, recommended_mention_indices = line
    user = index2user(user)
    for i in range(len(recommended_mention_indices)):
        mention_index = recommended_mention_indices[i][0]
        mention = index2user(mention_index)
        mentions.append(mention)
    return (user, mentions)

In [131]:
reco.rdd.map(lambda row: rec_mention(row)).take(5)

[(2453476904, ['428333', '18831926', '191807697', '759251', '380648579']),
 (365773760, ['807095', '360019454', '21802625', '20402945', '428333']),
 (227395882,
  ['26574283', '996693014251585536', '13850422', '185025785', '1004633989']),
 (768291338752135168,
  ['26574283', '996693014251585536', '13850422', '185025785', '1004633989']),
 (712795211131256833,
  ['428333', '18831926', '191807697', '380648579', '759251'])]