In [1]:
import findspark
findspark.init("/Users/chukuemekaogudu/Documents/Dev-Spark-Apache/Apache-Spark/spark-2.4.5-bin-hadoop2.7")

import os
import json
import time
import string
from collections import Counter
from pyspark import SparkContext, SparkConf
import re
import math
import itertools

In [2]:
data_dir = "/Volumes/oli2/inf533_datasets"

In [3]:
config = SparkConf().setMaster("local[*]") \
                    .setAppName("Task2Opt") \
                    .set("spark.executor.memory", "4g") \
                    .set("spark.driver.memory", "4g")
sc = SparkContext(conf=config).getOrCreate()

In [4]:
def loadStopWords(file_path):
    data = None
    with open(file_path, "rb") as file:
        data = file.read()
    return data.decode("utf-8").split()

In [5]:
stopwords = loadStopWords(os.path.join(data_dir, "stopwords"))

In [6]:
lines = sc.textFile(os.path.join(data_dir, "train_review.json")).map(json.loads).cache()

In [7]:
user_dict = lines.map(lambda x: x["user_id"]).distinct().zipWithIndex().collectAsMap()

In [8]:
business_dict = lines.map(lambda x: x["business_id"]).distinct().zipWithIndex().collectAsMap()

In [9]:
def tokens_to_model(tokens, tokens_type):
    model = []
    for k, v in tokens.items():
        model.append({"description": tokens_type, "id": k, "token": v})
    return model

In [10]:
model = tokens_to_model(business_dict, "business_tokens")

In [11]:
model += tokens_to_model(user_dict, "user_tokens")

In [12]:
model[:10]

[{'description': 'business_tokens',
  'id': 'bZMcorDrciRbjdjRyANcjA',
  'token': 0},
 {'description': 'business_tokens',
  'id': 'n8Zqqhff-2cxzWt_nwhU2Q',
  'token': 1},
 {'description': 'business_tokens',
  'id': '1Df5WnLX3DqN6ymlhqznaQ',
  'token': 2},
 {'description': 'business_tokens',
  'id': 'VfFHPsPtTW4Mgx0eHDyJiQ',
  'token': 3},
 {'description': 'business_tokens',
  'id': 'bPcqucuuClxYrIM8xWoArg',
  'token': 4},
 {'description': 'business_tokens',
  'id': '6Z6IyosSMciZtwk8hRLRag',
  'token': 5},
 {'description': 'business_tokens',
  'id': '1xieLFUt_lgTUuGRabiMpQ',
  'token': 6},
 {'description': 'business_tokens',
  'id': 'If6Bku2jkgPiikR6HBu-XQ',
  'token': 7},
 {'description': 'business_tokens',
  'id': 'wqFkAsxYPA5tcdSkYMtrrw',
  'token': 8},
 {'description': 'business_tokens',
  'id': 'aEGV0W2i8HRoDFsRrMz-BA',
  'token': 9}]

In [13]:
def filter_words(text):
    words = text.translate(str.maketrans('', '', string.punctuation))
    words = words.split()
    
    pattern = "[a-zA-Z]+"
    filtered_words = []
    for word in words:
        word = word.lower()
        if re.match(pattern, word) and word not in stopwords and len(word) > 3:
            filtered_words.append(word)
    return filtered_words

In [14]:
def compute_tf(doc, business_id):
    word_dict = Counter(doc)
    n_count = len(doc)
    threshold = 3
    
    tf = []
    for word, count in word_dict.items():
        if count > threshold:
            tf.append(((business_id, word), count / float(n_count)))
    if len(tf) == 0:
        for word, count in word_dict.items():
            tf.append(((business_id, word), count / float(n_count)))
    return tf

In [15]:
business_text_tf = lines.map(lambda x: (business_dict[x["business_id"]], filter_words(x["text"]))) \
                        .reduceByKey(lambda x, y: x + y, 7) \
                        .flatMap(lambda x: compute_tf(x[1], x[0])).cache()

In [16]:
num_doc = len(business_dict)

In [17]:
business_text_idf = business_text_tf.map(lambda x: (x[0][1], x[0][0])) \
                                    .groupByKey() \
                                    .mapValues(lambda x: math.log(num_doc/ len(set(x)))).collectAsMap()

In [18]:
print(len(business_text_idf))

22873


In [19]:
def get_top_words(tfidf):
    tfidf = list(tfidf)
    tfidf.sort(key=lambda x: x[1])
    return [pair[0] for pair in tfidf[:200]]

In [20]:
business_tfidf = business_text_tf.map(lambda x: (x[0][0], (x[0][1], x[1] * business_text_idf[x[0][1]]))) \
                                 .groupByKey() \
                                 .mapValues(get_top_words).cache()

In [21]:
word_tokens = business_tfidf.flatMap(lambda x: x[1]).distinct().zipWithIndex().collectAsMap()

In [22]:
word_tokens

{'place': 0,
 'always': 1,
 'money': 2,
 'manager': 3,
 'call': 4,
 'rude': 5,
 'customer': 6,
 'package': 7,
 'monthly': 8,
 'tanning': 9,
 'sure': 10,
 'work': 11,
 'nice': 12,
 'last': 13,
 'maybe': 14,
 'without': 15,
 'give': 16,
 'trying': 17,
 'couldnt': 18,
 'second': 19,
 'away': 20,
 'still': 21,
 'check': 22,
 'cold': 23,
 'minutes': 24,
 'week': 25,
 'head': 26,
 'thank': 27,
 'condition': 28,
 'hansen': 29,
 'doctors': 30,
 'throat': 31,
 'medicine': 32,
 'lungs': 33,
 'little': 34,
 'buying': 35,
 'pricing': 36,
 'store': 37,
 'movies': 38,
 'absolutely': 39,
 'water': 40,
 'white': 41,
 'however': 42,
 'mean': 43,
 'extremely': 44,
 'garlic': 45,
 'seem': 46,
 'anyone': 47,
 'tell': 48,
 'kitchen': 49,
 'smaller': 50,
 'roasted': 51,
 'coffee': 52,
 'okay': 53,
 'large': 54,
 'different': 55,
 'reasonable': 56,
 'offered': 57,
 'says': 58,
 'pepper': 59,
 'dining': 60,
 'table': 61,
 'delivery': 62,
 'perfectly': 63,
 'feta': 64,
 'owner': 65,
 'behind': 66,
 'beef': 67,

In [23]:
start = time.time()
business_profile = business_tfidf.mapValues(lambda x: [word_tokens[word] for word in x]) \
                                 .collectAsMap()
print("Time ", time.time() - start)

Time  0.18369293212890625


In [27]:
user_profile = lines.map(lambda x: (user_dict[x["user_id"]], business_profile.get(business_dict[x["business_id"]]))) \
                                                                             .filter(lambda x: x[1] != None and len(x[1]) > 0) \
                                                                             .reduceByKey(lambda x, y: list(set(x)) + list(set(y))).collect()

In [30]:
print(len(user_profile))

26184


In [31]:
print(len(business_profile))

10253
