In [42]:
import findspark
findspark.init("/Users/chukuemekaogudu/Documents/Dev-Spark-Apache/Apache-Spark/spark-2.4.5-bin-hadoop2.7")

import os
import json
import time
import string
from collections import Counter
from pyspark import SparkContext, SparkConf
import re
import math
import itertools

In [43]:
data_dir = "/Volumes/oli2/inf533_datasets"

In [46]:
config = SparkConf().setMaster("local[*]") \
                    .setAppName("Task2") \
                    .set("spark.executor.memory", "4g") \
                    .set("spark.driver.memory", "4g")
sc = SparkContext(conf=config).getOrCreate().setLogLevel("ERROR")

In [5]:
def loadStopWords(file_path):
    data = None
    with open(file_path, "rb") as file:
        data = file.read()
    return data.decode("utf-8").split()

In [6]:
stopwords = loadStopWords(os.path.join(data_dir, "stopwords"))

In [7]:
lines = sc.textFile(os.path.join(data_dir, "train_review.json"))

In [8]:
user_dict = lines.map(json.loads) \
                 .map(lambda x: x["user_id"]) \
                 .distinct() \
                 .zipWithIndex() \
                 .collectAsMap()

In [9]:
business_dict = lines.map(json.loads) \
                     .map(lambda x: x["business_id"]) \
                     .distinct() \
                     .zipWithIndex() \
                     .collectAsMap()

In [10]:
rdd = lines.map(json.loads) \
           .map(lambda x: (user_dict[x["user_id"]], business_dict[x["business_id"]], x["text"])) \
           .cache()

In [11]:
def filter_words(text):
    words = text.translate(str.maketrans('', '', string.punctuation))
    words = words.split()
    
    pattern = "[a-zA-Z]+"
    filtered_words = []
    for word in words:
        word = word.lower()
        if re.match(pattern, word) and word not in stopwords and len(word) > 1:
            filtered_words.append(word)
    return filtered_words

In [12]:
business_text = rdd.map(lambda x: (x[1], filter_words(x[2]))) \
                   .reduceByKey(lambda x, y: x + y, 7) \
                   .cache()

In [13]:
def compute_tf(doc):
    word_dict = Counter(doc)
    n_count = len(doc)
    tf = [(word, count / float(n_count)) for word, count in word_dict.items()]
    return tf

In [14]:
business_text_tf = business_text.mapValues(compute_tf).cache()

In [15]:
num_doc = len(business_dict)

In [16]:
start = time.time()
business_text_df = business_text.flatMap(lambda x: [(word, x[0]) for word in x[1]]) \
                                .groupByKey(7, lambda x: hash(x) % 7) \
                                .mapValues(lambda x: math.log(num_doc/ len(set(x)))) \
                                .collectAsMap()
print("Time ", time.time() - start)

Time  51.871434926986694


In [20]:
def compute_tfidf(tf_list, idf_dict):
    tfidf = {}
    for pair in tf_list:
        tfidf[pair[0]] = pair[1] * idf_dict[pair[0]]
    tfidf_sorted = [word for word, _ in sorted(tfidf.items(), key=lambda kv: kv[1], reverse=True)]
    return tfidf_sorted[:200]

In [21]:
business_tfidf = business_text_tf.mapValues(lambda x: compute_tfidf(x, business_text_df))

In [26]:
top_word_tokens = business_tfidf.flatMap(lambda x: x[1]) \
                               .distinct() \
                               .zipWithIndex() \
                               .collectAsMap()

In [31]:
business_profile = business_tfidf.mapValues(lambda x: [top_word_tokens[word] for word in x]).cache()

In [32]:
business_profile_dict = business_profile.collectAsMap()

In [33]:
user_profile = rdd.map(lambda x: (x[0], x[1])) \
                  .mapValues(lambda x: business_profile_dict[x]) \
                  .reduceByKey(lambda x, y: list(set(x)) + list(set(y))) \
                  .filter(lambda x: len(x[1]) > 1)

In [35]:
len(user_profile.take(1)[0][1])

2428

In [39]:
def convert_to_model(profile, model_type):
    model = [{"model": model_type, "id": k, "profile": v} for k, v in profile.items()]
    return model

In [40]:
model = convert_to_model(business_profile_dict, "business_profile")

In [41]:
model[:10]

[{'model': 'business_profile',
  'id': 0,
  'profile': [0,
   86744,
   29072,
   115261,
   115262,
   57817,
   173414,
   144413,
   57818,
   86745,
   57819,
   115263,
   29073,
   57820,
   29074,
   173415,
   1,
   144414,
   2,
   115264,
   86746,
   144415,
   144416,
   57821,
   144417,
   57822,
   86747,
   115265,
   57823,
   29075,
   173416,
   173417,
   173418,
   57824,
   29076,
   3,
   173419,
   29077,
   57825,
   4,
   57826,
   86748,
   5,
   86749,
   144418,
   173420,
   115266,
   173421,
   173422,
   29078,
   144419,
   57827,
   6,
   7,
   8,
   9,
   173423,
   57828,
   57829,
   86750,
   173424,
   115267,
   173425,
   29079,
   173426,
   57830,
   57831,
   86751,
   10,
   57832,
   144420,
   173427,
   144421,
   29080,
   57833,
   11,
   86752,
   57834,
   86753,
   29081,
   57835,
   173428,
   173429,
   86754,
   12,
   144422,
   115268,
   29082,
   86755,
   29083,
   57836,
   173430,
   29084,
   173431,
   115269,
   86756,

In [45]:
sc.stop()