In [535]:
import findspark
findspark.init("/Users/chukuemekaogudu/Documents/Dev-Spark-Apache/Apache-Spark/spark-2.4.5-bin-hadoop2.7")

import os
import json
import time
import string
from collections import Counter
from pyspark import SparkContext, SparkConf
import re
import math
import itertools

In [536]:
data_dir = "/Volumes/oli2/inf533_datasets"

In [537]:
config = SparkConf().setMaster("local[*]") \
                    .setAppName("Task2Opt") \
                    .set("spark.executor.memory", "4g") \
                    .set("spark.driver.memory", "4g")
sc = SparkContext(conf=config).getOrCreate()

In [538]:
def loadStopWords(file_path):
    data = None
    with open(file_path, "rb") as file:
        data = file.read()
    return data.decode("utf-8").split()

In [539]:
stopwords = loadStopWords(os.path.join(data_dir, "stopwords"))

In [540]:
lines = sc.textFile(os.path.join(data_dir, "train_review.json")).map(json.loads).cache()

In [541]:
user_dict = lines.map(lambda x: x["user_id"]).distinct().zipWithIndex().collectAsMap()

In [542]:
business_dict = lines.map(lambda x: x["business_id"]).distinct().zipWithIndex().collectAsMap()

In [543]:
def tokens_to_model(tokens, tokens_type):
    model = []
    for k, v in tokens.items():
        model.append({"description": tokens_type, "id": k, "token": v})
    return model

In [544]:
model = tokens_to_model(business_dict, "business_tokens")

In [545]:
model += tokens_to_model(user_dict, "user_tokens")

In [546]:
model[:10]

[{'description': 'business_tokens',
  'id': 'bZMcorDrciRbjdjRyANcjA',
  'token': 0},
 {'description': 'business_tokens',
  'id': 'n8Zqqhff-2cxzWt_nwhU2Q',
  'token': 1},
 {'description': 'business_tokens',
  'id': '1Df5WnLX3DqN6ymlhqznaQ',
  'token': 2},
 {'description': 'business_tokens',
  'id': 'VfFHPsPtTW4Mgx0eHDyJiQ',
  'token': 3},
 {'description': 'business_tokens',
  'id': 'bPcqucuuClxYrIM8xWoArg',
  'token': 4},
 {'description': 'business_tokens',
  'id': '6Z6IyosSMciZtwk8hRLRag',
  'token': 5},
 {'description': 'business_tokens',
  'id': '1xieLFUt_lgTUuGRabiMpQ',
  'token': 6},
 {'description': 'business_tokens',
  'id': 'If6Bku2jkgPiikR6HBu-XQ',
  'token': 7},
 {'description': 'business_tokens',
  'id': 'wqFkAsxYPA5tcdSkYMtrrw',
  'token': 8},
 {'description': 'business_tokens',
  'id': 'aEGV0W2i8HRoDFsRrMz-BA',
  'token': 9}]

In [547]:
def filter_words(text):
    words = text.translate(str.maketrans('', '', string.punctuation))
    words = words.split()
    
    pattern = "[a-zA-Z]+"
    filtered_words = []
    for word in words:
        word = word.lower()
        if re.match(pattern, word) and word not in stopwords and len(word) > 3:
            filtered_words.append(word)
    return filtered_words

In [548]:
def compute_tf(doc, business_id):
    word_dict = Counter(doc)
    n_count = len(doc)
    
    tf = []
    threshold = 3 if n_count > 10 else 0
    for word, count in word_dict.items():
        if count > threshold:
            tf.append(((business_id, word), count / float(n_count)))
    return tf

In [549]:
business_text_tf = lines.map(lambda x: (business_dict[x["business_id"]], filter_words(x["text"]))) \
                        .reduceByKey(lambda x, y: x + y, 7) \
                        .flatMap(lambda x: compute_tf(x[1], x[0])).cache()

In [550]:
num_doc = len(business_dict)

In [551]:
business_text_idf = business_text_tf.map(lambda x: (x[0][1], x[0][0])) \
                                    .groupByKey() \
                                    .mapValues(lambda x: math.log(num_doc/ len(set(x)))).collectAsMap()

In [509]:
print(len(business_text_idf))

22469


In [510]:
def get_top_words(tfidf):
    tfidf = list(tfidf)
    tfidf.sort(key=lambda x: x[1])
    return [pair[0] for pair in tfidf[:200]]

In [511]:
business_tfidf = business_text_tf.map(lambda x: (x[0][0], (x[0][1], x[1] * business_text_idf[x[0][1]]))) \
                                 .groupByKey() \
                                 .mapValues(get_top_words).cache()

In [512]:
word_tokens = business_tfidf.flatMap(lambda x: x[1]).distinct().zipWithIndex().collectAsMap()

In [513]:
word_tokens

{'always': 0,
 'money': 1,
 'manager': 2,
 'call': 3,
 'customer': 4,
 'monthly': 5,
 'water': 6,
 'however': 7,
 'mean': 8,
 'garlic': 9,
 'work': 10,
 'head': 11,
 'tell': 12,
 'anyone': 13,
 'trying': 14,
 'couldnt': 15,
 'last': 16,
 'minutes': 17,
 'large': 18,
 'give': 19,
 'week': 20,
 'sure': 21,
 'nice': 22,
 'different': 23,
 'offered': 24,
 'says': 25,
 'delivery': 26,
 'perfectly': 27,
 'feta': 28,
 'boys': 29,
 'basketball': 30,
 'jump': 31,
 'beef': 32,
 'full': 33,
 'style': 34,
 'maybe': 35,
 'finish': 36,
 'fairly': 37,
 'seating': 38,
 'simple': 39,
 'pass': 40,
 'store': 41,
 'combination': 42,
 'mixed': 43,
 'fine': 44,
 'past': 45,
 'casino': 46,
 'comfortable': 47,
 'walk': 48,
 'talking': 49,
 'hate': 50,
 'picked': 51,
 'vegetarian': 52,
 'instead': 53,
 'check': 54,
 'already': 55,
 'away': 56,
 'cold': 57,
 'truly': 58,
 'forgot': 59,
 'eyes': 60,
 'ticket': 61,
 'buying': 62,
 'towards': 63,
 'compare': 64,
 'intimate': 65,
 'round': 66,
 'wine': 67,
 'dress'

In [514]:
start = time.time()
business_profile = business_tfidf.mapValues(lambda x: [word_tokens[word] for word in x]) \
                                 .collectAsMap()
print("Time ", time.time() - start)

Time  0.1863088607788086


In [464]:
user_profile = lines.map(lambda x: (user_dict[x["user_id"]], business_profile.get(business_dict[x["business_id"]]))) \
                                                                             .filter(lambda x: x[1] != None and len(x[1]) > 0)

In [515]:
print(len(business_profile))
print(len(business_dict))

10127
10253


In [534]:
sc.stop()