In [1]:
import findspark
findspark.init("/Users/chukuemekaogudu/Documents/Dev-Spark-Apache/Apache-Spark/spark-2.4.5-bin-hadoop2.7")

import os
import json
import time
import string
from collections import Counter
from pyspark import SparkContext, SparkConf
import re
import math
import itertools

In [2]:
data_dir = "/Volumes/oli2/inf533_datasets"

In [3]:
def loadStopWords(file_path):
    data = None
    with open(file_path, "rb") as file:
        data = file.read()
    return data.decode("utf-8").split()

In [4]:
def filter_words(text, stopwords):
    words = text.translate(str.maketrans('', '', string.punctuation))
    words = words.split()
    
    pattern = "[a-zA-Z]+"
    filtered_words = []
    for word in words:
        word = word.lower()
        if re.match(pattern, word) and word not in stopwords and len(word) > 3:
            filtered_words.append(word)
    return filtered_words

In [5]:
def compute_tf(doc, business_id):
    word_dict = Counter(doc)
    n_count = len(doc)
    threshold = 3
    
    tf = []
    for word, count in word_dict.items():
        if count > threshold:
            tf.append(((business_id, word), count / float(n_count)))
            
    if len(tf) == 0:
        for word, count in word_dict.items():
            tf.append(((business_id, word), count / float(n_count)))
    return tf

In [6]:
def get_top_words(tfidf):
    tfidf = list(tfidf)
    tfidf.sort(key=lambda x: x[1])
    return [pair[0] for pair in tfidf[:200]]

In [7]:
def convert_to_model(profile, model_type):
    return [{"description": model_type, "id": k, "profile": v} for k, v in profile.items()]

In [8]:
def main():
    start = time.time()
    config = SparkConf().setMaster("local[*]") \
                        .setAppName("Task2") \
                        .set("spark.executor.memory", "4g") \
                        .set("spark.driver.memory", "4g")
    
    sc = SparkContext(conf=config).getOrCreate()
    
    stopwords = loadStopWords(os.path.join(data_dir, "stopwords"))
    
    lines = sc.textFile(os.path.join(data_dir, "train_review.json")) \
              .map(json.loads).cache()
    
    business_text_tf = lines.map(lambda x: (x["business_id"], filter_words(x["text"], stopwords))) \
                            .reduceByKey(lambda x, y: x + y, 7) \
                            .flatMap(lambda x: compute_tf(x[1], x[0])) \
                            .cache()
    num_doc = lines.map(lambda x: x["business_id"]).distinct().count()
    
    business_text_idf = business_text_tf.map(lambda x: (x[0][1], x[0][0])) \
                                        .groupByKey() \
                                        .mapValues(lambda x: math.log(num_doc/ len(set(x)))) \
                                        .collectAsMap()
    
    business_tfidf = business_text_tf.map(lambda x: (x[0][0], (x[0][1], x[1] * business_text_idf[x[0][1]]))) \
                                     .groupByKey() \
                                     .mapValues(get_top_words) \
                                     .cache()
    word_tokens = business_tfidf.flatMap(lambda x: x[1]).distinct().zipWithIndex().collectAsMap()
    
    business_profile = business_tfidf.mapValues(lambda x: [word_tokens[word] for word in x]).collectAsMap()
    
    user_profile = lines.map(lambda x: (x["user_id"], business_profile.get(x["business_id"]))) \
                        .filter(lambda x: x[1] != None and len(x[1]) > 0) \
                        .reduceByKey(lambda x, y: list(set(x)) + list(set(y))) \
                        .collectAsMap()
    
    model = convert_to_model(business_profile, "business_profile")
    model += convert_to_model(user_profile, "user_profile")
    
    with open("output.json", "w+") as file:
        for line in model:
            file.writelines(json.dumps(line) + "\n")
        file.close()
    print("Duration ", time.time() - start)

In [9]:
main()

Duration  139.01959323883057


In [32]:
sc.stop()