In [1]:
import findspark
findspark.init("/Users/chukuemekaogudu/Documents/Dev-Spark-Apache/Apache-Spark/spark-2.4.5-bin-hadoop2.7")
import os
import json
from pyspark import SparkContext, SparkConf
from datetime import datetime
import string
import re

In [2]:
data_dir = "/Volumes/oli2/inf533_datasets"

In [3]:
conf = SparkConf().setMaster("local[*]").setAppName("Task1")
sc = SparkContext(conf=conf).getOrCreate()

In [4]:
def loadStopWords():
    data = None
    with open(os.path.join(data_dir, "stopwords"), "rb") as file:
        data = file.read()
    return data.decode("utf-8")

In [19]:
def mapJsonObj(jsonObj):
    year = datetime.strptime(jsonObj["date"], "%Y-%m-%d %H:%M:%S").year
    return ((year, 1), (jsonObj["user_id"], 1))

In [20]:
lines = sc.textFile(os.path.join(data_dir, "review.json"))

In [5]:
import time

In [22]:
rdd = lines.map(json.loads) \
                    .map(lambda x: mapJsonObj(x)) \
                    .partitionBy(10, lambda x: hash(x)).cache()

In [23]:
reviews_y = rdd.filter(lambda x: x[0][0] == 2017) \
               .map(lambda x: x[0]).reduceByKey(lambda x, y: x + y)

In [24]:
user_reviews = rdd.map(lambda x: x[1]) \
                  .reduceByKey(lambda x, y: x + y) \
                  .sortBy(lambda x: x[1], ascending=False)

In [25]:
start = time.time()
total_reviews = rdd.count()
unique_users = rdd.map(lambda x: x[1][0]).distinct().count()
top_users = user_reviews.take(10)
end = time.time()
print("Time ", end - start)

Time  2.391263008117676


In [26]:
top_users

[('CxDOIDnH8gp9KXzpBHJYXw', 715),
 ('bLbSNkLggFnqwNNzzq-Ijw', 424),
 ('PKEzKWv_FktMm2mGPjwd0Q', 322),
 ('DK57YibC5ShBmqQl97CKog', 291),
 ('ELcQDlf69kb-ihJfxZyL0A', 288),
 ('U4INQZOPSUaj8hMjLlZ3KA', 276),
 ('QJI9OSEn6ujRCtrX06vs1w', 258),
 ('d_TBs6J3twMy9GChqUEXkg', 253),
 ('hWDybu_KvYLSdEFzGrniTw', 239),
 ('dIIKEfOgo0KqUfGQvGikPg', 216)]

In [28]:
rdd.getNumPartitions()

10

In [29]:
rdd.unpersist()

MapPartitionsRDD[5] at mapPartitions at PythonRDD.scala:133

In [6]:
stopwords = loadStopWords().split()

In [7]:
text_lines = sc.textFile(os.path.join(data_dir, "review.json"))

In [8]:
def filterPunctuation(jsonObj):
    words = jsonObj["text"].translate(str.maketrans('', '', string.punctuation))
    words = words.split()
    return words

In [9]:
def filterWords(word):
    pattern = "[a-zA-Z]+"
    if re.match(pattern, word) and word not in stopwords:
        return True
    return False

In [10]:
text_rdd = text_lines.flatMap(lambda x: filterPunctuation(json.loads(x))) \
                     .map(lambda x: (x.lower(), 1)) \
                     .filter(lambda x: filterWords(x[0])).cache()

In [11]:
start = time.time()
top_words = text_rdd.reduceByKey(lambda x, y: x + y) \
                    .sortBy(lambda x: x[1], ascending=False)
end = time.time()
print("Time ", end - start)

In [12]:
results = top_words.take(10)

Time  0.5254449844360352


In [13]:
results

[('food', 576209),
 ('place', 560373),
 ('good', 560111),
 ('great', 498172),
 ('service', 417863),
 ('like', 405253),
 ('time', 398821),
 ('get', 385094),
 ('one', 377153),
 ('would', 353381)]