In [84]:
import findspark
findspark.init("/Users/chukuemekaogudu/Documents/Dev-Spark-Apache/Apache-Spark/spark-2.4.5-bin-hadoop2.7")
import os
import json
from pyspark import SparkContext, SparkConf
import datetime
import string
import re

In [85]:
data_dir = "/Volumes/oli2/inf533_datasets"

In [87]:
conf = SparkConf().setMaster("local[*]").setAppName("Task1")
sc = SparkContext(conf=conf).getOrCreate()

In [88]:
def loadStopWords():
    data = None
    with open(os.path.join(data_dir, "stopwords"), "rb") as file:
        data = file.read()
    return data.decode("utf-8")

In [89]:
stopwords = loadStopWords().split()

In [90]:
print(type(stopwords))

<class 'list'>


In [91]:
for word in stopwords:
    print(word)

i
me
my
myself
we
our
ours
ourselves
you
your
yours
yourself
yourselves
he
him
his
himself
she
her
hers
herself
it
its
itself
they
them
their
theirs
themselves
what
which
who
whom
this
that
these
those
am
is
are
was
were
be
been
being
have
has
had
having
do
does
did
doing
a
an
the
and
but
if
or
because
as
until
while
of
at
by
for
with
about
against
between
into
through
during
before
after
above
below
to
from
up
down
in
out
on
off
over
under
again
further
then
once
here
there
when
where
why
how
all
any
both
each
few
more
most
other
some
such
no
nor
not
only
own
same
so
than
too
very
s
t
can
will
just
don
should
now


In [92]:
def mapJsonObj(jsonObj):
    year = datetime.datetime.strptime(jsonObj["date"], "%Y-%m-%d %H:%M:%S").year
    
    words = jsonObj["text"].translate(str.maketrans('', '', string.punctuation))
    words = words.split()
    pattern = "[a-zA-Z]+"
    
    words_dict = {}
    for word in words:
        word = word.lower()
        if re.match(pattern, word) and word not in stopwords:
            if word not in words_dict:
                words_dict[word] = 1
            else:
                words_dict[word] += 1
    words_list = list(words_dict.items())
    return (
        (year, 1), 
        (jsonObj["review_id"], 1), 
        (jsonObj["user_id"], 1),
        words_list,
    )
            

In [93]:
lines = sc.textFile(os.path.join(data_dir, "review.json"))

In [94]:
rdd = lines.map(json.loads).map(lambda x: mapJsonObj(x)).cache()

In [95]:
vals = rdd.take(1)
for val in vals:
    print(val)

((2017, 1), ('-I5umRTkhw15RqpKMl_o1Q', 1), ('-mA3-1mN4JIEkqOtdbNXCQ', 1), [('walked', 4), ('around', 1), ('friday', 1), ('afternoon', 1), ('sat', 1), ('table', 1), ('bar', 2), ('min', 1), ('dont', 1), ('even', 1), ('think', 1), ('realized', 1), ('however', 1), ('everyone', 1), ('noticed', 1), ('service', 1), ('non', 1), ('existent', 1), ('best', 1), ('good', 1), ('way', 1), ('new', 1), ('business', 1), ('start', 1), ('oh', 1), ('well', 1), ('location', 1), ('different', 1), ('things', 1), ('past', 1), ('several', 1), ('years', 1), ('added', 1), ('list', 1), ('smdh', 1)])


In [96]:
total_reviews = rdd.count()
print(total_reviews)

1151625


In [99]:
reviews_y = rdd.map(lambda x: x[0]) \
                .filter(lambda x: x[0] == 2017) \
                .reduceByKey(lambda x, y: x + y)

In [100]:
print(reviews_y.collect())

[(2017, 209995)]


In [101]:
unique_users = rdd.map(lambda x: x[2][0]).distinct().count()
print(unique_users)

566269


In [102]:
user_reviews = rdd.map(lambda x: x[2]) \
                  .reduceByKey(lambda x, y: x + y) \
                  .sortBy(lambda x: x[1], ascending=False)
print(user_reviews.take(10))

[('CxDOIDnH8gp9KXzpBHJYXw', 715), ('bLbSNkLggFnqwNNzzq-Ijw', 424), ('PKEzKWv_FktMm2mGPjwd0Q', 322), ('DK57YibC5ShBmqQl97CKog', 291), ('ELcQDlf69kb-ihJfxZyL0A', 288), ('U4INQZOPSUaj8hMjLlZ3KA', 276), ('QJI9OSEn6ujRCtrX06vs1w', 258), ('d_TBs6J3twMy9GChqUEXkg', 253), ('hWDybu_KvYLSdEFzGrniTw', 239), ('dIIKEfOgo0KqUfGQvGikPg', 216)]


In [104]:
top_words = rdd.map(lambda x: x[3]) \
               .flatMap(lambda x: x) \
               .reduceByKey(lambda x, y: x + y) \
               .sortBy(lambda x: x[1], ascending=False)
print(top_words.take(10))

[('food', 576209), ('place', 560373), ('good', 560111), ('great', 498172), ('service', 417863), ('like', 405253), ('time', 398821), ('get', 385094), ('one', 377153), ('would', 353381)]
