# TDT4305 - Project 2

In [1]:
from base64 import b64decode
import re

In [2]:
k = 10
NORMALIZE_SCORES = False

REDUCE_DATASET = False
DATASET_FRACTION = 0.1

## Load data

In [3]:
reviews = sc.textFile('../data/yelp_top_reviewers_with_reviews.csv') \
    .zipWithIndex() \
    .filter(lambda x: x[1] > 0) \
    .map(lambda x: x[0].replace('"', '').split('\t')) \
    .map(lambda row: tuple(b64decode(row[i]).decode('utf8') if i == 3 else row[i] for i in range(len(row))))
# "review_id","user_id","business_id","review_text","review_date"

afinn = sc.textFile('../data/AFINN-111.txt') \
    .map(lambda x: x.split('\t')) \
    .map(lambda row: (row[0], int(row[1])))

In [4]:
reviews.first()

('-lFvxYOmAuZMOelAs0dwgw',
 '---1lKK3aKOuomHnwAkAow',
 'XJGMgs9Kh4kcgf8Oskiewg',
 'I cant believe I have not Yelped this yet.  I think I only Yelp when I am hungry and then I only think food.  \n\nTheatre 7 is AMAZING!  Derek is one of the nicest guys in the community.  I Facebooked him and asked if I can use the space for TEDxYouth and he said yes.  He is very passionate about the independent film scene and the local Las Vegas Community.  You too can rent out the space for cool film events. \n\n@Misti we should have a YELP First Friday party here.  It would be really awesome! \n\nThey also did a Save the Huntridge build on a First Friday.\n\nDid I mention PolyGrind, zombies, blood gore, film festival right here in Vegas.  They take Red Carpet to a whole other level.  \n\nIt is also an art gallery space, improve class space, film networking space, and all around amazing place to be. \n\nSo if you are sick of watching romantic comedies at the megaplex grab a seat at Theatre 7.',
 '13470

In [5]:
afinn.first()

('abandon', -2)

I run a quick test to see that there are no duplicates in the AFINN set:

In [6]:
assert afinn.count() == afinn.map(lambda row: row[0]).distinct().count()

### Reduce data set (optional)
Since the calculations include a quite massive join, I include an option to reduce the dataset, so that the functionality can be tested more efficiently:

In [7]:
%%time

if REDUCE_DATASET:
    count = reviews.count()
    
    reviews = reviews.zipWithIndex() \
        .filter(lambda row: row[1] < count * DATASET_FRACTION) \
        .map(lambda row: row[0])

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 4.29 µs


## Tokenize review texts

In [8]:
def tokenize(review):
    return re.sub('[^a-zA-Z ]+', '', review).lower().split()

In [9]:
tokenized = reviews.map(lambda row: ((row[0], row[1], row[2]), row[3])) \
    .flatMapValues(tokenize)

## Calculate polarity

I start out by joining each word with its polarity score. This is a really heavy operation, so it will take a long time to run, but it is necessary. Note that I do a left outer join, keeping all words in each review, but setting a value of 0 for words not contained in AFINN. This is so that I later count the total number of words in each review, allowing for normalization.

In [15]:
if NORMALIZE_SCORES:
    word_polarity = tokenized \
        .map(lambda row: (row[1], row[0])) \
        .leftOuterJoin(afinn) \
        .map(lambda row: row[1]) \
        .map(lambda row: (row[0], 0 if row[1] is None else row[1]))
else:
    word_polarity = tokenized \
        .map(lambda row: (row[1], row[0])) \
        .join(afinn) \
        .map(lambda row: row[1]) \
        .map(lambda row: (row[0], 0 if row[1] is None else row[1]))

Now that I have the polarity of each word, I can sum to find the polarity of the review. I simultaneously count the total number of words, and divide by this in the end to find a normalized score. This is in order to avoid long reviews dominating.

In [11]:
if NORMALIZE_SCORES:
    review_polarity = word_polarity.map(lambda row: (row[0], (row[1], 1))) \
        .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])) \
        .map(lambda row: (row[0], row[1][0] / row[1][1]))
else:
        review_polarity = word_polarity \
            .reduceByKey(lambda x, y: (x + y)

## Get most extreme reviews
Looking at the highest ranking reviews is a great sanity check to see how well our system works.

In [None]:
top_review_ids = review_polarity.sortBy(lambda row: row[1], ascending=False).map(lambda row: row[0][0]).take(10)

_ = list(map(lambda x: print(x[0] + 1, x[1], '\n\n'), 
         enumerate(reviews.filter(lambda row: row[0] in top_review_ids).map(lambda row: row[3]).collect())))

In [None]:
bottom_review_ids = review_polarity.sortBy(lambda row: row[1]).map(lambda row: row[0][0]).take(10)

_ = list(map(lambda x: print(x[0] + 1, x[1], '\n\n'), 
         enumerate(reviews.filter(lambda row: row[0] in bottom_review_ids).map(lambda row: row[3]).collect())))

## Rank businesses

In [13]:
%%time
business_ranking = review_polarity.map(lambda row: (row[0][2], row[1])) \
    .reduceByKey(lambda x, y: x + y) \
    .sortBy(lambda row: row[1], ascending=False) \
    .zipWithIndex().filter(lambda row: row[1] < k).map(lambda row: row[0])

CPU times: user 59.4 ms, sys: 19.7 ms, total: 79.1 ms
Wall time: 4min 7s


In [14]:
%%time
business_ranking.collect()

CPU times: user 9.36 ms, sys: 0 ns, total: 9.36 ms
Wall time: 227 ms


[('4JNXUYY8wbaaDmk3BPzlWw', 7306),
 ('RESDUcs7fIiihp38-d6_6g', 6461),
 ('igHYkXZMLAc9UdV5VnR_AA', 5784),
 ('k1QpHAkzKTrFYfk6u--VgQ', 5195),
 ('A5Rkh7UymKm0_Rxm9K2PJw', 5191),
 ('5LNZ67Yw9RD6nf4_UhXOjw', 5118),
 ('IMLrj2klosTFvPRLv56cng', 4975),
 ('PVTfzxu7of57zo1jZwEzkg', 4827),
 ('z6-reuC5BYf_Rth9gMBfgQ', 4816),
 ('7sPNbCx7vGAaH7SbNPZ6oA', 4813)]

## Save results

In [None]:
%%time
business_ranking.zipWithIndex().filter(lambda row: row[1] < k)