# TDT4305 Project 1 - RDD Tasks

In [82]:
from datetime import datetime

## Loading data files

In [67]:
reviews = sc.textFile('data/yelp_top_reviewers_with_reviews.csv') \
    .zipWithIndex() \
    .filter(lambda x: x[1] > 0) \
    .map(lambda x: x[0].replace('"', '').split('\t'))
# "review_id","user_id","business_id","review_text","review_date"

In [68]:
businesses = sc.textFile('data/yelp_businesses.csv') \
    .zipWithIndex() \
    .filter(lambda x: x[1] > 0) \
    .map(lambda x: x[0].replace('"', '').split('\t'))
# "business_id","name","address","city","state","postal_code",
# "latitude","longitude","stars","review_count","categories"

In [69]:
friendships = sc.textFile('data/yelp_top_users_friendship_graph.csv') \
    .zipWithIndex() \
    .filter(lambda x: x[1] > 0) \
    .map(lambda x: x[0].replace('"', '').split(','))
# "src_user_id","dst_user_id"

## Task 1

### Counting number of rows

In [70]:
reviews.count()

883737

In [71]:
businesses.count()

192609

In [72]:
friendships.count()

1938472

## Task 2

### a) Finding number of distinct users

In [73]:
f_users = friendships.map(lambda row: row[0])
r_users = reviews.map(lambda row: row[1])

all_users = sc.union([f_users, r_users])
all_users.distinct().count()

160735

### b) Average numbers of characters in a review

In [74]:
reviews.map(lambda row: len(row[3])).mean()

1144.164193645845

### c) Top 10 businesses by amount of reviews

In [159]:
top_businesses_by_review = reviews.map(lambda row: (row[2], 1)).reduceByKey(lambda x, y: x + y) \
    .sortBy(lambda row: row[1], ascending=False) \
    .map(lambda row: row[0])

In [160]:
top_businesses_by_review.take(10)

['FaHADZARwnY4yvlvpnsfGA',
 '4JNXUYY8wbaaDmk3BPzlWw',
 'JmI9nslLD7KZqRr__Bg6NQ',
 'RESDUcs7fIiihp38-d6_6g',
 '7sPNbCx7vGAaH7SbNPZ6oA',
 'iCQpiavjjPzJ5_3gPD5Ebg',
 'K7lWdNUhCbcnEvI0NhGewg',
 'A5Rkh7UymKm0_Rxm9K2PJw',
 '9a3DrZvpYxVs3k_qwlCNSw',
 '5LNZ67Yw9RD6nf4_UhXOjw']

### d) Reviews per year

In [157]:
reviews_per_year = reviews.map(lambda row: (datetime.fromtimestamp(float(row[4])).year, 1)) \
    .reduceByKey(lambda x, y: x + y) \
    .sortBy(lambda x: x[0])

In [158]:
reviews_per_year.collect()

[(2004, 2),
 (2005, 215),
 (2006, 915),
 (2007, 6245),
 (2008, 17269),
 (2009, 24880),
 (2010, 49394),
 (2011, 73952),
 (2012, 83388),
 (2013, 98324),
 (2014, 100526),
 (2015, 109816),
 (2016, 114738),
 (2017, 114141),
 (2018, 89932)]

### e) First and last review

In [97]:
dates = reviews.map(lambda row: float(row[4]))

first_review = datetime.fromtimestamp(dates.sortBy(lambda x: x).first())
last_review = datetime.fromtimestamp(dates.sortBy(lambda x: -x).first())

In [103]:
first_review.strftime("%d.%m.%Y, %H:%M:%S")

'19.12.2004, 20:47:24'

In [102]:
last_review.strftime("%d.%m.%Y, %H:%M:%S")

'14.11.2018, 18:10:56'

### f) Pearson Correlation Coefficient

In [116]:
count_and_length = reviews.map(lambda row: (row[1], len(row[3]))) \
    .aggregateByKey((0, 0), lambda x, y: (x[0] + 1, x[1] + y), lambda x, y: (x[0] + y[0], x[1] + y[1])) \
    .map(lambda row: (row[0], row[1][0], row[1][1] / row[1][0]))

In [117]:
count_and_length.first()

('-InhDRRVG7wrwsgAUvN4Qw', 555, 1493.3261261261262)

In [123]:
# (count, review_count_sum, review_length_sum)
agg = count_and_length.map(lambda row: (row[1], row[2])) \
    .aggregate((0, 0, 0),
                lambda x, y: (x[0] + 1, x[1] + y[0], x[2] + y[1]),
                lambda x, y: (x[0] + y[0], x[1] + y[1], x[2] + y[2])
              )
x_avg = agg[1] / agg[0]
y_avg = agg[2] / agg[0]

In [126]:
pcc_agg = count_and_length.map(lambda row: (row[1], row[2])) \
    .map(lambda row: (
                         (row[0] - x_avg) * (row[1] - y_avg),
                         (row[0] - x_avg) ** 2,
                         (row[1] - y_avg) ** 2
                     )) \
    .reduce(lambda x, y: (x[0] + y[0], x[1] + y[1], x[2] + y[2]))

pcc = pcc_agg[0] / ((pcc_agg[1] ** 0.5) * (pcc_agg[2] ** 0.5))

In [127]:
pcc

0.12597976815180284

## Task 3

### a) Average rating by city

In [154]:
avg_rating_by_city = businesses.map(lambda row: (row[3], (float(row[8]), 1))) \
    .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])) \
    .map(lambda row: (row[0], row[1][0] / row[1][1])) \
    .sortBy(lambda x: x[0])

In [156]:
avg_rating_by_city.collect()

[('110 Las Vegas', 5.0),
 ('AGINCOURT', 2.0),
 ('AVON', 5.0),
 ('AZ', 3.0),
 ('Agincourt', 2.0),
 ('Ahwahtukee', 5.0),
 ('Ahwatukee', 3.5789473684210527),
 ('Ahwatukee Foothills Village', 5.0),
 ('Airdrie', 2.9464285714285716),
 ('Ajax', 3.0495049504950495),
 ('Akron', 3.2129032258064516),
 ('Alberta', 2.0),
 ('Alburg', 5.0),
 ('Alburgh', 3.0),
 ('Aliquippa', 3.0),
 ('Allegheny', 5.0),
 ('Allentown', 4.0),
 ('Allison Park', 3.2083333333333335),
 ('Ambridge', 3.7222222222222223),
 ('Amherst', 3.142857142857143),
 ('Ange-Gardien', 1.0),
 ('Anjou', 3.0),
 ('Ansnorveldt', 4.0),
 ('Anthem', 3.6923076923076925),
 ('Antioch', 2.0),
 ('Apache Junction', 3.5384615384615383),
 ('Apache Trail', 3.0),
 ('Arizona', 5.0),
 ('Arnold', 4.25),
 ('Arrowhead', 3.0),
 ('Ashburn', 3.5),
 ('Aspinwall', 3.4444444444444446),
 ('Auburn', 5.0),
 ('Auburn Township', 4.0),
 ('Auburn Twp', 2.0),
 ('Aurora', 3.247191011235955),
 ('Austin', 5.0),
 ('Avalon', 4.0),
 ('Avon', 3.4043478260869566),
 ('Avon Lake', 3.3278

### b) Top 10 most frequent categories

In [152]:
top_categories = businesses.flatMap(lambda row: row[10].split(',')) \
    .map(lambda row: (row.strip(), 1)) \
    .reduceByKey(lambda x, y: x + y) \
    .sortBy(lambda row: row[1], ascending=False)

In [153]:
top_categories.take(10)

[('Restaurants', 59371),
 ('Shopping', 31878),
 ('Food', 29989),
 ('Home Services', 19729),
 ('Beauty & Spas', 19370),
 ('Health & Medical', 17171),
 ('Local Services', 13932),
 ('Automotive', 13203),
 ('Nightlife', 13095),
 ('Bars', 11341)]

### c) Geographical centroid

In [150]:
pc_centroids = businesses.map(lambda row: (row[5], (1, float(row[6]), float(row[7])))) \
    .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1], x[2] + y[2])) \
    .map(lambda row: (row[0], (row[1][1] / row[1][0], row[1][2] / row[1][0])))

In [151]:
pc_centroids.take(5)

[('L4B 3P7', (43.84119090909091, -79.39952727272727)),
 ('85234', (33.368083565107476, -111.76009355246526)),
 ('89109', (36.11986979655711, -115.16799718309893)),
 ('85283', (33.369426519337, -111.93189088397787)),
 ('85281', (33.4254004230566, -111.92886144896912))]

## Task 4

### a) Top in and out degrees

In [164]:
in_out_degrees = friendships.flatMap(lambda row: [(row[0], (0, 1)), (row[1], (1, 0))]) \
        .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))

In [167]:
top_in = in_out_degrees.map(lambda row: (row[0], row[1][0])) \
    .sortBy(lambda row: row[1], ascending=False)

In [168]:
top_in.take(10)

[('8DEyKVyplnOcSKx39vatbg', 4919),
 ('ZIOCmdFaMIF56FR-nWr_2A', 4597),
 ('YttDgOC9AlM4HcAlDsbB2A', 4222),
 ('djxnI8Ux8ZYQJhiOQkrRhA', 4211),
 ('F_5_UNX-wrAFCXuAkBZRDw', 3943),
 ('dIIKEfOgo0KqUfGQvGikPg', 3651),
 ('GGTF7hnQi6D5W77_qiKlqg', 3609),
 ('NfU0zDaTMEQ4-X9dbQWd9A', 3557),
 ('3gRfkaVcEWri-Ju7OQX7uQ', 3396),
 ('NhgU7RhuYYFmpkb1jlYJ6Q', 3330)]

In [169]:
top_out = in_out_degrees.map(lambda row: (row[0], row[1][1])) \
    .sortBy(lambda row: row[1], ascending=False)

In [170]:
top_out.take(10)

[('ZIOCmdFaMIF56FR-nWr_2A', 9564),
 ('F_5_UNX-wrAFCXuAkBZRDw', 8586),
 ('djxnI8Ux8ZYQJhiOQkrRhA', 8381),
 ('YttDgOC9AlM4HcAlDsbB2A', 6758),
 ('NfU0zDaTMEQ4-X9dbQWd9A', 6506),
 ('dIIKEfOgo0KqUfGQvGikPg', 6187),
 ('ACUVZ4SiN0gni7dzVDm9EQ', 6065),
 ('8DEyKVyplnOcSKx39vatbg', 6026),
 ('w-w-k-QXosIKQ8HQVwU6IQ', 5987),
 ('Thc2zV-K-KLcvJn3fMPdqQ', 5821)]

### b) Mean and median in and out degrees

In [173]:
mean_in = in_out_degrees.map(lambda row: row[1][0]).mean()
mean_out = in_out_degrees.map(lambda row: row[1][1]).mean()

In [174]:
mean_in

3.863660734395157

In [175]:
mean_out

3.863660734395106

In [197]:
count = in_out_degrees.count()

top_in = in_out_degrees.map(lambda row: row[1][0]) \
    .sortBy(lambda row: row, ascending=False) \
    .zipWithIndex() \
    .map(lambda row: (row[1], row[0]))

top_out = in_out_degrees.map(lambda row: row[1][1]) \
    .sortBy(lambda row: row, ascending=False) \
    .zipWithIndex() \
    .map(lambda row: (row[1], row[0]))

if count % 2 == 0:
    l = count // 2
    r = l + 1
    median_in = (top_in.lookup(l)[0] + top_in.lookup(r)[0]) / 2
    median_out = (top_out.lookup(l)[0] + top_out.lookup(r)[0]) / 2
else:
    mid = count // 2
    median_in = top_in.lookup(mid)[0]
    median_out = top_out.lookup(mid)[0]

In [199]:
median_in

1

In [200]:
median_out

0