In [83]:
from collections import OrderedDict
import decimal
import json
import csv
import numpy as np
import matplotlib as plt
from scipy.sparse import linalg
from scipy import dot

In [118]:
def generate_user_set(user_file):
#     produce user_id set
#     trim irrelevant user data
    userset = set()
    users = {}
    
    with open(user_file) as all_user_data:
        
        for record in all_user_data:
            
            user = json.loads(record)
            
            sufs = ['hot', 'more', 'profile', 'cute', 'list', 'note', 'plain', 'cool', 'funny', 'writer', 'photos']
            comps = ['compliment_'+suf for suf in sufs]
            
            remove_keys = ['type', 'yelping_since', 'elite', 'name']
            remove_keys.extend(comps)
            user_id = user.pop('user_id')
            friends = user.pop('friends')
            user['gave_useful'] = user.pop('useful')
            user['gave_cool'] = user.pop('cool')
            user['gave_funny'] = user.pop('funny')
            user['friends'] = len(friends) if friends[0] != 'None' else 0
            
            for key in remove_keys:
                user.pop(key)
                
            userset.add(user_id)
            users[user_id] = user
    
    return userset, users

def process_reviews_by_user(review_file, user_set):
    
    user_dict = {}
    with open(review_file) as all_review_data:
        
        for record in all_review_data:
            review = json.loads(record)
            
            remove_keys = ['type', 'text', 'date', 'review_id']
            
            for key in remove_keys:
                review.pop(key)
            
            user_id = review.pop('user_id')
            business_id = review.pop('business_id')
            stars = review.pop('stars')
                
            if user_id in user_set:
                user_dict[user_id] = review
                user_dict[user_id]['business_ids'] = [business_id]
                user_dict[user_id]['star_list'] = [stars]
                user_set.remove(user_id)
            else:
#                 user_id not in set, was added to dict
                for key, value in review.items():
                    user_dict[user_id][key] += value
                user_dict[user_id]['business_ids'].append(business_id)
                user_dict[user_id]['star_list'].append(stars)
    
    return user_dict

users_file = 'yelp_data/yelp_academic_dataset_user.json'
review_file = 'yelp_data/yelp_academic_dataset_review.json'
            

In [119]:
userset, users = generate_user_set(users_file)

In [120]:
user_dict = process_reviews_by_user(review_file, userset)

In [121]:
count = 0
for key, val in user_dict.items():
    if count > 100000:
        print(key)
        print(val)
    if count > 100010:
        break
    count += 1

SEh6win-GkWhb6m1txR8BA
{'cool': 1, 'business_ids': ['3Mx4renubPRnjHUw1n2UkA'], 'funny': 4, 'star_list': [1], 'useful': 16}
4fl071XokVipTCRGVTGMaA
{'cool': 0, 'business_ids': ['PsTzoIERiCjq6QHrOnk2Lg'], 'funny': 0, 'star_list': [4], 'useful': 1}
XBnhevK2NyS620Sn2VmzAA
{'cool': 1, 'business_ids': ['-Zk7UMlZqbwWYcyAghD-cw'], 'funny': 0, 'star_list': [1], 'useful': 2}
7hidZRikh-PpqTsXOORV7w
{'cool': 0, 'business_ids': ['pHJu8tj3sI8eC5aIHLFEfQ'], 'funny': 0, 'star_list': [5], 'useful': 0}
LOPyVI7uOLOCwvXBrQBnVg
{'cool': 0, 'business_ids': ['pwAUK5IdkBV2ddliPvNx-g'], 'funny': 0, 'star_list': [3], 'useful': 0}
N6__iT2ZijklpgxVmz7_ug
{'cool': 0, 'business_ids': ['BTcY04QFiS1uh-RpkR7rAg', 'MffsS7AXw8DahmQoTaNUGQ'], 'funny': 2, 'star_list': [1, 5], 'useful': 0}
nYCla-7QSdun4Wr0GcEEAw
{'cool': 0, 'business_ids': ['JkncPZ1jFi0_n23Ptl8jGQ'], 'funny': 0, 'star_list': [5], 'useful': 0}
9v0y9tSYO-G06M_a0jUT6A
{'cool': 0, 'business_ids': ['rXUZNVlpWMV5ORDDSguOEQ', 'vHlF8VSkELv6nKz5W4GDnw', 'DrJ6xjGcQJf

In [122]:
master_users = {}
for uid, data in list(user_dict.items()):
    master_users[uid] = {**data, **users[uid]}

In [123]:
print(len(master_users))
print(len(users))

1029432
1029432


In [124]:
count = 0
for uid, data in master_users.items():
    if count > 100000:
        print(uid)
        print(data)
    if count > 100010:
        break
    count += 1

vrJIsz1BbGUl24zQRw08cQ
{'business_ids': ['bcyKUgFBUogjjInuhsloqw', '0NxNTtgpfjBIOdLW5IJhCQ'], 'review_count': 3, 'star_list': [5, 5], 'average_stars': 3.67, 'gave_funny': 0, 'fans': 0, 'cool': 1, 'gave_useful': 0, 'friends': 0, 'funny': 1, 'useful': 2, 'gave_cool': 0}
Q96ZvYyAYvfkhGDObyHhEA
{'business_ids': ['JHWBGE2Yy_AIadqSTr5LNA', 'FsCujpVh9Za2Dl5MIYLCxA'], 'review_count': 2, 'star_list': [5, 5], 'average_stars': 5.0, 'gave_funny': 0, 'fans': 0, 'cool': 1, 'gave_useful': 0, 'friends': 245, 'funny': 0, 'useful': 2, 'gave_cool': 1}
sHrZaI6beSY_osgMdlIWww
{'business_ids': ['oN_A87dyVD2FBk7coLnhCg'], 'review_count': 1, 'star_list': [5], 'average_stars': 5.0, 'gave_funny': 0, 'fans': 0, 'cool': 0, 'gave_useful': 0, 'friends': 0, 'funny': 0, 'useful': 0, 'gave_cool': 0}
UQwlQQcw8FsU5zIKlgoWIA
{'business_ids': ['lSdTgM_JOdGd7Wd6nR77sA'], 'review_count': 18, 'star_list': [5], 'average_stars': 4.89, 'gave_funny': 5, 'fans': 1, 'cool': 0, 'gave_useful': 18, 'friends': 35, 'funny': 0, 'useful'

In [125]:
biz_file = 'yelp_data/business_compressed.csv'

businesses = {}

with open(biz_file, newline='') as business_data:
    
    reader = csv.DictReader(business_data, delimiter=",")
    
    for business in reader:
        bid = business.pop('business_id')
        businesses[bid] = business
        

In [126]:
businesses['gQMAcDm8kv8ev7x2BshMwg']

{'city': 'Phoenix',
 'latitude': '33.5100117',
 'longitude': '-112.0960331',
 'postal_code': '85015',
 'review_count': '582',
 'stars': '4.0',
 'topic': '11'}

In [127]:
for uid, data in master_users.items():
    data['lat_longs'] = []
    data['topics'] = []
    data['zips'] = []
#     the number of other reviews for each reviewed business
    data['mutual_reviews'] = []
    data['business_stars'] = []
    biz_ids = data.pop('business_ids')
    
    for bid in biz_ids:
        
        biz_data = businesses[bid]
        data['lat_longs'].append((biz_data['latitude'], biz_data['longitude']))
        data['topics'].append(biz_data['topic'])
        if biz_data['postal_code']:
            data['zips'].append(biz_data['postal_code'])
        data['mutual_reviews'].append(biz_data['review_count'])
        data['business_stars'].append(biz_data['stars'])

In [128]:
master_users['Q96ZvYyAYvfkhGDObyHhEA']

{'average_stars': 5.0,
 'business_stars': ['5.0', '4.5'],
 'cool': 1,
 'fans': 0,
 'friends': 245,
 'funny': 0,
 'gave_cool': 1,
 'gave_funny': 0,
 'gave_useful': 0,
 'lat_longs': [('33.699168', '-111.892549'),
  ('33.4525298384', '-111.926886571')],
 'mutual_reviews': ['34', '295'],
 'review_count': 2,
 'star_list': [5, 5],
 'topics': ['24', '1'],
 'useful': 2,
 'zips': ['85253', '85281']}

In [129]:
user_out = 'yelp_data/final_user.json'

with open(user_out, 'w') as outfile:
    
    for uid, user in master_users.items():
        user['user_id'] = uid
        outfile.write(json.dumps(user) + '\n')