In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
import pickle

## Review

In [232]:
review_json_path = '../data/yelp_dataset/yelp_academic_dataset_review.json'
tip_json_path = '../data/yelp_dataset/yelp_academic_dataset_tip.json'
business = pd.read_pickle('tmp_data/business.pickle')
users = pd.read_pickle('tmp_data/Users.pickle')

In [233]:
size = 500000

review = pd.read_json(review_json_path, lines=True, dtype={'review_id':str,'user_id':str,'business_id':str,'stars':int,'date':str,'text':str,'useful':int,'funny':int,'cool':int},chunksize=size)

chunk_list = []
for chunk in review:
    chunk = chunk.drop(['review_id'], axis=1)
    chunk = chunk.rename(columns={'stars': 'review_stars'})
    chunk = chunk.rename(columns={'text': 'review'})
    chunk_merged = pd.merge(chunk, business[['business_id']], on='business_id', how='inner')
    chunk_merged = pd.merge(chunk_merged, users[['user_id']], on='user_id', how='inner')
    print(f"{chunk_merged.shape[0]} out of {size:,} related reviews")
    chunk_list.append(chunk_merged)

review_df = pd.concat(chunk_list, ignore_index=True, join='outer', axis=0)
print(review_df.shape)
review_df.sample(3)

1312 out of 500,000 related reviews
1312 out of 500,000 related reviews
1153 out of 500,000 related reviews
1399 out of 500,000 related reviews
1268 out of 500,000 related reviews
1205 out of 500,000 related reviews
1364 out of 500,000 related reviews
1438 out of 500,000 related reviews
1364 out of 500,000 related reviews
1222 out of 500,000 related reviews
1164 out of 500,000 related reviews
1437 out of 500,000 related reviews
1426 out of 500,000 related reviews
1342 out of 500,000 related reviews
(18406, 8)


Unnamed: 0,user_id,business_id,review_stars,useful,funny,cool,review,date
11872,5Xnz5T8V2BgstQbYupTn5w,6a4gLLFSgr-Q6CZXDLzBGQ,4,0,0,0,Food was excellent. Rabbit & dumplings were o...,2010-10-31 05:43:44
13487,1LBiaytob4-fL1sK87jwtg,zjTBfbvbN2Ps6_Ar0w-fuQ,5,1,0,1,My dad is a regular here. He lives in port Ric...,2017-06-25 17:43:32
6193,lYonv0jBeu65-TcU_7uM0A,3WySw_caRJy7YVPLlWtBRw,5,1,0,0,Encanto is a MUST TRY for anyone that enjoys a...,2013-07-13 03:22:35


In [234]:
size = 500000

tip = pd.read_json(tip_json_path, lines=True, dtype={'text':str,'date':str,'compliment_count':int,'business_id':str,'user_id': int},chunksize=size)

chunk_list_tip = []
for chunk in tip:
    chunk = chunk.rename(columns={'text': 'tip'})
    chunk_merged = pd.merge(chunk, business[['business_id']], on='business_id', how='inner')
    chunk_merged = pd.merge(chunk_merged, users[['user_id']], on='user_id', how='inner')
    print(f"{chunk_merged.shape[0]} out of {size:,} related reviews")
    chunk_list_tip.append(chunk_merged)

tip_df = pd.concat(chunk_list_tip, ignore_index=True, join='outer', axis=0)
print(tip_df.shape)
tip_df.sample(100)

tip_df.compliment_count.value_counts()

1238 out of 500,000 related reviews
1215 out of 500,000 related reviews
(2453, 5)


0    2432
1      20
2       1
Name: compliment_count, dtype: int64

In [238]:
review_dfs = pd.merge(tip_df, review_df, on=['user_id','business_id','date'], how='outer')

In [236]:
#check min max length of tip
print(review_dfs['tip'].str.len().max())
print(review_dfs['tip'].str.len().min())

#check min,max length of review
print(review_dfs['review'].str.len().max())
print(review_dfs['review'].str.len().min())

500.0
1.0
4998.0
29.0


In [240]:
#drop outlier
review_dfs = review_dfs[review_dfs['review'].str.len()!=29]
#rename col 
review_dfs = review_dfs.rename(columns={'tip': 'short_tip','date':'review_date','review_stars':'stars','review':'detailed_review','compliment_count':'likes'})
#convert float to int
review_dfs[['likes','stars','useful','funny','cool']] = review_dfs[['likes','stars','useful','funny','cool']].fillna(0).astype(int)
#add review_id
review_dfs['review_id'] = [i + 100000 for i in range(len(review_dfs))]


In [241]:
#reset colname
review = review_dfs[['review_id','review_date','business_id','short_tip','likes','detailed_review','stars','useful','funny','cool']]


In [242]:
review.to_pickle('tmp_data/Review_of_Business.pickle')

## Users write review

In [248]:
users_write_review = pd.DataFrame()
users_write_review['user_id']= review_dfs['user_id']
users_write_review['review_id']= review_dfs['review_id']

In [252]:
users_write_review.head()

Unnamed: 0,user_id,review_id
0,oCgXFRb3v4vO2Q0OVmPBkA,100000
1,oCgXFRb3v4vO2Q0OVmPBkA,100001
2,oCgXFRb3v4vO2Q0OVmPBkA,100002
3,oCgXFRb3v4vO2Q0OVmPBkA,100003
4,6OZDhVrt81UC3b-gDEw_oA,100004


In [250]:
users_write_review.to_pickle('tmp_data/Users_write_Review.pickle')

## Users_follow_Users

In [311]:
users = pd.read_pickle('tmp_data/Users.pickle')

#Assume 70% of users don't follow any users, and 30% of the active users have min(0,  𝑁(5, 5)) fans, and follow_since happened at a random date from after signup date till the globally maximum signup date.

In [312]:
users_follow= pd.DataFrame()
users_follow = users[['user_id']].sample(frac=0.3).reset_index(drop=True) # only 30%users have fans
users_follow = users_follow.rename(columns={'user_id': 'fan_user_id'})

In [313]:
users_follow['n_followers'] = np.round(np.maximum(1, np.random.normal(3, 5, len(users_follow)))).astype('int')

In [314]:
users_follow

Unnamed: 0,fan_user_id,n_followers
0,7FkS4-iTNCmnsg6domqhWQ,7
1,DE1f22CynZpyUK7JaivLmw,1
2,I6S3570pSQ37GTibA3ttQw,3
3,Ac0pi_xqI-j02bZw5_HpRw,3
4,GTsOMS5HOgc_KG725S1XUg,8
...,...,...
8886,ZbrQsrtRqXkCZd7wBIOCyQ,1
8887,CzhKoVMsRK-zylwUuaAjRQ,2
8888,DLo53C3BgJkXZPoRz19V4Q,5
8889,fhCVm8NUkbq9Prec94Y6ng,3


In [315]:
def get_fan(fan):
    df = users.sample(fan['n_followers'])
    fan['followee_user_id'] = df.user_id.tolist()
    #fan['collection_id'] = df.collection_id.tolist()
    return fan
users_follow = users_follow.apply(get_fan, axis=1).explode(["followee_user_id"])


In [318]:
def random_dates(start, end=max(users.yealping_since), n=10):
    random.seed(4111)
    d = random.randint(0, (end - start).days)
    return start + pd.DateOffset(days=d)
users_follow['follow_since'] = users.apply(lambda x: random_dates(x['yealping_since']), axis=1)


In [319]:
users_follow = users_follow[['fan_user_id','followee_user_id','follow_since']]

In [321]:
users_follow.to_pickle("tmp_data/Users_follow_Users.pickle")

## photo

In [10]:
photo_json_path = '../data/yelp_photos/photos.json'

photo = pd.read_json(photo_json_path, lines=True, dtype={'photo_id':str,'business_id':str,'caption':str,'label':str})
                                                                                                      

In [12]:
photo.to_pickle("tmp_data/Photo_contained_Business.pickle")