## 2. Data Preparation

In this part, we clean the Yelp dataset which contains mostly real-word data. That being said, we still need to generate some of the features, like `password` of User but all with reasonable assumptions described.

To re-run the code, download the [Yelp Dataset](https://www.yelp.com/dataset/documentation/main) into a folder named "data" in the parent level of this file.

To access the cleaned data output from this part, we upload the dataframes in pickle format into a [Google Drive](https://drive.google.com/drive/folders/1RvL6q7U1eeMvFVpNyH-dJ1AdkIf_E7l0?usp=sharing).

In [1]:
import pickle
pickle.HIGHEST_PROTOCOL = 4
import pandas as pd
import numpy as np
import random, string
from tqdm import tqdm
from datetime import datetime

### 2.1 Entity

#### Business

In [2]:
business = pd.read_json("../data/yelp_dataset/yelp_academic_dataset_business.json", lines=True)

In [3]:
# Only include restaurant of main category
business_category = business[['business_id', 'categories']]
business_category = business_category.assign(category = business.categories.str.split(', ')).explode('category').drop('categories', axis=1)
MAIN_FOOD_CATEGORIES = """Bars
Sandwiches
Fast Food
Pizza
Coffee & Tea
Breakfast & Brunch
Burgers
Mexican
Specialty Food
Italian
Seafood
Chicken Wings
Chinese
Salad
Bakeries
Cafes""".split('\n')
business_category = business_category[business_category.category.isin(MAIN_FOOD_CATEGORIES)]
business = business.merge(business_category[['business_id']].drop_duplicates(),
                          on="business_id",
                          how="inner").drop(['categories'], axis=1)

# Select N_USER business with moderate amount of reviews
## Note: We finished the data cleaning pipeline, but for the purpose of this homework, we only includes 100 restaurant and 200 users that has wrote reviews on them!
## For project 1 part 3, we will request more resources to allow us populate the whole dataset ^.^
N_BUSINESS = 100
business = business[(business['review_count'] >= business['review_count'].quantile(.25)) & 
                    (business['review_count'] <= business['review_count'].quantile(.75))].sample(n=N_BUSINESS, random_state=4111)
business = business.drop(['review_count', 'stars', 'hours'], axis=1).reset_index(drop=True)

# Extract whether allow takeout information
attribute_values = ['False' if attributes is None else attributes.get('RestaurantsTakeOut') for attributes in business.attributes]
business['is_takeout'] =  ['False' if value is None or value == 'None' else value for value in attribute_values]
business['is_open'] = business['is_open'].astype(bool)
business = business.drop('attributes', axis=1)

In [4]:
business.head(5)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,is_open,is_takeout
0,2gTQ0X9iRTs5zIKs97IWOA,Wayback Burgers,200 West Alexander St,Plant City,FL,33563,27.989818,-82.122716,True,True
1,TlaXeIvS_0yQhjWTLHCLSA,Sciarrinos Pizza,2310 Carpenter Station Rd,Wilmington,DE,19810,39.818019,-75.466534,True,True
2,sBRKxVbRBmeXY9mVwliqYw,Someone's in the Kitchen,109 Walton Ferry Rd,Hendersonville,TN,37075,36.303316,-86.619735,True,True
3,KfBpr_NoldM9w0CugY32ow,Wawa,2177 Gulf To Bay Blvd,Clearwater,FL,33765,27.959926,-82.747455,True,True
4,t24_JnNptChMXityiA6mgQ,Forest Hills Brick Oven Pizza,905 W Linebaugh Ave,Tampa,FL,33612,28.040184,-82.468059,True,True


In [5]:
business.to_pickle('data/Business.pickle')

##### Business_tagged_Category

In [6]:
business_category = business_category.merge(business[['business_id']],
                                            on="business_id",
                                            how="inner")

In [7]:
business_category.head(5)

Unnamed: 0,business_id,category
0,TEP-73fGvgSUmtQFaOTA_g,Burgers
1,L0-MS0MbQhEWAPLkjCqhpg,Mexican
2,L0-MS0MbQhEWAPLkjCqhpg,Seafood
3,L0-MS0MbQhEWAPLkjCqhpg,Burgers
4,NKplFLr1UebQxMLQAxALPQ,Pizza


In [8]:
business_category.to_pickle('data/Business_tagged_Category.pickle')

#### Category

In [9]:
category = pd.DataFrame({'name': MAIN_FOOD_CATEGORIES})

In [10]:
category.head(5)

Unnamed: 0,name
0,Bars
1,Sandwiches
2,Fast Food
3,Pizza
4,Coffee & Tea


In [11]:
category.to_pickle('data/Category.pickle')

#### Users

In [12]:
N_USER = 200 
size = 500000
# Sample N_USER users among those who have commented on those business 
review_json_path = '../data/yelp_dataset/yelp_academic_dataset_review.json'
review = pd.read_json(review_json_path, lines=True, 
                      dtype={'review_id':str,'user_id':str,
                             'business_id':str,'stars':int,
                             'date':str,'text':str,'useful':int,
                             'funny':int,'cool':int},
                      chunksize=size)
chunk_list = []
for chunk in tqdm(review):
    chunk = chunk[['business_id', 'user_id']].merge(business[['business_id']], how='inner')
    chunk_list.append(chunk)
user_id_list = pd.concat(chunk_list, ignore_index=True, axis=0)[["user_id"]].drop_duplicates().sample(n=N_USER)

14it [01:17,  5.55s/it]


In [13]:
#users_raw = pd.read_json("../data/yelp_dataset/yelp_academic_dataset_user.json", lines=True)
#users_raw.to_pickle("../data/users_raw.pickle")
users_raw = pd.read_pickle('../data/users_raw.pickle')
users = pd.merge(users_raw, user_id_list, how="inner")

In [14]:
# Extract date of registrating
users["yealping_since"] = pd.to_datetime(users["yelping_since"]).dt.date

# Generate a password for existing users
def password_generator(prefix, length_min=8, length_max=16):
    chars = string.ascii_letters + string.digits + '!@#$%*'
    # Generate a [1, 6], but mostly 1~3 random suffix
    random_suffix_length = round(np.clip(np.random.normal(2, 1), a_min=1, a_max=6))
    random_suffix_length = max(length_min-len(prefix), random_suffix_length)
    random_suffix_length = min(length_max-len(prefix), random_suffix_length)
    random_suffix = ''.join(random.choice(chars) for _ in range(random_suffix_length))
    return prefix+random_suffix

users["password"] = users.apply(lambda x: password_generator(prefix=x['name']), axis=1)

# Generate an email address for existing users
EMAIL_DOMAIN_OPTIONS = ["@gmail.com", "@hotmail.com", "@outlook.com", "@inbox.com", "@qq.com"]
users['email'] = users.groupby("name")["name"].rank(method="first", ascending=True).astype(int)
users['email'] = users.apply(lambda x: x['name'] + "_" +str(x['email']) + random.choice(EMAIL_DOMAIN_OPTIONS), axis=1).str.replace("_1@", "@", regex=False)

# Select the columns
users = users[["user_id", "email", "name", "password", "yealping_since"]].reset_index(drop=True)

In [15]:
users.to_pickle('data/Users.pickle')

#### Collection_of_User

Assume 85% of users don't have collection, and 15% of the active users have min(1, $N(2, 4)$) collections that created from a random date from after signup date till the globally maximum signup date.

In [16]:
collection = users[['user_id', 'yealping_since']].sample(frac=0.15).reset_index(drop=True)

# Generate local id for each user_id
collection['n_collection'] = pd.Series(np.around(np.random.normal(2, 4, len(collection)))).clip(1, 20)
collection['collection_id'] = collection['n_collection'].apply(lambda x: list(range(1, int(x)+1)))
collection = collection.explode('collection_id').reset_index(drop=True)

# Generate a random collection create date
def random_dates(start, end=max(users.yealping_since), n=10):
    random.seed(4111)
    d = random.randint(0, (end - start).days)
    return start + pd.DateOffset(days=d)
collection['created_time'] = collection.apply(lambda x: random_dates(x['yealping_since']), axis=1)

# Select columns
collection = collection[["user_id", "collection_id", "created_time"]]

In [17]:
collection.head()

Unnamed: 0,user_id,collection_id,created_time
0,k6AwCajLT06J6cwC3SqFcg,1,2020-10-07
1,73oVGO52bkKmjgx4Nwat4Q,1,2021-01-11
2,73oVGO52bkKmjgx4Nwat4Q,2,2021-01-11
3,73oVGO52bkKmjgx4Nwat4Q,3,2021-01-11
4,73oVGO52bkKmjgx4Nwat4Q,4,2021-01-11


In [18]:
collection.to_pickle("data/Collection_of_User.pickle")

#### Review_of_Business

In [19]:
size = 500000
review_json_path = '../data/yelp_dataset/yelp_academic_dataset_review.json'
review = pd.read_json(review_json_path, lines=True, dtype={'review_id':str,'user_id':str,'business_id':str,'stars':int,'date':str,'text':str,'useful':int,'funny':int,'cool':int},chunksize=size)

chunk_list = []
for chunk in review:
    chunk = chunk.drop(['review_id'], axis=1)
    chunk = chunk.rename(columns={'stars': 'review_stars'})
    chunk = chunk.rename(columns={'text': 'review'})
    chunk_merged = pd.merge(chunk, business[['business_id']], on='business_id', how='inner')
    chunk_merged = pd.merge(chunk_merged, users[['user_id']], on='user_id', how='inner')
    print(f"{chunk_merged.shape[0]} out of {size:,} related reviews")
    chunk_list.append(chunk_merged)

review_df = pd.concat(chunk_list, ignore_index=True, join='outer', axis=0)


16 out of 500,000 related reviews
17 out of 500,000 related reviews
9 out of 500,000 related reviews
10 out of 500,000 related reviews
16 out of 500,000 related reviews
15 out of 500,000 related reviews
15 out of 500,000 related reviews
10 out of 500,000 related reviews
15 out of 500,000 related reviews
30 out of 500,000 related reviews
23 out of 500,000 related reviews
5 out of 500,000 related reviews
14 out of 500,000 related reviews
14 out of 500,000 related reviews


In [20]:
size = 500000
tip_json_path = '../data/yelp_dataset/yelp_academic_dataset_tip.json'
tip = pd.read_json(tip_json_path, lines=True, dtype={'text':str,'date':str,'compliment_count':int,'business_id':str,'user_id': int},chunksize=size)

chunk_list_tip = []
for chunk in tip:
    chunk = chunk.rename(columns={'text': 'tip'})
    chunk_merged = pd.merge(chunk, business[['business_id']], on='business_id', how='inner')
    chunk_merged = pd.merge(chunk_merged, users[['user_id']], on='user_id', how='inner')
    print(f"{chunk_merged.shape[0]} out of {size:,} related tips")
    chunk_list_tip.append(chunk_merged)

tip_df = pd.concat(chunk_list_tip, ignore_index=True, join='outer', axis=0)

8 out of 500,000 related tips
2 out of 500,000 related tips


In [21]:
review_df = review_df[review_df['review'].str.len()>=30]
review_dfs = pd.merge(tip_df, review_df, on=['user_id','business_id','date'], how='outer')
review_dfs = review_dfs.rename(columns={'tip': 'short_tip','date':'review_date','review_stars':'stars','review':'detailed_review','compliment_count':'likes'})
review_dfs['review_date'] =  pd.to_datetime(review_dfs['review_date']).dt.date
# review_dfs[['likes','stars','useful','funny','cool','detailed_review']] = review_dfs[['likes','stars','useful','funny','cool', 'detailed_review']]
#review_dfs= review_dfs.where(review_dfs.notnull(),None)

In [22]:
# Add globally unique review_id
review_dfs['review_id'] = [i + 100000 for i in range(len(review_dfs))]

# Select columns
review = review_dfs[['review_id','review_date','business_id','short_tip','likes','detailed_review','stars','useful','funny','cool']]

In [23]:
review.head()

Unnamed: 0,review_id,review_date,business_id,short_tip,likes,detailed_review,stars,useful,funny,cool
0,100000,2013-05-13,inyckJCTAiQ8ro5ShDi6OQ,"What's not to like....fresh produce, vegetable...",0.0,,,,,
1,100001,2013-06-11,BFPxxguGxBZQ0kR0rWePTQ,Good as any.....,0.0,,,,,
2,100002,2015-09-03,xuFjcrdGxISZVfLVl0mttA,Starter menu trying out a few key entrees and ...,0.0,,,,,
3,100003,2015-08-24,xuFjcrdGxISZVfLVl0mttA,If you need an accessible table I suggest call...,0.0,,,,,
4,100004,2014-02-08,MN5A-cUnGnffkN2wf6Y6MQ,I have an addiction to their crab rangoons. No...,0.0,,,,,


In [24]:
review.to_pickle('data/Review_of_Business.pickle')

##### Users_write_Review

In [25]:
users_write_review = review_dfs[['user_id', 'review_id']]

In [26]:
users_write_review.head()

Unnamed: 0,user_id,review_id
0,RKULSOrIvvYpDmtuYXEXzA,100000
1,RKULSOrIvvYpDmtuYXEXzA,100001
2,Kgb1KdaTrRnGqQ4misL12w,100002
3,96f2e36vpRvMBEZ92xiwIQ,100003
4,kkNFDL_bfM4BP65UEaDs_w,100004


In [27]:
users_write_review.to_pickle('data/Users_write_Review.pickle')

#### Photo_contained_Business

In [28]:
photo_json_path = '../data/yelp_photos/photos.json'
photo = pd.read_json(photo_json_path, lines=True, dtype={'photo_id':str,'business_id':str,'caption':str,'label':str})                       
photo = pd.merge(photo, business[['business_id']], on=['business_id'], how='inner')

In [29]:
photo.head()

Unnamed: 0,photo_id,business_id,caption,label
0,YytrZGwGLAscS-270DUU_w,tLMWVzUBGjklGIyEQQLxXQ,,outside
1,26WQjQQO6dpXQh__MQfzfA,tLMWVzUBGjklGIyEQQLxXQ,,inside
2,lPDxpMwbSldbX5atTBNUng,tLMWVzUBGjklGIyEQQLxXQ,Aja channelside in downtown Tampa,inside
3,AFo1Dt0NUc2MBcdBSahYSA,tLMWVzUBGjklGIyEQQLxXQ,Upstairs VIP,inside
4,L96SgH_HdVQe0y3XrdmcGg,OLGqB9dRca8Vib7lMdtC8A,NYE,drink


In [30]:
photo.to_pickle("data/Photo_contained_Business.pickle")

### 2.2 Relationship

#### Users_favorite_Business

Assume 60% of users don't have collection, and 40% of the active users have min(1,  𝑁(3, 5)) business favorated that they have reviewed, which happened at a random date from after signup date till the globally maximum signup date.

In [31]:
user_business = users[['user_id']].sample(frac=0.4).reset_index(drop=True)

# Generate local id for each user_id
user_business['n_bz_follow'] = pd.Series(np.around(np.random.normal(3, 5, len(user_business)))).clip(lower=1).astype('int')
user_business['business_id'] = user_business['n_bz_follow'].apply(lambda n: business.sample(n).business_id.tolist())
user_business = user_business.explode(["business_id"])
user_business = user_business[['user_id', 'business_id']]

In [32]:
user_business.head(5)

Unnamed: 0,user_id,business_id
0,SyOG4eHZK3wv3MnpiZVX9w,HCUsFoHYsMc_-qvIVA19IA
1,K-B9Ir8e0B-aro8p1VYDqw,Kt6aKSP97edaiSs-hEZzNw
1,K-B9Ir8e0B-aro8p1VYDqw,u48EHQFGHF5FwDTqAFXVwQ
1,K-B9Ir8e0B-aro8p1VYDqw,79sRRGDXhjRvxvBkQ4W9CQ
1,K-B9Ir8e0B-aro8p1VYDqw,aq5Y8xr0pwrwRI9yL4KgoA


In [33]:
user_business.to_pickle("data/Users_favorite_Business.pickle")

#### Users_follow_Collection

Assume 80% of users don't follow any collection, and 20% of the active users follow max(1, 𝑁(2, 5)) collections (include theirselves)>

In [34]:
users_collection = pd.DataFrame()
users_collection['fan_user_id'] = users[['user_id']]

# Random some collections for user to follow
users_collection['n_collection_follow'] = np.round(np.maximum(1, np.random.normal(2, 5, len(users_collection)))).astype('int')
def get_collection(fan):
    df = collection.sample(fan['n_collection_follow'])
    fan['followee_user_id'] = df.user_id.tolist()
    fan['collection_id'] = df.collection_id.tolist()
    return fan
users_collection = users_collection.apply(get_collection, axis=1).explode(["followee_user_id", "collection_id"])
users_collection = users_collection[['fan_user_id','followee_user_id','collection_id']]

In [35]:
collection.head()

Unnamed: 0,user_id,collection_id,created_time
0,k6AwCajLT06J6cwC3SqFcg,1,2020-10-07
1,73oVGO52bkKmjgx4Nwat4Q,1,2021-01-11
2,73oVGO52bkKmjgx4Nwat4Q,2,2021-01-11
3,73oVGO52bkKmjgx4Nwat4Q,3,2021-01-11
4,73oVGO52bkKmjgx4Nwat4Q,4,2021-01-11


In [36]:
users_collection.to_pickle('data/Users_follow_Collection.pickle')

#### Collection_contain_Business

Assume every collections collects max(1, 𝑁(3, 5)) business.

In [37]:
collection_business = pd.DataFrame()
collection_business['collection_owner_id'] = collection['user_id']
collection_business['collection_id'] = collection['collection_id']
collection_business['n_business_contain'] = np.round(np.maximum(1, np.random.normal(3, 5, len(collection_business)))).astype('int')
collection_business['business_id'] = collection_business['n_business_contain'].apply(lambda n: business.sample(n).business_id.tolist())
collection_business = collection_business.explode('business_id')
collection_business = collection_business[['collection_owner_id','collection_id', 'business_id']]

In [38]:
collection_business.head()

Unnamed: 0,collection_owner_id,collection_id,business_id
0,k6AwCajLT06J6cwC3SqFcg,1,kS-UWuhV8kxAdMN3RW0Ssw
0,k6AwCajLT06J6cwC3SqFcg,1,MSnWYdS0w5m9JLcr1wHO4w
0,k6AwCajLT06J6cwC3SqFcg,1,tLMWVzUBGjklGIyEQQLxXQ
0,k6AwCajLT06J6cwC3SqFcg,1,9TvKg94l4PA2xuViDlvOuA
0,k6AwCajLT06J6cwC3SqFcg,1,BXPyrf12pvtP6fXNvYZUeg


In [39]:
collection_business.to_pickle('data/Collection_contain_Business.pickle')

#### Users_follow_Users

Assume 70% of users don't follow any users, and 30% of the active users have min(0,  $N(3, 5)$) fans, and follow_since happened at a random date from after signup date till the globally maximum signup date.

In [40]:
users_follow= pd.DataFrame()
users_follow = users[['user_id']].sample(frac=0.3).reset_index(drop=True) # only 30%users have fans
users_follow = users_follow.rename(columns={'user_id': 'fan_user_id'})

# Generate follower
users_follow['n_followers'] = np.round(np.maximum(1, np.random.normal(3, 5, len(users_follow)))).astype('int')
def get_fan(fan):
    df = users.sample(fan['n_followers'])
    fan['followee_user_id'] = df.user_id.tolist()
    #fan['collection_id'] = df.collection_id.tolist()
    return fan
users_follow = users_follow.apply(get_fan, axis=1).explode(["followee_user_id"])

# Generate follow date
def random_dates(start, end=max(users.yealping_since), n=10):
    random.seed(4111)
    d = random.randint(0, (end - start).days)
    return start + pd.DateOffset(days=d)
users_follow['follow_since'] = users.apply(lambda x: random_dates(x['yealping_since']), axis=1)

users_follow = users_follow[['fan_user_id','followee_user_id','follow_since']]

In [41]:
users_follow.head()

Unnamed: 0,fan_user_id,followee_user_id,follow_since
0,7YSylmBoZxHTWF3j-2a4zA,OiqOaU31KVRB7RfyvZeM1w,2020-04-10
1,uOkJt5kTu7wafarASZRvqQ,ixh7QWfylnpeFQ1rGG-8eg,2019-12-02
1,uOkJt5kTu7wafarASZRvqQ,og-NLmnYAAr1CC1iYDIuog,2019-12-02
1,uOkJt5kTu7wafarASZRvqQ,hdrfHfM-wd9P5VCmMnJv1A,2019-12-02
1,uOkJt5kTu7wafarASZRvqQ,RdLTUqd662yMT1IcCG5sQw,2019-12-02


In [42]:
users_follow.to_pickle("data/Users_follow_Users.pickle")