## Yelp inference (test-set) generation

In [2]:
YELPPATH = '/share/data/yelp'
RP = 'yelp_academic_dataset_review.json'
BUSINESS = 'yelp_academic_dataset_business.json'
rppath = f'{YELPPATH}/{RP}'
busipath = f'{YELPPATH}/{BUSINESS}'

In [3]:
import pandas as pd
import json

In [37]:
Bid2Info = {}
with open (busipath, 'r') as f:
    for line in f:
        line = json.loads(line)
        busi_id = line['business_id']
        category = line['categories']
        
        name = line['name']
        star = line['stars']
        Bid2Info[busi_id] = {'name':name, 
                             'category':category, 
                             'star':star, 
                             'review_ids':[], 
                             'business_id':busi_id}

In [29]:
tags = set()
from collections import defaultdict
counter = defaultdict(int)
NoCat = 0 
for ts in Bid2Info.values():
    try:
        ts = ts['category'].split(',')
    except:
        NoCat += 1
        continue
    for t in ts:
        t = t.strip()
        tags.add(t)
        counter[t]+=1
print(f'No category: {NoCat}')
# print(f'#tags: {len(tags)}') 

No category: 103


In [30]:
path = '../data/yelp_tags.csv'
tags = pd.DataFrame({'tag': list(tags)})
tags.to_csv(path, index = False)

In [7]:
scounter = sorted(counter.items(), key = lambda x:-x[1])
# ('Eyewear & Opticians', 1016)
scounter.index(('Eyewear & Opticians', 1016)) # 124
# scounter[:124]

124

In [31]:
import sys
# caution: path[0] is reserved for script path (or '' in REPL)
sys.path.insert(1, '../data')
from yelp_tags import YelpFoodTags

RID_Dict = {}
FoodReviewIDs = set()
isCoveredReview = 0 
YelpFoodTags = set(YelpFoodTags)
KnownRestaurants = set()
with open (rppath, 'r') as f:
    for id, line in enumerate(f):
        line = json.loads(line)
        store = Bid2Info[line['business_id']]
        # 'Shipping Centers, Local Services, Notaries, Mailbox Centers, Printing Services'
        flag = False
        if store['business_id'] in KnownRestaurants:
            isCoveredReview += 1 
            flag = True
        elif store['category']:
            for tag in store['category'].split(','):
                tag = tag.strip()
                if tag in YelpFoodTags:
                    isCoveredReview += 1
                    flag = True
                    KnownRestaurants.add(store['business_id'])
                break 
        if flag:
            FoodReviewIDs.add(line['review_id'])
            
        RID_Dict[line['review_id']] = line

In [13]:
MYSEED = 1024
import random 
random.seed(MYSEED)
print(f'Successfully seeding {MYSEED}')
len(FoodReviewIDs)

Successfully seeding 1024


44555

### Generate random/general inference data 
- 100 筆

In [14]:
n = 100
chosenIDs = random.sample(FoodReviewIDs, n)
chosenReviews = [RID_Dict[x] for x in chosenIDs]

In [19]:
import json 
with open('../data/yelp-food-test100.json', 'w') as f:
    json.dump(chosenReviews, f)

### Generate restaurant-specific data

In [44]:
## add review number attribute
for frid in FoodReviewIDs:
    review = RID_Dict[frid]
    bid = review['business_id']
    if bid == '_ab50qdWOk0DdB6XOrBitw':
        print('!')
    Bid2Info[bid]['review_ids'].append(frid)    
sbid = sorted(Bid2Info.items(), key = lambda x:-len(x[1]['review_ids']))

In [20]:
# randomly pick from these 2 resta

{'Mm6dl2ZANXk4vzzJ9Nztjg',
 'z4Xtxe6demG-4N5KkkSE1Q',
 'k9DdPdI1gF8K4tkIuvp02A',
 'F-DYslH7w4jfxfACNrRKNA',
 'f57iqQ8dnQLYq3bcrLD7Qg',
 '1JViNL_VNI8tBjHmtojEQw',
 '0W2KiL_Rgqd9GvIjFNpGEA',
 'U2kb6WvFHIdLWuXL5dLKPQ',
 'KW2RrtrPWyeBeYlyd2vQ4Q',
 'BK_k_IUsEHMagmHffdD-bA',
 'q09EustC0HN2KMP5-EkuKg',
 'ZEzC-_x0hc6bDC0aHymywQ',
 '95ObZeCtNILkMzAN2IShOg',
 'dSD4WmBMYqWY5vj8L4Rr_Q',
 'icRR4qIwgl9Z_zIYf-pi7Q',
 'q7Ws2ivuReuAXAd3FxJizw',
 'BCjaPNM_MkQYmr_8wcrDUg',
 '3xwr6ZAzr0G4lT1iXfoR9w',
 'NFHXnOKfJBYIXNqpxfTIXg',
 'OwWs4uIEa7Ku00PjCYJLKw',
 'yXmlElvKxV6FvAlOCKaVfA',
 'VIUwZgCQnwpbB2w2Xbr5CA',
 'HX5NSN8QVPSLKWVP4gtwNQ',
 'z9EMTyvEC97ctufcFqQrhQ',
 'GXWLELDC9zBlIve_zr8_yg',
 'hU9PXpBfS0r7znrM8PiipA',
 'AlcXocLsvTBNVNtOb7k5gw',
 'nIBJCJohyaFjcqwrjZkJ3A',
 'dCKYcIuhOhRR3655x4Eq5Q',
 'vKKKPhWcyGGp2fR78_7ZLg',
 'SscWp3LWat9g-OeMJ90idw',
 '1Qsgm63Bff1l3s7CWtWQCw',
 '_h6g05qFAtg5uvneDmHVpw',
 '6Kja66E10JI2zjl-G7zMSg',
 'Bus_xuP6j5BrRpam_mZsuQ',
 '87G2E8eqekaqs1YCJgBhfg',
 'zGir0CMCi2jL-_0BFQnIpg',
 

### Restaurant Covered Ratio
- Out of the 15K (unique business_id) stores(business) that yelp records, 

  about 7K (46%) are restaurants/food-related stores. 

In [48]:
restaurant = 0
for ts in Bid2Info.values():
    try:
        ts = ts['category'].split(',')
    except:
        NoCat += 1
        continue
    for t in ts:
        t = t.strip()
        if t in YelpFoodTags:
            restaurant += 1
            break 
restaurant/len(Bid2Info) 

0.4609633778085217

In [51]:
restaurant

69304

### Review Covered Ratio 
- Out of the 699K data, roughly 325K are commenting on restuarants/stores relevant to food. 
- The coverage is 46.6%. 

In [60]:
# review covered ratio: 
isCoveredReview/len(REVIEWS) 

0.4662050733303959

In [7]:
# !cp /share/home/yuxiang/yelp_data/mapping/tags.txt /share/home/nana2929/repo_en/data

In [62]:
isCoveredReview # inference 的時候把不是餐廳的丟掉

3258904

In [39]:
len(REVIEWS) 

6990280