In [2]:
import pandas as pd 
import json 
from collections import defaultdict

### Extract User+Comment

In [None]:
with open('Yelp/yelp_dataset/reviews.json', 'r') as f:
    for i, line in enumerate(f):
        if i >= 5:  
            break
        data = json.loads(line)  
        print(data)

{'review_id': 'KU_O5udG6zpxOg-VcAEodg', 'user_id': 'mh_-eMZ6K5RLWhZyISBhwA', 'business_id': 'XQfwVwDr-v0ZS3_CbbE5Xw', 'stars': 3.0, 'useful': 0, 'funny': 0, 'cool': 0, 'text': "If you decide to eat here, just be aware it is going to take about 2 hours from beginning to end. We have tried it multiple times, because I want to like it! I have been to it's other locations in NJ and never had a bad experience. \n\nThe food is good, but it takes a very long time to come out. The waitstaff is very young, but usually pleasant. We have just had too many experiences where we spent way too long waiting. We usually opt for another diner or restaurant on the weekends, in order to be done quicker.", 'date': '2018-07-07 22:09:11'}
{'review_id': 'BiTunyQ73aT9WBnpR9DZGw', 'user_id': 'OyoGAe7OKpv6SyGZT5g77Q', 'business_id': '7ATYjTIgM3jUlt4UM3IypQ', 'stars': 5.0, 'useful': 1, 'funny': 0, 'cool': 1, 'text': "I've taken a lot of spin classes over the years, and nothing compares to the classes at Body Cycl

In [2]:
import json 
from collections import defaultdict

# 1. 读取reviews，按user_id分组
user_reviews = defaultdict(list)

with open('Yelp/yelp_dataset/reviews.json', 'r') as f:
    for line in f:
        review = json.loads(line)
        user_reviews[review['user_id']].append({
            'business_id': review['business_id'],
            'review_id': review['review_id'],
            'stars': review['stars'],
            'useful': review['useful'],
            'funny': review['funny'],
            'cool': review['cool'],
            'date': review['date'],
            'text': review['text']
        })

# 2. 筛选 >= 5 reviews 的用户
qualified_users = []
for user_id, reviews in user_reviews.items():
    if len(reviews) >= 5:
        # 按日期排序，取最近5条（或按stars排序）
        sorted_reviews = sorted(reviews, key=lambda x: x['date'], reverse=True)[:5]
        qualified_users.append({
            'user_id': user_id,
            'reviews': sorted_reviews,
            'business_ids': [r['business_id'] for r in sorted_reviews]
        })

# 3. 保存结果
with open('user_reviews_filtered.json', 'w') as f:
    json.dump(qualified_users, f, indent=2)

print(f"Found {len(qualified_users)} users with >= 5 reviews")


Found 287116 users with >= 5 reviews


### Business Information Extraction

In [11]:
business_data = pd.read_json('Yelp/yelp_dataset/business.json', lines=True)
business_data.shape

(150346, 14)

In [12]:
business_data[:3]

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."


In [6]:
#user_reviews = pd.read_json('user_reviews_filtered.json')
#user_reviews.shape

In [10]:
all_business_ids = set()
for user in qualified_users:
    all_business_ids.update(user['business_ids'])

print(f"Total unique business IDs: {len(all_business_ids)}")

# 保存为 JSON
with open('qualified_business_ids.json', 'w') as f:
    json.dump(list(all_business_ids), f)

Total unique business IDs: 133560


In [16]:
import random 
# 2. 构建 business_id -> reviews 的映射
business_reviews = defaultdict(list)
for user in qualified_users:
    for i, bid in enumerate(user['business_ids']):
        business_reviews[bid].append(user['reviews'][i]['text'])

# 3. 从 business_data 中提取对应信息
business_info_list = []

for _, row in business_data.iterrows():
    bid = row['business_id']
    if bid in all_business_ids:
        # 随机抽取2条评论（如果不足2条则全部取出）
        reviews = business_reviews.get(bid, [])
        sample_reviews = random.sample(reviews, min(2, len(reviews)))
        
        business_info_list.append({
            'business_id': bid,
            'name': row['name'],
            'city': row['city'],
            'state': row['state'],
            'stars': row['stars'],
            'attributes': row['attributes'],
            'categories': row['categories'],
            'sample_reviews': sample_reviews
        })

print(f"Matched {len(business_info_list)} businesses")

# 4. 保存为 JSON
with open('business_with_reviews.json', 'w') as f:
    json.dump(business_info_list, f, indent=2, ensure_ascii=False)

print("Saved to business_with_reviews.json")


Matched 133560 businesses
Saved to business_with_reviews.json


### User Information Extraction

In [None]:
user_data = pd.read_json('Yelp/yelp_dataset/user.json', lines=True)
user_data.shape

In [3]:
with open('user_reviews_filtered.json', 'r') as f:
    qualified_users = json.load(f)

all_user_ids = [user['user_id'] for user in qualified_users]

print(f"Total qualified user IDs: {len(all_user_ids)}")

# 保存为 JSON
with open('qualified_user_ids.json', 'w') as f:
    json.dump(all_user_ids, f)

print("Saved to qualified_user_ids.json")

Total qualified user IDs: 287116
Saved to qualified_user_ids.json


In [14]:
user_profiles = defaultdict(list)
all_user_ids_set = set(all_user_ids)

START_LINE = 1        
END_LINE = 2000000       

In [7]:
import pickle
from datetime import datetime

In [15]:
user_profiles = {}
current_line = 0
matched = 0

print(f"开始处理: 第 {START_LINE:,} 行 到 第 {END_LINE:,} 行")
print(f"开始时间: {datetime.now()}")
print("=" * 70)

with open('Yelp/yelp_dataset/user.json', 'r', encoding='utf-8') as f:
    for line in f:
        current_line += 1
        
        # 跳过前面不需要的行
        if current_line < START_LINE:
            continue
        
        # 超过结束行就停止
        if current_line > END_LINE:
            break
        
        # 解析JSON
        user = json.loads(line)
        
        # 检查是否是我们要找的用户
        if user['user_id'] in all_user_ids_set:
            matched += 1
            user_profiles[user['user_id']] = {
                'name': user['name'],
                'review_count': user['review_count'],
                'yelping_since': user['yelping_since'],
                'friend_num': len(user['friends']),
                'useful': user['useful'],
                'funny': user['funny'],
                'cool': user['cool'],
                'fans': user['fans'],
                'average_stars': user['average_stars'],
                'compliment_hot': user['compliment_hot'],
                'compliment_more': user['compliment_more'],
                'compliment_profile': user['compliment_profile'],
                'compliment_cute': user['compliment_cute'],
                'compliment_list': user['compliment_list'],
                'compliment_note': user['compliment_note'],
                'compliment_plain': user['compliment_plain'],
                'compliment_cool': user['compliment_cool'],
                'compliment_funny': user['compliment_funny'],
                'compliment_writer': user['compliment_writer']
            }
        
        # 每5万行显示一次进度
        if (current_line - START_LINE + 1) % 50000 == 0:
            print(f"[{datetime.now().strftime('%H:%M:%S')}] "
                  f"已处理: {current_line - START_LINE + 1:,} 行, "
                  f"匹配: {matched:,} 个用户")

output_file = 'user_profiles.json'

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(user_profiles, f, ensure_ascii=False, indent=2)

print("=" * 70)
print(f"完成时间: {datetime.now()}")
print(f"总共处理: {matched:,} 行")
print(f"匹配到: {matched:,} 个用户")

开始处理: 第 1 行 到 第 2,000,000 行
开始时间: 2025-11-16 14:29:29.662226
[14:29:30] 已处理: 50,000 行, 匹配: 31,849 个用户
[14:29:30] 已处理: 100,000 行, 匹配: 59,188 个用户
[14:29:30] 已处理: 150,000 行, 匹配: 83,603 个用户
[14:29:30] 已处理: 200,000 行, 匹配: 102,576 个用户
[14:29:31] 已处理: 250,000 行, 匹配: 124,638 个用户
[14:29:31] 已处理: 300,000 行, 匹配: 144,202 个用户
[14:29:31] 已处理: 350,000 行, 匹配: 159,065 个用户
[14:29:31] 已处理: 400,000 行, 匹配: 171,617 个用户
[14:29:32] 已处理: 450,000 行, 匹配: 187,511 个用户
[14:29:32] 已处理: 500,000 行, 匹配: 200,755 个用户
[14:29:32] 已处理: 550,000 行, 匹配: 211,791 个用户
[14:29:32] 已处理: 600,000 行, 匹配: 221,341 个用户
[14:29:33] 已处理: 650,000 行, 匹配: 231,513 个用户
[14:29:33] 已处理: 700,000 行, 匹配: 240,354 个用户
[14:29:33] 已处理: 750,000 行, 匹配: 246,782 个用户
[14:29:33] 已处理: 800,000 行, 匹配: 252,753 个用户
[14:29:34] 已处理: 850,000 行, 匹配: 258,721 个用户
[14:29:34] 已处理: 900,000 行, 匹配: 264,136 个用户
[14:29:34] 已处理: 950,000 行, 匹配: 267,860 个用户
[14:29:34] 已处理: 1,000,000 行, 匹配: 271,488 个用户
[14:29:34] 已处理: 1,050,000 行, 匹配: 274,579 个用户
[14:29:35] 已处理: 1,100,000 行, 匹配: 277