In [1]:
import random 
import json
import pandas as pd
import psycopg

In [2]:
%load_ext sql

In [3]:
%sql postgresql://michellelin@localhost/yelp

# Sample data from JSON file

In [4]:
def reservoir_sampling(filename, k=1000):
    """
    Perform reservoir sampling from a large JSON file (line-by-line streaming)
    """
    reservoir = []
    with open(filename, 'r') as f:
        for i, line in enumerate(f):
            record = json.loads(line)
            if i < k:
                reservoir.append(record)
            else:
                j = random.randint(0, i)
                if j < k:
                    reservoir[j] = record
    return reservoir


# Step 1: Sample `business.json`
sampled_business = reservoir_sampling('../data/yelp_dataset/yelp_academic_dataset_business.json')
df_business = pd.DataFrame(sampled_business)
sampled_business_ids = set(df_business['business_id'])
df_business.to_csv('../sampled_data/business_sample.csv', index=False)

# Step 2: Filter `reviews.json` by `business_id`
filtered_reviews = []
user_ids = set()

with open('../data/yelp_dataset/yelp_academic_dataset_review.json', 'r') as f:
    for line in f:
        review = json.loads(line)
        if review['business_id'] in sampled_business_ids:
            filtered_reviews.append(review)
            user_ids.add(review['user_id'])

df_reviews = pd.DataFrame(filtered_reviews)
df_reviews.to_csv('../sampled_data/reviews_sample.csv', index=False)

# Step 3: Filter `user.json` by `user_id`
filtered_users = []

with open('../data/yelp_dataset/yelp_academic_dataset_user.json', 'r') as f:
    for line in f:
        user = json.loads(line)
        if user['user_id'] in user_ids:
            filtered_users.append(user)

df_users = pd.DataFrame(filtered_users)
df_users.to_csv('../sampled_data/users_sample.csv', index=False)

# Step 4: Filter `tips.json` by `business_id` and `user_id`
filtered_tips = []

with open('../data/yelp_dataset/yelp_academic_dataset_tip.json', 'r') as f:
    for line in f:
        tip = json.loads(line)
        if tip['business_id'] in sampled_business_ids and tip['user_id'] in user_ids:
            filtered_tips.append(tip)

df_tips = pd.DataFrame(filtered_tips)
df_tips.to_csv('../sampled_data/tips_sample.csv', index=False)

print("Sampling and filtering complete.")


Sampling and filtering complete.


## EDA

In [5]:
print(f'df_business schema: {df_business.dtypes}')
df_business.shape

df_business schema: business_id      object
name             object
address          object
city             object
state            object
postal_code      object
latitude        float64
longitude       float64
stars           float64
review_count      int64
is_open           int64
attributes       object
categories       object
hours            object
dtype: object


(1000, 14)

In [6]:
df_business.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,yFrOuce72KhvW0LpU5FY6A,Earnest Bar & Hideaway,"438 Houston St, Ste 160",Nashville,TN,37203,36.143036,-86.767485,4.0,265,1,"{'WheelchairAccessible': 'True', 'HasTV': 'Tru...","American (New), American (Traditional), Nightl...","{'Monday': '0:0-0:0', 'Tuesday': '17:0-22:0', ..."
1,WfGXVT-WOgDymYv_kL28GQ,Vanessa Cafe Restaraunt Pizzeria,3815 Church Rd,Mount Laurel Township,NJ,8054,39.928964,-74.97033,5.0,15,1,"{'RestaurantsReservations': 'True', 'BusinessA...","Pizza, Italian, Restaurants, Sicilian","{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'..."
2,otm8R7rkCtCNM0g0TC55-Q,The Gables Cafe,4600 Woodland Ave,Philadelphia,PA,19143,39.944194,-75.209709,5.0,14,1,"{'RestaurantsTableService': 'True', 'WiFi': 'u...","Food, Coffee & Tea, Restaurants, Italian, Dess...",
3,3xMtsSQ42lBpkP-R3726lw,Pinky's Nails,63 Boone Vlg,Zionsville,IN,46077,39.951867,-86.278108,3.5,26,1,"{'BusinessAcceptsCreditCards': 'True', 'ByAppo...","Nail Salons, Beauty & Spas","{'Monday': '9:30-19:30', 'Tuesday': '9:30-19:3..."
4,ne1EabaPSD9WjIpWZ26JMQ,Reno Tahoe Window Cleaning,"59 Damonte Ranch Pkwy, Ste B-187",Reno,NV,89521,39.420298,-119.756945,4.5,94,1,"{'BusinessAcceptsCreditCards': 'True', 'BikePa...","Pest Control, Local Services, Home Services, W...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-17:0', '..."


In [7]:
print(f'df_reviews schema: {df_reviews.dtypes}')
df_reviews.shape

df_reviews schema: review_id       object
user_id         object
business_id     object
stars          float64
useful           int64
funny            int64
cool             int64
text            object
date            object
dtype: object


(44650, 9)

In [8]:
print(f'df_users schema: {df_users.dtypes}')
df_users.shape

df_users schema: user_id                object
name                   object
review_count            int64
yelping_since          object
useful                  int64
funny                   int64
cool                    int64
elite                  object
friends                object
fans                    int64
average_stars         float64
compliment_hot          int64
compliment_more         int64
compliment_profile      int64
compliment_cute         int64
compliment_list         int64
compliment_note         int64
compliment_plain        int64
compliment_cool         int64
compliment_funny        int64
compliment_writer       int64
compliment_photos       int64
dtype: object


(38921, 22)

In [9]:
print(f'df_tips schema: {df_tips.dtypes}')
df_tips.shape

df_tips schema: user_id             object
business_id         object
text                object
date                object
compliment_count     int64
dtype: object


(3042, 5)