In [32]:
import random 
import json
import pandas as pd

In [33]:
%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


## Sample business data from JSON file

In [18]:
def reservoir_sampling(filename, k=1000):
    """
    Perform reservoir sampling from a large JSON file (line-by-line streaming)
    Args:
    - filename (str): Path to the JSON file
    - k (int): The size of the sample (number of rows to sample)
    
    Returns:
    - List of k sampled rows
    """
    reservoir = []  # The reservoir will store k items
    with open(filename, 'r') as f:
        for i, line in enumerate(f):
            record = json.loads(line)  # Parse JSON line
            if i < k:
                reservoir.append(record)  # Fill the reservoir initially
            else:
                j = random.randint(0, i)  # Randomly select an index from 0 to i
                if j < k:
                    reservoir[j] = record  # Replace one of the items in the reservoir
    
    return reservoir


# Step 1: Sample 100000 of the data from business.json
sampled_data = reservoir_sampling('../data/yelp_dataset/yelp_academic_dataset_business.json', k=100000) 

# Step 2: Convert the sample to a DataFrame
df_sample = pd.DataFrame(sampled_data)

# Step 3: Save the sampled data as CSV
df_sample.to_csv('../data/business_sample.csv', index=False)

print("Sample saved as business_sample.csv")

Sample saved as business_sample.csv


In [20]:
import psycopg

conn = psycopg.connect("postgresql://localhost/yelp")
cursor = conn.cursor()

create_table_query = """
CREATE TABLE IF NOT EXISTS business (
    business_id VARCHAR PRIMARY KEY,
    name VARCHAR,
    address VARCHAR,
    city VARCHAR,
    state VARCHAR(2),
    postal_code VARCHAR(10),
    latitude NUMERIC,
    longitude NUMERIC,
    stars NUMERIC,
    review_count INTEGER
);
"""

cursor.execute(create_table_query)
conn.commit()
print("Table created successfully.")

cursor.close()
conn.close()


Table created successfully.


In [21]:
%sql postgresql://michellelin@localhost/yelp

In [23]:
import psycopg
import pandas as pd

# Step 1: Read the sampled CSV file
df_sample = pd.read_csv('../data/business_sample.csv')

# Step 2: Connect to the PostgreSQL database
conn = psycopg.connect("postgresql://michellelin@localhost/yelp")
cursor = conn.cursor()

# Step 3: Insert each row of the DataFrame into the 'business' table
for index, row in df_sample.iterrows():
    business_id = row['business_id']
    name = row['name']
    address = row['address']
    city = row['city']
    state = row['state']
    postal_code = row['postal_code']
    latitude = row['latitude']
    longitude = row['longitude']
    stars = row['stars']
    review_count = row['review_count']
    
    # SQL Insert Query with ON CONFLICT to avoid duplicate errors
    insert_query = """
    INSERT INTO business (business_id, name, address, city, state, postal_code, latitude, longitude, stars, review_count) 
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (business_id) DO NOTHING;
    """
    
    try:
        cursor.execute(insert_query, (business_id, name, address, city, state, postal_code, latitude, longitude, stars, review_count))
    except Exception as e:
        print(f"Error inserting row {index}: {e}")

# Step 4: Commit the transaction to make the changes permanent
conn.commit()
print("Data inserted successfully.")

# Step 5: Close the cursor and connection
cursor.close()
conn.close()


Data inserted successfully.


In [24]:
%%sql
SELECT COUNT(*) FROM business;

count
100635


In [27]:
%%sql
SELECT * FROM business LIMIT 5;

business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count
ij01QEUh2uK3W27dN4knwQ,Dutch Hollow Medical Day Spa,3042 Godfrey Rd,Godfrey,IL,62035,38.930379,-90.19284,4.0,15
OXZU-xctypqqAaCQQ9Eiqg,Great Clips,3188 Telegraph Rd,Saint Louis,MO,63125,38.4973695,-90.2984265,2.0,5
D9sMUiSSzR0Q1WrXvlZ20Q,The Gravity Forge,2920 Turnpike Dr,Hatboro,PA,19040,40.1624444027,-75.1068388,2.5,9
RsGp61dhtKNonHqaUQQniw,Somerdale Cold Cuts,501 Warwick Rd,Somerdale,NJ,8083,39.850103,-75.032732,4.5,26
5eHsFwEG9UkVNaMpcbZjEg,Ms. Nancy's Place,177 S Ctr St,Merchantville,NJ,8109,39.9476273,-75.0480838,4.0,44


### Filter reviews.json by business_id

In [30]:
def reservoir_sampling(filename, k=10000):
    """
    Perform reservoir sampling from a large JSON file (line-by-line streaming)
    """
    reservoir = []
    with open(filename, 'r') as f:
        for i, line in enumerate(f):
            record = json.loads(line)
            if i < k:
                reservoir.append(record)
            else:
                j = random.randint(0, i)
                if j < k:
                    reservoir[j] = record
    return reservoir


# Step 1: Sample `business.json`
sampled_business = reservoir_sampling('../data/yelp_dataset/yelp_academic_dataset_business.json', k=1000)
df_business = pd.DataFrame(sampled_business)
sampled_business_ids = set(df_business['business_id'])
df_business.to_csv('../sampled_data/business_sample.csv', index=False)

# Step 2: Filter `reviews.json` by `business_id`
filtered_reviews = []
user_ids = set()

with open('../data/yelp_dataset/yelp_academic_dataset_review.json', 'r') as f:
    for line in f:
        review = json.loads(line)
        if review['business_id'] in sampled_business_ids:
            filtered_reviews.append(review)
            user_ids.add(review['user_id'])

df_reviews = pd.DataFrame(filtered_reviews)
df_reviews.to_csv('../sampled_data/reviews_sample.csv', index=False)

# Step 3: Filter `user.json` by `user_id`
filtered_users = []

with open('../data/yelp_dataset/yelp_academic_dataset_user.json', 'r') as f:
    for line in f:
        user = json.loads(line)
        if user['user_id'] in user_ids:
            filtered_users.append(user)

df_users = pd.DataFrame(filtered_users)
df_users.to_csv('../sampled_data/users_sample.csv', index=False)

# Step 4: Filter `tips.json` by `business_id` and `user_id`
filtered_tips = []

with open('../data/yelp_dataset/yelp_academic_dataset_tip.json', 'r') as f:
    for line in f:
        tip = json.loads(line)
        if tip['business_id'] in sampled_business_ids and tip['user_id'] in user_ids:
            filtered_tips.append(tip)

df_tips = pd.DataFrame(filtered_tips)
df_tips.to_csv('../sampled_data/tips_sample.csv', index=False)

# Step 5: Filter `checkin.json` by `business_id`
filtered_checkins = []

with open('../data/yelp_dataset/yelp_academic_dataset_checkin.json', 'r') as f:
    for line in f:
        checkin = json.loads(line)
        if checkin['business_id'] in sampled_business_ids:
            filtered_checkins.append(checkin)

df_checkins = pd.DataFrame(filtered_checkins)
df_checkins.to_csv('../sampled_data/checkins_sample.csv', index=False)

print("Sampling and filtering complete.")


Sampling and filtering complete.


## EDA

In [40]:
print(f'df_business schema: {df_business.dtypes}')
df_business.shape

df_business schema: business_id      object
name             object
address          object
city             object
state            object
postal_code      object
latitude        float64
longitude       float64
stars           float64
review_count      int64
is_open           int64
attributes       object
categories       object
hours            object
dtype: object


(1000, 14)

In [45]:
df_business.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,UVfYRQIr6u_WrFSY5Cv7xw,InCycle,736 Hanover Pl,Carmel,IN,46032,39.969726,-86.127815,5.0,32,1,"{'GoodForKids': 'False', 'BusinessAcceptsCredi...","Active Life, Cycling Classes, Gyms, Fitness & ...","{'Monday': '16:0-20:0', 'Tuesday': '16:0-20:0'..."
1,wH6QxQv31IJ-qQ-FKIc5TA,Roman Plumbing,6125 Grand Blvd,New Port Richey,FL,34652,28.247926,-82.719939,4.5,29,1,"{'BusinessAcceptsBitcoin': 'False', 'ByAppoint...","Plumbing, Water Heater Installation/Repair, Co...","{'Monday': '8:0-17:0', 'Tuesday': '8:0-17:0', ..."
2,K1NdCTaJWK1ezkZpPyB7-A,Desert Rain Coffee,1551 E Tangerine Rd,Oro Valley,AZ,85755,32.429167,-110.948631,5.0,46,0,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Coffee & Tea, Food","{'Monday': '7:0-15:0', 'Tuesday': '7:0-15:0', ..."
3,juL8ovMlnjkXNACxZ8HLAQ,Sam Levitz Furniture,3750 W Orange Grove Rd,Tucson,AZ,85741,32.324899,-111.044485,2.0,186,1,"{'BusinessParking': '{'garage': False, 'street...","Shopping, Mattresses, Home & Garden, Furniture...","{'Monday': '11:0-20:0', 'Tuesday': '11:0-20:0'..."
4,2eN2pfPCear_ofmgQ0peCw,"David Caplan, CPA",301 Andorra Glen Ct,Lafayette Hill,PA,19444,40.088457,-75.267412,4.5,13,1,,"Professional Services, Accountants",


In [41]:
print(f'df_reviews schema: {df_reviews.dtypes}')
df_reviews.shape

df_reviews schema: review_id       object
user_id         object
business_id     object
stars          float64
useful           int64
funny            int64
cool             int64
text            object
date            object
dtype: object


(48743, 9)

In [42]:
print(f'df_users schema: {df_users.dtypes}')
df_users.shape

df_users schema: user_id                object
name                   object
review_count            int64
yelping_since          object
useful                  int64
funny                   int64
cool                    int64
elite                  object
friends                object
fans                    int64
average_stars         float64
compliment_hot          int64
compliment_more         int64
compliment_profile      int64
compliment_cute         int64
compliment_list         int64
compliment_note         int64
compliment_plain        int64
compliment_cool         int64
compliment_funny        int64
compliment_writer       int64
compliment_photos       int64
dtype: object


(42593, 22)

In [43]:
print(f'df_tips schema: {df_tips.dtypes}')
df_tips.shape

df_tips schema: user_id             object
business_id         object
text                object
date                object
compliment_count     int64
dtype: object


(3171, 5)

In [44]:
print(f'df_checkins schema: {df_checkins.dtypes}')
df_checkins.shape

df_checkins schema: business_id    object
date           object
dtype: object


(883, 2)