In [2]:
import random 
import json
import pandas as pd

In [3]:
%load_ext sql

### Sample business data from JSON file

In [7]:
def reservoir_sampling(filename, k=1000):
    """
    Perform reservoir sampling from a large JSON file (line-by-line streaming)
    Args:
    - filename (str): Path to the JSON file
    - k (int): The size of the sample (number of rows to sample)
    
    Returns:
    - List of k sampled rows
    """
    reservoir = []  # The reservoir will store k items
    with open(filename, 'r') as f:
        for i, line in enumerate(f):
            record = json.loads(line)  # Parse JSON line
            if i < k:
                reservoir.append(record)  # Fill the reservoir initially
            else:
                j = random.randint(0, i)  # Randomly select an index from 0 to i
                if j < k:
                    reservoir[j] = record  # Replace one of the items in the reservoir
    
    return reservoir


# Step 1: Sample 1/1000 of the data from business.json
sampled_data = reservoir_sampling('../data/yelp_dataset/yelp_academic_dataset_business.json', k=1000)  # Take 1000 samples

# Step 2: Convert the sample to a DataFrame
df_sample = pd.DataFrame(sampled_data)

# Step 3: Save the sampled data as CSV
df_sample.to_csv('business_sample.csv', index=False)

print("Sample saved as business_sample.csv")

Sample saved as business_sample.csv


In [8]:
business = pd.read_csv('business_sample.csv')
business.columns


Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'attributes', 'categories', 'hours'],
      dtype='object')

In [9]:
import psycopg

conn = psycopg.connect("postgresql://localhost/yelp")
cursor = conn.cursor()

create_table_query = """
CREATE TABLE IF NOT EXISTS business (
    business_id VARCHAR PRIMARY KEY,
    name VARCHAR,
    address VARCHAR,
    city VARCHAR,
    state VARCHAR(2),
    postal_code VARCHAR(10),
    latitude NUMERIC,
    longitude NUMERIC,
    stars NUMERIC,
    review_count INTEGER
);
"""

cursor.execute(create_table_query)
conn.commit()
print("Table created successfully.")

cursor.close()
conn.close()


Table created successfully.


In [None]:
%sql postgresql://michellelin@localhost/yelp

In [13]:
import psycopg
import pandas as pd

# Step 1: Read the sampled CSV file
df_sample = pd.read_csv('business_sample.csv')

# Step 2: Connect to the PostgreSQL database
conn = psycopg.connect("postgresql://michellelin@localhost/yelp")
cursor = conn.cursor()

# Step 3: Insert each row of the DataFrame into the 'business' table
for index, row in df_sample.iterrows():
    business_id = row['business_id']
    name = row['name']
    address = row['address']
    city = row['city']
    state = row['state']
    postal_code = row['postal_code']
    latitude = row['latitude']
    longitude = row['longitude']
    stars = row['stars']
    review_count = row['review_count']
    
    # SQL Insert Query with ON CONFLICT to avoid duplicate errors
    insert_query = """
    INSERT INTO business (business_id, name, address, city, state, postal_code, latitude, longitude, stars, review_count) 
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (business_id) DO NOTHING;
    """
    
    try:
        cursor.execute(insert_query, (business_id, name, address, city, state, postal_code, latitude, longitude, stars, review_count))
    except Exception as e:
        print(f"Error inserting row {index}: {e}")

# Step 4: Commit the transaction to make the changes permanent
conn.commit()
print("Data inserted successfully.")

# Step 5: Close the cursor and connection
cursor.close()
conn.close()


Data inserted successfully.


In [14]:
%%sql
SELECT * FROM business LIMIT 5;

business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count
ij01QEUh2uK3W27dN4knwQ,Dutch Hollow Medical Day Spa,3042 Godfrey Rd,Godfrey,IL,62035,38.930379,-90.19284,4.0,15
OXZU-xctypqqAaCQQ9Eiqg,Great Clips,3188 Telegraph Rd,Saint Louis,MO,63125,38.4973695,-90.2984265,2.0,5
D9sMUiSSzR0Q1WrXvlZ20Q,The Gravity Forge,2920 Turnpike Dr,Hatboro,PA,19040,40.1624444027,-75.1068388,2.5,9
RsGp61dhtKNonHqaUQQniw,Somerdale Cold Cuts,501 Warwick Rd,Somerdale,NJ,8083,39.850103,-75.032732,4.5,26
5eHsFwEG9UkVNaMpcbZjEg,Ms. Nancy's Place,177 S Ctr St,Merchantville,NJ,8109,39.9476273,-75.0480838,4.0,44
