In [4]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
import pyodbc
import pandas as pd
import json

yelp_dataset = kagglehub.dataset_download('yelp-dataset/yelp-dataset')

#load business data
business_data = pd.read_json(f"{yelp_dataset}/yelp_academic_dataset_business.json", lines=True)

#load reviews data in chunks
reviews_chunks = pd.read_json(f"{yelp_dataset}/yelp_academic_dataset_review.json", lines=True, chunksize=10000) 

#store chunks in list of chunks
reviews_df_list = []
for chunk in reviews_chunks:
    reviews_df_list.append(chunk)

#combine to one df
reviews_df = pd.concat(reviews_df_list, ignore_index=True)

#load checkin data
checkin_chunks = pd.read_json(f"{yelp_dataset}/yelp_academic_dataset_checkin.json", lines=True, chunksize=10000)
checkin_df_list = []

for chunk in checkin_chunks:
    checkin_df_list.append(chunk)
#combine to one df
checkin_df = pd.concat(checkin_df_list, ignore_index=True)

print('Data source import complete.')


Data source import complete.


### Flattening Business into 3 tables

In [5]:
def flatten_attributes(row):
    attributes = row.get('attributes', {})
    if attributes:
        attributes = json.loads(attributes) if isinstance(attributes, str) else attributes
        flattened = []
        for key, value in attributes.items():
            if key == 'RestaurantsPriceRange2':
                key = 'PriceRange'
            elif key == 'RestaurantsDelivery':
                key = 'Delivery'
            elif key == 'RestaurantsTakeOut':
                key = 'Takeout'
            flattened.append({'business_id': row['business_id'], 'attribute_key': key, 'attribute_value': value})
        return flattened
    return []

def flatten_hours(row):
    hours = row.get('hours', {})
    if hours:
        hours = json.loads(hours) if isinstance(hours, str) else hours
        flattened = []
        for day, time_range in hours.items():
            open_time, close_time = time_range.split('-')
            flattened.append({'business_id': row['business_id'], 'day': day, 'open_time': open_time, 'close_time': close_time})
        return flattened
    return []

# Flatten attributes and hours
attributes_data = []
hours_data = []

for _, row in business_data.iterrows():
    attributes_data.extend(flatten_attributes(row))
    hours_data.extend(flatten_hours(row))

# Convert to DataFrames
attributes_df = pd.DataFrame(attributes_data)
hours_df = pd.DataFrame(hours_data)

# Drop the original nested fields from the business table
business_flattened = business_data.drop(columns=['attributes', 'hours'])


### Create Tables


In [17]:
create_business_table = """
CREATE TABLE business (
    business_id VARCHAR(22) PRIMARY KEY,
    name VARCHAR(255),
    address VARCHAR(255),
    city VARCHAR(100),
    state VARCHAR(10),
    postal_code VARCHAR(20),
    latitude FLOAT,
    longitude FLOAT,
    stars FLOAT,
    review_count INT,
    is_open BIT,
    categories TEXT
);
"""

create_attributes_table = """
CREATE TABLE attributes (
    attribute_id INT IDENTITY(1,1) PRIMARY KEY,
    business_id VARCHAR(22),
    attribute_key VARCHAR(100),
    attribute_value VARCHAR(MAX),
    FOREIGN KEY (business_id) REFERENCES business(business_id)
);
"""

create_hours_table = """
CREATE TABLE hours (
    hour_id INT IDENTITY(1,1) PRIMARY KEY,
    business_id VARCHAR(22),
    day VARCHAR(20),
    open_time VARCHAR(20),
    close_time VARCHAR(20),
    FOREIGN KEY (business_id) REFERENCES business(business_id)
);
"""
create_review_table = """
CREATE TABLE review (
    review_id VARCHAR(22) PRIMARY KEY,
    business_id VARCHAR(22),
    user_id VARCHAR(22),
    stars INT,
    date DATETIME,
    text TEXT,
    useful INT,
    funny INT,
    cool INT,
    FOREIGN KEY (business_id) REFERENCES business(business_id)
);
"""

create_checkin_table = """
CREATE TABLE checkin (
    business_id VARCHAR(22) PRIMARY KEY,
    date TEXT,
    FOREIGN KEY (business_id) REFERENCES business(business_id)
);
"""

### Database Information and connection

In [14]:
server = 'group15-server.database.windows.net'  # Replace with your server name
database = 'group15_Yelp_Database'  # Replace with your database name
username = 'group15'  # Replace with your username
password = 'Badam15123'  # Replace with your password
driver = '{ODBC Driver 18 for SQL Server}'  # Ensure you have this driver installed
conn = pyodbc.connect(
    f'DRIVER={driver};SERVER={server};DATABASE={database};UID={username};PWD={password};'
    'Encrypt=yes;TrustServerCertificate=no;Connection Timeout=60;'
)
cursor = conn.cursor()
print("Connected.")

Connected.


### Widening columns

### Upload Data in Chunks

In [19]:

def upload_with_query(conn, df, insert_sql, param_order, batch_size=10000):
    cur = conn.cursor()
    cur.fast_executemany = True
    for start in range(0, len(df), batch_size):
        part = df.iloc[start:start + batch_size]
        part = part[param_order].astype(object)
        part = part.where(pd.notna(part), None)  # NaN -> None
        values = part.values.tolist()
        cur.executemany(insert_sql, values)
        conn.commit()

# Example column lists â€“ adjust to match your actual CREATE TABLEs
business_insert_sql = """
INSERT INTO business (
  business_id, name, address, city, state, postal_code,
  latitude, longitude, stars, review_count, is_open, categories
) VALUES (?,?,?,?,?,?,?,?,?,?,?,?)
"""
attributes_insert_sql = "INSERT INTO attributes (business_id, attribute_key, attribute_value) VALUES (?,?,?)"
hours_insert_sql = "INSERT INTO hours (business_id, day, open_time, close_time) VALUES (?,?,?,?)"
review_insert_sql = """
  INSERT INTO review (
  review_id, business_id, user_id, stars, date, text, useful, funny, cool
) VALUES (?,?,?,?,?,?,?,?,?)
"""
checkin_insert_sql = "INSERT INTO checkin (business_id, date) VALUES (?,?)"

# Param orders (column order for binding)
business_params = ["business_id","name","address","city","state","postal_code",
                   "latitude","longitude","stars","review_count","is_open","categories"]
attributes_params = ["business_id","attribute_key","attribute_value"]
hours_params = ["business_id","day","open_time","close_time"]
review_params = ["review_id","business_id","user_id","stars","date","text","useful","funny","cool"]
checkin_params = ["checkin_id","business_id","date"]  # or ["business_id","date"] if IDENTITY

# Upload full DataFrames you already built
upload_with_query(conn, business_flattened, business_insert_sql, business_params, batch_size=10000)
upload_with_query(conn, attributes_df, attributes_insert_sql, attributes_params, batch_size=10000)
upload_with_query(conn, hours_df, hours_insert_sql, hours_params, batch_size=10000)
upload_with_query(conn, reviews_df, review_insert_sql, review_params, batch_size=5000)
upload_with_query(conn, checkin_df, checkin_insert_sql, checkin_params, batch_size=10000)


ProgrammingError: ('String data, right truncation: length 602 buffer 510', 'HY000')