# **Yelp Dataset**
### Author: Jennifer Nguyen

In [6]:
# Import libraries and load datasets
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm  # For progress bars


In [None]:
business_path = 'yelp_academic_dataset_business.json'
review_path = 'yelp_academic_dataset_review.json'
user_path = 'yelp_academic_dataset_user.json'
checkin_path = 'yelp_academic_dataset_checkin.json'
tip_path = 'yelp_academic_dataset_tip.json'

### Function to read JSON files line by line (since they're too large to load at once)

In [8]:
def read_json(file_path, max_records=None):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(tqdm(f)):
            if max_records and i >= max_records:
                break
            data.append(json.loads(line))
    return pd.DataFrame(data)

### Load a sample of each dataset

In [3]:
# Removed 'max_records' for business and users since we have to clean it
# Saved 'max_records' for others for faster processing but can remove when utilizing entire dataset
business_df = read_json(business_path)
review_df = read_json(review_path, max_records=5000)
user_df = read_json(user_path)
checkin_df = read_json(checkin_path, max_records=5000)
tip_df = read_json(tip_path, max_records=5000)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

### Display basic information about each dataset

In [4]:
print("Business Dataset Shape:", business_df.shape)
print("Review Dataset Shape:", review_df.shape)
print("User Dataset Shape:", user_df.shape)
print("Checkin Dataset Shape:", checkin_df.shape)

Business Dataset Shape: (150346, 12)
Review Dataset Shape: (5000, 9)
User Dataset Shape: (1987897, 22)
Checkin Dataset Shape: (5000, 2)


### Sample view of business data

In [5]:
# Clean business data - remove latitude and longitude if they exist
columns_to_drop = ['latitude', 'longitude']
existing_columns = [col for col in columns_to_drop if col in business_df.columns]

if existing_columns:
    business_df_clean = business_df.drop(existing_columns, axis=1)
    print(f"Removed columns from business data: {existing_columns}")
else:
    business_df_clean = business_df
    print("No columns needed to be removed from business data (removed already)")

# Write to file
business_df_clean.to_json(business_path, orient='records', lines=True)

# Display sample
print("\nBusiness Data Sample:")
display(business_df.head())

No columns needed to be removed from business data (removed already)

Business Data Sample:


Unnamed: 0,business_id,name,address,city,state,postal_code,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


### Sample view of review data

In [6]:
print("\nReview Data Sample:")
display(review_df.head())


Review Data Sample:


Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5.0,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3.0,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4.0,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


### Sample view of user data

In [14]:
# Define columns to remove
additional_cols_to_remove = ['yelping_since', 'friends', 'useful', 'funny', 'cool']

# Create the new compliments_total column
compliment_cols = [col for col in user_df.columns if 'compliment_' in col]

# Add compliments total if it doesn't exist
if 'compliments_total' not in user_df.columns and compliment_cols:
    user_df['compliments_total'] = user_df[compliment_cols].sum(axis=1)

# Combine all columns to remove
columns_to_drop = compliment_cols + [col for col in additional_cols_to_remove if col in user_df.columns]

# Drop columns and create clean dataframe
user_df_clean = user_df.drop(columns_to_drop, axis=1)

# Replace the original dataframe with the clean one
user_df = user_df_clean

# Print columns before saving to verify what will be written
print("Columns being saved to file:", user_df.columns.tolist())

# Write to file with explicit parameters to ensure proper writing
user_df.to_json(user_path, orient='records', lines=True)

# Display sample
print("\nUser Data Sample:")
display(user_df.head())

Columns being saved to file: ['user_id', 'name', 'review_count', 'elite', 'fans', 'average_stars', 'compliments_total']

User Data Sample:


Unnamed: 0,user_id,name,review_count,elite,fans,average_stars,compliments_total
0,qVc8ODYU5SZjKXVBgXdI7w,Walker,585,2007,267,3.91,2873
1,j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,"2009,2010,2011,2012,2013,2014,2015,2016,2017,2...",3138,3.74,20631
2,2WnXYQFK0hXEoTxPtV2zvg,Steph,665,20092010201120122013,52,3.32,585
3,SZDeASXq7o05mMNLshsdIA,Gwen,224,200920102011,28,4.27,136
4,hA5lMy-EnncsH4JoR-hFGQ,Karen,79,,1,3.54,4


### Sample view of checkin data

In [8]:
print("\nCheckin Data Sample:")
display(checkin_df.head())


Checkin Data Sample:


Unnamed: 0,business_id,date
0,---kPU91CF4Lq2-WlRu9Lw,"2020-03-13 21:10:56, 2020-06-02 22:18:06, 2020..."
1,--0iUa4sNDFiZFrAdIWhZQ,"2010-09-13 21:43:09, 2011-05-04 23:08:15, 2011..."
2,--30_8IhuyMHbSOcNWd6DQ,"2013-06-14 23:29:17, 2014-08-13 23:20:22"
3,--7PUidqRWpRSpXebiyxTg,"2011-02-15 17:12:00, 2011-07-28 02:46:10, 2012..."
4,--7jw19RH9JKXgFohspgQw,"2014-04-21 20:42:11, 2014-04-28 21:04:46, 2014..."


### Sample view of tip data

In [9]:
print("\nTip Data Sample:")
display(tip_df.head())


Tip Data Sample:


Unnamed: 0,user_id,business_id,text,date,compliment_count
0,AGNUgVwnZUey3gcPCJ76iw,3uLgwr0qeCNMjKenHJwPGQ,Avengers time with the ladies.,2012-05-18 02:17:21,0
1,NBN4MgHP9D3cw--SnauTkA,QoezRbYQncpRqyrLH6Iqjg,They have lots of good deserts and tasty cuban...,2013-02-05 18:35:10,0
2,-copOvldyKh1qr-vzkDEvw,MYoRNLb5chwjQe3c_k37Gg,It's open even when you think it isn't,2013-08-18 00:56:08,0
3,FjMQVZjSqY8syIO-53KFKw,hV-bABTK-glh5wj31ps_Jw,Very decent fried chicken,2017-06-27 23:05:38,0
4,ld0AperBXk1h6UbqmM80zw,_uN0OudeJ3Zl_tf6nxg5ww,Appetizers.. platter special for lunch,2012-10-06 19:43:09,0


### Display column names for each dataset

In [15]:
print("\nBusiness columns:", business_df_clean.columns.tolist())
print("Review columns:", review_df.columns.tolist())
print("User columns:", user_df.columns.tolist())
print("Checkin columns:", checkin_df.columns.tolist())
print("Tip columns:", tip_df.columns.tolist())


Business columns: ['business_id', 'name', 'address', 'city', 'state', 'postal_code', 'stars', 'review_count', 'is_open', 'attributes', 'categories', 'hours']
Review columns: ['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny', 'cool', 'text', 'date']
User columns: ['user_id', 'name', 'review_count', 'elite', 'fans', 'average_stars', 'compliments_total']
Checkin columns: ['business_id', 'date']
Tip columns: ['user_id', 'business_id', 'text', 'date', 'compliment_count']
