In [1]:
import pandas as pd
import numpy as np
import glob

import pyarrow as pa
import pyarrow.dataset as ds

from itertools import islice

In [2]:
businesses_df = pd.read_json("yelp_dataset/yelp_academic_dataset_business.json", lines=True)
# reviews_df = pd.read_json("yelp_dataset/yelp_academic_dataset_review.json", lines=True)

In [3]:
print(pd.__version__, pa.__version__)

2.3.3 22.0.0


In [4]:
reader = pd.read_json(
    "yelp_dataset/yelp_academic_dataset_review.json",
    lines=True,
    chunksize=200_000,
    dtype_backend="pyarrow"
)

for i, chunk in enumerate(reader):
    chunk.to_parquet(
        f"yelp_dataset_parquet/reviews_part_{i:04}.parquet",
        index=False
    )

In [5]:
reader2 = pd.read_json(
    "yelp_dataset/yelp_academic_dataset_review.json",
    lines=True,
    chunksize=5_000  # tiny chunk just to inspect columns/dtypes
)
sample = next(reader2)
col_names = sample.columns.tolist()
print(col_names)

['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny', 'cool', 'text', 'date']


In [6]:
dataset = ds.dataset("yelp_dataset_parquet", format="parquet")
print("rows:", dataset.count_rows())
print(dataset.schema)

rows: 6990280
review_id: string
user_id: string
business_id: string
stars: int64
useful: int64
funny: int64
cool: int64
text: string
date: timestamp[ns]
-- schema metadata --
pandas: '{"index_columns": [], "column_indexes": [], "columns": [{"name":' + 1180


In [7]:
sorted_file_paths = sorted(glob.glob("yelp_dataset_parquet/reviews_part_*.parquet"))
reviews_df = pd.concat([pd.read_parquet(p, columns=col_names) for p in sorted_file_paths], ignore_index=True)

In [8]:
reviews_df.head(5)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


In [9]:
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6990280 entries, 0 to 6990279
Data columns (total 9 columns):
 #   Column       Dtype                 
---  ------       -----                 
 0   review_id    string                
 1   user_id      string                
 2   business_id  string                
 3   stars        int64[pyarrow]        
 4   useful       int64[pyarrow]        
 5   funny        int64[pyarrow]        
 6   cool         int64[pyarrow]        
 7   text         string                
 8   date         timestamp[ns][pyarrow]
dtypes: int64[pyarrow](4), string(4), timestamp[ns][pyarrow](1)
memory usage: 4.6 GB


In [10]:
businesses_df.head(5)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


In [19]:
businesses_df.loc[businesses_df["state"] == "CO", "name"].nunique()

3

In [25]:
businesses_df.loc[businesses_df["state"] == "FL", "name"].unique().tolist()

['Temple Beth-El',
 'Marshalls',
 'Vietnamese Food Truck',
 'Adams Dental',
 "Zio's Italian Market",
 "Gold's Gym",
 "Charlie's Market",
 'Roman Forum',
 "Joe's Pizza",
 'Educational Outfitters',
 'Publix Super Market',
 'Holly Nails & Spa',
 'Top Shelf Sports Lounge',
 'Impasto',
 'The Pearl',
 'Walmart Supercenter',
 'Mirror Lake Community Library',
 'Laser Spine Institute',
 'Meineke Car Care Center',
 'The Cake Drip',
 'Professional Healthcare of Pinellas',
 'Chipotle Mexican Grill',
 'Carsmetics',
 'Tuffy Tire & Auto Service Center',
 'Bay Area Appliance',
 'Trust Me BBQ',
 "Kinjo's Japanese Restaurant",
 'PDQ Temple Terrace',
 'Seaview Condominiums',
 'Aussie Grill',
 '530 Pub & Grill',
 'Option 1 Barber Shop',
 'Zesty Tsunami',
 "O'Briens Irish Pub",
 'Thach Used Tires',
 "Lee Roy Selmon's",
 'Union on Fletcher',
 'Nails & Beauty Lounge',
 'Four Green Fields',
 'Cafe Con Leche',
 'Master Bait and Fishing Supplies',
 '1-275 Rest Area Manatee County Mile 7',
 'Sake House',
 'Camde

In [None]:
businesses_df.loc[businesses_df["state"] == "FL"]

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
7,qkRM_2X51Yqxk3btlwAQIg,Temple Beth-El,400 Pasadena Ave S,St. Petersburg,FL,33707,27.766590,-82.732983,3.5,5,1,,"Synagogues, Religious Organizations","{'Monday': '9:0-17:0', 'Tuesday': '9:0-17:0', ..."
10,UJsufbvfyfONHeWdvAHKjA,Marshalls,21705 Village Lakes Sc Dr,Land O' Lakes,FL,34639,28.190459,-82.457380,3.5,6,1,"{'RestaurantsPriceRange2': '2', 'BikeParking':...","Department Stores, Shopping, Fashion","{'Monday': '9:30-21:30', 'Tuesday': '9:30-21:3..."
11,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.456320,4.0,10,1,"{'Alcohol': ''none'', 'OutdoorSeating': 'None'...","Vietnamese, Food, Restaurants, Food Trucks","{'Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'..."
13,jaxMSoInw8Poo3XeMJt8lQ,Adams Dental,15 N Missouri Ave,Clearwater,FL,33755,27.966235,-82.787412,5.0,10,1,{'ByAppointmentOnly': 'True'},"General Dentistry, Dentists, Health & Medical,...","{'Monday': '7:30-15:30', 'Tuesday': '7:30-15:3..."
14,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,0,"{'OutdoorSeating': 'False', 'RestaurantsGoodFo...","Food, Delis, Italian, Bakeries, Restaurants","{'Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150280,37G3SzO7RS1qSthACCG5SQ,Tampa Bay Club Sport,380 105th Terrace Ne,Saint Petersburg,FL,33716,27.867754,-82.632602,2.5,7,0,{'GoodForKids': 'True'},"Active Life, Sports Clubs",
150289,Fck8i0fNQCa22ERz5Fa21w,Thoughtful Moving,5004 E Fowler Ave,Tampa,FL,33617,28.054934,-82.400832,2.0,27,1,"{'BusinessAcceptsCreditCards': 'True', 'ByAppo...","Packing Services, Home Services, Movers, Local...","{'Monday': '22:0-22:30', 'Tuesday': '8:0-19:0'..."
150292,esBGrrmuZzSiECyRBoKvvA,Colony Grill - St. Petersburg,670 Central Ave,St. Petersburg,FL,33701,27.770872,-82.643069,4.5,38,1,"{'RestaurantsPriceRange2': '2', 'RestaurantsAt...","Bars, Beer Bar, Nightlife, Wine Bars, Pizza, R...","{'Monday': '11:30-23:0', 'Tuesday': '11:30-23:..."
150317,Q7JYAMNzI1IpUd2edflmTA,21 Barber,10937 56th St N,Temple Terrace,FL,33617,28.047632,-82.393519,4.5,18,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Men's Hair Salons, Hair Salons, Barbers, Beaut...","{'Monday': '9:0-20:0', 'Tuesday': '9:0-20:0', ..."


In [28]:
businesses_FL = businesses_df.loc[businesses_df["state"] == "FL", "business_id"].unique().tolist()

In [None]:
reviews_df.loc[reviews_df["business_id"].isin(businesses_FL)]

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
10,rGQRf8UafX7OTlMNN19I8A,1WHRWwQmZOZDAhp2Qyny4g,uMvVYRgGNXf5boolA9HXTw,5,2,0,0,My experience with Shalimar was nothing but wo...,2015-06-21 14:48:06
15,OAhBYw8IQ6wlfw1owXWRWw,1C2lxzUo1Hyye4RFIXly3g,BVndHaLihEYbr76Z0CMEGw,5,0,0,0,"Great place for breakfast! I had the waffle, w...",2014-10-11 16:22:06
18,u2vzZaOqJ2feRshaaF1doQ,NDZvyYHTUWWu-kqgQzzDGQ,CLEWowfkj-wKYJlQDqT1aw,5,2,0,1,I go to blow bar to get my brows done by natal...,2016-03-07 00:02:18
40,mO398Ed5dpv1H5ZsKc8KXw,yobeeTUBfaTBcnk26mXNuA,hKameFsaXh9g8WQbv593UA,4,0,0,0,Food was good- atmosphere/decor is like a fish...,2015-04-15 15:30:48
47,TcCcHzc3L6Aboq3DteEfZA,OuatwND396ZQxm2zK8WlUQ,jNL5KUPz2-tHUJM__ysSaw,1,1,0,0,If you want to pay for everything a la carte t...,2014-08-24 20:14:12
...,...,...,...,...,...,...,...,...,...
6990233,kZiKvXxK7o5i7fa32u5Jgw,6jjHo9Lilv3kTy87pm2ycw,pQAQwhBlSQdG1HuuLuCqXw,5,46,17,45,"Just $5 every SUNDAY in October! Do it!\n\nOh,...",2020-10-11 00:09:30
6990243,8yc7YWCe6lVU4RDmh-Ww1Q,3kJ4ktaXV-RVm49M-y1uTg,B2sF6QFCUFlfGb1QzWAyqg,3,0,0,0,My favorite place to pick up sports supplement...,2017-11-09 15:23:57
6990247,CphIcLTNU26TDb_YTOSgUQ,Is5IWDXWtqVMCVtedjJIQQ,VOITl4HlmC1EGJ_wVMS6_w,5,0,0,0,I compete in Jiu-Jitsu and every competition I...,2018-11-09 18:09:35
6990261,34M6AEbY84174OBerbm96Q,i48cHEyRBl5g9_npYIG7dA,ReVpjIDupK_VMPn7ZxPvOQ,4,2,0,1,This place never fails the food is absolutely ...,2019-08-21 20:49:13


: 