Importing Libraries and Initial Setup

In [29]:
import pandas as pd
import json
import seaborn as sns
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud

 Display options for pandas


In [30]:
pd.set_option('display.max_columns', None)
pd.options.display.max_seq_items = 2000
pd.set_option('display.max_rows', 500)
pd.set_option('display.width', 1000)

Load and Clean Yelp Business Data


In [31]:
path = "data source/archive/yelp_academic_dataset_business.json"
business = pd.read_json(path,lines=True)

In [32]:
business.columns

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code', 'latitude', 'longitude', 'stars', 'review_count', 'is_open', 'attributes', 'categories', 'hours'], dtype='object')

In [33]:

# Drop excess column
business.drop(columns=['attributes','hours'], inplace=True)

In [34]:
business.business_id.duplicated()

0         False
1         False
2         False
3         False
4         False
          ...  
150341    False
150342    False
150343    False
150344    False
150345    False
Name: business_id, Length: 150346, dtype: bool

In [35]:
business.columns

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code', 'latitude', 'longitude', 'stars', 'review_count', 'is_open', 'categories'], dtype='object')

In [36]:
# Remove quotation marks in 'name' and 'address'
business['name'] = business['name'].str.replace('"', '', regex=False)
business['address'] = business['address'].str.replace('"', '', regex=False)

In [37]:
# Define US state abbreviations
states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

# Filter businesses located in the US
us_businesses = business[business['state'].isin(states)]

# Select restaurants only

us_restaurants = us_businesses[us_businesses['categories'].str.contains('Restaurants', na=False)]



In [38]:
us_businesses.columns

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code', 'latitude', 'longitude', 'stars', 'review_count', 'is_open', 'categories'], dtype='object')

In [39]:
us_restaurants

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,categories
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B..."
5,CF33F8-E6oudUQ46HnavjQ,Sonic Drive-In,615 S Main St,Ashland City,TN,37015,36.269593,-87.058943,2.0,6,1,"Burgers, Fast Food, Sandwiches, Food, Ice Crea..."
8,k0hlBqXX-Bt0vf1op7Jr1w,Tsevi's Pub And Grill,8025 Mackenzie Rd,Affton,MO,63123,38.565165,-90.321087,3.0,19,0,"Pubs, Restaurants, Italian, Bars, American (Tr..."
9,bBDDEgkFA1Otx9Lfe7BZUQ,Sonic Drive-In,2312 Dickerson Pike,Nashville,TN,37207,36.208102,-86.768170,1.5,10,1,"Ice Cream & Frozen Yogurt, Fast Food, Burgers,..."
11,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.456320,4.0,10,1,"Vietnamese, Food, Restaurants, Food Trucks"
...,...,...,...,...,...,...,...,...,...,...,...,...
150323,w_4xUt-1AyY2ZwKtnjW0Xg,Bittercreek Alehouse,246 N 8th St,Boise,ID,83702,43.616590,-116.202383,4.5,998,1,"Bars, Gastropubs, Sandwiches, Nightlife, Resta..."
150325,l9eLGG9ZKpLJzboZq-9LRQ,Wawa,19 N Bishop Ave,Clifton Heights,PA,19018,39.925656,-75.310344,3.0,11,1,"Restaurants, Sandwiches, Convenience Stores, C..."
150327,cM6V90ExQD6KMSU3rRB5ZA,Dutch Bros Coffee,1181 N Milwaukee St,Boise,ID,83704,43.615401,-116.284689,4.0,33,1,"Cafes, Juice Bars & Smoothies, Coffee & Tea, R..."
150336,WnT9NIzQgLlILjPT0kEcsQ,Adelita Taqueria & Restaurant,1108 S 9th St,Philadelphia,PA,19147,39.935982,-75.158665,4.5,35,1,"Restaurants, Mexican"


Categorize Restaurants by Cuisine

In [40]:
# Make a copy of the us_restaurants DataFrame to avoid modifying a slice
us_restaurants = us_restaurants.copy()

# Define cuisine categories and assign them
cuisine_types = {
    'American': 'American', 'Mexican': 'Mexican', 'Italian': 'Italian',
    'Japanese': 'Japanese', 'Chinese': 'Chinese', 'Thai': 'Thai',
    'Mediterranean': 'Mediterranean', 'French': 'French', 'Vietnamese': 'Vietnamese',
    'Greek': 'Greek', 'Indian': 'Indian', 'Korean': 'Korean', 'Hawaiian': 'Hawaiian',
    'African': 'African', 'Spanish': 'Spanish', 'Middle Eastern': 'Middle_eastern'
}

# Initialize the 'category' column with empty strings
us_restaurants['category'] = ''

# Assign categories based on cuisine types
for cuisine, pattern in cuisine_types.items():
    us_restaurants.loc[us_restaurants['categories'].str.contains(pattern, na=False), 'category'] = cuisine

# Drop restaurants without a recognized category
us_restaurants = us_restaurants[us_restaurants['category'] != ''].reset_index(drop=True)


In [41]:
us_restaurants.drop(columns=['categories'], inplace=True)

In [42]:
us_restaurants.dropna(axis=0,subset=["category"])
us_restaurants=us_restaurants.reset_index(drop=True)

In [43]:
us_restaurants.shape

(28450, 12)

In [44]:
us_restaurants.isnull().sum()

business_id     0
name            0
address         0
city            0
state           0
postal_code     0
latitude        0
longitude       0
stars           0
review_count    0
is_open         0
category        0
dtype: int64

clean yelp review

In [45]:
path1 = "data source/archive/yelp_academic_dataset_review.json"

In [46]:
# Load Yelp review dataset
review_list=[]
chunk_size = 1000  # Adjust the chunk size based on your system's memory
chunks = pd.read_json(path1, lines=True, chunksize=chunk_size)

In [47]:
# Process each chunk and append it to the list
for chunk in chunks:
    # Perform any processing on the chunk if needed
    # e.g., filtering, transformation, etc.
    review_list.append(chunk)

# Concatenate all chunks into a single DataFrame
review1 = pd.concat(review_list, ignore_index=True)

In [48]:
review1.columns

Index(['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny', 'cool', 'text', 'date'], dtype='object')

In [49]:

# Merge business and review data on 'business_id'
restaurants_reviews = pd.merge(us_restaurants, review1, on='business_id')

# Rename columns for clarity
restaurants_reviews.rename(columns={'stars_x': 'avg_star', 'stars_y': 'review_star'}, inplace=True)

In [50]:
restaurants_reviews.columns

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code', 'latitude', 'longitude', 'avg_star', 'review_count', 'is_open', 'category', 'review_id', 'user_id', 'review_star', 'useful', 'funny', 'cool', 'text', 'date'], dtype='object')

In [51]:
restaurants_reviews

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,avg_star,review_count,is_open,category,review_id,user_id,review_star,useful,funny,cool,text,date
0,k0hlBqXX-Bt0vf1op7Jr1w,Tsevi's Pub And Grill,8025 Mackenzie Rd,Affton,MO,63123,38.565165,-90.321087,3.0,19,0,Greek,9fmAJ76g2-CKbbU14Ai1aw,LNprC9Mi8Xqtgk1KCFlKAg,4,0,0,0,"I like Tsevis because they have good gyros, wh...",2014-02-25 22:34:59
1,k0hlBqXX-Bt0vf1op7Jr1w,Tsevi's Pub And Grill,8025 Mackenzie Rd,Affton,MO,63123,38.565165,-90.321087,3.0,19,0,Greek,OCmjANuYad62GAyY0NI8bQ,z-i_Qv-E3qeHfdPZddpwYQ,4,0,0,0,Good Greek American food. I highly recomend th...,2013-01-29 04:27:04
2,k0hlBqXX-Bt0vf1op7Jr1w,Tsevi's Pub And Grill,8025 Mackenzie Rd,Affton,MO,63123,38.565165,-90.321087,3.0,19,0,Greek,QPZ66Xk54CprqZgTW1QTdQ,m6YhwUNoehMm6s52w9A4eA,2,0,0,0,Wife and I have eaten lunch here a few times o...,2013-10-25 15:39:01
3,k0hlBqXX-Bt0vf1op7Jr1w,Tsevi's Pub And Grill,8025 Mackenzie Rd,Affton,MO,63123,38.565165,-90.321087,3.0,19,0,Greek,yUpKEiSWjcix-zWHFMT39w,-YAnRx8VSDkASxlylv3dyg,1,0,0,0,After about 7 minutes of waiting patiently for...,2014-07-16 19:17:34
4,k0hlBqXX-Bt0vf1op7Jr1w,Tsevi's Pub And Grill,8025 Mackenzie Rd,Affton,MO,63123,38.565165,-90.321087,3.0,19,0,Greek,JR0MWE4psJqD2MyHbMckxA,WJ-veSDe63t0HnCu2E1NSA,1,3,0,0,Three of us decided to try this place out last...,2012-12-17 18:37:23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3191289,WnT9NIzQgLlILjPT0kEcsQ,Adelita Taqueria & Restaurant,1108 S 9th St,Philadelphia,PA,19147,39.935982,-75.158665,4.5,35,1,Mexican,qBcwQEQPnLxjkw-xbUIF4Q,6nF5PT1c0dF6EpOgQdF2tw,5,0,0,0,Ordered delivery for some tacos on a Saturday ...,2020-12-06 00:19:29
3191290,WnT9NIzQgLlILjPT0kEcsQ,Adelita Taqueria & Restaurant,1108 S 9th St,Philadelphia,PA,19147,39.935982,-75.158665,4.5,35,1,Mexican,G8fbysnUAUmqq1XWTjMQ4Q,1M78_w4J9f5S8xmUVYyxdQ,5,2,1,1,First time trying this restaurant and I had a ...,2020-11-30 23:19:40
3191291,WnT9NIzQgLlILjPT0kEcsQ,Adelita Taqueria & Restaurant,1108 S 9th St,Philadelphia,PA,19147,39.935982,-75.158665,4.5,35,1,Mexican,JKiy0aeyGd3KmXN7uRPFLw,B7TD5yTemGv50y4wM2EVNA,5,1,1,1,This restaurant is truly amazing!!! The owner ...,2021-03-27 19:33:29
3191292,WnT9NIzQgLlILjPT0kEcsQ,Adelita Taqueria & Restaurant,1108 S 9th St,Philadelphia,PA,19147,39.935982,-75.158665,4.5,35,1,Mexican,JITY01bGbdsiUBznLz9rdg,HI8QwhpeP_ZRY5JZy11VDw,4,1,0,0,Recently got take out from adelita; they were ...,2021-02-07 15:09:25


Add Review Metadata

In [52]:

# Add number of words in each review
restaurants_reviews['num_words_review'] = (
    restaurants_reviews['text']
    .str.replace('\n', '', regex=True)
    .str.replace(r'[!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~]', '', regex=True)
    .str.split()
    .str.len()
)
# Label reviews as positive or negative
restaurants_reviews['labels'] = restaurants_reviews['review_star'].apply(
    lambda x: 'positive' if x >= 4 else 'negative' if x < 3 else 'neutral'
)

# Drop neutral reviews for simplicity
restaurants_reviews = restaurants_reviews[restaurants_reviews['labels'] != 'neutral'].reset_index(drop=True)


Final Data Inspection

In [53]:
# Inspect the cleaned and prepared DataFrame
restaurants_reviews.head

<bound method NDFrame.head of                     business_id                           name            address          city state postal_code   latitude  longitude  avg_star  review_count  is_open category               review_id                 user_id  review_star  useful  funny  cool                                               text                date  num_words_review    labels
0        k0hlBqXX-Bt0vf1op7Jr1w          Tsevi's Pub And Grill  8025 Mackenzie Rd        Affton    MO       63123  38.565165 -90.321087       3.0            19        0    Greek  9fmAJ76g2-CKbbU14Ai1aw  LNprC9Mi8Xqtgk1KCFlKAg            4       0      0     0  I like Tsevis because they have good gyros, wh... 2014-02-25 22:34:59                53  positive
1        k0hlBqXX-Bt0vf1op7Jr1w          Tsevi's Pub And Grill  8025 Mackenzie Rd        Affton    MO       63123  38.565165 -90.321087       3.0            19        0    Greek  OCmjANuYad62GAyY0NI8bQ  z-i_Qv-E3qeHfdPZddpwYQ            4       0      

In [55]:
restaurants_reviews.to_parquet('output_file.parquet', engine='pyarrow', index=False)