# Project B


This notebook will cover only cover the data prep and merge of the two larger json files.

## Packages Needed

In [1]:
import pandas as pd
from functools import reduce
import matplotlib.pyplot as plt

## Load Data Business Dataset + Prep

In [2]:
df = pd.read_json('data/yelp_academic_dataset_business.json', lines=True)
print(df.shape)

(160585, 14)


In [3]:
df.columns

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'attributes', 'categories', 'hours'],
      dtype='object')

We only want to look at the restaurants that are open and not closed.

In [4]:
df = df[df['is_open']==1]

In [5]:
df= df.drop(['is_open'], axis=1)

Start by select all the rows that mention restaurants.

In [6]:
df = df[df['categories'].str.contains('Restaurants',
              case=False, na=False)]

Try to split the category column into different subgroups, so we can do some nice plotting.

In [7]:
cat_countries = ['thai', 'chinese','japanese','korean','indian','american',
                 'caribbean','italian','mediterranean','mexican', 'cajun',
                'vietnamese','greek']

cat_type = ['fast food', 'salad','buffet','cafe','bar','pub',
                 'vegetarian','barbeque','steakhouse','sushi','diner']

In [8]:
df_explode = df.assign(categories = df.categories
                         .str.split(', ')).explode('categories')

In [9]:
df_cat_countries = df_explode[df_explode['categories'].str.contains(
              '|'.join(cat_countries),
              case=False, na=False)]

df_cat_countries = df_cat_countries.rename(columns={'categories':'Cat Countries'})
df_cat_countries = df_cat_countries['Cat Countries']

In [10]:
df_cat_type = df_explode[df_explode['categories'].str.contains(
              '|'.join(cat_type),
              case=False, na=False)]

df_cat_type = df_cat_type.rename(columns={'categories':'Cat Type'})
df_cat_type = df_cat_type['Cat Type']

In [11]:
df_merge_list = [df, df_cat_countries, df_cat_type]

In [12]:
df_merged = reduce(lambda  left, right: pd.merge(left,right,left_index=True,
                                            right_index=True), df_merge_list)

In [13]:
df_merged.shape

(17700, 15)

In [14]:
df_merged.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,attributes,categories,hours,Cat Countries,Cat Type
0,6iYb2HFDywm3zjuRg0shjw,Oskar Blues Taproom,921 Pearl St,Boulder,CO,80302,40.017544,-105.283348,4.0,86,"{'RestaurantsTableService': 'True', 'WiFi': 'u...","Gastropubs, Food, Beer Gardens, Restaurants, B...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'...",American (Traditional),Gastropubs
0,6iYb2HFDywm3zjuRg0shjw,Oskar Blues Taproom,921 Pearl St,Boulder,CO,80302,40.017544,-105.283348,4.0,86,"{'RestaurantsTableService': 'True', 'WiFi': 'u...","Gastropubs, Food, Beer Gardens, Restaurants, B...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'...",American (Traditional),Bars
0,6iYb2HFDywm3zjuRg0shjw,Oskar Blues Taproom,921 Pearl St,Boulder,CO,80302,40.017544,-105.283348,4.0,86,"{'RestaurantsTableService': 'True', 'WiFi': 'u...","Gastropubs, Food, Beer Gardens, Restaurants, B...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'...",American (Traditional),Beer Bar
29,jGennaZUr2MsJyRhijNBfA,Legal Sea Foods,1 Harborside Dr,Boston,MA,2128,42.363442,-71.025781,3.5,856,"{'NoiseLevel': 'u'average'', 'BikeParking': 'F...","Sandwiches, Food, Restaurants, Breakfast & Bru...","{'Monday': '6:0-21:0', 'Tuesday': '6:0-21:0', ...",Italian,Cocktail Bars
29,jGennaZUr2MsJyRhijNBfA,Legal Sea Foods,1 Harborside Dr,Boston,MA,2128,42.363442,-71.025781,3.5,856,"{'NoiseLevel': 'u'average'', 'BikeParking': 'F...","Sandwiches, Food, Restaurants, Breakfast & Bru...","{'Monday': '6:0-21:0', 'Tuesday': '6:0-21:0', ...",Italian,Bars


## Load Review Data (Massive dataset)

In [15]:
size = 500000
review = pd.read_json('data/yelp_academic_dataset_review.json', lines=True,
                      dtype={'review_id':str,'user_id':str,
                             'business_id':str,'stars':int,
                             'date':str,'text':str,'useful':int,
                             'funny':int,'cool':int},
                      chunksize=size)

In [16]:
# There are multiple chunks to be read
chunk_list = []
for chunk_review in review:
    # Drop columns that aren't needed
    chunk_review = chunk_review.drop(['review_id','useful','funny','cool'], axis=1)
    # Renaming column name to avoid conflict with business overall star rating
    chunk_review = chunk_review.rename(columns={'stars': 'review_stars'})
    # Inner merge with edited business file so only reviews related to the business remain
    chunk_merged = pd.merge(df_merged, chunk_review, on='business_id', how='inner')
    # Show feedback on progress
    print(f"{chunk_merged.shape[0]} out of {size:,} related reviews")
    chunk_list.append(chunk_merged)
# After trimming down the review file, concatenate all relevant data back to one dataframe
df = pd.concat(chunk_list, ignore_index=True, join='outer', axis=0)

243213 out of 500,000 related reviews
231744 out of 500,000 related reviews
237529 out of 500,000 related reviews
234259 out of 500,000 related reviews
230877 out of 500,000 related reviews
201396 out of 500,000 related reviews
200280 out of 500,000 related reviews
250300 out of 500,000 related reviews
230987 out of 500,000 related reviews
222641 out of 500,000 related reviews
211402 out of 500,000 related reviews
206222 out of 500,000 related reviews
233358 out of 500,000 related reviews
230987 out of 500,000 related reviews
256635 out of 500,000 related reviews
257087 out of 500,000 related reviews
271064 out of 500,000 related reviews
68120 out of 500,000 related reviews


In [17]:
df.shape

(4018101, 19)

## More data prep

Select 2005-2020, so we have full years

In [18]:
print (df.date.min())
print (df.date.max())

2004-10-14 02:57:52
2021-01-28 15:23:52


In [19]:
df.date.dtypes

dtype('O')

In [20]:
df['date'] = pd.to_datetime(df['date'])

In [21]:
df.date.dtypes

dtype('<M8[ns]')

In [22]:
time_start = pd.Timestamp(2005,1,1)
time_end = pd.Timestamp(2020,12,31)

In [23]:
#Filter the data
df = df.loc[(df['date']>= time_start) & (df['date']<= time_end)]

In [24]:
print (df.date.min())
print (df.date.max())

2005-01-06 07:49:21
2020-12-30 23:55:24


Save to csv file

In [25]:
csv_name = "data/yelp_reviews_RV_categories.csv"
df.to_csv(csv_name, index=False)