# Project B


This notebook will cover only cover the data prep and merge of the two larger json files.

## Packages Needed

In [6]:
import pandas as pd
from functools import reduce
import matplotlib.pyplot as plt
import os

In [7]:
path = 'C:\\Users\\Miche\\OneDrive - Danmarks Tekniske Universitet\\MMC\\2. Semester\\Social Data\\websites\\data'
os.chdir(path)
os.getcwd()

'C:\\Users\\Miche\\OneDrive - Danmarks Tekniske Universitet\\MMC\\2. Semester\\Social Data\\websites\\data'

## Load Data Business Dataset + Prep

In [8]:
df = pd.read_json('yelp_academic_dataset_business.json', lines=True)
print(df.shape)

(160585, 14)


In [9]:
df.columns

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'attributes', 'categories', 'hours'],
      dtype='object')

We only want to look at the restaurants that are open and not closed.

In [10]:
df = df[df['is_open']==1]

In [11]:
df= df.drop(['is_open'], axis=1)

Start by select all the rows that mention restaurants.

In [12]:
df = df[df['categories'].str.contains('Restaurants',
              case=False, na=False)]

In [13]:
df_explode = df.assign(categories = df.categories
                         .str.split(', ')).explode('categories')

In [14]:
df_explode.categories.value_counts().head(20)

Restaurants                  32022
Food                         10732
Nightlife                     5550
Bars                          5345
Sandwiches                    4795
American (Traditional)        4379
Fast Food                     4326
Pizza                         3890
Breakfast & Brunch            3872
American (New)                3254
Burgers                       3246
Coffee & Tea                  3010
Mexican                       2815
Chinese                       2329
Italian                       2130
Seafood                       1998
Salad                         1975
Event Planning & Services     1916
Cafes                         1871
Japanese                      1773
Name: categories, dtype: int64

In [15]:
remove = ['Food Delivery Services','Food Safety Training', 'Food Tours', 
          'Food Banks','Chinese Martial Arts','Traditional Chinese Medicine',
         'Coffee & Tea Supplies']

df = df[~df['categories'].str.contains('|'.join(remove),
              case=False, na=False)]

Try to split the category column into different subgroups, so we can do some nice plotting.

In [16]:
cat_kitchens = ['thai', 'chinese','japanese','korean','indian','american',
                 'caribbean','italian','mediterranean','mexican', 'cajun',
                'vietnamese','greek']

cat_type = ['Food','Nightlife','Bars','Sandwiches','Pizza','Breakfast & Brunch', 'Fast Food',
            'Burgers','Salad', 'Buffet', 'Cafes','Coffee & Tea', 'Vegetarian', 'Steakhouse', 'Sushi Bars',
            'Diners','Wine Bars']

In [17]:
df_explode = df.assign(categories = df.categories
                         .str.split(', ')).explode('categories')

In [18]:
df_cat_kitchen = df_explode[df_explode['categories'].str.contains(
              '|'.join(cat_kitchens),
              case=False, na=False)]

df_cat_kitchen = df_cat_kitchen.rename(columns={'categories':'cat_kitchen'})
df_cat_kitchen = df_cat_kitchen['cat_kitchen']

In [19]:
df_cat_type = df_explode[df_explode['categories'].str.match(
              '|'.join(cat_type),
              case=True, na=False)]

df_cat_type = df_cat_type.rename(columns={'categories':'cat_type'})
df_cat_type = df_cat_type['cat_type']

In [20]:
df_cat_type.unique()

array(['Food', 'Bars', 'Nightlife', 'Salad', 'Sandwiches', 'Cafes',
       'Vegetarian', 'Breakfast & Brunch', 'Pizza', 'Food Court',
       'Sushi Bars', 'Steakhouses', 'Fast Food', 'Burgers',
       'Coffee & Tea', 'Diners', 'Food Trucks', 'Buffets', 'Food Stands',
       'Wine Bars'], dtype=object)

In [21]:
df_merge_list = [df, df_cat_kitchen, df_cat_type]

In [22]:
df_merged = reduce(lambda  left, right: pd.merge(left,right,left_index=True,
                                            right_index=True), df_merge_list)

In [23]:
df_merged.shape

(32700, 15)

In [24]:
df_merged.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,attributes,categories,hours,cat_kitchen,cat_type
0,6iYb2HFDywm3zjuRg0shjw,Oskar Blues Taproom,921 Pearl St,Boulder,CO,80302,40.017544,-105.283348,4.0,86,"{'RestaurantsTableService': 'True', 'WiFi': 'u...","Gastropubs, Food, Beer Gardens, Restaurants, B...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'...",American (Traditional),Food
0,6iYb2HFDywm3zjuRg0shjw,Oskar Blues Taproom,921 Pearl St,Boulder,CO,80302,40.017544,-105.283348,4.0,86,"{'RestaurantsTableService': 'True', 'WiFi': 'u...","Gastropubs, Food, Beer Gardens, Restaurants, B...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'...",American (Traditional),Bars
0,6iYb2HFDywm3zjuRg0shjw,Oskar Blues Taproom,921 Pearl St,Boulder,CO,80302,40.017544,-105.283348,4.0,86,"{'RestaurantsTableService': 'True', 'WiFi': 'u...","Gastropubs, Food, Beer Gardens, Restaurants, B...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'...",American (Traditional),Nightlife
13,ufCxltuh56FF4-ZFZ6cVhg,Sister Honey's,247 E Michigan St,Orlando,FL,32806,28.513265,-81.374707,4.5,135,"{'BusinessParking': '{'garage': False, 'street...","Restaurants, American (New), Bakeries, Dessert...","{'Tuesday': '11:0-18:0', 'Wednesday': '11:0-18...",American (New),Food
16,GfWJ19Js7wX9rwaHQ7KbGw,Everything POP Shopping & Dining,1050 Century Dr,Orlando,FL,32830,28.350498,-81.542819,3.0,7,"{'HasTV': 'False', 'Caters': 'False', 'Busines...","Restaurants, American (New), Food Court, Flowe...","{'Monday': '0:0-0:0', 'Tuesday': '7:0-22:0', '...",American (New),Food Court


## Load Review Data (Massive dataset)

In [25]:
size = 500000
review = pd.read_json('yelp_academic_dataset_review.json', lines=True,
                      dtype={'review_id':str,'user_id':str,
                             'business_id':str,'stars':int,
                             'date':str,'text':str,'useful':int,
                             'funny':int,'cool':int},
                      chunksize=size)

In [26]:
# There are multiple chunks to be read
chunk_list = []
for chunk_review in review:
    # Drop columns that aren't needed
    chunk_review = chunk_review.drop(['review_id','useful','funny','cool'], axis=1)
    # Renaming column name to avoid conflict with business overall star rating
    chunk_review = chunk_review.rename(columns={'stars': 'review_stars'})
    # Inner merge with edited business file so only reviews related to the business remain
    chunk_merged = pd.merge(df_merged, chunk_review, on='business_id', how='inner')
    # Show feedback on progress
    print(f"{chunk_merged.shape[0]} out of {size:,} related reviews")
    chunk_list.append(chunk_merged)
# After trimming down the review file, concatenate all relevant data back to one dataframe
df = pd.concat(chunk_list, ignore_index=True, join='outer', axis=0)

435330 out of 500,000 related reviews
400770 out of 500,000 related reviews
387009 out of 500,000 related reviews
379983 out of 500,000 related reviews
369463 out of 500,000 related reviews
370686 out of 500,000 related reviews
366913 out of 500,000 related reviews
422926 out of 500,000 related reviews
385030 out of 500,000 related reviews
370800 out of 500,000 related reviews
374630 out of 500,000 related reviews
374870 out of 500,000 related reviews
402781 out of 500,000 related reviews
390999 out of 500,000 related reviews
427114 out of 500,000 related reviews
434869 out of 500,000 related reviews
455023 out of 500,000 related reviews
113635 out of 500,000 related reviews


In [27]:
df.shape

(6862831, 19)

## More data prep

Select 2005-2020, so we have full years

In [28]:
print (df.date.min())
print (df.date.max())

2004-10-14 02:57:52
2021-01-28 15:23:52


In [29]:
df.date.dtypes

dtype('O')

In [30]:
df['date'] = pd.to_datetime(df['date'])

In [31]:
df.date.dtypes

dtype('<M8[ns]')

In [35]:
time_start = pd.Timestamp(2014,1,1)

In [36]:
#Filter the data
df = df.loc[(df['date']>= time_start)]

In [37]:
print (df.date.min())
print (df.date.max())

2014-01-01 00:01:51
2021-01-28 15:23:52


Save to csv file

In [38]:
path = 'C:\\Users\\Miche\\OneDrive - Danmarks Tekniske Universitet\\MMC\\2. Semester\\Social Data\\websites\\Restaurant-Guide\\data'
os.chdir(path)
os.getcwd()

'C:\\Users\\Miche\\OneDrive - Danmarks Tekniske Universitet\\MMC\\2. Semester\\Social Data\\websites\\Restaurant-Guide\\data'

In [39]:
csv_name = "yelp_reviews_RV_categories.csv"
df.to_csv(csv_name, index=False)