# (1) Data Preparation

## Introduction

This article uses the famous yelp academic data set

**Description of the data:**

Get the data <a href='https://www.yelp.com/dataset' target='_blank'>here</a>


In [1]:
#!pip install pandas-profiling[notebook,html]
#!pip install plotly
#!pip install chart_studio


In [2]:
#plotly.offline doesn't push your charts to the clouds
import plotly.offline as pyo
#allows us to create the Data and Figure objects
from plotly.graph_objs import *
#plotly.plotly pushes your charts to the cloud  
# import chart_studio.plotly as py
import plotly.figure_factory as ff
import plotly.express as px

# work with cufflinks offline and set its theme
import plotly.io as pio
pio.templates.default = "plotly_white"

import matplotlib.pyplot as plt
%matplotlib inline

import pandas as pd
from pandas_profiling import ProfileReport
import datetime as dt

import numpy as np

import os
import json
import re
import sys


Bad key "text.kerning_factor" on line 4 in
D:\Anaconda3\envs\tf2\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.3/matplotlibrc.template
or from the matplotlib source distribution


In [3]:
print(pd.__version__)

1.2.1


In [4]:
def print_full(x):
    pd.set_option('display.max_colwidth', None)
    return x

def reset():
    pd.reset_option('display.max_colwidth')

### Reading and Cleaning The Business Data 

In [5]:
business = pd.read_json('data/yelp_dataset/yelp_academic_dataset_business.json', lines=True, orient='records')

In [6]:
business.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209393 entries, 0 to 209392
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   business_id   209393 non-null  object 
 1   name          209393 non-null  object 
 2   address       209393 non-null  object 
 3   city          209393 non-null  object 
 4   state         209393 non-null  object 
 5   postal_code   209393 non-null  object 
 6   latitude      209393 non-null  float64
 7   longitude     209393 non-null  float64
 8   stars         209393 non-null  float64
 9   review_count  209393 non-null  int64  
 10  is_open       209393 non-null  int64  
 11  attributes    180348 non-null  object 
 12  categories    208869 non-null  object 
 13  hours         164550 non-null  object 
dtypes: float64(3), int64(2), object(9)
memory usage: 22.4+ MB


In [7]:
business.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,f9NumwFMBDn751xgFiRbNA,The Range At Lake Norman,10913 Bailey Rd,Cornelius,NC,28031,35.462724,-80.852612,3.5,36,1,"{'BusinessAcceptsCreditCards': 'True', 'BikePa...","Active Life, Gun/Rifle Ranges, Guns & Ammo, Sh...","{'Monday': '10:0-18:0', 'Tuesday': '11:0-20:0'..."
1,Yzvjg0SayhoZgCljUJRF9Q,"Carlos Santo, NMD","8880 E Via Linda, Ste 107",Scottsdale,AZ,85258,33.569404,-111.890264,5.0,4,1,"{'GoodForKids': 'True', 'ByAppointmentOnly': '...","Health & Medical, Fitness & Instruction, Yoga,...",
2,XNoUzKckATkOD1hP6vghZg,Felinus,3554 Rue Notre-Dame O,Montreal,QC,H4C 1P4,45.479984,-73.58007,5.0,5,1,,"Pets, Pet Services, Pet Groomers",
3,6OAZjbxqM5ol29BuHsil3w,Nevada House of Hose,1015 Sharp Cir,North Las Vegas,NV,89030,36.219728,-115.127725,2.5,3,0,"{'BusinessAcceptsCreditCards': 'True', 'ByAppo...","Hardware Stores, Home Services, Building Suppl...","{'Monday': '7:0-16:0', 'Tuesday': '7:0-16:0', ..."
4,51M2Kk903DFYI6gnB5I6SQ,USE MY GUY SERVICES LLC,4827 E Downing Cir,Mesa,AZ,85205,33.428065,-111.726648,4.5,26,1,"{'BusinessAcceptsCreditCards': 'True', 'ByAppo...","Home Services, Plumbing, Electricians, Handyma...","{'Monday': '0:0-0:0', 'Tuesday': '9:0-16:0', '..."


Only keep the businesses that are still open in the dataset

In [8]:
# 1 = open, 0 = closed
business = business[business['is_open']==1]
business.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 168903 entries, 0 to 209392
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   business_id   168903 non-null  object 
 1   name          168903 non-null  object 
 2   address       168903 non-null  object 
 3   city          168903 non-null  object 
 4   state         168903 non-null  object 
 5   postal_code   168903 non-null  object 
 6   latitude      168903 non-null  float64
 7   longitude     168903 non-null  float64
 8   stars         168903 non-null  float64
 9   review_count  168903 non-null  int64  
 10  is_open       168903 non-null  int64  
 11  attributes    143470 non-null  object 
 12  categories    168401 non-null  object 
 13  hours         137502 non-null  object 
dtypes: float64(3), int64(2), object(9)
memory usage: 19.3+ MB


Drop any irrelevant columns (Must keep business_id to merge with reviews)

In [9]:
drop_columns = ['hours','is_open']
business = business.drop(drop_columns, axis=1)

In [10]:
business_explode = business.assign(categories = business.categories.str.split(', ')).explode('categories')

In [11]:
business_explode.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,attributes,categories
0,f9NumwFMBDn751xgFiRbNA,The Range At Lake Norman,10913 Bailey Rd,Cornelius,NC,28031,35.462724,-80.852612,3.5,36,"{'BusinessAcceptsCreditCards': 'True', 'BikePa...",Active Life
0,f9NumwFMBDn751xgFiRbNA,The Range At Lake Norman,10913 Bailey Rd,Cornelius,NC,28031,35.462724,-80.852612,3.5,36,"{'BusinessAcceptsCreditCards': 'True', 'BikePa...",Gun/Rifle Ranges
0,f9NumwFMBDn751xgFiRbNA,The Range At Lake Norman,10913 Bailey Rd,Cornelius,NC,28031,35.462724,-80.852612,3.5,36,"{'BusinessAcceptsCreditCards': 'True', 'BikePa...",Guns & Ammo
0,f9NumwFMBDn751xgFiRbNA,The Range At Lake Norman,10913 Bailey Rd,Cornelius,NC,28031,35.462724,-80.852612,3.5,36,"{'BusinessAcceptsCreditCards': 'True', 'BikePa...",Shopping
1,Yzvjg0SayhoZgCljUJRF9Q,"Carlos Santo, NMD","8880 E Via Linda, Ste 107",Scottsdale,AZ,85258,33.569404,-111.890264,5.0,4,"{'GoodForKids': 'True', 'ByAppointmentOnly': '...",Health & Medical


### Keep only Restaurants with High Review Volume

In [13]:
business = business.dropna(subset=['categories'])

In [14]:
restaurants = business[business['categories'].str.contains('Restaurants', flags=re.IGNORECASE)]
restaurants.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,attributes,categories
8,pQeaRpvuhoEqudo3uymHIQ,The Empanadas House,404 E Green St,Champaign,IL,61820,40.110446,-88.233073,4.5,5,"{'RestaurantsAttire': 'u'casual'', 'Restaurant...","Ethnic Food, Food Trucks, Specialty Food, Impo..."
24,eBEfgOPG7pvFhb2wcG9I7w,Philthy Phillys,"15480 Bayview Avenue, unit D0110",Aurora,ON,L4G 7J1,44.010962,-79.448677,4.5,4,"{'RestaurantsTableService': 'False', 'Restaura...","Restaurants, Cheesesteaks, Poutineries"
25,lu7vtrp_bE9PnxWfA8g4Pg,Banzai Sushi,300 John Street,Thornhill,ON,L3T 5W4,43.820492,-79.398466,4.5,7,"{'GoodForKids': 'True', 'RestaurantsTakeOut': ...","Japanese, Fast Food, Food Court, Restaurants"
30,9sRGfSVEfLhN_km60YruTA,Apadana Restaurant,13071 Yonge Street,Richmond Hill,ON,L4E 1A5,43.947011,-79.454862,3.0,3,"{'Ambience': '{'touristy': False, 'hipster': F...","Persian/Iranian, Turkish, Middle Eastern, Rest..."
33,vjTVxnsQEZ34XjYNS-XUpA,Wetzel's Pretzels,"4550 East Cactus Rd, #KSFC-4",Phoenix,AZ,85032,33.602822,-111.983533,4.0,10,"{'GoodForKids': 'True', 'RestaurantsTakeOut': ...","Food, Pretzels, Bakeries, Fast Food, Restaurants"


In [15]:
restaurants.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 43965 entries, 8 to 209390
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   business_id   43965 non-null  object 
 1   name          43965 non-null  object 
 2   address       43965 non-null  object 
 3   city          43965 non-null  object 
 4   state         43965 non-null  object 
 5   postal_code   43965 non-null  object 
 6   latitude      43965 non-null  float64
 7   longitude     43965 non-null  float64
 8   stars         43965 non-null  float64
 9   review_count  43965 non-null  int64  
 10  attributes    42804 non-null  object 
 11  categories    43965 non-null  object 
dtypes: float64(3), int64(1), object(8)
memory usage: 4.4+ MB


In [16]:
n_review = 3000

In [19]:
review_count_by_restaurant = restaurants.groupby('name')['review_count'].sum()
review_count_by_restaurant.head(10)

name
 China                       10
#1 Hawaiian Barbecue         47
#1Brothers Pizza             84
&Company Resto Bar           54
'ONO Poké Bar                90
00 Gelato                    67
0109 Dessert & Chocolate    213
1 Brother's Pizza            36
1 Brothers Pizza             68
1 Hawaiian Barbecue         140
Name: review_count, dtype: int64

In [20]:
big_restaurant_names = review_count_by_restaurant[review_count_by_restaurant > n_review].index
big_restaurant_names

Index(['Angry Crab Shack', 'Applebee's Grill + Bar',
       'BJ's Restaurant & Brewhouse', 'Bacchanal Buffet', 'Barro's Pizza',
       'Black Bear Diner', 'Blaze Fast-Fire'd Pizza', 'Bobby Q', 'Bouchon',
       'Buca di Beppo Italian Restaurant',
       ...
       'The Habit Burger Grill', 'The Peppermill Restaurant & Fireside Lounge',
       'The Venetian Las Vegas', 'Topgolf', 'Tropical Smoothie Cafe',
       'Wendy's', 'Wicked Spoon', 'Yard House',
       'Yardbird Southern Table & Bar', 'Zipps Sports Grill'],
      dtype='object', name='name', length=110)

In [21]:
big_restaurant = restaurants[restaurants['name'].isin(big_restaurant_names)]

In [22]:
big_restaurant.groupby('name')['review_count'].sum().sort_values()

name
Circus Circus Las Vegas Hotel and Casino     3037
Pampas Las Vegas                             3040
Lazy Dog Restaurant & Bar                    3041
The Habit Burger Grill                       3079
Barro's Pizza                                3079
                                            ...  
Mon Ami Gabi                                 9264
Bacchanal Buffet                            10129
Chipotle Mexican Grill                      10789
Hash House A Go Go                          12767
McDonald's                                  18980
Name: review_count, Length: 110, dtype: int64

In [23]:
reset()

In [26]:
big_restaurant.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,attributes,categories
49,tLpkSwdtqqoXwU0JAGnApw,Wendy's,4602 Northfield Road,Cleveland,OH,44128,41.434614,-81.527026,3.5,7,"{'RestaurantsDelivery': 'False', 'BusinessAcce...","Restaurants, Fast Food, Burgers"
101,MTx-Zdl_KcU_z9G832XAjg,Taco Bell,8033 N Durango Dr,Las Vegas,NV,89131,36.307495,-115.279059,2.5,37,"{'OutdoorSeating': 'False', 'Alcohol': 'u'none...","Fast Food, Restaurants"
250,ypILNgy7QFskKAdcPKB2RQ,KFC,1667 Ebenezer Rd,Rock Hill,SC,29732,34.956131,-81.046257,2.0,5,"{'GoodForKids': 'True', 'WiFi': ''free'', 'Res...","Restaurants, Fast Food, Chicken Shop, Chicken ..."
280,CfwrsG76Wm4iLS22v_wAcg,McDonald's,6421 Pearl Rd,Parma Heights,OH,44130,41.385949,-81.768113,2.5,15,"{'Ambience': '{'romantic': False, 'intimate': ...","Restaurants, Coffee & Tea, Burgers, Fast Food,..."
283,_V4CzzA7Z9h4qyLIdG-KUg,Wendy's,5214 Sunset Rd,Charlotte,NC,28269,35.307994,-80.841429,1.5,18,"{'RestaurantsTakeOut': 'True', 'GoodForKids': ...","Restaurants, Hot Dogs, Burgers, Fast Food"


In [27]:
big_restaurant.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5909 entries, 49 to 209390
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   business_id   5909 non-null   object 
 1   name          5909 non-null   object 
 2   address       5909 non-null   object 
 3   city          5909 non-null   object 
 4   state         5909 non-null   object 
 5   postal_code   5909 non-null   object 
 6   latitude      5909 non-null   float64
 7   longitude     5909 non-null   float64
 8   stars         5909 non-null   float64
 9   review_count  5909 non-null   int64  
 10  attributes    5838 non-null   object 
 11  categories    5909 non-null   object 
dtypes: float64(3), int64(1), object(8)
memory usage: 600.1+ KB


#### Additional Cleaning 

In [29]:
big_restaurant.isna().sum()

business_id      0
name             0
address          0
city             0
state            0
postal_code      0
latitude         0
longitude        0
stars            0
review_count     0
attributes      71
categories       0
dtype: int64

## Reading the Reviews 

In [31]:
import gc
gc.collect()

20

In [32]:
size = 100000

In [34]:
review_reader = pd.read_json('data/yelp_dataset/yelp_academic_dataset_review.json', 
                             lines=True, 
                             dtype={'review_id':str,'user_id':str,
                                    'business_id':str,'stars':int,
                                    'date':str,'text':str,'useful':int,
                                    'funny':int,'cool':int},
                             chunksize=size)



In [36]:
review_reader

<pandas.io.json._json.JsonReader at 0x1a8ca028f88>

In [37]:
# There are multiple chunks to be read
chunk_list = []
for review_chunk in review_reader:
    # Drop columns that aren't needed
    review_chunk = review_chunk.drop(['review_id','funny','cool'], axis=1)
    # Renaming column name to avoid conflict with business overall star rating
    review_chunk = review_chunk.rename(columns={'stars': 'review_stars'})
    # Inner merge with edited business file so only reviews related to the business remaining
    chunk_merged = pd.merge(big_restaurant, review_chunk, on='business_id', how='inner')
    # Show feedback on progress
    print(f"{chunk_merged.shape[0]} out of {size:,} related reviews")
    chunk_list.append(chunk_merged)
# After trimming down the review file, concatenate all relevant data back to one dataframe
df = pd.concat(chunk_list, ignore_index=True, axis=0)

5897 out of 100,000 related reviews
5876 out of 100,000 related reviews
5990 out of 100,000 related reviews
6115 out of 100,000 related reviews
5886 out of 100,000 related reviews
5568 out of 100,000 related reviews
5333 out of 100,000 related reviews
5642 out of 100,000 related reviews
7217 out of 100,000 related reviews
7198 out of 100,000 related reviews
7227 out of 100,000 related reviews
7234 out of 100,000 related reviews
7157 out of 100,000 related reviews
7219 out of 100,000 related reviews
6474 out of 100,000 related reviews
6353 out of 100,000 related reviews
6831 out of 100,000 related reviews
6735 out of 100,000 related reviews
6743 out of 100,000 related reviews
6785 out of 100,000 related reviews
6899 out of 100,000 related reviews
7131 out of 100,000 related reviews
6830 out of 100,000 related reviews
7235 out of 100,000 related reviews
8780 out of 100,000 related reviews
8681 out of 100,000 related reviews
8606 out of 100,000 related reviews
8874 out of 100,000 related 

In [38]:
gc.collect()

20

In [39]:
df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,attributes,categories,user_id,review_stars,useful,text,date
0,MTx-Zdl_KcU_z9G832XAjg,Taco Bell,8033 N Durango Dr,Las Vegas,NV,89131,36.307495,-115.279059,2.5,37,"{'OutdoorSeating': 'False', 'Alcohol': 'u'none...","Fast Food, Restaurants",OE5MNd5PVORXxcrHEoWPdA,1,1,Do not stop here if you are wanting fast food....,2016-07-04 21:04:10
1,MTx-Zdl_KcU_z9G832XAjg,Taco Bell,8033 N Durango Dr,Las Vegas,NV,89131,36.307495,-115.279059,2.5,37,"{'OutdoorSeating': 'False', 'Alcohol': 'u'none...","Fast Food, Restaurants",z0O_MNIPTvwjw-YCjCM5rw,1,0,I was really irritated because i said no chees...,2018-05-20 08:12:21
2,MTx-Zdl_KcU_z9G832XAjg,Taco Bell,8033 N Durango Dr,Las Vegas,NV,89131,36.307495,-115.279059,2.5,37,"{'OutdoorSeating': 'False', 'Alcohol': 'u'none...","Fast Food, Restaurants",7S2wwOSVSRn4CEZdtQKG_Q,4,3,"""Yo Quiero Taco Bell!"" \n\nFor a Taco Bell thi...",2014-05-12 00:45:44
3,ypILNgy7QFskKAdcPKB2RQ,KFC,1667 Ebenezer Rd,Rock Hill,SC,29732,34.956131,-81.046257,2.0,5,"{'GoodForKids': 'True', 'WiFi': ''free'', 'Res...","Restaurants, Fast Food, Chicken Shop, Chicken ...",tDXj_pyGBFrQ9dj29uErBA,1,0,Ordered 2 large sides at drive up window. Got ...,2017-03-04 12:58:01
4,CfwrsG76Wm4iLS22v_wAcg,McDonald's,6421 Pearl Rd,Parma Heights,OH,44130,41.385949,-81.768113,2.5,15,"{'Ambience': '{'romantic': False, 'intimate': ...","Restaurants, Coffee & Tea, Burgers, Fast Food,...",VULFcmvS5-zp1wt-aokPwA,1,0,Try cleaning your play area once in a while. I...,2016-02-13 22:57:36


In [42]:
df.isna().sum()

business_id       0
name              0
address           0
city              0
state             0
postal_code       0
latitude          0
longitude         0
stars             0
review_count      0
attributes      336
categories        0
user_id           0
review_stars      0
useful            0
text              0
date              0
dtype: int64

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 564245 entries, 0 to 564244
Data columns (total 17 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   business_id   564245 non-null  object 
 1   name          564245 non-null  object 
 2   address       564245 non-null  object 
 3   city          564245 non-null  object 
 4   state         564245 non-null  object 
 5   postal_code   564245 non-null  object 
 6   latitude      564245 non-null  float64
 7   longitude     564245 non-null  float64
 8   stars         564245 non-null  float64
 9   review_count  564245 non-null  int64  
 10  attributes    563909 non-null  object 
 11  categories    564245 non-null  object 
 12  user_id       564245 non-null  object 
 13  review_stars  564245 non-null  int32  
 14  useful        564245 non-null  int32  
 15  text          564245 non-null  object 
 16  date          564245 non-null  object 
dtypes: float64(3), int32(2), int64(1), object(11)
me

In [44]:
df['name'].value_counts()

McDonald's                                  19849
Hash House A Go Go                          13004
Chipotle Mexican Grill                      11188
Bacchanal Buffet                            10417
Mon Ami Gabi                                 9536
                                            ...  
Barro's Pizza                                3227
Lazy Dog Restaurant & Bar                    3185
The Habit Burger Grill                       3184
Circus Circus Las Vegas Hotel and Casino     3110
Pampas Las Vegas                             3098
Name: name, Length: 110, dtype: int64

In [49]:
df.memory_usage(deep=True).sum()/1024**2

1194.603063583374

### Formulation of the Problem

We'll treat the problem as a multi-class classification problem, where we have 3 classes:

- Positive: corresponds to 4 & 5-star reviews

- Negative: corresponds to 1 & 2-star reviews

- Neutral: corresponds to 3-star reviews

In [50]:
def star_to_sentiment(star_rating):
    if star_rating > 3:
        sentiment = 'Positive'
    elif star_rating < 3:
        sentiment = 'Negative'
    else:
        sentiment = 'Neutral'
        
    return sentiment

In [51]:
df['sentiment'] = df['review_stars'].apply(star_to_sentiment)

In [52]:
df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,attributes,categories,user_id,review_stars,useful,text,date,sentiment
0,MTx-Zdl_KcU_z9G832XAjg,Taco Bell,8033 N Durango Dr,Las Vegas,NV,89131,36.307495,-115.279059,2.5,37,"{'OutdoorSeating': 'False', 'Alcohol': 'u'none...","Fast Food, Restaurants",OE5MNd5PVORXxcrHEoWPdA,1,1,Do not stop here if you are wanting fast food....,2016-07-04 21:04:10,Negative
1,MTx-Zdl_KcU_z9G832XAjg,Taco Bell,8033 N Durango Dr,Las Vegas,NV,89131,36.307495,-115.279059,2.5,37,"{'OutdoorSeating': 'False', 'Alcohol': 'u'none...","Fast Food, Restaurants",z0O_MNIPTvwjw-YCjCM5rw,1,0,I was really irritated because i said no chees...,2018-05-20 08:12:21,Negative
2,MTx-Zdl_KcU_z9G832XAjg,Taco Bell,8033 N Durango Dr,Las Vegas,NV,89131,36.307495,-115.279059,2.5,37,"{'OutdoorSeating': 'False', 'Alcohol': 'u'none...","Fast Food, Restaurants",7S2wwOSVSRn4CEZdtQKG_Q,4,3,"""Yo Quiero Taco Bell!"" \n\nFor a Taco Bell thi...",2014-05-12 00:45:44,Positive
3,ypILNgy7QFskKAdcPKB2RQ,KFC,1667 Ebenezer Rd,Rock Hill,SC,29732,34.956131,-81.046257,2.0,5,"{'GoodForKids': 'True', 'WiFi': ''free'', 'Res...","Restaurants, Fast Food, Chicken Shop, Chicken ...",tDXj_pyGBFrQ9dj29uErBA,1,0,Ordered 2 large sides at drive up window. Got ...,2017-03-04 12:58:01,Negative
4,CfwrsG76Wm4iLS22v_wAcg,McDonald's,6421 Pearl Rd,Parma Heights,OH,44130,41.385949,-81.768113,2.5,15,"{'Ambience': '{'romantic': False, 'intimate': ...","Restaurants, Coffee & Tea, Burgers, Fast Food,...",VULFcmvS5-zp1wt-aokPwA,1,0,Try cleaning your play area once in a while. I...,2016-02-13 22:57:36,Negative


In [53]:
df['attributes'].iloc[0]

{'OutdoorSeating': 'False',
 'Alcohol': "u'none'",
 'RestaurantsAttire': "u'casual'",
 'BusinessAcceptsCreditCards': 'True',
 'NoiseLevel': "u'average'",
 'Ambience': "{'romantic': False, 'intimate': False, 'touristy': False, 'hipster': False, 'divey': False, 'classy': False, 'trendy': False, 'upscale': False, 'casual': False}",
 'BusinessParking': "{'garage': False, 'street': False, 'validated': False, 'lot': False, 'valet': False}",
 'RestaurantsReservations': 'False',
 'RestaurantsTakeOut': 'True',
 'RestaurantsPriceRange2': '1',
 'WiFi': "u'no'",
 'RestaurantsGoodForGroups': 'True',
 'GoodForKids': 'True',
 'RestaurantsDelivery': 'False',
 'BikeParking': 'False',
 'GoodForMeal': "{'dessert': False, 'latenight': False, 'lunch': False, 'dinner': False, 'brunch': False, 'breakfast': False}",
 'DriveThru': 'True',
 'Caters': 'False',
 'HasTV': 'True',
 'DogsAllowed': 'False'}

In [54]:
if not os.path.isdir('data'):
    os.path.makedirs('data')

In [55]:
df.to_csv('data/data_clean_1.csv', index=False)

In [56]:
df = pd.read_csv('data/data_clean_1.csv', parse_dates=['date'])

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 564245 entries, 0 to 564244
Data columns (total 18 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   business_id   564245 non-null  object        
 1   name          564245 non-null  object        
 2   address       564245 non-null  object        
 3   city          564245 non-null  object        
 4   state         564245 non-null  object        
 5   postal_code   564240 non-null  object        
 6   latitude      564245 non-null  float64       
 7   longitude     564245 non-null  float64       
 8   stars         564245 non-null  float64       
 9   review_count  564245 non-null  int64         
 10  attributes    563909 non-null  object        
 11  categories    564245 non-null  object        
 12  user_id       564245 non-null  object        
 13  review_stars  564245 non-null  int64         
 14  useful        564245 non-null  int64         
 15  text          564