In [1]:
import pandas as pd
import numpy as np

# Fastfood Dataset

In [3]:
# read dataset
df_fastfood = pd.read_csv('../data/FastFoodRestaurants.csv')
df_fastfood.head(3)

Unnamed: 0,address,city,country,keys,latitude,longitude,name,postalCode,province,websites
0,324 Main St,Massena,US,us/ny/massena/324mainst/-1161002137,44.9213,-74.89021,McDonald's,13662,NY,"http://mcdonalds.com,http://www.mcdonalds.com/..."
1,530 Clinton Ave,Washington Court House,US,us/oh/washingtoncourthouse/530clintonave/-7914...,39.53255,-83.44526,Wendy's,43160,OH,http://www.wendys.com
2,408 Market Square Dr,Maysville,US,us/ky/maysville/408marketsquaredr/1051460804,38.62736,-83.79141,Frisch's Big Boy,41056,KY,"http://www.frischs.com,https://www.frischs.com..."


In [4]:
# print number of entities and attributes
print(len(df_fastfood))
print(len(df_fastfood.columns))
df_fastfood.columns

10000
10


Index(['address', 'city', 'country', 'keys', 'latitude', 'longitude', 'name',
       'postalCode', 'province', 'websites'],
      dtype='object')

In [5]:
# check for missing values
df_fastfood.replace('', np.nan, inplace=True)
df_fastfood.isnull().mean()

address       0.0000
city          0.0000
country       0.0000
keys          0.0000
latitude      0.0000
longitude     0.0000
name          0.0000
postalCode    0.0000
province      0.0000
websites      0.0465
dtype: float64

In [129]:
# number of city entities
len(df_fastfood[['city','country', 'province']].drop_duplicates())

3439

In [130]:
df_fastfood.groupby(['city','country', 'province']).size().sort_values(ascending=False)[:15]

city           country  province
Cincinnati     US       OH          119
Las Vegas      US       NV           65
Houston        US       TX           62
Miami          US       FL           57
Denver         US       CO           51
Chicago        US       IL           51
Phoenix        US       AZ           42
Oklahoma City  US       OK           41
Atlanta        US       GA           38
New York       US       NY           36
Albuquerque    US       NM           35
Rapid City     US       SD           34
Saint Paul     US       MN           33
Los Angeles    US       CA           33
Saint Louis    US       MO           33
dtype: int64

# Yelp Dataset

In [6]:
df_yelp_business = pd.read_json('../data/yelp-dataset/yelp_academic_dataset_business.json', lines=True)
print(len(df_yelp_business))
df_yelp_business = df_yelp_business[df_yelp_business['categories'].notnull()]
print(len(df_yelp_business))
df_yelp_business.head(2)

188593
188052


Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state
0,1314 44 Avenue NE,"{'RestaurantsPriceRange2': '2', 'BikeParking':...",Apn5Q_b6Nz61Tq4XzPdf9A,"Tours, Breweries, Pizza, Restaurants, Food, Ho...",Calgary,"{'Thursday': '11:0-21:0', 'Tuesday': '11:0-21:...",1,51.091813,-114.031675,Minhas Micro Brewery,,T2E 6L6,24,4.0,AB
1,,"{'GoodForMeal': '{'dessert': False, 'latenight...",AjEbIBw6ZFfln7ePHha9PA,"Chicken Wings, Burgers, Caterers, Street Vendo...",Henderson,"{'Sunday': '17:0-23:0', 'Saturday': '17:0-23:0...",0,35.960734,-114.939821,CK'S BBQ & Catering,,89002,3,4.5,NV


In [7]:
# filter restaurants
df_yelp_business_restaurants = df_yelp_business.loc[df_yelp_business['categories'].str.contains('Restaurants')]
print(len(df_yelp_business_restaurants))
df_yelp_business_restaurants.head(3)

57173


Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state
0,1314 44 Avenue NE,"{'RestaurantsPriceRange2': '2', 'BikeParking':...",Apn5Q_b6Nz61Tq4XzPdf9A,"Tours, Breweries, Pizza, Restaurants, Food, Ho...",Calgary,"{'Thursday': '11:0-21:0', 'Tuesday': '11:0-21:...",1,51.091813,-114.031675,Minhas Micro Brewery,,T2E 6L6,24,4.0,AB
1,,"{'GoodForMeal': '{'dessert': False, 'latenight...",AjEbIBw6ZFfln7ePHha9PA,"Chicken Wings, Burgers, Caterers, Street Vendo...",Henderson,"{'Sunday': '17:0-23:0', 'Saturday': '17:0-23:0...",0,35.960734,-114.939821,CK'S BBQ & Catering,,89002,3,4.5,NV
2,1335 rue Beaubien E,"{'GoodForMeal': '{'dessert': False, 'latenight...",O8S5hYJ1SMc8fA4QBtVujA,"Breakfast & Brunch, Restaurants, French, Sandw...",Montréal,"{'Thursday': '10:0-22:0', 'Tuesday': '10:0-22:...",0,45.540503,-73.5993,La Bastringue,Rosemont-La Petite-Patrie,H2G 1K7,5,4.0,QC


Check whether the dataset contains Fastfood chains

In [8]:
chains = ['Donald', 'Burger King', 'Subway', 'Wendy', 'Domino']

for chain in chains:
    print(chain, len(df_yelp_business_restaurants.loc[df_yelp_business_restaurants.name.str.contains(chain)]))
    print(df_yelp_business_restaurants.loc[df_yelp_business_restaurants.name.str.contains(chain)][['address','city','name']].head(2))
    print()

Donald 810
                     address        city        name
107        901 E Arrowood Rd   Charlotte   McDonalds
486  8001 E Indian School Rd  Scottsdale  McDonald's

Burger King 304
              address         city                     name
878   6010 Dixie Road  Mississauga  Burger King Restaurants
1579   1201 W Main St         Mesa              Burger King

Subway 745
                  address       city    name
32    2255 N Rampart Blvd  Las Vegas  Subway
59  2812 W Sugar Creek Rd  Charlotte  Subway

Wendy 289
                 address     city     name
106  7109 MacLeod Trl SW  Calgary  Wendy's
675          660 Park St  Belmont  Wendy's

Domino 245
                         address       city            name
321               3214 17 Ave SW    Calgary  Domino's Pizza
391  8532 Del Webb Blvd, Ste 116  Las Vegas  Domino's Pizza



Flatten

In [None]:
attributes_flattened = df_yelp_business_restaurants['attributes'].apply(pd.Series)

df_yelp_business_restaurants_flattened = pd.concat([df_yelp_business_restaurants.drop(['attributes'], axis=1), attributes_flattened[['BikeParking', 'BusinessAcceptsCreditCards','BusinessParking','GoodForKids','HasTV','NoiseLevel','OutdoorSeating','RestaurantsAttire',
                                               'RestaurantsDelivery','RestaurantsGoodForGroups','RestaurantsPriceRange2', 'RestaurantsReservations','RestaurantsTakeOut',
                                              'DriveThru','GoodForMeal', 'RestaurantsTableService', 'WheelchairAccessible','WiFi']]], axis=1)    

In [None]:
attributes_flattened.columns

In [None]:
df_yelp_business_restaurants_flattened.head(3)

Number of entities, Number of attributes

In [None]:
print(len(df_yelp_business_restaurants_flattened))
print(len(df_yelp_business_restaurants_flattened.columns))
df_yelp_business_restaurants_flattened.columns

Missing Values (more than 30%)

In [None]:
df_yelp_business_restaurants_flattened.replace('', np.nan, inplace=True)

In [None]:
df_yelp_business_restaurants_flattened.isnull().mean()

City entities

In [None]:
len(df_yelp_business_restaurants_flattened[['city','state']].drop_duplicates())

In [None]:
df_yelp_business_restaurants_flattened.groupby(['city','state']).size().sort_values(ascending=False)[:15]

In [None]:
# save as csv
