# 3. Feature Engineering

In [1]:
# Import python packages
import numpy as np
import pandas as pd

## 3.1 Business Features

In [2]:
# Read business data and checkin data
business = pd.read_csv('clean_business.csv')
business.shape

(2493, 13)

In [3]:
# Reveal first few rows 
business.head()

Unnamed: 0,attributes,business_id,categories,city,latitude,longitude,name,postal_code,review_count,stars,state,isBankrupt,county
0,"{'RestaurantsTableService': False, 'GoodForMea...",rDMptJYWtnMhpQu_rRXHng,"['Fast Food', 'Burgers', 'Restaurants']",Phoenix,33.60707,-112.064382,McDonald's,85022,10,1.0,AZ,0,Maricopa County
1,"{'RestaurantsTableService': True, 'GoodForMeal...",1WBkAuQg81kokZIPMpn9Zg,"['Burgers', 'Restaurants']",Phoenix,33.60731,-112.063404,Charr An American Burger Bar,85022,232,3.0,AZ,0,Maricopa County
2,"{'RestaurantsTableService': False, 'GoodForMea...",iPa__LOhse-hobC2Xmp-Kw,"['Restaurants', 'Burgers', 'Fast Food']",Phoenix,33.508765,-112.04624,McDonald's,85016,34,3.0,AZ,0,Maricopa County
3,"{'BusinessAcceptsCreditCards': True, 'Business...",YhV93k9uiMdr3FlV4FHjwA,"['Marketing', ""Men's Clothing"", 'Restaurants',...",Phoenix,33.449967,-112.070222,Caviness Studio,85001,4,5.0,AZ,0,Maricopa County
4,"{'RestaurantsTableService': True, 'GoodForMeal...",QkG3KUXwqZBW18A9k1xqCA,"['American (Traditional)', 'Restaurants', 'Sea...",Phoenix,33.478735,-112.221379,Red Lobster,85035,37,2.5,AZ,0,Maricopa County


In [4]:
# Make a copy of raw dataset
business_df = business.copy()

## 3.1.1 Create isChain

We define `isChain` variable by checking whether the name of the restaurant has appeared more than once. 

In [5]:
# Create new feature chain
chain_dict = business_df['name'].value_counts().to_dict()
business_df['isChain'] = business_df['name'].apply(lambda x: int(chain_dict[x] > 1))

## 3.1.2 Create Neighborhood Density (Do this later)

## 3.1.3 Extract Business Categories

In [6]:
# Check categories type
type(business_df['categories'][0])

str

In [7]:
# Convert categories string into list
business_df['categories'] = business_df['categories'].apply(lambda x: eval(x))

In [8]:
# Extract all unique categories
categories = np.unique([item for row in business_df['categories'] for item in row])
print("Number of unique categories:", len(categories))

Number of unique categories: 271


## Restaurant categories I - Ethnicity

### a) Extract all ethnics

In [9]:
# All ethnics found in the dataset
all_ethnics = ['Afghan', 'African', 'American (New)', 'American (Traditional)', 
               'Arabian', 'Argentine','Armenian', 'Asian Fusion', 'British', 
               'Cajun/Creole', 'Cambodian', 'Cantonese', 'Caribbean', 'Chinese', 
               'Indian', 'Cuban', 'Empanadas', 'Ethiopian', 'Filipino', 'French', 
               'Greek', 'Hawaiian', 'Irish', 'Italian', 'Japanese', 'Korean', 
               'Kosher', 'Halal', 'Latin American', 'Lebanese', 'Mediterranean', 
               'Mexican', 'Middle Eastern', 'Modern European', 'Mongolian',
               'Moroccan', 'Pakistani', 'Persian/Iranian', 'Peruvian', 'Puerto Rican', 
               'Russian', 'Salvadoran', 'Southern', 'Spanish', 'Szechuan', 
               'Thai', 'Tex-Mex', 'Vietnamese', 'Turkish', 'Ukrainian', 'Uzbek']

In [10]:
# Extract all ethnics from categories
business_df['ethnics'] = business_df['categories'].apply(lambda row: [x for x in row if x in all_ethnics])

In [11]:
# Create a dictionary to count the occurence of each ethnic
ethnics_dict = dict(zip(all_ethnics, [sum([x in row for row in business_df['ethnics']]) for x in all_ethnics]))
print('Total Number of Ethnics:', len(ethnics_dict))

Total Number of Ethnics: 51


In [12]:
# Convert dictionary to dataframe
ethnics_df = pd.DataFrame(list(ethnics_dict.items()), columns=['Ethnics','Count']) \
                .sort_values('Count', ascending = False) \
                .reset_index() \
                .drop('index', axis=1)
ethnics_df.head()

Unnamed: 0,Ethnics,Count
0,Mexican,490
1,American (Traditional),339
2,American (New),246
3,Italian,190
4,Chinese,144


### b) Combine ethnicity into smaller groups

Since there are too many ethnics categories in the dataset, we combine these categories into smaller groups.

In [13]:
# Combine ethnics into fewer categories
ethnic_dict = {
    'African': ['African', 'Ethiopian', 'Moroccan'],
    'American': ['American (New)', 'American (Traditional)', 'Hawaiian', 'Southern'],
    'AsiaContinent': ['Mongolian', 'Russian', 'Ukrainian', 'Uzbek'],
    'Chinese': ['Cantonese', 'Chinese', 'Szechuan'],
    'EastAsia': ['Japanese', 'Korean', 'Asian Fusion'],
    'European': ['British', 'French', 'Greek', 'Irish', 'Kosher', 'Modern European', 'Cajun/Creole', 'Spanish'],
    'LatinAmerica': ['Argentine', 'Caribbean', 'Cuban', 'Empanadas', 
                     'Latin American', 'Peruvian', 'Puerto Rican', 'Salvadoran'],
    'Mexican': ['Mexican','Tex-Mex'],
    'MiddleEast': ['Arabian', 'Armenian', 'Halal', 'Lebanese', 'Middle Eastern', 
                   'Turkish','Mediterranean', 'Afghan', 'Pakistani', 'Persian/Iranian'],
    'SouthEastAsia': ['Cambodian', 'Filipino', 'Laotian', 'Thai', 'Vietnamese','Indian']
}

In [14]:
# Merge ethnics into smaller categories
merge_ethnic = lambda row: np.unique([ethnic if x in ethnic_dict[ethnic] else x for x in row])
for ethnic in ethnic_dict.keys():
    business_df['ethnics'] = business_df['ethnics'].apply(merge_ethnic)

In [15]:
# Update dictionary
ethnics = ethnic_dict.keys()
ethnics_dict = dict(zip(ethnics, [sum([x in row for row in business_df['ethnics'] if len(row) > 0]) for x in ethnics]))
print('Total Number of Ethnics:', len(ethnics_dict))

Total Number of Ethnics: 10


In [16]:
# Print dictionary
ethnics_dict

{'African': 9,
 'American': 542,
 'AsiaContinent': 5,
 'Chinese': 144,
 'EastAsia': 117,
 'European': 85,
 'LatinAmerica': 37,
 'Mexican': 496,
 'MiddleEast': 92,
 'SouthEastAsia': 78}

### c) Combine less frequent ethnic group

We will group ethnics that have very less than 50 occurence into `Other_Ethnic`

In [17]:
other_ethnic = [x for x in ethnics_dict.keys() if ethnics_dict[x] < 50]
other_ethnic

['African', 'AsiaContinent', 'LatinAmerica']

In [18]:
# Update feature ethnics
convert_to_other = lambda row: np.unique(['Other_Ethnic' if x in other_ethnic else x for x in row])
business_df['ethnics'] = business_df['ethnics'].apply(convert_to_other)

In [19]:
# Update dictionary
ethnics = [x for x in ethnics_dict.keys() if x not in other_ethnic] + ['Other_Ethnic']
ethnics_dict = dict(zip(ethnics, [sum([x in row for row in business_df['ethnics'] if len(row) > 0]) for x in ethnics]))
print('Total Number of Ethnics:', len(ethnics_dict))

Total Number of Ethnics: 8


In [20]:
# Convert dictionary to dataframe
ethnics_df = pd.DataFrame(list(ethnics_dict.items()), columns=['Ethnics','Count']) \
                .sort_values('Count',ascending = False) \
                .reset_index() \
                .drop('index', axis=1)
ethnics_df

Unnamed: 0,Ethnics,Count
0,American,542
1,Mexican,496
2,Chinese,144
3,EastAsia,117
4,MiddleEast,92
5,European,85
6,SouthEastAsia,78
7,Other_Ethnic,51


## Restaurant categories II - Food Types

### a) Extract all food types

In [21]:
# Type words filtered by manually check
all_types = ['Acai Bowls', 'Bagels', 'Bubble Tea', 'Burgers', 'Cheesesteaks', 'Chicken Wings', 
             'Creperies', 'Cupcakes', 'Custom Cakes', 'Donuts', 'Falafel','Fish & Chips', 
             'Fruits & Veggies', 'Gelato', 'Gluten-Free', 'Hot Dogs', 'Ice Cream & Frozen Yogurt', 
             'Imported Food', 'Juice Bars & Smoothies','Local Flavor', 'Macarons','Noodles', 
             'Pizza','Sandwiches','Pretzels','Ramen','Salad','Seafood','Shaved Ice', 'Soup', 
             'Tacos', 'Waffles', 'Wraps', 'Bakeries', 'Barbeque','Beverage Store', 'Buffets','Cafes',
             'Candy Stores','Cheese Shops','Chicken Shop','Chocolatiers & Shops', 'Coffee & Tea', 
             'Coffee Roasteries','Desserts', 'Delis', 'Fast Food','Internet Cafes','Organic Stores', 
             'Patisserie/Cake Shop','Popcorn Shops', 'Seafood Markets','Steakhouses','Tea Rooms', 'Vegan', 
             'Vegetarian','Bars', 'Beer', 'Beer Bar','Beer Gardens', 'Breweries', 'Cocktail Bars', 'Dive Bars',
             'Gay Bars', 'Gastropubs','Hotel bar','Irish Pub', 'Pubs', 'Speakeasies', 'Sports Bars',
             'Tapas Bars','Whiskey Bars','Wine & Spirits','Wine Bars']

In [22]:
# Create new feature types
business_df['types'] = business_df['categories'].apply(lambda row: [x for x in row if x in all_types])

In [23]:
# Create a dictionary to store the appear times of each types
types_dict = dict(zip(all_types, [sum([x in row for row in business_df['types']]) for x in all_types]))
print('Total Number of Types:', len(types_dict))

Total Number of Types: 74


### b) Combine types into smaller groups

Similar with ethnics, we will apply the same process for types.

In [24]:
# Combine types into fewer categories
type_dict = {
    'Fast_Food': ['Burgers','Fish & Chips','Hot Dogs','Sandwiches','Fast Food','Pizza'],
    'Alcohol': ['Bars','Beer', 'Beer Bar','Beer Gardens', 'Breweries',
                'Cocktail Bars','Dive Bars','Gay Bars', 'Gastropubs','Hotel bar',
                'Irish Pub', 'Pubs','Speakeasies','Sports Bars','Tapas Bars','Whiskey Bars',
                'Wine & Spirits','Wine Bars'],
    'Beverage': ['Bubble Tea','Juice Bars & Smoothies','Beverage Store'],
    'Bakeries_Desserts': ['Bagels','Cupcakes','Custom Cakes','Patisserie/Cake Shop','Shaved Ice',
                            'Gelato','Ice Cream & Frozen Yogurt','Macarons','Chocolatiers & Shops','Donuts',
                            'Waffles','Bakeries','Desserts','Creperies'],
    'Vegetarian': ['Fruits & Veggies','Vegan','Vegetarian','Falafel','Acai Bowls','Salad'],
    'Coffee_Tea': ['Coffee Roasteries','Tea Rooms','Coffee & Tea','Cafes',
                    'Internet Cafes'],
    'Noodles': ['Noodles','Ramen'],
    'Seafood': ['Seafood Markets','Seafood'],
    'Snacks': ['Pretzels','Candy Stores','Popcorn Shops'],
    'Chicken': ['Chicken Shop','Chicken Wings'],
    'Meat': ['Barbeque','Delis','Steakhouses','Cheesesteaks'],
}

In [25]:
# Food types that do not belong any of above small groups
other_type = ['Gluten-Free', 'Imported Food', 'Local Flavor','Soup', 'Tacos', 'Wraps', 'Buffets']

In [26]:
# Update feature types
merge_type = lambda row: np.unique([tp if x in type_dict[tp] else x for x in row])
for tp in type_dict.keys():
    business_df['types'] = business_df['types'].apply(merge_type)

In [27]:
# Update dictionary
types = list(type_dict.keys()) + other_type
type_dict = dict(zip(types, [sum([x in row for row in business_df['types'] if len(row) > 0]) for x in types]))
print('Total Number of Types:', len(type_dict))

Total Number of Types: 18


In [28]:
# Convert dictionary to dataframe
type_df = pd.DataFrame(list(type_dict.items()), columns=['Type','Count']) \
                .sort_values('Count', ascending = False) \
                .reset_index() \
                .drop('index', axis=1)
type_df

Unnamed: 0,Type,Count
0,Fast_Food,1175
1,Alcohol,339
2,Vegetarian,231
3,Meat,220
4,Bakeries_Desserts,192
5,Coffee_Tea,183
6,Chicken,150
7,Seafood,119
8,Buffets,56
9,Beverage,47


## 3.1.4 Extract Business Attributes

In [29]:
# Check attributes type
type(business_df['attributes'][0])

str

In [30]:
# Convert attributes into dictionary
business_df['attributes'] = business_df['attributes'].apply(lambda x: eval(x))

In [31]:
# Convert feature attributes to a dataframe
attributes_df = business_df['attributes'].apply(pd.Series)

In [32]:
# Reveal first few rows
attributes_df.head()

Unnamed: 0,RestaurantsTableService,GoodForMeal,Alcohol,Caters,HasTV,RestaurantsGoodForGroups,NoiseLevel,WiFi,RestaurantsAttire,RestaurantsReservations,...,Smoking,BYOBCorkage,RestaurantsCounterService,BYOB,Open24Hours,DietaryRestrictions,Corkage,AgesAllowed,ByAppointmentOnly,AcceptsInsurance
0,False,"{'dessert': False, 'latenight': False, 'lunch'...",none,False,True,True,loud,free,casual,False,...,,,,,,,,,,
1,True,"{'dessert': False, 'latenight': False, 'lunch'...",full_bar,True,True,True,average,free,casual,False,...,,,,,,,,,,
2,False,"{'dessert': False, 'latenight': True, 'lunch':...",none,False,False,True,loud,paid,casual,False,...,,,,,,,,,,
3,,,,,,,,,,False,...,,,,,,,,,,
4,True,"{'dessert': False, 'latenight': False, 'lunch'...",full_bar,False,True,True,quiet,no,casual,True,...,,,,,,,,,,


In [33]:
# Drop features
attributes_df=attributes_df.drop(['Ambience','BikeParking','BusinessAcceptsBitcoin','Open24Hours','BYOBCorkage',
                                 'BYOB','Corkage','DietaryRestrictions','GoodForDancing',
                                 'Music','BestNights','CoatCheck','ByAppointmentOnly','RestaurantsCounterService',
                                 'AgesAllowed','AcceptsInsurance','RestaurantsTableService',
                                 'Caters','HasTV','RestaurantsTakeOut','WheelchairAccessible','DogsAllowed',
                                 'Smoking'], axis=1)

In [34]:
# Define name change in key-value pairs
column_name_change = {
    'GoodForMeal':'openFor',
    'Alcohol':'alcohol',
    'RestaurantsGoodForGroups':'forGroups',
    'NoiseLevel':'noiceLevel',
    'WiFi':'hasWiFi',
    'RestaurantsAttire':'attire',
    'RestaurantsReservations':'reservations',
    'OutdoorSeating':'outdoorSeating',
    'BusinessAcceptsCreditCards':'acceptCreditCard',
    'RestaurantsPriceRange2':'priceRange',
    'RestaurantsDelivery':'delivery',
    'GoodForKids':'goodForKids',
    'DriveThru':'driveThru',
    'BusinessParking':'businessParking',
    'HappyHour':'happyHour'}

In [35]:
# Change names for some variables
attributes_df = attributes_df.rename(columns=column_name_change)

In [36]:
# Reveal first few rows
attributes_df.head()

Unnamed: 0,openFor,alcohol,forGroups,noiceLevel,hasWiFi,attire,reservations,outdoorSeating,acceptCreditCard,priceRange,delivery,goodForKids,driveThru,businessParking,happyHour
0,"{'dessert': False, 'latenight': False, 'lunch'...",none,True,loud,free,casual,False,False,True,1.0,False,True,True,"{'garage': False, 'street': False, 'validated'...",
1,"{'dessert': False, 'latenight': False, 'lunch'...",full_bar,True,average,free,casual,False,True,True,2.0,True,True,False,"{'garage': False, 'street': False, 'validated'...",
2,"{'dessert': False, 'latenight': True, 'lunch':...",none,True,loud,paid,casual,False,False,True,1.0,False,True,True,"{'garage': False, 'street': False, 'validated'...",
3,,,,,,,False,,True,,False,,,,
4,"{'dessert': False, 'latenight': False, 'lunch'...",full_bar,True,quiet,no,casual,True,False,True,2.0,False,True,,"{'garage': False, 'street': False, 'validated'...",


## Restaurant categories III - Time of The Day

We will use `openFor` in `attribute` feature to build our third restaurant categories.

### a) Preprocess

In [37]:
# Convert feature time to a dataframe
openFor_df = attributes_df['openFor'].apply(pd.Series)

  result = result.union(other)
  index = _union_indexes(indexes, sort=sort)
  result = result.union(other)


In [38]:
# Drop unnecessary variable
openFor_df = openFor_df.drop([0,'dessert'],axis=1)

### b) Create new feature time_of_day

In [39]:
# Reveal first few rows
openFor_df.head()

Unnamed: 0,latenight,lunch,dinner,breakfast,brunch
0,False,True,False,True,False
1,False,False,True,False,False
2,True,True,True,True,False
3,,,,,
4,False,False,True,False,False


In [40]:
# Create new feature time
openFor_df['time_of_day'] = pd.Series()
openFor_df['time_of_day'] = openFor_df['time_of_day'].apply(lambda x: [])

In [41]:
# Convert several features into one
openFor_ls = ['latenight','lunch','dinner','breakfast','brunch']
for i in range(len(openFor_df['time_of_day'])):
    for j in openFor_ls:
        if str(openFor_df[j].iloc[i])=='True':
            openFor_df['time_of_day'].iloc[i].append(j)

### c) Merge back to attributes

In [42]:
# Drop feature in attributes_df
attributes_df = attributes_df.drop('openFor', axis=1)

In [43]:
# Drop feature in openFor_df
openFor_df = openFor_df.drop(openFor_ls, axis=1)

In [44]:
# Merge feature to attributes_df
attributes_df = attributes_df.merge(openFor_df, left_index=True, right_index=True)

## Clean Feature Alcohol

### a) Fill NA's using keyword alcohol in restaurant categories II 

In [45]:
# Fill NA's
attributes_df['alcohol'] = attributes_df['alcohol']\
     .fillna(business_df['types'].apply(lambda row: 'True' if 'Alcohol' in row else np.nan))

  This is separate from the ipykernel package so we can avoid doing imports until


### b) Simplify values

In [46]:
# Check value counts
attributes_df['alcohol'].value_counts()

none             1261
full_bar          593
beer_and_wine     290
True               15
Name: alcohol, dtype: int64

Now we will do some combinations: <br>
`none` -> `False` <br>
`full_bar`, `beer_and_wine`, `True` -> `True`.

In [47]:
# Change value names
attributes_df['alcohol'] = attributes_df['alcohol'].str.replace('none', 'False')
attributes_df['alcohol'] = attributes_df['alcohol'].str.replace('full_bar', 'True')
attributes_df['alcohol'] = attributes_df['alcohol'].str.replace('beer_and_wine', 'True')

In [48]:
# Convert values to boolean
attributes_df['alcohol'] = attributes_df['alcohol'].apply(lambda x: (x=='True') if not pd.isna(x) else x)

## Clean Feature businessParking

Since feature `businessParking` also has values in dictionary format, we will apply similar process for `openFor` to it.

### a) Preprocess

In [49]:
# Convert feature businessParking to a dataframe
parking_df = attributes_df['businessParking'].apply(pd.Series)

In [50]:
# Drop unnecessary variable
parking_df = parking_df.drop(0,axis=1)

### b) Create new feature parking

In [51]:
# Reveal first few rows
parking_df.head()

Unnamed: 0,garage,street,validated,lot,valet
0,False,False,False,True,False
1,False,False,False,True,False
2,False,False,False,True,False
3,,,,,
4,False,False,False,False,False


In [52]:
# Create new feature parking
parking_df['parking'] = pd.Series()
parking_df['parking'] = parking_df['parking'].apply(lambda x: [])

In [53]:
# Convert several features into one
parking_ls = ['garage','street','validated','lot','valet']
for i in range(len(parking_df['parking'])):
    for j in parking_ls:
        if str(parking_df[j].iloc[i])=='True':
            parking_df['parking'].iloc[i].append(j)

### c) Merge back to attributes

In [54]:
# Drop feature in attributes_df
attributes_df = attributes_df.drop('businessParking', axis=1)

In [55]:
# Drop feature in openFor_df
parking_df = parking_df.drop(parking_ls,axis=1)

In [56]:
# Merge feature to attributes_df
attributes_df = attributes_df.merge(parking_df, left_index=True, right_index=True)

## 3.1.5 Final Merge

In [57]:
#Merge arrtributes back to business
business_df = business_df.merge(attributes_df, left_index=True, right_index=True)

In [58]:
# Drop orginal arrtributes in business
business_df = business_df.drop('attributes', axis=1)

In [59]:
# Check dimensions
business_df.shape

(2493, 30)