In [None]:
import pandas as pd
import numpy as np
import re, json

### Read data & Extract attributes

In [None]:
# read files: business, review, tip, user
business = pd.read_json("../data/business_city.json", lines = True)

In [None]:
steak_business = business[business["categories"].str.lower().str.contains("steakhouse") == True]
steak_business = steak_business[["business_id", "name", "latitude", "longitude", "stars", "attributes"]]

In [None]:
# one record contain NA in attributes, remove this record
steak_business = steak_business.dropna()
# reset index
steak_business = steak_business.reset_index().drop("index", axis = 1)

In [None]:
# extract attributes
attributes = []
for ind in steak_business.index: 
    attr = steak_business.loc[ind, "attributes"].keys()
    attributes.extend(list(attr))

attributes = sorted(list(set(attributes)))

In [None]:
attr_dict = {}
for attr in attributes: 
    attr_dict[attr] = []
for ind in steak_business.index:
    attribute = steak_business.loc[ind, "attributes"]
    for attr in attributes: 
        if attr in attribute.keys(): 
            attr_dict[attr].append(attribute[attr])
        else: 
            attr_dict[attr].append(np.nan)

In [None]:
attr_df = pd.DataFrame(attr_dict)
steak_attributes = pd.concat([steak_business, attr_df], axis = 1)

In [None]:
na_num = {}
for col in attr_df.columns: 
    na_num[col] = attr_df[col].isnull().sum()
extract_attr = []
for attr in list(na_num.keys()): 
    if na_num[attr] < 35: 
        extract_attr.append(attr)
extract_attr

In [None]:
# keep attributes with less missing proportion less than 10%
steak_attributes = steak_attributes[["business_id", "name", "latitude", "longitude", "stars", "Alcohol", 
                                     "Ambience", "BusinessAcceptsCreditCards", "BusinessParking", "GoodForKids", 
                                     "HasTV", "OutdoorSeating", "RestaurantsAttire", "RestaurantsDelivery", 
                                     "RestaurantsGoodForGroups", "RestaurantsPriceRange2", 
                                     "RestaurantsReservations", "RestaurantsTakeOut"]]

### Summary of each attributes

#### Alcohol

In [None]:
steak_attributes.Alcohol.value_counts()

Most steakhouse businesses provide alcohol. omit!

#### BusinessAcceptsCreditCards

In [None]:
steak_attributes.BusinessAcceptsCreditCards.value_counts()

Most steakhouse businesses have BusinessAcceptsCreditCards attribute. omit!

#### GoodForKids

In [None]:
steak_attributes.GoodForKids.value_counts()

#### HasTV

In [None]:
steak_attributes.HasTV.value_counts()


Most steakhouse businesses have HasTV attribute. omit!

#### OutdoorSeating

In [None]:
steak_attributes.OutdoorSeating.value_counts()

#### RestaurantsAttire

In [None]:
steak_attributes.RestaurantsAttire.value_counts()

#### RestaurantsDelivery

In [None]:
steak_attributes.RestaurantsDelivery.value_counts()

In [None]:
for deliver, group in steak_attributes.groupby("RestaurantsDelivery"): 
    print(deliver)
    print(group["stars"].value_counts(ascending = True))

#### RestaurantsGoodForGroups

In [None]:
steak_attributes.RestaurantsGoodForGroups.value_counts()

Most steakhouse business have RestaurantsGoodForGroups attribute. omit!

#### RestaurantsPriceRage2

In [None]:
steak_attributes.RestaurantsPriceRange2.value_counts()

#### RestaruantsReservations

In [None]:
steak_attributes.RestaurantsReservations.value_counts()

In [None]:
for reserve, group in steak_attributes.groupby("RestaurantsReservations"): 
    print(reserve)
    print(group["stars"].value_counts(ascending = True))

With reservation or not, star distributions are similar for True and False. Not important. omit! 

#### RestaurantsTakeOut

In [None]:
steak_attributes.RestaurantsTakeOut.value_counts()

Most steakhouse businesses have RestaurantsTakeOut attribute. omit! 

Only retain Alcohol, Ambience, BusinessParking, GoodForKids, OutdoorSeating, RestaurantsAttire, RestaurantsDelivery and RestaurantsPriceRage2. 

In [None]:
steak_attributes = steak_attributes.drop(["BusinessAcceptsCreditCards", "RestaurantsGoodForGroups"], axis = 1)

In [None]:
# Not able to impute missing values, omit imcomplete records
steak_mask_amb = steak_attributes.mask(steak_attributes["Ambience"].eq("None"))
steak_attributes = steak_mask_amb.dropna().reset_index().drop("index", axis = 1)

### Extract information from Ambience

expand Ambience information

In [None]:
ambience = ["romantic", "intimate", "classy", 'upscale', "touristy", "trendy", "casual", "divey", "hipster"]
ambiences = {}

for amb in sorted(ambience): 
    keys = "Ambience." + amb
    ambiences[keys] = []
    
for ind in steak_attributes.index: 
    amb = steak_attributes.loc[ind, "Ambience"]
    amb_false = re.sub('False|None', '0', amb)
    amb_true = re.sub('True', '1', amb_false)
    amb_punc = re.sub('\'', '\"', amb_true)
    amb_dict = json.loads(amb_punc)
    for amb_type in ambience: 
        keys = "Ambience." + amb_type
        if amb_type in amb_dict.keys(): 
            ambiences[keys].append(amb_dict[amb_type])
        else: 
            ambiences[keys].append(0)
            
steak_ambience = pd.DataFrame(ambiences)

In [None]:
steak_attributes = pd.concat([steak_attributes, steak_ambience], axis = 1)

### Extract information from BusinessParking

expand BusinessParking information

In [None]:
steak_attributes.loc[0, "BusinessParking"]

In [None]:
park_type = ["garage", "street", "validated", "lot", "valet"]
parkings = {}

for park in sorted(park_type): 
    keys = "BusinessParking." + park
    parkings[keys] = []
    
for ind in steak_attributes.index: 
    parking = steak_attributes.loc[ind, "BusinessParking"]
    if parking != "None": 
        parking_false = re.sub('False|None', '0', parking)
        parking_true = re.sub('True', '1', parking_false)
        parking_punc = re.sub('\'', '\"', parking_true)
        parking_dict = json.loads(parking_punc)
        for p in park_type: 
            keys = "BusinessParking." + p
            if p in parking_dict.keys(): 
                parkings[keys].append(parking_dict[p])
            else: 
                parkings[keys].append(0)
    else: 
        for p in parkings.keys(): 
            parkings[p].append(0)

steak_parking = pd.DataFrame(parkings)

In [None]:
steak_attributes = pd.concat([steak_attributes, steak_parking], axis = 1)

In [None]:
# drop original "Ambience", "BusinessParking"
steak_attributes = steak_attributes.drop(["Ambience", "BusinessParking"], axis = 1)
steak_attributes.head()

In [None]:
steak_attributes.to_csv("../data/steak_attributes.csv", index = False)