In [29]:
import requests
import json
import time
import pandas as pd
import numpy as np

# Authentication

In [13]:
def load_credentials(filename):
    """Take a file containing API info and return client id and access token"""
    with open(filename) as f:
        client_id, access_token = f.read().split(',')
    return client_id, access_token

In [14]:
client_id, api_key = load_credentials('/Users/haivule/Dropbox/licenses/yelp_api.txt')

# CA cities

In [16]:
cities_fname = "/Users/haivule/Documents/USF/spring2/data-viz/uscities.csv"

cities = pd.read_csv(cities_fname)
CA_cities = cities[cities['state_id']== 'CA']['city'].values

In [17]:
CA_cities

array(['La Quinta', 'Saint Helena', 'Burney', ..., 'La Mesa',
       'Montebello', 'Canyondam'], dtype=object)

In [18]:
len(CA_cities)

1625

# all "Food" results in each city

In [7]:
def business_search(search_term, location, offset, limit=50):
    url = 'https://api.yelp.com/v3/businesses/search'
    headers = {'Authorization': 'bearer %s' % api_key}
    params = {'location': f'{location}, CA',
              'term': search_term,
              'offset': offset,
              'limit': 50}

    resp = requests.get(url=url, params=params, headers=headers)
    return resp.json()

In [27]:
total = {}
for city in CA_cities:
    n_businesses = business_search(search_term='Food', location=city, offset=0, limit=1)['total']
    total[city] = n_businesses

In [35]:
all_cuisine_CA = pd.DataFrame.from_dict(total, orient='index').reset_index()
all_cuisine_CA.columns = ['city', 'num_resto']

In [39]:
all_cuisine_CA.to_csv('/Users/haivule/Documents/USF/spring2/data-viz/final-project/all_cuisine.csv', index=False)

In [81]:
all_cuisine_CA = pd.read_csv('/Users/haivule/Documents/USF/spring2/data-viz/final-project/data/all_cuisine.csv')
all_cuisine_CA.head()

Unnamed: 0,city,num_resto
0,La Quinta,634
1,Saint Helena,336
2,Burney,17
3,Kensington,306
4,Upper Lake,89


In [98]:
all_cuisine_CA[all_cuisine_CA['city']==location]

Unnamed: 0,city,num_resto
205,Santa Fe Springs,18000


# all "Vietnamese Food" in each city

In [22]:

nested_fields = {'categories':{'alias', 'title'}, 
                 'coordinates':{'latitude', 'longitude'}, 
                 'location':{'address1', 'address2', 'address3', 'city', 
                             'country', 'state', 'zip_code'}}

single_fields = ['id', 'display_phone', 'distance', 'alias', 'image_url',
                 'name', 'phone', 'price', 'rating',
                 'review_count', 'url', 'transactions']

unnested_fields = []
for field in nested_fields:
    sub_fields = nested_fields[field]
    for sub_field in sub_fields:
        unnested_fields.append(f"{field}_{sub_field}")

print(unnested_fields)

['categories_alias', 'categories_title', 'coordinates_latitude', 'coordinates_longitude', 'location_address1', 'location_city', 'location_state', 'location_address2', 'location_zip_code', 'location_country', 'location_address3']


In [23]:
def extract(raw_json, field):
    """Extraction robust to null values"""
    if len(field) == 2:
        try:
            parent, child = field
            return raw_json[parent][child]
        except:
            return np.nan
    try:
        return raw_json[field[0]]
    except:
        return np.nan

In [24]:
def parse_response(response):
    """
    Take the raw json output returned from the api call
    and return the parsed data as a dictionary.
    """
    business_info = response['businesses']
    num = len(business_info)
    
    parsed_all = {}
    
    
    for i in range(num):
        parsed_each = {}
        # parse single fields
        for field in single_fields:
            parsed_each[field] = extract(business_info[i], [field])
        
        # parse nested fields
        for field in nested_fields.keys():
            if field == 'categories':
                all_categories = business_info[i][field]
                if len(all_categories) == 0: alias = np.nan; title = np.nan
                alias = []
                title = []
                for category in all_categories:
                    alias.append(category['alias'])
                    title.append(category['title'])
            parsed_each["categories_alias"] = alias
            parsed_each["categories_title"] = title

            sub_fields = nested_fields[field]
            for sub_field in sub_fields:
                unnested = f"{field}_{sub_field}"
                parsed_each[unnested] = extract(business_info[i], [field, sub_field])
        parsed_all[i] = parsed_each
    
    return parsed_all

In [25]:
def response_to_df(response):
    """Take raw response and return a pd.DataFrame"""
    parsed_output = parse_response(response)
    return pd.DataFrame(parsed_output).T  # transpose


def accumulate_df(current_df, new_df):
    return current_df.append(new_df, ignore_index=True)

In [26]:
def df_to_file(df, location:str):
    output_file = "./data/business_info_{}.csv".format('_'.join(location.split()))
    df.to_csv(output_file, index=False)

In [105]:
def scrape(location):
    n_businesses = business_search(search_term="Vietnamese Restaurant", location=location, offset=0, limit=1)['total']
    if n_businesses == 0: 
        return
    if n_businesses > 1000:
        n_businesses = 1000  # cuz Yelp only returns 1000 results max
        
    df = pd.DataFrame()
    for batch_idx in range((n_businesses-1)//50 +1):
        response = business_search(search_term="Vietnamese Restaurant", location=location, offset = batch_idx*50)
        new_df = response_to_df(response)
        df = accumulate_df(current_df=df, new_df=new_df)
        df = df.iloc[:n_businesses] # as the last batch may have duplicate results
        df_to_file(df, location)
    return df

In [193]:
# vn_CA = {}
# df_all = pd.DataFrame()  # already run cities[70:100]

for city in CA_cities[1600:]:
    if city in ['San Francisco', 'San Jose']:
        pass
    else:
        df_city = scrape(city)
        if df_city is None:
            vn_CA[city] = 0
            pass
        else:
            df_all = accumulate_df(current_df=df_all, new_df=df_city)
            vn_CA[city] = df_city.shape[0]

# Save to csv

In [194]:
df_all.shape

(135044, 23)

In [48]:
# t = df_all

In [195]:
len(vn_CA)

1623

In [197]:
df_sf = pd.read_csv('./data/business_info_San_Francisco.csv')
df_sj = pd.read_csv('./data/business_info_San_Jose.csv')

In [178]:
# business_search(search_term="Vietnamese Restaurant", location="San Francisco", offset=0)

In [203]:
df_all = accumulate_df(current_df=df_all, new_df=df_sj)

In [204]:
df_all.shape

(136875, 23)

In [214]:
df_all['categories_alias'] = df_all['categories_alias'].apply(lambda x: ','.join(x))

In [217]:
df_all['categories_title'] = df_all['categories_title'].apply(lambda x: ','.join(x))

In [218]:
df_all['transactions'] = df_all['transactions'].apply(lambda x: ','.join(x))

In [221]:
df_all = df_all.drop_duplicates()

In [227]:
df_all.to_csv('vietnamese_restaurant_CA_all_results.csv', index=False)

In [226]:
df_all.shape

(128493, 23)

In [228]:
vn_df = df_all[df_all['categories_title'].str.contains('Vietnamese')==True]

In [229]:
vn_df.shape

(46135, 23)

In [230]:
vn_df.to_csv('vietnamese_restaurant_CA.csv', index=False)

In [231]:
t = pd.read_csv('vietnamese_restaurant_CA.csv')
t.shape

(46135, 23)

In [232]:
t.head()

Unnamed: 0,alias,categories_alias,categories_title,coordinates_latitude,coordinates_longitude,display_phone,distance,id,image_url,location_address1,...,location_country,location_state,location_zip_code,name,phone,price,rating,review_count,transactions,url
0,pho-vu-la-quinta,vietnamese,Vietnamese,33.707539,-116.272491,(760) 775-2417,1937.578321,FWhjge1DykCtlVjBU5KhmA,https://s3-media3.fl.yelpcdn.com/bphoto/gWXGfx...,79-630 Hwy 111,...,US,CA,92253,Pho Vu,17607750000.0,$,4.0,441,,https://www.yelp.com/biz/pho-vu-la-quinta?adju...
1,pho-of-the-desert-indio,vietnamese,Vietnamese,33.716305,-116.232306,(760) 775-1500,5697.759701,86ljJKAJv5bLzEVveCrf9g,https://s3-media1.fl.yelpcdn.com/bphoto/sFD7PR...,82128 US Hwy 111,...,US,CA,92201,Pho of the Desert,17607750000.0,$,4.0,252,,https://www.yelp.com/biz/pho-of-the-desert-ind...
2,kettles-vietnamese-bistro-santa-rosa,vietnamese,Vietnamese,38.459496,-122.732201,(707) 528-3747,23740.152826,wxPfjrSBQWcSRCPy56jA1w,https://s3-media4.fl.yelpcdn.com/bphoto/KhD18f...,1202 W Steele Ln,...,US,CA,95403,Kettles Vietnamese Bistro,17075280000.0,$$,4.0,380,pickup,https://www.yelp.com/biz/kettles-vietnamese-bi...
3,pho-sonoma-vietnamese-cuisine-petaluma,vietnamese,Vietnamese,38.23323,-122.63737,(707) 762-6888,33687.384504,chcy5OiTcxXXi9U_nYCR4A,https://s3-media3.fl.yelpcdn.com/bphoto/boCwI_...,140 2nd St,...,US,CA,94952,Pho Sonoma Vietnamese Cuisine,17077630000.0,$$,4.5,133,,https://www.yelp.com/biz/pho-sonoma-vietnamese...
4,bui-bistro-napa,"vietnamese,french","Vietnamese,French",38.30085,-122.28641,(707) 255-5417,27512.61615,6keF8wbnbKzOAMRsyZ9SuA,https://s3-media1.fl.yelpcdn.com/bphoto/dZhs67...,976 Pearl St,...,US,CA,94559,Bui Bistro,17072560000.0,$$,3.5,250,,https://www.yelp.com/biz/bui-bistro-napa?adjus...
