# Yelp API

In [1]:
import psycopg2
import yaml
import requests
from pprint import pprint
from datetime import datetime
import json
import pandas as pd
from sqlalchemy import create_engine

## Connect to PostgreSQL database

In [2]:
# connect to the databse
conn = psycopg2.connect(database="postgres",
                        user="postgres",
                        password="apassword",
                        host="192.168.0.104",
                        port="5432")

# enable autocommit
conn.autocommit = True

# define cursor
cur = conn.cursor()

# create a table
cur.execute("""CREATE TABLE IF NOT EXISTS yelp_business_search
               (id varchar PRIMARY KEY NOT NULL,
                business jsonb NOT NULL)""")

## Collect data from Yelp API

In [3]:
# read in config file
with open('/home/curtis/etc/yelp.yaml') as f:
    config = yaml.load(f)
    
# get the API key
api_key = config['yelp.com'][0]['key']

In [4]:
def query_api(limit=50, offset=0):
    
    # define the base URL for the request
    base_url = 'https://api.yelp.com/v3/businesses/search'

    # define the header for the request
    headers = {
        'Authorization': 'Bearer %s' % api_key,
    }

    # define the parameters for the request
    params = {
        'location': 'boston',
        'categories': 'coffee',
        'limit': limit,
        'offset': offset,
    }

    # submit the request to the API
    response = requests.get(base_url, headers=headers, params=params)

    # checek the status of the response
    response.status_code

    # save response data
    data = response.json()
    
    return data

In [5]:
# iterate over range and submit queries
responses = []
for i, n in enumerate(list(range(0,1051, 50))):
    r = query_api(limit=50, offset=n)
    print("query #{} completed".format(i))
    responses.append(r)

query #0 completed
query #1 completed
query #2 completed
query #3 completed
query #4 completed
query #5 completed
query #6 completed
query #7 completed
query #8 completed
query #9 completed
query #10 completed
query #11 completed
query #12 completed
query #13 completed
query #14 completed
query #15 completed
query #16 completed
query #17 completed
query #18 completed
query #19 completed
query #20 completed
query #21 completed


In [6]:
# pull data frmo individual responses and combine
data = []
for line in responses:
    if 'businesses' in line.keys():
        data.extend(line['businesses'])
    
print(len(data))

1000


## Investigate structure of data returned from API

In [7]:
# inspect repsonse structure
data[0].keys()

dict_keys(['review_count', 'categories', 'location', 'coordinates', 'url', 'transactions', 'price', 'is_closed', 'distance', 'phone', 'display_phone', 'image_url', 'alias', 'rating', 'name', 'id'])

In [8]:
# what is the first business look like?
pprint(data[0])

{'alias': 'modern-pastry-shop-boston',
 'categories': [{'alias': 'bakeries', 'title': 'Bakeries'},
                {'alias': 'coffee', 'title': 'Coffee & Tea'},
                {'alias': 'desserts', 'title': 'Desserts'}],
 'coordinates': {'latitude': 42.36324, 'longitude': -71.05474},
 'display_phone': '(617) 523-3783',
 'distance': 2185.181158712615,
 'id': '54ElwAyN-o8e4uvOkC85hw',
 'image_url': 'https://s3-media1.fl.yelpcdn.com/bphoto/BBuJF89-g0zFa1HcCHmF0w/o.jpg',
 'is_closed': False,
 'location': {'address1': '257 Hanover St',
              'address2': '',
              'address3': '',
              'city': 'Boston',
              'country': 'US',
              'display_address': ['257 Hanover St', 'Boston, MA 02113'],
              'state': 'MA',
              'zip_code': '02113'},
 'name': 'Modern Pastry Shop',
 'phone': '+16175233783',
 'price': '$',
 'rating': 4.0,
 'review_count': 1594,
 'transactions': ['delivery'],
 'url': 'https://www.yelp.com/biz/modern-pastry-shop-boston

## Load raw source data into PostgreSQL database

In [10]:
# iterate over response data and insert into a postgreSQL
for n, i in enumerate(data):
    
    try:

        # put data into databse
        cur.execute("""INSERT INTO yelp_business_search
                       (id, business) 
                       VALUES (%s, %s)""", [i['id'], json.dumps(i)])

    except:
        
        # print warning
        #print(n)
        pass

In [11]:
len(data)

1000

In [12]:
pprint(data[0])

{'alias': 'modern-pastry-shop-boston',
 'categories': [{'alias': 'bakeries', 'title': 'Bakeries'},
                {'alias': 'coffee', 'title': 'Coffee & Tea'},
                {'alias': 'desserts', 'title': 'Desserts'}],
 'coordinates': {'latitude': 42.36324, 'longitude': -71.05474},
 'display_phone': '(617) 523-3783',
 'distance': 2185.181158712615,
 'id': '54ElwAyN-o8e4uvOkC85hw',
 'image_url': 'https://s3-media1.fl.yelpcdn.com/bphoto/BBuJF89-g0zFa1HcCHmF0w/o.jpg',
 'is_closed': False,
 'location': {'address1': '257 Hanover St',
              'address2': '',
              'address3': '',
              'city': 'Boston',
              'country': 'US',
              'display_address': ['257 Hanover St', 'Boston, MA 02113'],
              'state': 'MA',
              'zip_code': '02113'},
 'name': 'Modern Pastry Shop',
 'phone': '+16175233783',
 'price': '$',
 'rating': 4.0,
 'review_count': 1594,
 'transactions': ['delivery'],
 'url': 'https://www.yelp.com/biz/modern-pastry-shop-boston

## Investigate repsonse fields

In [13]:
categories = set()
for line in data:
    try:
        for cat in line['categories']:
            categories.add(cat['alias'])
    except:
        print(line['id'])
    
categories

{'australian',
 'bagels',
 'bakeries',
 'banks',
 'barbers',
 'bars',
 'basque',
 'beer_and_wine',
 'beerbar',
 'bike_repair_maintenance',
 'bikes',
 'bookstores',
 'breakfast_brunch',
 'breweries',
 'bubbletea',
 'burgers',
 'cafes',
 'cakeshop',
 'candy',
 'cantonese',
 'caribbean',
 'catering',
 'chocolate',
 'cocktailbars',
 'coffee',
 'coffeeroasteries',
 'colombian',
 'convenience',
 'creperies',
 'cupcakes',
 'customcakes',
 'delis',
 'desserts',
 'diners',
 'dominican',
 'donuts',
 'ethiopian',
 'falafel',
 'food_court',
 'foodstands',
 'foodtrucks',
 'galleries',
 'gelato',
 'giftshops',
 'gluten_free',
 'grocery',
 'healthmarkets',
 'herbsandspices',
 'hotdogs',
 'icecream',
 'importedfood',
 'internetcafe',
 'italian',
 'juicebars',
 'kitchenandbath',
 'kosher',
 'latin',
 'mexican',
 'museums',
 'musicvenues',
 'musicvideo',
 'newamerican',
 'nonprofit',
 'pizza',
 'poke',
 'ramen',
 'restaurants',
 'salad',
 'sandwiches',
 'seafood',
 'servicestations',
 'soup',
 'souvenir

In [14]:
transactions = set()
for line in data:
    try:
        for i in line['transactions']:
            transactions.add(i)
    except:
        print(line['id'])
    
transactions

{'delivery', 'pickup'}

## Extract raw source data to perform light ETL

In [15]:
def parse_yelp_business(line):
    """
    Flatten nested-json and pull key features from dataset
    """
    
    row = {
        'alias': line['alias'],
        'latitude': line['coordinates']['latitude'],
        'longitude': line['coordinates']['longitude'],
        'id': line['id'],
        'image_url': line['image_url'],
        'address1': line['location']['address1'],
        'address2': line['location']['address2'],
        'address3': line['location']['address3'],
        'city': line['location']['city'],
        'state': line['location']['state'],
        'zip_code': line['location']['zip_code'],
        'name': line['name'],
        'phone': line['phone'],
        'rating': line['rating'],
        'review_count': line['review_count'],
        'url': line['url'],
    }
    
    if 'price' in line.keys():
        row['price'] = line['price']
    else:
        row['price'] = ''
    
    # create indicator variables for category
    for cat in categories:
        for i in line['categories']:
            if i['alias'] in cat:
                row[cat] = True
            else:
                row[cat] = False
                
    # create indicator variables for transaction
    for trans in transactions:
        for i in line['transactions']:
            if i in trans:
                row[trans] = True
            else:
                row[trans] = False
                
    return row

In [16]:
# review raw data
data[0]

{'alias': 'modern-pastry-shop-boston',
 'categories': [{'alias': 'bakeries', 'title': 'Bakeries'},
  {'alias': 'coffee', 'title': 'Coffee & Tea'},
  {'alias': 'desserts', 'title': 'Desserts'}],
 'coordinates': {'latitude': 42.36324, 'longitude': -71.05474},
 'display_phone': '(617) 523-3783',
 'distance': 2185.181158712615,
 'id': '54ElwAyN-o8e4uvOkC85hw',
 'image_url': 'https://s3-media1.fl.yelpcdn.com/bphoto/BBuJF89-g0zFa1HcCHmF0w/o.jpg',
 'is_closed': False,
 'location': {'address1': '257 Hanover St',
  'address2': '',
  'address3': '',
  'city': 'Boston',
  'country': 'US',
  'display_address': ['257 Hanover St', 'Boston, MA 02113'],
  'state': 'MA',
  'zip_code': '02113'},
 'name': 'Modern Pastry Shop',
 'phone': '+16175233783',
 'price': '$',
 'rating': 4.0,
 'review_count': 1594,
 'transactions': ['delivery'],
 'url': 'https://www.yelp.com/biz/modern-pastry-shop-boston?adjust_creative=MGVKNU5prVDnLKTWHJebZQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=MG

In [17]:
# review parses data
parse_yelp_business(data[0])

{'address1': '257 Hanover St',
 'address2': '',
 'address3': '',
 'alias': 'modern-pastry-shop-boston',
 'australian': False,
 'bagels': False,
 'bakeries': False,
 'banks': False,
 'barbers': False,
 'bars': False,
 'basque': False,
 'beer_and_wine': False,
 'beerbar': False,
 'bike_repair_maintenance': False,
 'bikes': False,
 'bookstores': False,
 'breakfast_brunch': False,
 'breweries': False,
 'bubbletea': False,
 'burgers': False,
 'cafes': False,
 'cakeshop': False,
 'candy': False,
 'cantonese': False,
 'caribbean': False,
 'catering': False,
 'chocolate': False,
 'city': 'Boston',
 'cocktailbars': False,
 'coffee': False,
 'coffeeroasteries': False,
 'colombian': False,
 'convenience': False,
 'creperies': False,
 'cupcakes': False,
 'customcakes': False,
 'delis': False,
 'delivery': True,
 'desserts': True,
 'diners': False,
 'dominican': False,
 'donuts': False,
 'ethiopian': False,
 'falafel': False,
 'food_court': False,
 'foodstands': False,
 'foodtrucks': False,
 'galle

In [18]:
# create an empty list to hold transformed data
clean = []

# iterate over raw data
for line in data:
    parsed = parse_yelp_business(line)
    clean.append(parsed)

In [19]:
# load cleaned data into a Pandas DataFrame
df = pd.DataFrame(clean)

In [20]:
# inspect cleaned data
df.head()

Unnamed: 0,address1,address2,address3,alias,australian,bagels,bakeries,banks,barbers,bars,...,vegetarian,venezuelan,venues,videoandgames,vietnamese,waffles,wholesale_stores,wine_bars,wraps,zip_code
0,257 Hanover St,,,modern-pastry-shop-boston,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,2113
1,12 Farnsworth St,,,flour-bakery-café-boston-4,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,2210
2,323 Hanover St,,,the-daily-catch-boston,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,2113
3,165 Tremont St,,,thinking-cup-boston-2,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,2111
4,1595 Washington St,,,flour-bakery-and-cafe-boston-2,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,2118


In [21]:
len(df)

1000

## Load cleaned data into PostgreSQL

In [22]:
# create a connection to write df to database
engine = create_engine('postgresql://postgres:apassword@localhost:5432/postgres')
df.to_sql(name='yelp_businesses_clean', con=engine, if_exists = 'replace', chunksize=2500, index=False) 