# Yelp API

In [2]:
import psycopg2
import yaml
import requests
from pprint import pprint
from datetime import datetime
import json
import pandas as pd
from sqlalchemy import create_engine

## Connect to PostgreSQL database

In [3]:
# connect to the databse
conn = psycopg2.connect(database="postgres",
                        user="postgres",
                        password="apassword",
                        host="192.168.0.104",
                        port="5432")

# enable autocommit
conn.autocommit = True

# define cursor
cur = conn.cursor()

# create a table
cur.execute("""CREATE TABLE IF NOT EXISTS yelp_business_search
               (id varchar PRIMARY KEY NOT NULL,
                business jsonb NOT NULL)""")

## Collect data from Yelp API

In [4]:
# read in config file
with open('/home/curtis/etc/yelp.yaml') as f:
    config = yaml.load(f)
    
# get the API key
api_key = config['yelp.com'][0]['key']

In [5]:
# define the base URL for the request
base_url = 'https://api.yelp.com/v3/businesses/search'

# define the header for the request
headers = {
    'Authorization': 'Bearer %s' % api_key,
}

# define the parameters for the request
params = {
    'location': 'Boston',
    'categories': 'coffee'
}

# submit the request to the API
response = requests.get(base_url, headers=headers, params=params)

# checek the status of the response
response.status_code

# save response data
data = response.json()

## Investigate structure of data returned from API

In [6]:
# inspect repsonse structure
data.keys()

dict_keys(['total', 'region', 'businesses'])

In [7]:
# what is region?
data['region']

{'center': {'latitude': 42.34784169448538, 'longitude': -71.07124328613281}}

In [8]:
# what is total?
data['total']

1100

In [9]:
# what type of object is businesses?
type(data['businesses'])

list

In [10]:
# what is the first business look like?
pprint(data['businesses'][0])

{'alias': 'modern-pastry-shop-boston',
 'categories': [{'alias': 'bakeries', 'title': 'Bakeries'},
                {'alias': 'coffee', 'title': 'Coffee & Tea'},
                {'alias': 'desserts', 'title': 'Desserts'}],
 'coordinates': {'latitude': 42.36324, 'longitude': -71.05474},
 'display_phone': '(617) 523-3783',
 'distance': 2185.181158712615,
 'id': '54ElwAyN-o8e4uvOkC85hw',
 'image_url': 'https://s3-media1.fl.yelpcdn.com/bphoto/BBuJF89-g0zFa1HcCHmF0w/o.jpg',
 'is_closed': False,
 'location': {'address1': '257 Hanover St',
              'address2': '',
              'address3': '',
              'city': 'Boston',
              'country': 'US',
              'display_address': ['257 Hanover St', 'Boston, MA 02113'],
              'state': 'MA',
              'zip_code': '02113'},
 'name': 'Modern Pastry Shop',
 'phone': '+16175233783',
 'price': '$',
 'rating': 4.0,
 'review_count': 1594,
 'transactions': ['delivery'],
 'url': 'https://www.yelp.com/biz/modern-pastry-shop-boston

In [11]:
# get the unique ids
ids = set(line['id'] for line in data['businesses'])

print(len(ids))

20


## Load raw source data into PostgreSQL database

In [12]:
# iterate over response data and insert into a postgreSQL
for i in data['businesses']:
    
    try:

        # put data into databse
        cur.execute("""INSERT INTO yelp_business_search
                       (id, business) 
                       VALUES (%s, %s)""", [i['id'], json.dumps(i)])

    except:
        
        # print warning
        print(i['id'])
        break

54ElwAyN-o8e4uvOkC85hw


In [13]:
# what is the first business look like?
pprint(data['businesses'][0])

{'alias': 'modern-pastry-shop-boston',
 'categories': [{'alias': 'bakeries', 'title': 'Bakeries'},
                {'alias': 'coffee', 'title': 'Coffee & Tea'},
                {'alias': 'desserts', 'title': 'Desserts'}],
 'coordinates': {'latitude': 42.36324, 'longitude': -71.05474},
 'display_phone': '(617) 523-3783',
 'distance': 2185.181158712615,
 'id': '54ElwAyN-o8e4uvOkC85hw',
 'image_url': 'https://s3-media1.fl.yelpcdn.com/bphoto/BBuJF89-g0zFa1HcCHmF0w/o.jpg',
 'is_closed': False,
 'location': {'address1': '257 Hanover St',
              'address2': '',
              'address3': '',
              'city': 'Boston',
              'country': 'US',
              'display_address': ['257 Hanover St', 'Boston, MA 02113'],
              'state': 'MA',
              'zip_code': '02113'},
 'name': 'Modern Pastry Shop',
 'phone': '+16175233783',
 'price': '$',
 'rating': 4.0,
 'review_count': 1594,
 'transactions': ['delivery'],
 'url': 'https://www.yelp.com/biz/modern-pastry-shop-boston

## Investigate repsonse fields

In [14]:
categories = set()
for line in data['businesses']:
    try:
        for cat in line['categories']:
            categories.add(cat['alias'])
    except:
        print(line['id'])
    
categories

{'bagels',
 'bakeries',
 'breakfast_brunch',
 'bubbletea',
 'cafes',
 'chocolate',
 'coffee',
 'desserts',
 'icecream',
 'italian',
 'juicebars',
 'sandwiches',
 'seafood'}

In [15]:
transactions = set()
for line in data['businesses']:
    try:
        for i in line['transactions']:
            transactions.add(i)
    except:
        print(line['id'])
    
transactions

{'delivery', 'pickup'}

## Extract raw source data to perform light ETL

In [16]:
def parse_yelp_business(line):
    """
    Flatten nested-json and pull key features from dataset
    """
    
    row = {
        'alias': line['alias'],
        'latitude': line['coordinates']['latitude'],
        'longitude': line['coordinates']['longitude'],
        'id': line['id'],
        'image_url': line['image_url'],
        'address1': line['location']['address1'],
        'address2': line['location']['address2'],
        'address3': line['location']['address3'],
        'city': line['location']['city'],
        'state': line['location']['state'],
        'zip_code': line['location']['zip_code'],
        'name': line['name'],
        'phone': line['phone'],
        'price': line['price'],
        'rating': line['rating'],
        'review_count': line['review_count'],
        'url': line['url'],
    }
    
    # create indicator variables for category
    for cat in categories:
        for i in line['categories']:
            if i['alias'] in cat:
                row[cat] = True
            else:
                row[cat] = False
                
    # create indicator variables for transaction
    for trans in transactions:
        for i in line['transactions']:
            if i in trans:
                row[trans] = True
            else:
                row[trans] = False
                
    return row

In [17]:
# review raw data
data['businesses'][0]

{'alias': 'modern-pastry-shop-boston',
 'categories': [{'alias': 'bakeries', 'title': 'Bakeries'},
  {'alias': 'coffee', 'title': 'Coffee & Tea'},
  {'alias': 'desserts', 'title': 'Desserts'}],
 'coordinates': {'latitude': 42.36324, 'longitude': -71.05474},
 'display_phone': '(617) 523-3783',
 'distance': 2185.181158712615,
 'id': '54ElwAyN-o8e4uvOkC85hw',
 'image_url': 'https://s3-media1.fl.yelpcdn.com/bphoto/BBuJF89-g0zFa1HcCHmF0w/o.jpg',
 'is_closed': False,
 'location': {'address1': '257 Hanover St',
  'address2': '',
  'address3': '',
  'city': 'Boston',
  'country': 'US',
  'display_address': ['257 Hanover St', 'Boston, MA 02113'],
  'state': 'MA',
  'zip_code': '02113'},
 'name': 'Modern Pastry Shop',
 'phone': '+16175233783',
 'price': '$',
 'rating': 4.0,
 'review_count': 1594,
 'transactions': ['delivery'],
 'url': 'https://www.yelp.com/biz/modern-pastry-shop-boston?adjust_creative=MGVKNU5prVDnLKTWHJebZQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=MG

In [18]:
# review parses data
parse_yelp_business(data['businesses'][0])

{'address1': '257 Hanover St',
 'address2': '',
 'address3': '',
 'alias': 'modern-pastry-shop-boston',
 'bagels': False,
 'bakeries': False,
 'breakfast_brunch': False,
 'bubbletea': False,
 'cafes': False,
 'chocolate': False,
 'city': 'Boston',
 'coffee': False,
 'delivery': True,
 'desserts': True,
 'icecream': False,
 'id': '54ElwAyN-o8e4uvOkC85hw',
 'image_url': 'https://s3-media1.fl.yelpcdn.com/bphoto/BBuJF89-g0zFa1HcCHmF0w/o.jpg',
 'italian': False,
 'juicebars': False,
 'latitude': 42.36324,
 'longitude': -71.05474,
 'name': 'Modern Pastry Shop',
 'phone': '+16175233783',
 'pickup': False,
 'price': '$',
 'rating': 4.0,
 'review_count': 1594,
 'sandwiches': False,
 'seafood': False,
 'state': 'MA',
 'url': 'https://www.yelp.com/biz/modern-pastry-shop-boston?adjust_creative=MGVKNU5prVDnLKTWHJebZQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=MGVKNU5prVDnLKTWHJebZQ',
 'zip_code': '02113'}

In [19]:
# create an empty list to hold transformed data
clean = []

# iterate over raw data
for line in data['businesses']:
    parsed = parse_yelp_business(line)
    clean.append(parsed)

In [20]:
# load cleaned data into a Pandas DataFrame
df = pd.DataFrame(clean)

In [21]:
# inspect cleaned data
df.head()

Unnamed: 0,address1,address2,address3,alias,bagels,bakeries,breakfast_brunch,bubbletea,cafes,chocolate,...,phone,pickup,price,rating,review_count,sandwiches,seafood,state,url,zip_code
0,257 Hanover St,,,modern-pastry-shop-boston,False,False,False,False,False,False,...,16175233783,False,$,4.0,1594,False,False,MA,https://www.yelp.com/biz/modern-pastry-shop-bo...,2113
1,12 Farnsworth St,,,flour-bakery-café-boston-4,False,False,False,False,False,False,...,16173384333,,$$,4.5,999,True,False,MA,https://www.yelp.com/biz/flour-bakery-caf%C3%A...,2210
2,323 Hanover St,,,the-daily-catch-boston,False,False,False,False,False,False,...,16175238567,,$$,4.0,1396,False,False,MA,https://www.yelp.com/biz/the-daily-catch-bosto...,2113
3,165 Tremont St,,,thinking-cup-boston-2,False,False,False,False,False,False,...,16174825555,,$$,4.0,1048,False,False,MA,https://www.yelp.com/biz/thinking-cup-boston-2...,2111
4,1595 Washington St,,,flour-bakery-and-cafe-boston-2,False,False,False,False,False,False,...,16172674300,True,$$,4.0,826,True,False,MA,https://www.yelp.com/biz/flour-bakery-and-cafe...,2118


In [22]:
len(df)

20

## Load cleaned data into PostgreSQL

In [89]:
# create a connection to write df to database
engine = create_engine('postgresql://postgres:apassword@localhost:5432/postgres')
df.to_sql(name='yelp_businesses_clean', con=engine, if_exists = 'replace', chunksize=2500, index=False) 