# Yelp API

In [1]:
import psycopg2
from pprint import pprint
import pandas as pd
from sqlalchemy import create_engine
import numpy as np

## Connect to PostgreSQL database

In [2]:
# connect to the databse
conn = psycopg2.connect(database="postgres",
                        user="postgres",
                        password="apassword",
                        host="192.168.0.104",
                        port="5432")

# enable autocommit
conn.autocommit = True

# define cursor
cur = conn.cursor()

## Get raw source data from database to process

In [3]:
cur.execute("""SELECT business
               FROM yelp_business_search;""")

# load the records into a list
data = [record[0] for record in cur]

print(len(data))

1000


## Investigate repsonse fields

In [4]:
# review raw data
data[1]

{'alias': 'flour-bakery-café-boston-4',
 'categories': [{'alias': 'bakeries', 'title': 'Bakeries'},
  {'alias': 'coffee', 'title': 'Coffee & Tea'},
  {'alias': 'sandwiches', 'title': 'Sandwiches'}],
 'coordinates': {'latitude': 42.35137, 'longitude': -71.04881},
 'display_phone': '(617) 338-4333',
 'distance': 1886.6887381554013,
 'id': '-5gWvrcKOPmhlcZju3tpbw',
 'image_url': 'https://s3-media3.fl.yelpcdn.com/bphoto/I3n77tHGUtZRerpsU8CtVQ/o.jpg',
 'is_closed': False,
 'location': {'address1': '12 Farnsworth St',
  'address2': '',
  'address3': '',
  'city': 'Boston',
  'country': 'US',
  'display_address': ['12 Farnsworth St', 'Boston, MA 02210'],
  'state': 'MA',
  'zip_code': '02210'},
 'name': 'Flour Bakery + Café',
 'phone': '+16173384333',
 'price': '$$',
 'rating': 4.5,
 'review_count': 999,
 'transactions': [],
 'url': 'https://www.yelp.com/biz/flour-bakery-caf%C3%A9-boston-4?adjust_creative=MGVKNU5prVDnLKTWHJebZQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_so

In [5]:
categories = set()
for line in data:
    try:
        for cat in line['categories']:
            categories.add(cat['alias'])
    except:
        print(line['id'])
    
list(categories)[:5]

['beerbar', 'kitchenandbath', 'tea', 'australian', 'seafood']

In [6]:
transactions = set()
for line in data:
    try:
        for i in line['transactions']:
            transactions.add(i)
    except:
        print(line['id'])
    
transactions

{'delivery', 'pickup'}

## Extract raw source data to perform light ETL

In [7]:
def get_categories(categories):
    
    """
    Pull all categories and convert to list
    """
    
    all_categories = []
    for i in categories:
        all_categories.append(i['alias'])
        
    return all_categories

In [8]:
def parse_yelp_business(line):
    """
    Flatten nested-json and pull key features from dataset
    """
    
    row = {
        'latitude': line['coordinates']['latitude'],
        'longitude': line['coordinates']['longitude'],
        'id': line['id'],
        'address1': line['location']['address1'],
        'address2': line['location']['address2'],
        'address3': line['location']['address3'],
        'city': line['location']['city'],
        'state': line['location']['state'],
        'zip_code': line['location']['zip_code'],
        'name': line['name'],
        'phone': line['phone'],
        'rating': line['rating'],
        'review_count': line['review_count'],
        'categories': get_categories(line['categories']),
    }
    
    if 'price' in row.keys():
        row['price'] = len(row['price'].strip())
    else:
        row['price'] = np.nan
                
    return row

In [9]:
# create an empty list to hold transformed data
clean = []

# iterate over raw data
for line in data:
    parsed = parse_yelp_business(line)
    clean.append(parsed)

In [10]:
# load cleaned data into a Pandas DataFrame
df = pd.DataFrame(clean)

In [11]:
# inspect cleaned data
df.head()

Unnamed: 0,address1,address2,address3,categories,city,id,latitude,longitude,name,phone,price,rating,review_count,state,zip_code
0,257 Hanover St,,,"[bakeries, coffee, desserts]",Boston,54ElwAyN-o8e4uvOkC85hw,42.36324,-71.05474,Modern Pastry Shop,16175233783,,4.0,1594,MA,2113
1,12 Farnsworth St,,,"[bakeries, coffee, sandwiches]",Boston,-5gWvrcKOPmhlcZju3tpbw,42.35137,-71.04881,Flour Bakery + Café,16173384333,,4.5,999,MA,2210
2,323 Hanover St,,,"[italian, seafood, coffee]",Boston,uXOVFQraz1va1TrtgiqYTg,42.36406,-71.05397,The Daily Catch,16175238567,,4.0,1396,MA,2113
3,165 Tremont St,,,[coffee],Boston,YPMMkUSAvZX4K-JsUbREyQ,42.3539,-71.06372,Thinking Cup,16174825555,,4.0,1048,MA,2111
4,1595 Washington St,,,"[bakeries, coffee, sandwiches]",Boston,SvSjcS0jjnFwl27RMnrkQQ,42.338443,-71.074548,Flour Bakery & Cafe,16172674300,,4.0,826,MA,2118


## Load cleaned data into PostgreSQL

In [12]:
# create a connection to write df to database
engine = create_engine('postgresql://postgres:apassword@localhost:5432/postgres')
df.to_sql(name='yelp_business_clean', con=engine, if_exists = 'replace', chunksize=2500, index=False) 