# Yelp API data

## 1. Data requesting

In [16]:
from __future__ import print_function

import argparse
import json
import pprint
import requests
import sys
import urllib
import progressbar

try:
    # For Python 3.0 and later
    from urllib.error import HTTPError
    from urllib.parse import quote
    from urllib.parse import urlencode
except ImportError:
    # Fall back to Python 2's urllib2 and urllib
    from urllib2 import HTTPError
    from urllib import quote
    from urllib import urlencode

In [46]:
# Yelp API now uses private keys to authenticate requests (API Key)
# You can find it on
# https://www.yelp.com/developers/v3/manage_app or search Yelp Fusion for guidance

API_KEY= "bxRZR_xJJUk2x9rEDFC9p7G-eP6UtgBO4gjLK1uajbWFZX4LifAy7yJXQmgmjKHdTIbngaIF0spwIRFhAcp4-Ro2RNBuKxL5xflSQ5mQ-usaMfsJWweDjQDuiqQNYnYx"

# API constants, details are shown at https://www.yelp.com/developers/documentation/v3/business_search
# remember search following the keys that allowed as demanded
API_HOST = 'https://api.yelp.com'
SEARCH_PATH = '/v3/businesses/search'
BUSINESS_PATH = '/v3/businesses/'  

# Defaults for simple example.
DEFAULT_TERM = 'restaurant'
DEFAULT_LOCATION = 'Manhattan'
SEARCH_LIMIT = 50
DEFAULT_OFFSET = 1000 # should be no more than 1000


def request(host, path, api_key, url_params=None):
    """Given your API_KEY, send a GET request to the API.

    Args:
        host (str): The domain host of the API
        path (str): The path of the API after the domain
        API_KEY (str): Your API Key
        url_params (dict): An optional set of query parameters in the request

    Returns:
        dict: The JSON response from the request.

    Raises:
        HTTPError: An error occurs from the HTTP request.
    """
    url_params = url_params or {}
    url = '{0}{1}'.format(host, quote(path.encode('utf8')))
    headers = {
        'Authorization': 'Bearer %s' % api_key,
    }

    response = requests.request('GET', url, headers=headers, params=url_params)

    return response.json()


def search(api_key, term, location, offset, category):
    """Query the Search API by a search term and location.

    Args:
        term (str): The search term passed to the API
        location (str): The search location passed to the API

    Returns:
        dict: The JSON response from the request.
    """

    url_params = {
        'term': term.replace(' ', '+'),
        'location': location.replace(' ', '+'),
        'limit': SEARCH_LIMIT,
        'offset': offset, 
        'categories': category
    }
    
    return request(API_HOST, SEARCH_PATH, api_key, url_params=url_params)


def get_business(api_key, business_id):
    """Query the Business API by a business ID.

    Args:
        business_id (str): The ID of the business to query

    Returns:
        dict: The JSON response from the request.
    """
    
    business_path = BUSINESS_PATH + business_id

    return request(API_HOST, business_path, api_key)


def query_api(term, location, category):
    """ Queries the API by the input values from the user.
    
    Args:
        term (str): The search term to query, default by "restaurant"
        location (str): The location of the business term to query
    
    Returns:
        dict: The JSON response by the user-defined query
    """
    
    businesses = []
    offset = 0
    
    # progress bar
    bar = progressbar.ProgressBar(
    widgets=[
        'Loading ' + category, 
        ' ', progressbar.Percentage(),
        ' ', progressbar.Bar('#'),
        ' ', progressbar.Timer(),
        ' ( ', progressbar.ETA(), ' ) ', 
#         progressbar.FileTransferSpeed()
    ]
)
    
    for offset in bar(range(0, DEFAULT_OFFSET, 50)):
        response = search(API_KEY, term, location, offset, category)
        businesses.extend(response.get('businesses'))
    
    # if found nothing:
    if not businesses:
        print(u'No businesses for {0} in {1} found.'.format(term, location))
        return
    
    return businesses

def main(category='all', term="", location=""):
    """ Get the query result of input category in yelp api.
    
    Args:
        category (str): requested category to query in yelp
        term (str) : The business term given
        location (str): The location of the business term to query
    
    Returns:
        dict: The JSON response by the user-defined query
    """
        
    # Set to default if not declared
    if not term:
        term = DEFAULT_TERM
    if not location:
        location = DEFAULT_LOCATION
        
    result = []
    try:
        mid = query_api(term, location, category)
        result.extend(mid)
        
    except HTTPError as error:
        sys.exit(
            'Encountered HTTP error {0} on {1}:\n {2}\nAbort program.'.format(
                error.code,
                error.url,
                error.read(),
            )
        )

    return result

In [47]:
if __name__ == '__main__':
    cn = main(category = 'chinese')
    hal = main(category = 'halal')
    jap = main(category = 'japanese')
    korean = main(category = 'korean')
    vegan = main(category = 'vegan')
    mexican = main(category = 'mexican')
    italian = main(category = 'italian')
    mide = main(category = 'mideastern')

Loading chinese 100% |################| Elapsed Time: 0:00:19 ( Time: 0:00:19 ) 
Loading halal 100% |##################| Elapsed Time: 0:00:14 ( Time: 0:00:14 ) 
Loading japanese 100% |###############| Elapsed Time: 0:00:18 ( Time: 0:00:18 ) 
Loading korean 100% |#################| Elapsed Time: 0:00:15 ( Time: 0:00:15 ) 
Loading vegan 100% |##################| Elapsed Time: 0:00:14 ( Time: 0:00:14 ) 
Loading mexican 100% |################| Elapsed Time: 0:00:18 ( Time: 0:00:18 ) 
Loading italian 100% |################| Elapsed Time: 0:00:19 ( Time: 0:00:19 ) 
Loading mideastern 100% |#############| Elapsed Time: 0:00:13 ( Time: 0:00:13 ) 


## 2. Data Modification

### (1) transfer data to needed information in json

In [48]:
def compress_DB(data, label):
    """shape the data from original jason type to smaller type 
       containing only the information we want
    
    Args:
        data: (list) total records of the data attributes 
        label: (str) the label attribute to attach on the data
        
    Returns:
        dict: compressed data with demanded information
    """
    
    key_set = set(['id','name','address','rating','review_count',
                   'coordinates','zip_code'])
    
    for record in data:
        # define the keys to delete from dict
        unwant = set(record.keys()) - key_set
        
        address = record['location']
        for key in unwant:
            del record[key]
            
        record['cuisineType'] = label
        record['address'] = address['address1']
        record['zip_code'] = address['zip_code']
    
    return data

In [49]:
# data combination
dataDB = []
dataDB += compress_DB(cn, 'chinese')
dataDB += compress_DB(hal, 'halal')
dataDB += compress_DB(jap, 'japanese')
dataDB += compress_DB(korean, 'korean')
dataDB += compress_DB(vegan, 'vegan')
dataDB += compress_DB(mexican, 'mexican')
dataDB += compress_DB(italian, 'italian')
dataDB += compress_DB(mide, 'middle-east')

In [56]:
# delete duplicates
def delete(all_data):
    """Delete duplicate data with same id
    
    Args:
        data (list): json format data
        name (str): name of file to store
    
    Returns:
        dict: data with no duplicate id
    """
    
    id_set = set()
    for i in range(len(all_data)-1, -1, -1):
        if all_data[i]['id'] in id_set:
            del all_data[i]
        else:
            id_set.add(all_data[i]['id'])
    
    return all_data

In [66]:
dataDB = delete(dataDB)

### (2) Write data to dynamoDB

In [67]:
def write_jsonDB(data, name):
    """Dump input data to json file with given name
    
    Args:
        data (list): json format data
        name (str): name of file to store
    
    Returns:
        None
    """
    
    with open('./{}.json'.format(name), 'w', encoding='UTF-8') as fp:
        fp.write(json.dumps(data,indent=2, ensure_ascii=False))

In [None]:
write_jsonDB(dataDB, name="dataDB")

### (3) Convert to Elastic Search format

In [209]:
def convertES(data):
    """Convert input json format data to ES format 
    
    Args:
        data (list): original json format data
    
    Returns:
        dict: converted format of input data
    """
    
    mydata = []
    for i in range(len(data)):
        # follow the format of elastic search with _index, _id and content
        mydata.append({'index': {'_index': data[i]['cuisineType'], "_id": i+1} })
        mydata.append({'restaurant_id': data[i]['id']})

    return mydata

In [210]:
dataES = convertES(dataDB)

### (4) Write data to ES

In [211]:
def write_jsonES(data, name):
    """Dump input data to json file with given name
    
    Args:
        data (list): converted json format data
        name (str): name of file to store
    
    Returns:
        None
    """
    
    with open('./{}.json'.format(name), 'w', encoding='UTF-8') as fp:
        for ele in data:
            # shaped data in two lines
            fp.write(json.dumps(ele, ensure_ascii=False))
            fp.write('\r\n')

In [212]:
write_jsonES(dataES, name='dataES_test')

## 3. DynamoDB Data Insert

In [2]:
import json
import boto3
from botocore.exceptions import ClientError

Since we need to upload the data to dynamoDB on AWS, we write a lambda function below to trigue it.

In [164]:
def lambda_handler(event, context):

    data = read_json(file_name='all.json')
    insert_data(data)

    return

def read_json(file_name):
    with open(file_name,'r') as load_f:
        load_dict = json.load(load_f)
        
    return load_dict


def insert_data(data_list, db=None, table='6998Demo'):
    if not db:
        db = boto3.resource('dynamodb')
    table = db.Table(table)
    
    # overwrite if the same index is provided
    for data in data_list:
        response = table.put_item(Item=data)
        
    return response


def lookup_data(key, db=None, table='6998DB'):
    if not db:
        db = boto3.resource('dynamodb')
    table = db.Table(table)
    try:
        response = table.get_item(Key=key)
    except ClientError as e:
#         print('Error', e.response['Error']['Message'])
        return False
    else:
#         print(response['Item'])
        return True


def update_item(key, feature, db=None, table='6998DB'):
    if not db:
        db = boto3.resource('dynamodb')
    table = db.Table(table)
    # change student location
    response = table.update_item(
        Key=key,
        UpdateExpression="set #feature=:f",
        ExpressionAttributeValues={
            ':f': feature
        },
        ExpressionAttributeNames={
            "#feature": "from"
        },
        ReturnValues="UPDATED_NEW"
    )
    print(response)
    return response

def delete_item(key, db=None, table='6998DB'):
    if not db:
        db = boto3.resource('dynamodb')
    table = db.Table(table)
    try:
        response = table.delete_item(Key=key)
    except ClientError as e:
        print('Error', e.response['Error']['Message'])
    else:
        print(response)
        return response

## 4. Elastic Search

In [None]:
def es_match(message, number=3):
    """
        using elastic search to search 'id' with given record number and message
        message here is the index in elastic search
        
        return type: {'id1': value1, 'id2': value2, ...}
    """
    
    region = 'us-east-1' 
    service = 'es'
    credentials = boto3.Session().get_credentials()
    awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service, session_token=credentials.token)
    
    host = 'https://search-restaurants-hziyyv6c43ou56b6nj7jefglp4.us-east-1.es.amazonaws.com' # OpenSearch domain endpoint
    index = message['cuisine']
    url = host + '/' + index + '/_search'

    # Put the user query into the query DSL for more accurate search results.
    # in query, "size" parameter returns the total matched values

    query = {
        "size": number, # number of records to return
        "query": {
            "multi_match": {
                "query": index,      # match word
                "fields": ["_index"] # search field, can be multiple matching
            }
        },
        "sort": {
        "_script": {
            "script": "Math.random()", # shuffle to get random results
            "type": "number",
            "order": "asc"
            }
        }
    }

    # Elasticsearch 6.x requires an explicit Content-Type header
    headers = {"Content-Type": "application/json" }

    # Make the signed HTTP request
    r = requests.get(url, auth=awsauth, headers=headers, data=json.dumps(query))

    # Create the response and add some extra content to support CORS
    response = {
        "statusCode": 200,
        "headers": {
            "Access-Control-Allow-Origin": '*'
        },
        "isBase64Encoded": False
    }
    
    # Add the search results to the response
    response['body'] = r.text
    
    # get elastic search results and load by json
    rest_id = []
    Idx = json.loads(r.text)

    if Idx['hits']:
        for record in Idx['hits']['hits']:
            # transfer format 'restaurant_id' to 'id' to fit dynamoDB search
            rest_id.append({'id': record['_source']['restaurant_id']})

    return rest_id