In [1]:
# setup library imports
import io, time, json
import requests
from pathlib import Path
from bs4 import BeautifulSoup
import math

## Handle Basic HTTP Requests


In [2]:

def retrieve_html(url):
    """
    Return the raw HTML at the specified URL.

    Args:
        url (string): 

    Returns:
        status_code (integer):
        raw_html (string): the raw HTML content of the response, properly encoded according to the HTTP headers.
    """
    response = requests.get(url)
    return response.status_code, response.text


## Read yelp API key

In [3]:
def read_api_key(filepath="yelp_api_key.txt"):
    """
    Read the Yelp API Key from file.
    
    Args:
        filepath (string): File containing API Key
    Returns:
        api_key (string): The API Key
    """
    
    # Feel free to modify this function if you are storing the API Key differently
    return Path(filepath).read_text().strip()

## Get yelp business Info

In [4]:
def yelp_search(api_key, query):
    """
    Make an authenticated request to the Yelp API.

    Args:
        query (string): Search term

    Returns:
        total (integer): total number of businesses on Yelp corresponding to the query
        businesses (list): list of dicts representing each business
    """
    url = "https://api.yelp.com/v3/businesses/search"
    headers = {"Authorization" : "Bearer %s" % (api_key)}
    params = {"location" : query}
    response = requests.get(url, params = params, headers = headers)
    result = json.loads(response.text)
    return result["total"], result["businesses"]


## Get all urls

In [5]:
def all_restaurants(api_key, query):
    """
    Retrieve ALL the restaurants on Yelp for a given query.

    Args:
        query (string): Search term

    Returns:
        results (list): list of dicts representing each business
    """
    url = "https://api.yelp.com/v3/businesses/search"
    headers = {"Authorization" : "Bearer %s" % (api_key)}
    offset = 0
    params = {"location" : query, "categories": "restaurants", "limit": 40}
    response = requests.get(url, params = params, headers = headers)
    result = json.loads(response.text)
    total = result["total"]
    final = []
    
    while offset<total:
        params["offset"] = offset
        response = requests.get(url, params = params, headers = headers)
        offset += 40
        result = json.loads(response.text)
        final += result["businesses"]
        time.sleep(0.2)
    return final

---

In [6]:
def parse_api_response(data):
    """
    Parse Yelp API results to extract restaurant URLs.
    
    Args:
        data (string): String of properly formatted JSON.

    Returns:
        (list): list of URLs as strings from the input JSON.
    """
    
    return [ele["url"] for ele in data]


In [7]:


def parse_page(html):

    soup = BeautifulSoup(html, "html.parser")
    
    result = []
    reviews = soup.find_all("script",attrs={"type":"application/ld+json"})[0]
    reviews = json.loads(reviews.contents[0])
    reviews_count = reviews["aggregateRating"]["reviewCount"]
    reviews = reviews["review"]
    for review in reviews:
        tmp = {}
        tmp["author"] = review["author"]
        tmp["rating"] = float(review["reviewRating"]["ratingValue"])
        tmp["date"] = review["datePublished"]
        tmp["description"] = review["description"]
        result.append(tmp)
         
    return result, math.ceil(reviews_count/20)

In [8]:

def extract_reviews(url):
    """
    Retrieve ALL of the reviews for a single restaurant on Yelp.

    Parameters:
        url (string): Yelp URL corresponding to the restaurant of interest.

    Returns:
        reviews (list): list of dictionaries containing extracted review information
    """
    tmp, pages = parse_page(retrieve_html(url)[1])
    res = []
    for i in range(pages):
        print(f"parsing - {i} page")
        if i>50:
            break
        if i>0:
            current, tmp_count = parse_page(retrieve_html(url+"?start="+str(20*i))[1])
        else:
            current = tmp
        res+=current
    
    return res


In [None]:
api_key = read_api_key()
places = ['Shadyside, Pittsburgh']

for place in places:
    print(f"Search restaurants in {place} :")
    businesses = all_restaurants(api_key, place)
    urls = parse_api_response(businesses)

    print("Start extract reviews:")
    with open(place+".review","w") as fd:
        for url in urls:
            reviews = extract_reviews(url)
            print(f"reviews number: {len(reviews)}")
            for review in reviews:
                fd.write(json.dumps(review)+"\n")


Search restaurants in Shadyside, Pittsburgh :
Start extract reviews:
parsing - 0 page
parsing - 1 page
parsing - 2 page
parsing - 3 page
parsing - 4 page
parsing - 5 page
parsing - 6 page
parsing - 7 page
parsing - 8 page
parsing - 9 page
parsing - 10 page
parsing - 11 page
parsing - 12 page
parsing - 13 page
parsing - 14 page
parsing - 15 page
parsing - 16 page
