# Rating prediction using Bidirectional LSTM - part 1: data preparation

## Introduction

In this notebook, we use BeautifulSoup to parse the html information from the webpages, then we use Yelp Developers API to extract restaurant reviews. We extract around 80K reviews from restaurant located in 4 districts(Shadyside, Point Breeze, Downtown and Strip district) in Pittsburgh. We then combine the 4 sets of reviews together into a single csv file which will be used as a raw dataset for the model training in part2.

## 1. Install and import packages


We use BeautifulSoup to parse the html pages on Yelp website. The other packages are assumed to be installed on the machine.

In [1]:
# setup library imports
import io, time, json
import requests
from pathlib import Path
from bs4 import BeautifulSoup
import math

## 2. Get web page html

In [2]:
def retrieve_html(url):
    """
    Return the raw HTML at the specified URL.

    Args:
        url (string): 

    Returns:
        status_code (integer):
        raw_html (string): the raw HTML content of the response, properly encoded according to the HTTP headers.
    """
    response = requests.get(url)
    return response.status_code, response.text


## 3. Read yelp API key

In [3]:
def read_api_key(filepath="yelp_api_key.txt"):
    """
    Read the Yelp API Key from file.
    
    Args:
        filepath (string): File containing API Key
    Returns:
        api_key (string): The API Key
    """
    
    # Feel free to modify this function if you are storing the API Key differently
    return Path(filepath).read_text().strip()

## 4. Get yelp business Info

In [4]:
def yelp_search(api_key, query):
    """
    Make an authenticated request to the Yelp API.

    Args:
        query (string): Search term

    Returns:
        total (integer): total number of businesses on Yelp corresponding to the query
        businesses (list): list of dicts representing each business
    """
    url = "https://api.yelp.com/v3/businesses/search"
    headers = {"Authorization" : "Bearer %s" % (api_key)}
    params = {"location" : query}
    response = requests.get(url, params = params, headers = headers)
    result = json.loads(response.text)
    return result["total"], result["businesses"]


## 5. Get all restaurants business information

The important module in this part is collecting all the restaurants based on the filter we set. Since the API has a limit of accessing frequenccy, we should scrap the results with short intervals and page by page.

In [5]:
def all_restaurants(api_key, query):
    """
    Retrieve ALL the restaurants on Yelp for a given query.

    Args:
        query (string): Search term

    Returns:
        results (list): list of dicts representing each business
    """
    url = "https://api.yelp.com/v3/businesses/search"
    headers = {"Authorization" : "Bearer %s" % (api_key)}
    offset = 0
    params = {"location" : query, "categories": "restaurants", "limit": 40}
    response = requests.get(url, params = params, headers = headers)
    result = json.loads(response.text)
    total = result["total"]
    final = []
    
    while offset<total:
        params["offset"] = offset
        response = requests.get(url, params = params, headers = headers)
        offset += 40
        result = json.loads(response.text)
        final += result["businesses"]
        time.sleep(0.2)
    return final

## 6. Get urls from business information

In [6]:
def parse_api_response(data):
    """
    Parse Yelp API results to extract restaurant URLs.
    
    Args:
        data (string): String of properly formatted JSON.

    Returns:
        (list): list of URLs as strings from the input JSON.
    """
    
    return [ele["url"].split("?")[0] for ele in data]


## 7. Parse html page

In [7]:
def parse_page(html):

    soup = BeautifulSoup(html, "html.parser")
    
    result = []
    reviews = soup.find_all("script",attrs={"type":"application/ld+json"})[0]
    reviews = json.loads(reviews.contents[0])
    reviews_count = reviews["aggregateRating"]["reviewCount"]
    reviews = reviews["review"]
    for review in reviews:
        tmp = {}
        tmp["author"] = review["author"]
        tmp["rating"] = float(review["reviewRating"]["ratingValue"])
        tmp["date"] = review["datePublished"]
        tmp["description"] = review["description"]
        result.append(tmp)
         
    return result, math.ceil(reviews_count/20)

## 8. Extract yelp review from html pages

In [8]:
def extract_reviews(url):
    """
    Retrieve ALL of the reviews for a single restaurant on Yelp.

    Parameters:
        url (string): Yelp URL corresponding to the restaurant of interest.

    Returns:
        reviews (list): list of dictionaries containing extracted review information
    """
    tmp, pages = parse_page(retrieve_html(url)[1])
    res = []
    for i in range(pages):
        print(f"parsing - {i} page")
        if i>10:
            break
        if i>0:
            current, tmp_count = parse_page(retrieve_html(url+"?start="+str(20*i))[1])
        else:
            current = tmp
        res+=current[:]
    
    return res


## 9. Start scrap yelp review data

In [12]:
api_key = read_api_key()
places = ['Strip District, Pittsburgh']
# places = ['Shadyside, Pittsburgh','Downtown, Pittsburgh', 'Strip District, Pittsburgh', 'Point Breeze, Pittsburgh']

for place in places:
    print(f"Search restaurants in {place} :")
    businesses = all_restaurants(api_key, place)
    urls = set(parse_api_response(businesses))
    
    print("Start extract reviews:")
    with open(place+".review","w") as fd:
        for url in urls:
            print(url)
            try:
                reviews = extract_reviews(url)
                reviews = [json.dumps(rev) for rev in reviews]
                print(f"reviews number: {len(reviews)}, distinct_reviews: {len(set(reviews))}")
                for review in reviews:
                    fd.write(review+"\n")
            except Exception:
                print("Running into error!")
                print(Exception)
                continue

Search restaurants in Strip District, Pittsburgh :
Start extract reviews:
Running into error!
<class 'Exception'>
Running into error!
<class 'Exception'>
Running into error!
<class 'Exception'>
Running into error!
<class 'Exception'>
Running into error!
<class 'Exception'>
Running into error!
<class 'Exception'>
Running into error!
<class 'Exception'>
Running into error!
<class 'Exception'>
Running into error!
<class 'Exception'>
Running into error!
<class 'Exception'>
Running into error!
<class 'Exception'>
Running into error!
<class 'Exception'>
Running into error!
<class 'Exception'>
Running into error!
<class 'Exception'>
Running into error!
<class 'Exception'>
Running into error!
<class 'Exception'>
Running into error!
<class 'Exception'>
Running into error!
<class 'Exception'>
Running into error!
<class 'Exception'>
Running into error!
<class 'Exception'>
Running into error!
<class 'Exception'>
Running into error!
<class 'Exception'>
Running into error!
<class 'Exception'>
Runnin

KeyboardInterrupt: 

## 10. Transform review data file from json to csv

In [6]:
import csv

file_names = [place+".review" for place in places]

entire_review_data = []

for file_name in file_names:
    with open(file_name, "r") as fd:
        line = fd.readline()
        while line :
            review_dict = json.loads(line)
            text, rating = review_dict["description"], review_dict["rating"]
            text = text.replace("\"","\'").replace("\n", "")
            entire_review_data.append([rating, text])
            line = fd.readline()

def write_csv_file(file_name, review_data):
    with open(file_name, "w") as fd:
        fd.write("rating,text\n")
        for rating, text in review_data:
            fd.write("\""+str(rating)+"\""+","+"\""+text+"\"\n")


## 11. Split dataset to train, dev and test

In [5]:
import numpy as np
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(entire_review_data, test_size=0.3, random_state=15618)

write_csv_file("Pittsburgh_review.train", train_data)
write_csv_file("Pittsburgh_review.test", test_data)



## BILSTM Model
