In [1]:
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import re
import time
import random

In [None]:
"""
    Legal Issues:
        Don't worry about it.
        But keep note, I should respect robots.txt
        But if I do collect the data, DO NOT give it to others.
        Just don't sell or advertise that the data is mine.
        Don't mess with financial or gov data.
        Don't infringe on actual copyright stuff.
        Don't enter anything that requires permission, passwords.
        Don't scrape emails, usernames.
        Don't spam forms.
"""

"""
    Dealing With HTTP Errors:
        Don't move scrapers too quickly.
        Change headers.
        Don't anything a human wouldn't.
"""

"""
    Scraping Remotely:
        Use TOR browser (bounces IP address)
        DuckDuckGo does not store cookies
        Scrape on Google Cloud. Has access to changing IP addresses.
"""

In [2]:
session = requests.Session()
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
           'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'}

In [None]:
'''
    Go to starting page.
    Gather all the links in that one page.
    Go to next page.
    Repeat.
    Stop when the page does not exist. 
        Problem: the page exists even when it shouldn't
        Example: searching &start=1000 seems to produce the first page
        Solution: need to create a numpy array of links,
            append the new links if the new links are not found
            stop if the links are found
        Current Solution (going to change this): 
            Specify a limit
'''

In [3]:
def gather_Yelp_data(url, cap_num):
    """
        Given a starting url and a page limit,
        gather all the links to businesses.
        Go to the next page.
        Gather all links.
        Repeat a specified number of times.
        Return a numpy array of all links gathered.
    """
    item_number = 0
    websites = np.array([])
    while item_number <= cap_num:
        try:
            #html = urlopen(url + "&start=" + str(item_number*10))
            #bs = BeautifulSoup(html.read(), 'lxml')
            req = session.get(url + "&start=" + str(item_number*10), headers=headers)
            bs = BeautifulSoup(req.text, 'lxml')
            links = bs.findAll('a', {'class': "biz-name js-analytics-click"})
            for link in links:
                #print (link)
                link_string = link.attrs['href']
                websites = np.append(websites, np.array([link_string]))
        except:
            pass
        time.sleep (random.random())
        item_number += 1
    return websites
starturl = """https://www.yelp.com/search?find_desc=cafe&find_loc=60637"""

start_time = time.time()
sites = gather_Yelp_data(starturl, 0)
print("--- %s seconds ---" % (time.time() - start_time))
sites

--- 0.9090261459350586 seconds ---


array([], dtype=float64)

In [4]:
"""
    Huge Problem of Inconsistency:
    Sometimes things are returned, others nothing is returned. 
    Why? I don't know. Running the same thing returns different results.
    So how do I know when I'm not getting any result? Label Them
"""
def gather_Yelp_data(url, cap_num):
    item_number = 0
    websites = []
    num_label = []
    while item_number <= cap_num:
        try:
            req = session.get(url + "&start=" + str(item_number*10), headers=headers)
            bs = BeautifulSoup(req.text, 'lxml')
            links = bs.findAll('a', {'class': "biz-name js-analytics-click"})
            for link in links:
                link_string = link.attrs['href']
                websites.append (link_string)
                num_label.append (item_number)
        except:
            pass
        time.sleep (random.random())
        item_number += 1
    return pd.DataFrame ({'Link':websites, 'Num': num_label}), cap_num
starturl = """https://www.yelp.com/search?find_desc=cafe&find_loc=60637"""

yelp_page_num = 0
start_time = time.time()
sites, yelp_page_num = gather_Yelp_data(starturl, yelp_page_num)
print("--- %s seconds ---" % (time.time() - start_time))
sites

--- 21.07632803916931 seconds ---


Unnamed: 0,Link,Num


In [6]:
req = session.get(starturl + "&start=" + str(0), headers=headers)

In [7]:
bs = BeautifulSoup(req.text, 'lxml')
links = bs.findAll('a', {'class': "biz-name js-analytics-click"})
for link in links:
    link_string = link.attrs['href']
    print (link_string)
websites = []

/adredir?ad_business_id=JbAkhS96tW5lbOLjVwsQlQ&campaign_id=v9vTEM8xECDPpPldqe7BxQ&click_origin=search_results&placement=above_search&redirect_url=https%3A%2F%2Fwww.yelp.com%2Fbiz%2Fedible-arrangements-chicago-11&request_id=6b8306fcab115a64&signature=93ab4e2d026292b1e023ae6e9feff22376ad737cbc631f8d8da30a0c65f79a3c&slot=0
/adredir?ad_business_id=jIUfYtTGz3nEXxGid5kH3w&campaign_id=eBRciaMya6NTx6XfPYxGlw&click_origin=search_results&placement=above_search&redirect_url=https%3A%2F%2Fwww.yelp.com%2Fbiz%2Fritas-italian-ice-burbank-2&request_id=6b8306fcab115a64&signature=6fc20c1c0aaa99b8bb589851a40cf41480126a22267621943ffb027c7907be52&slot=1
/biz/plein-air-cafe-and-eatery-chicago-2?osq=cafe
/biz/robust-coffee-lounge-chicago?osq=cafe
/biz/harper-cafe-chicago?osq=cafe
/biz/build-coffee-chicago?osq=cafe
/biz/sanctuary-cafe-chicago?osq=cafe
/biz/greenline-coffee-chicago?osq=cafe
/biz/grounds-of-being-the-divinity-school-coffee-shop-chicago?osq=cafe
/biz/caf%C3%A9-logan-chicago?osq=cafe
/biz/cafe-53

In [None]:
'''
    Going to have to search by zip code.
    There are 42000 zip codes in US.
    How do we find the number of cafes in Yelp?
    Take a random sample (say 30) of zip codes.
    Search Yelp through the Zip Code.
    Find average number of businesses. 
    Multiply that by 42000
'''

'''
import random
for x in range(30):
  print (random.randint(10000,99999)) 
'''

num_cafes = [
    21, 10, 69, 33, 21, 55, 18, 39, 16, 47, 16, 24, 16, 37, 28, 26, 8, 37, 22, 5, 81
]
average_num_Cafe = sum(num_cafes) / len (num_cafes) # 29.952380952380953
num_cafe_in_US = average_num_Cafe * 43000 # 1287952

'''
    About 1.945 seconds per visited webpage (10 business for one webpage)
    Then about 69.6 hours for visiting all webpages and collecting websites.
    
    About 7.5 seconds per taking all review pages of a single business.
    About 3.3 seconds to take all additional info.
    10.8 seconds total.
    So then 161 days.
    
'''
print (num_cafe_in_US)

In [None]:
"""
    Scraping an individual business:
    Information that I want:
        Address
        Pricing
        Health Score
        Extra info on the right
        Reviews
"""
def getBasicInfo(url):
    """
        Given url of one business,
        gather all basic info except the reviews.
        Returns a dictionary.
    """
    dict_of_info = {}
    html = urlopen(url)
    bs = BeautifulSoup(html.read(), 'lxml')
    dict_of_info['Address'] = bs.find_all('address')[-1].text
    price_and_health = bs.find_all('dd', {'class': 'nowrap'})
    dict_of_info['Price Range'] = price_and_health[0].text
    try:
        dict_of_info['Health Score'] = price_and_health[1].text
    except:
        pass
    more_biz_info = bs.find('div', {'class': "short-def-list"}).find_all('dl',)
    for info in more_biz_info:
        dict_of_info[info.find('dt').text] = info.find('dd').text
    return dict_of_info

start_time = time.time()
greenline_coffee = """https://www.yelp.com/biz/greenline-coffee-chicago"""
greenline = getBasicInfo(greenline_coffee)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
def clear_white_space(i_dict):
    for key in i_dict.keys():
        i_dict[key.strip()] = i_dict.pop(key)
    for key in i_dict.keys():
        i_dict[key] = i_dict[key].strip()
    return i_dict

greenline = clear_white_space(clear_white_space (greenline)) # clear it twice
greenline

In [None]:
"""
    Now to get the reviews:
    Want:
        star number
        date
        content
"""
def get_reviews_in_single_page(bs):
    """
        Given a parsed url,
        get all review info in a single page.
        Returns a df.
    """
    review_dict = {}
    # this gets the box of info of Yelp review
    review_form = bs.find(
        'ul', {'class': "ylist ylist-bordered reviews"}
        ).find_all(
        'div', {'class': "review-wrapper"})

    rating_list = []
    date_list = []
    comment_list = []
    for review in review_form[1:]:
        # gets number of stars
        rating_list.append(
            review.find('div', {'class': re.compile('i-stars i-stars--regular-.*')}).img.attrs['alt'])
        # gets dates
        date_list.append(
            review.find('span', {'class': "rating-qualifier"}).text)
        # gets comments
        comment_list.append(
            review.find('p', {'lang': "en"}).text)
    review_dict['Star Ratings'] = rating_list
    review_dict['Date'] = date_list
    review_dict['Comment'] = comment_list   
    return pd.DataFrame(review_dict)

In [None]:
def get_all_reviews(url, page_limit):
    page_num = 0
    review_df = pd.DataFrame()
    review_url = url
    while page_num <= page_limit:
        html = urlopen(review_url)
        bs = BeautifulSoup(html.read(), 'lxml')
        review_df = review_df.append(get_reviews_in_single_page(bs))
        page_num += 1
        review_url = url + '?start=' + str(page_num * 20)  # 20 reviews in each page
    review_df = review_df.drop_duplicates(['Comment'])  # .reset_index().drop('index',axis=1)
    return review_df

start_time = time.time()
greenline_coffee_reviews = get_all_reviews(greenline_coffee, 2)
print("--- %s seconds ---" % (time.time() - start_time))
greenline_coffee_reviews

In [None]:
# Test code on a different business
sanc_url = """https://www.yelp.com/biz/sanctuary-cafe-chicago?osq=cafe"""
sanctuary_cafe = getBasicInfo(sanc_url)
sanctuary_cafe = clear_white_space(clear_white_space(sanctuary_cafe))
sanctuary_cafe

In [None]:
sanctuary_cafe_reviews = get_all_reviews(sanc_url, 2)
sanctuary_cafe_reviews

In [None]:
greenline_coffee_reviews.to_csv('Greenline Coffee Reviews.csv')