In [2]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import re

In [None]:
'''
    Go to starting page.
    Gather all the links in that one page.
    Go to next page.
    Repeat.
    Stop when the page does not exist. 
        Problem: the page exists even when it shouldn't
        Example: searching &start=1000 seems to produce the first page
        Solution: need to create a numpy array of links,
            append the new links if the new links are not found
            stop if the links are found
        Current Solution (going to change this): 
            Specify a limit
'''

In [10]:
def gather_Yelp_data(url, cap_num):
    """
        Given a starting url and a page limit,
        gather all the links to businesses.
        Go to the next page.
        Gather all links.
        Repeat a specified number of times.
        Return a numpy array of all links gathered.
    """
    item_number = 1
    websites = np.array([])
    while item_number <= cap_num:
        try:
            html = urlopen(url + "&start=" + str(item_number*10))
            bs = BeautifulSoup(html.read(), 'lxml')
            links = bs.findAll('a', {'class': "biz-name js-analytics-click"})
            for link in links:
                link_string = link.attrs['href']
                websites = np.append(websites, np.array([link_string]))
        except:
            pass
        item_number += 1
    return websites
starturl = """https://www.yelp.com/search?find_desc=cafe&find_loc=Chicago,+IL+60637"""
sites = gather_Yelp_data(starturl, 3)
sites

array(['/adredir?ad_business_id=TxXC31nSy249Oe-AJKagTQ&campaign_id=bfAMUIFasowO9I-Qp6u3EA&click_origin=search_results&placement=above_search&redirect_url=https%3A%2F%2Fwww.yelp.com%2Fbiz%2Fzanzibar-cafe-chicago&request_id=456b97e9f0fa7815&signature=d79026743d6a2120d924a7e42fe4853160d7286ad27249e2cc30f211c5938151&slot=0',
       '/biz/sip-and-savor-chicago?osq=cafe',
       '/biz/ex-libris-chicago?osq=cafe',
       '/biz/pret-a-manger-chicago-17?osq=cafe',
       '/biz/teamo-boba-bar-chicago-3?osq=cafe',
       '/biz/truenorth-cafe-hyde-park-chicago-2?osq=cafe',
       '/biz/chef-saras-cafe-chicago?osq=cafe',
       '/biz/kusanya-cafe-chicago?osq=cafe',
       '/biz/bridgeport-coffee-chicago-2?osq=cafe',
       '/biz/midway-market-chicago?osq=cafe',
       '/biz/sip-and-savor-chicago-3?osq=cafe',
       '/adredir?ad_business_id=TxXC31nSy249Oe-AJKagTQ&campaign_id=bfAMUIFasowO9I-Qp6u3EA&click_origin=search_results&placement=above_search&redirect_url=https%3A%2F%2Fwww.yelp.com%2Fbiz%2Fzanz

In [15]:
"""
    Scraping an individual business:
    Information that I want:
        Address
        Pricing
        Health Score
        Extra info on the right
        Reviews
"""
def getBasicInfo(url):
    """
        Given url of one business,
        gather all basic info except the reviews.
        Returns a dictionary.
    """
    dict_of_info = {}
    html = urlopen(url)
    bs = BeautifulSoup(html.read(), 'lxml')
    dict_of_info['Address'] = bs.find_all('address')[-1].text
    price_and_health = bs.find_all('dd', {'class': 'nowrap'})
    dict_of_info['Price Range'] = price_and_health[0].text
    try:
        dict_of_info['Health Score'] = price_and_health[1].text
    except:
        pass
    more_biz_info = bs.find('div', {'class': "short-def-list"}).find_all('dl',)
    for info in more_biz_info:
        dict_of_info[info.find('dt').text] = info.find('dd').text
    return dict_of_info

greenline_coffee = """https://www.yelp.com/biz/greenline-coffee-chicago"""
greenline = getBasicInfo(greenline_coffee)

In [17]:
def clear_white_space(i_dict):
    for key in i_dict.keys():
        i_dict[key.strip()] = i_dict.pop(key)
    for key in i_dict.keys():
        i_dict[key] = i_dict[key].strip()
    return i_dict

greenline = clear_white_space(clear_white_space (greenline)) # clear it twice
greenline

{'Accepts Apple Pay': 'No',
 'Accepts Credit Cards': 'Yes',
 'Accepts Google Pay': 'No',
 'Address': '501 E 61st StChicago, IL 60637',
 'Bike Parking': 'Yes',
 'Caters': 'No',
 'Gender Neutral Restrooms': 'Yes',
 'Good for Working': 'Yes',
 'Health Score': '95 out of 100',
 'Outdoor Seating': 'Yes',
 'Parking': 'Street, Private Lot',
 'Price Range': 'Inexpensive',
 'Take-out': 'Yes',
 'Wheelchair Accessible': 'Yes',
 'Wi-Fi': 'Free'}

In [19]:
"""
    Now to get the reviews:
    Want:
        star number
        date
        content
"""
def get_reviews_in_single_page(bs):
    """
        Given a parsed url,
        get all review info in a single page.
        Returns a df.
    """
    review_dict = {}
    # this gets the box of info of Yelp review
    review_form = bs.find(
        'ul', {'class': "ylist ylist-bordered reviews"}
        ).find_all(
        'div', {'class': "review-wrapper"})

    rating_list = []
    date_list = []
    comment_list = []
    for review in review_form[1:]:
        # gets number of stars
        rating_list.append(
            review.find('div', {'class': re.compile('i-stars i-stars--regular-.*')}).img.attrs['alt'])
        # gets dates
        date_list.append(
            review.find('span', {'class': "rating-qualifier"}).text)
        # gets comments
        comment_list.append(
            review.find('p', {'lang': "en"}).text)
    review_dict['Star Ratings'] = rating_list
    review_dict['Date'] = date_list
    review_dict['Comment'] = comment_list   
    return pd.DataFrame(review_dict)


In [25]:
def get_all_reviews(url, page_limit):
    page_num = 0
    review_df = pd.DataFrame()
    review_url = url
    while page_num <= page_limit:
        html = urlopen(review_url)
        bs = BeautifulSoup(html.read(), 'lxml')
        review_df = review_df.append(get_reviews_in_single_page(bs))
        page_num += 1
        review_url = url + '?start=' + str(page_num * 20)  # 20 reviews in each page
    review_df = review_df.drop_duplicates(['Comment'])  # .reset_index().drop('index',axis=1)
    return review_df

greenline_coffee_reviews = get_all_reviews(greenline_coffee, 2)
greenline_coffee_reviews

Unnamed: 0,Comment,Date,Star Ratings
0,Excited to find this chill space! Great friend...,\n 7/14/2018\n,5.0 star rating
1,This place is AWESOME!I was so hungry I gobble...,\n 9/8/2017\n,5.0 star rating
2,I love this place and what it's trying to do f...,\n 4/13/2018\n,5.0 star rating
3,This winter season has been very mild in compa...,\n 2/23/2016\n,4.0 star rating
4,"Very good coffee. Affordable and local, cant b...",\n 1/8/2018\n,5.0 star rating
5,I really enjoy bringing my work and sipping an...,\n 6/22/2017\n,5.0 star rating
6,"An oasis in the dessert! This is a nice, clean...",\n 5/19/2018\n,5.0 star rating
7,Pros: Very friendly ambiance especially if you...,\n 3/10/2016\n,5.0 star rating
8,This place is a great local coffee shop locate...,\n 8/9/2017\n,4.0 star rating
9,My coffee (caramel macchiato)tasted exactly th...,\n 8/18/2017\n,1.0 star rating


In [26]:
# Test code on a different business
sanc_url = """https://www.yelp.com/biz/sanctuary-cafe-chicago?osq=cafe"""
sanctuary_cafe = getBasicInfo(sanc_url)
sanctuary_cafe = clear_white_space(clear_white_space(sanctuary_cafe))
sanctuary_cafe

{'Accepts Apple Pay': 'Yes',
 'Accepts Bitcoin': 'No',
 'Accepts Credit Cards': 'Yes',
 'Accepts Google Pay': 'No',
 'Address': '5655 S University AveChicago, IL 60637',
 'Alcohol': 'No',
 'Ambience': 'Casual',
 'Attire': 'Casual',
 'Bike Parking': 'Yes',
 'Cannabis Products': 'No',
 'Caters': 'Yes',
 'Delivery': 'Yes',
 'Dogs Allowed': 'No',
 'Gender Neutral Restrooms': 'Yes',
 'Good For': 'Dessert',
 'Good for Groups': 'Yes',
 'Good for Kids': 'Yes',
 'Good for Working': 'Yes',
 'Has TV': 'No',
 'Noise Level': 'Quiet',
 'Outdoor Seating': 'No',
 'Parking': 'Street',
 'Price Range': 'Under $10',
 'Take-out': 'Yes',
 'Takes Reservations': 'No',
 'Waiter Service': 'No',
 'Wheelchair Accessible': 'Yes',
 'Wi-Fi': 'Free'}

In [27]:
sanctuary_cafe_reviews = get_all_reviews(sanc_url, 2)
sanctuary_cafe_reviews

Unnamed: 0,Comment,Date,Star Ratings
0,Cinnamon roll and cakes are incredible! Their ...,\n 10/10/2018\n,5.0 star rating
1,Popped in here for a quick cup of coffee while...,\n 4/24/2018\n,3.0 star rating
2,A fun cafe with a nice vibe and the perfect pl...,\n 9/28/2018\n,5.0 star rating
3,Love this place and what they stand for. Tried...,\n 9/9/2018\n,5.0 star rating
4,"While looking for an online menu, I found info...",\n 8/16/2017\n,5.0 star rating
5,I was expecting another cafe similar to Fabian...,\n 7/24/2017\n,3.0 star rating
6,"I love this space! Light, airy, clean and a gr...",\n 9/15/2017\n,5.0 star rating
7,"I was so excited to finally try this cafe, as ...",\n 5/8/2018\n,2.0 star rating
8,Sanctuary Cafe is a lovely place to study or g...,\n 1/16/2018\n,5.0 star rating
9,Love this place. Good selection of savory and...,\n 3/6/2018\n,5.0 star rating
