## Yelp Web Scraping Demo

In this notebook, we scrape all the info needed from a page of yelp for our improved search engine

In [1]:
from itertools import cycle
from lxml.html import fromstring
import os
import random
import re
import time
import traceback
from urllib.parse import urljoin,urlparse

from bs4 import BeautifulSoup
import numpy as np
import requests
import pandas as pd
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from tqdm import tqdm

In [4]:
resturants_columns = ['resturant_id','resturant','resturant_rating','cuisine','resturant_review_count','phone','address']
reviews_columns = ['resturant_id','user_id','description','rating','date','useful','funny','cool']
users_columns = ['user_id','name','user_review_count']

In [10]:
def get_connection(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.content,'html.parser')  
    h2_tag = soup.find('h2')
    if h2_tag is not None:
        if "robot" in h2_tag.text:
            options = Options()
            options.headless = False

            driver = webdriver.Firefox(executable_path = '../webdriver/geckodriver',   options=options)
            
            driver.get(url)
            input('Hit enter when Captcha is passed')
            driver.close()
            
            r = requests.get(url)
    return(r)

In [64]:
def review_scrapper(review):
    name = review.find('a',{'class':"user-display-name"}).text
    user_id = review.find('a',{'class':"js-analytics-click"}).get('href').split('=')[1]    
    review_count = int(review.find('li',{'class':"review-count responsive-small-display-inline-block"}).find('b').text)
    
    review_text = review.find("p").text
    rating = float(review.find('div',{'class':"i-stars"}).get('title')[0:3])
    review_date = review.find('span',{'class':"rating-qualifier"}).text.strip().split("\n")[0]
    
    vote_buttons = review.find_all('span',{'class':"count"})[0:3]
    votes = []
    for button in vote_buttons:
        if button.text == '':
            votes.append(0)
        else:
            votes.append(int(button.text))
    
    user_lst = [user_id,name,review_count]

    review_lst = [user_id,review_text,rating,review_date]
    review_lst.extend(votes)
    return(user_lst,review_lst)

In [137]:
def resturant_scrapper(url):
    r =  get_connection(url)
    soup = BeautifulSoup(r.content,'html.parser')
    
    resturant_id = url.split("biz/")[1]

    resturant_name_lst = soup.find_all('h1')
    resturant = ''
    for resturant_name in resturant_name_lst:
        resturant += resturant_name.text + " "
    resturant = resturant.strip()
    
    resturant_rating = float(soup.find('div',{'class':'i-stars'}).get('title')[0:3])

    reviews_count = int(soup.find("span",{"class":"review-count rating-qualifier"}).text.strip().split(' ')[0])
    
    phone_number = soup.find('span',{'class':"biz-phone"})
    if phone_number is None:
        phone_number = ''
    else:
        phone_number = phone_number.text.strip()
    
    
    address_lst = soup.find('address').find_all('span')
    
    if len(address_lst) == 4:
        address = address_lst[0].text + " " + address_lst[1].text + ", " + address_lst[2].text + " " + address_lst[3].text
    else:
        address = ''
        
    cuisine_lst = soup.find("span",{"class":"category-str-list"}).find_all('a')
    cuisines = [cuisine.text for cuisine in cuisine_lst]
    resturant_lst = [resturant_id,resturant,resturant_rating,cuisines,reviews_count,phone_number,address]
    resturants_df = pd.DataFrame([resturant_lst])

    reviews_df = pd.DataFrame()
    users_df = pd.DataFrame()
    while True:
        reviews = soup.find_all('div',{'class':'review review--with-sidebar'})
        for review_input in reviews:
            user_lst,review_lst = review_scrapper(review_input)

            review_data = [resturant_id]
            review_data.extend(review_lst)
            
            review_df = pd.DataFrame([review_data])
            reviews_df = pd.concat([reviews_df,review_df],axis = 0)
            
            user_df = pd.DataFrame([user_lst])
            users_df = pd.concat([users_df,user_df],axis = 0)
            
        next_page = soup.find('a',{'class':'u-decoration-none next pagination-links_anchor'})
        if next_page is None:
            break

        url = next_page.get('href')
        time.sleep(2 + random.random())
        r =  get_connection(url)
        soup = BeautifulSoup(r.content,'html.parser')

    return(users_df,resturants_df,reviews_df)

In [115]:
def get_resturants(url):
    r = get_connection(url)
    soup = BeautifulSoup(r.content,'html.parser')
    
    resturants_to_search = []
    while True:
        resturant_tags = soup.find_all("a",{"class":"lemon--a__373c0__IEZFH link__373c0__29943 link-color--blue-dark__373c0__1mhJo link-size--inherit__373c0__2JXk5"})
        resturants_to_add = set([urlparse(resturant_tag.get('href')).path for resturant_tag in resturant_tags])        
        resturants_to_search.extend(resturants_to_add)
        
        next_page = soup.find("a",{"class":"lemon--a__373c0__IEZFH link__373c0__29943 next-link navigation-button__373c0__1D3Ug link-color--blue-dark__373c0__1mhJo link-size--default__373c0__1skgq"})
        
        if next_page is None:
            break
            
        url = base_url + next_page.get('href')
        
        time.sleep(1+random.random())
        r =  get_connection(url)
        soup = BeautifulSoup(r.content,'html.parser')
    print("[INFO] Complete")
    print("[INFO] Resturant Count: %i" %len(resturants_to_search))
    
    return(resturants_to_search)

In [8]:
#proxies = get_proxies()
#proxy_pool = cycle(proxies)

base_url = 'http://www.yelp.com'
hoboken_yelp = base_url + "/search?find_desc=&find_loc=Hoboken"
resturant_urls = get_resturants(hoboken_yelp)

[INFO] Complete
[INFO] Resturant Count: 390


In [98]:
users = pd.DataFrame()
resturants = pd.DataFrame()
reviews = pd.DataFrame()

In [138]:
for resturant_url in resturant_urls:
    #print(resturant_url)
    time.sleep(15+random.random())
    users_df,resturants_df,reviews_df = resturant_scrapper(base_url + resturant_url)
    users = pd.concat([users,users_df],axis = 0)
    resturants = pd.concat([resturants,resturants_df],axis = 0)
    reviews = pd.concat([reviews,reviews_df],axis = 0)

/biz/live-more-adventures-hoboken-3
/biz/hudson-coffee-company-hoboken
/biz/cadillac-cantina-hoboken-2
/biz/spa-diner-hoboken-2
/biz/9-11-tribute-museum-new-york
/biz/city-of-saints-coffee-roasters-hoboken
/biz/chipotle-mexican-grill-hoboken
/biz/hudson-river-waterfront-walkway-hoboken
/biz/village-pourhouse-hoboken-5
/biz/gold-roast-cafe-hoboken-2
/biz/the-shannon-hoboken
/biz/the-ale-house-hoboken-2
/biz/the-winston-hoboken
/biz/prato-bakery-hoboken-2
/biz/the-little-grocery-hoboken-3
/biz/the-roost-outpost-hoboken-3
/biz/qdoba-mexican-eats-hoboken-2
/biz/corkscrew-bar-jersey-city
/biz/disos-italian-sandwich-society-new-york-2
/biz/birch-hoboken-hoboken
/biz/cluck-u-chicken-hoboken
/biz/tornas-pizzeria-hoboken
/biz/coco-havana-hoboken-4
/biz/the-nags-head-hoboken
/biz/madd-hatter-hoboken-3
/biz/insomnia-cookies-hoboken
/biz/rice-shop-hoboken
/biz/aroy-d-the-thai-elephant-truck-hoboken
/biz/johnny-rockets-hoboken
/biz/8th-street-tavern-hoboken
/biz/grubbs-take-away-hoboken
/biz/the-br

In [27]:
users.columns = users_columns
resturants.columns = resturants_columns
reviews.columns = reviews_columns

In [267]:
users = users.drop_duplicates()
users = users.loc[~users.user_id.duplicated()]

In [271]:
resturants = resturants.drop_duplicates()

In [278]:
reviews = reviews.drop_duplicates()

In [279]:
users.to_csv('../data/users.csv',index = False)
reviews.to_csv('../data/reviews.csv',index = False)

In [34]:
base_url = 'http://www.yelp.com/biz/'
for i in tqdm(range(len(resturants))):
    row = resturants.iloc[i]
    url = base_url + row.resturant_id
    
    r = get_connection(url)
    soup = BeautifulSoup(r.content,'html.parser')
    resturants.loc[i,'photo'] = soup.find_all("img")[3].get('src')

100%|██████████| 390/390 [26:00<00:00,  4.00s/it]


In [36]:
resturants.to_csv('../data/resturants.csv',index=False)