## Yelp Web Scraping Demo

In this notebook, we scrape all the info needed from a page of yelp for our improved search engine

In [54]:
from itertools import cycle
from lxml.html import fromstring
import random
import re
import time
import traceback
from urllib.parse import urljoin,urlparse

from bs4 import BeautifulSoup
import requests
import pandas as pd
from selenium import webdriver
from selenium.webdriver.firefox.options import Options

In [85]:
reviews_df = pd.DataFrame()
user_df = pd.DataFrame()
resturant_df = pd.DataFrame()

resturants_columns = ['resturant_id','resturant','resturant_rating','cuisine','resturant_review_count']
reviews_columns = ['resturant_id','user_id','description','rating','date','useful','funny','cool']
users_columns = ['user_id','name','user_review_count']

In [15]:
def get_proxies():
    options = Options()
    options.headless = True

    webdriver_name = 'geckodriver'
    driver = webdriver.Firefox(executable_path = 'webdriver/geckodriver',   options=options)

    url = 'https://free-proxy-list.net/'
    driver.get(url)

    https_button = driver.find_element_by_xpath('/html/body/section[1]/div/div[2]/div/div[2]/div/table/thead/tr/th[7]')
    https_button.click()
    https_button.click()

    soup = BeautifulSoup(driver.page_source,'html.parser')
    possible_proxies = soup.find_all('tbody')[0].find_all('tr')
    proxies = set()
    for possible_proxy in possible_proxies:
        possible_proxy_info = possible_proxy.find_all('td')
        if possible_proxy_info[6].text == 'yes':
            proxy = possible_proxy_info[0].text + ":" + possible_proxy_info[1].text
            proxies.add(proxy)

    driver.close()
    return(proxies)

In [30]:
def get_connection(url):
    proxy = next(proxy_pool)
    flag = 0
    while not flag:
        try:
            r = requests.get(url,proxies={"http": proxy, "https": proxy})
            flag = 1
        except:
            print(r.json())
            print("Skipping. Connnection error")

    return(r)

In [56]:
def review_scrapper(review):
    name = review.find('a',{'class':"user-display-name"}).text
    user_id = review.find('a',{'class':"js-analytics-click"}).get('href').split('=')[1]    
    review_count = int(review.find('li',{'class':"review-count responsive-small-display-inline-block"}).find('b').text)
    
    review_text = review.find("p").text
    rating = float(review.find('div',{'class':"i-stars"}).get('title')[0:3])
    review_date = review.find('span',{'class':"rating-qualifier"}).text.strip().split("\n")[0]
    
    vote_buttons = review.find_all('span',{'class':"count"})[0:3]
    votes = []
    for button in vote_buttons:
        if button.text == '':
            votes.append(0)
        else:
            votes.append(int(button.text))
    
    user_lst = [user_id,name,review_count]

    review_lst = [user_id,review_text,rating,review_date]
    review_lst.extend(votes)
    return(user_lst,review_lst)

In [77]:
def resturant_scrapper(url):
    r =  requests.get(url)
    soup = BeautifulSoup(r.content,'html.parser')
    
    resturant_id = url.split("biz/")[1]

    resturant_name_lst = soup.find_all('h1')
    resturant = ''
    for resturant_name in resturant_name_lst:
        resturant += resturant_name.text + " "
    resturant = resturant.strip()
    
    resturant_rating = float(soup.find('div',{'class':'i-stars'}).get('title')[0:3])

    reviews_count = int(soup.find("span",{"class":"review-count rating-qualifier"}).text.strip().split(' ')[0])
    
    cuisine_lst = soup.find("span",{"class":"category-str-list"}).find_all('a')
    cuisines = [cuisine.text for cuisine in cuisine_lst]
    resturant_lst = [resturant_id,resturant,resturant_rating,cuisines,reviews_count]
    resturants_df = pd.DataFrame([resturant_lst])

    reviews_df = pd.DataFrame()
    users_df = pd.DataFrame()
    while True:
        reviews = soup.find_all('div',{'class':'review review--with-sidebar'})
        for review_input in reviews:
            user_lst,review_lst = review_scrapper(review_input)

            review_data = [resturant_id]
            review_data.extend(review_lst)
            
            review_df = pd.DataFrame([review_data])
            reviews_df = pd.concat([reviews_df,review_df],axis = 0)
            
            user_df = pd.DataFrame([user_lst])
            users_df = pd.concat([users_df,user_df],axis = 0)
            
        next_page = soup.find('a',{'class':'u-decoration-none next pagination-links_anchor'})
        if next_page is None:
            break

        url = next_page.get('href')
        time.sleep(4 + random.random())
        r =  requests.get(url)
        soup = BeautifulSoup(r.content,'html.parser')

    return(users_df,resturants_df,reviews_df)

In [78]:
def get_resturants(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.content,'html.parser')
    
    resturants_to_search = []
    while True:
        resturant_tags = soup.find_all("a",{"class":"lemon--a__373c0__IEZFH link__373c0__29943 link-color--blue-dark__373c0__1mhJo link-size--inherit__373c0__2JXk5"})
        resturants_to_add = set([urlparse(resturant_tag.get('href')).path for resturant_tag in resturant_tags])        
        resturants_to_search.extend(resturants_to_add)
        
        next_page = soup.find("a",{"class":"lemon--a__373c0__IEZFH link__373c0__29943 next-link navigation-button__373c0__1D3Ug link-color--blue-dark__373c0__1mhJo link-size--default__373c0__1skgq"})
        
        if next_page is None:
            break
            
        url = base_url + next_page.get('href')
        
        time.sleep(2+random.random())
        r =  requests.get(url)
        soup = BeautifulSoup(r.content,'html.parser')
    print("[INFO] Complete")
    print("[INFO] Resturant Count: %i" %len(resturants_to_search))
    
    return(resturants_to_search)

In [63]:
#proxies = get_proxies()
#proxy_pool = cycle(proxies)

base_url = 'http://www.yelp.com'
hoboken_yelp = base_url + "/search?find_desc=&find_loc=Hoboken"
resturant_urls = get_resturants(hoboken_yelp)

[INFO] Complete
[INFO] Resturant Count: 389


In [79]:
users = pd.DataFrame()
resturants = pd.DataFrame()
reviews = pd.DataFrame()
for resturant_url in resturant_urls:
    print(resturant_url)
    time.sleep(15+random.random())
    users_df,resturants_df,reviews_df = resturant_scrapper(base_url + resturant_url)
    users = pd.concat([users,users_df],axis = 0)
    resturants = pd.concat([resturants,resturants_df],axis = 0)
    reviews = pd.concat([reviews,reviews_df],axis = 0)

/biz/la-isla-restaurant-hoboken
/biz/mamouns-falafel-restaurant-hoboken
/biz/the-cuban-restaurant-and-bar-hoboken-2
/biz/fiore-deli-of-hoboken-hoboken
/biz/bw%C3%A8-kafe-hoboken
/biz/karma-kafe-hoboken
/biz/m-and-p-biancamano-hoboken
/biz/amandas-restaurant-hoboken-2
/biz/vitos-italian-deli-hoboken
/biz/anthony-davids-hoboken
/biz/tutta-pesca-hoboken
/biz/benny-tudinos-pizzeria-hoboken
/biz/choc-o-pain-hoboken-2
/biz/sweet-hoboken
/biz/empire-coffee-and-tea-company-hoboken
/biz/old-german-bakery-hoboken
/biz/satay-malaysian-cuisine-hoboken-2
/biz/elysian-cafe-hoboken
/biz/grimaldis-hoboken


AttributeError: 'NoneType' object has no attribute 'get'

In [86]:
users.columns = users_columns
resturants.columns = resturants_columns
reviews.columns = reviews_columns

In [90]:
users.to_csv('data/users.csv')
resturants.to_csv('data/resturants.csv')
reviews.to_csv('data/reviews.csv')