### Extracting the menu

In [3]:
import re
import urllib.request
from bs4 import BeautifulSoup

#selenium imports
#install geckodriver first and make sure you have selenium installed and firefox is a version > 53.0
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
class Restaurant:
    """
    Represents a restaurant with all the menu.
    Used for scraping before covnerting everything to csv.
    """
    def __init__(self, url):
        self.url = url
        
    def scrape(self):
        self.soup = BeautifulSoup(urllib.request.urlopen(url).read(), "lxml")
        self.scrape_menu()
        self.scrape_info()
        self.clean_scraped()
        
        
    def scrape_menu(self):
        self.menu_items, self.menu_prices = map(self.scrape_all_class, 
                                                ["cardCategory-itemTitle","cardCategory-itemPrice"])

    def scrape_info(self):
        self.avg_price = self.scrape_all_class("pull-left restaurantSummary-price", True)
        self.tags = self.scrape_all_class("restaurantTag")
        self.location = self.scrape_all_class("restaurantSummary-address", True)
        self.restaurant_name = self.scrape_all_class("restaurantSummary-name", True)
        self.glob_rating = self.scrape_all_class("rating-ratingValue", True)
        
    def scrape_all_class(self, name, first=False):
        if first:
            return self.soup.find(class_=name)
        else:
            return self.soup.find_all(class_=name)
        
    def clean_scraped(self):
        def get_clean_text(string):
            return string.text.strip()
        
        def prices_as_numbers(price):
            return float(replace_dict(price, {"€": "", "\xa0": "", ",": ".", " ": ""}))
        
        def replace_dict(text, changes):
            for old, new in changes.items():
                text = text.replace(old, new)
            return text
        
        self.menu_items = map(get_clean_text, self.menu_items,)
        self.tags = map(get_clean_text, self.tags,)
        self.location = get_clean_text(self.location)
        self.restaurant_name = get_clean_text(self.restaurant_name)
        self.menu_prices = map(get_clean_text, self.menu_prices)
        self.menu_prices = map(prices_as_numbers, self.menu_prices)
        self.avg_price = prices_as_numbers(get_clean_text(self.avg_price)[-8:])
        self.glob_rating = prices_as_numbers(get_clean_text(self.glob_rating))
    
    def iter_menu(self):
        for item, price in zip(self.menu_items, self.menu_prices):
            yield (self.restaurant_name, item, price, self.url, self.glob_rating, self.avg_price,
                   self.location, list(self.tags))

#### Example

In [14]:
url = "https://www.lafourchette.com/restaurant/le-23-clauzel-julie-riviere/6999"
rest = Restaurant(url)
rest.scrape()


### Extracting the list of restaurants?

In [1]:
def wait(dr, x, t,i):
    element = WebDriverWait(dr, t).until(EC.text_to_be_present_in_element((By.XPATH, x),i))
    return element

In [8]:
dr = webdriver.Firefox()
dr.get("https://www.lafourchette.com/restaurant+paris#sort=QUALITY_DESC&page=105")
wait(dr,"//li[@class='active']",10,"105")
soup = BeautifulSoup(dr.page_source,"lxml")
base_url = "https://www.lafourchette.com/"
pattern = re.compile("restaurant/[a-z-]+/[0-9]+$")
restaurants = []
restaurants.extend(base_url + rest["href"] for rest in soup.find_all(href=pattern))
restaurants

['https://www.lafourchette.com//restaurant/la-charrette-creole/47259',
 'https://www.lafourchette.com//restaurant/la-charrette-creole/47259',
 'https://www.lafourchette.com//restaurant/les-domaines-qui-montent/265887',
 'https://www.lafourchette.com//restaurant/les-domaines-qui-montent/265887',
 'https://www.lafourchette.com//restaurant/dochilak-batignolles/314091',
 'https://www.lafourchette.com//restaurant/dochilak-batignolles/314091',
 'https://www.lafourchette.com//restaurant/le-safari/18559',
 'https://www.lafourchette.com//restaurant/le-safari/18559',
 'https://www.lafourchette.com//restaurant/le-cafe-de-mars/29923',
 'https://www.lafourchette.com//restaurant/le-cafe-de-mars/29923',
 'https://www.lafourchette.com//restaurant/laduree-paris-royale/5141',
 'https://www.lafourchette.com//restaurant/laduree-paris-royale/5141',
 'https://www.lafourchette.com//restaurant/laduree-paris-royale/5141',
 'https://www.lafourchette.com//restaurant/flora-danica/25988',
 'https://www.lafourchett

In [21]:
#Buggy: see fb chat 
# Need to implement: 
# https://stackoverflow.com/questions/37953182/how-do-i-wait-through-a-wait-page-and-then-download-a-pdf-using-python

#introducting the functions for wait times




def find_restaurants():
    
    #dr = webdriver.PhantomJS()
    base_url = "https://www.lafourchette.com/"
    search_url = base_url + "restaurant+paris#sort=QUALITY_DESC&page={}"
    pattern = re.compile("restaurant/[a-z-]+/[0-9]+$")
    restaurants = []
    for i in range(1, 4):
        if i % 1 == 0:
            print("Handled {} pages, have {} restaurant urls".format(i, len(restaurants)))
        search_page = search_url.format(i)
        soup = BeautifulSoup(urllib.request.urlopen(search_page).read(), "lxml")
        restaurants.extend(base_url + rest["href"] for rest in soup.find_all(href=pattern))
    return restaurants


In [23]:
restaurants = find_restaurants()

restaurants

Handled 1 pages, have 0 restaurant urls
Handled 2 pages, have 24 restaurant urls
Handled 3 pages, have 48 restaurant urls


['https://www.lafourchette.com//restaurant/nem-hanoi/336555',
 'https://www.lafourchette.com//restaurant/la-grande-cascade-frederic-robert/122',
 'https://www.lafourchette.com//restaurant/o-friendly/338159',
 'https://www.lafourchette.com//restaurant/aarchna/15964',
 'https://www.lafourchette.com//restaurant/carte-blanche/4616',
 'https://www.lafourchette.com//restaurant/copenhague/25985',
 'https://www.lafourchette.com//restaurant/ao-izakaya/323811',
 'https://www.lafourchette.com//restaurant/suraj/340043',
 'https://www.lafourchette.com//restaurant/hao-long/215199',
 'https://www.lafourchette.com//restaurant/bistrot-wattignies/330201',
 'https://www.lafourchette.com//restaurant/upper-cafe-les-halles/342631',
 'https://www.lafourchette.com//restaurant/les-rupins/295875',
 'https://www.lafourchette.com//restaurant/perbacco/313673',
 'https://www.lafourchette.com//restaurant/mayfair-garden/332645',
 'https://www.lafourchette.com//restaurant/chez-petit-hugo/335287',
 'https://www.lafourc