### Extracting the menu

In [None]:
import re
import urllib.request
from bs4 import BeautifulSoup

In [99]:
class Restaurant:
    """
    Represents a restaurant with all the menu.
    Used for scraping before covnerting everything to csv.
    """
    def __init__(self, url):
        self.url = url
        
    def scrape(self):
        self.soup = BeautifulSoup(urllib.request.urlopen(url).read(), "lxml")
        self.scrape_menu()
        self.scrape_info()
        self.clean_scraped()
        
        
    def scrape_menu(self):
        self.menu_items, self.menu_prices = map(self.scrape_all_class, 
                                                ["cardCategory-itemTitle","cardCategory-itemPrice"])

    def scrape_info(self):
        self.avg_price = self.scrape_all_class("pull-left restaurantSummary-price", True)
        self.tags = self.scrape_all_class("restaurantTag")
        self.location = self.scrape_all_class("restaurantSummary-address", True)
        self.restaurant_name = self.scrape_all_class("restaurantSummary-name", True)
        self.glob_rating = self.scrape_all_class("rating-ratingValue", True)
        
    def scrape_all_class(self, name, first=False):
        if first:
            return self.soup.find(class_=name)
        else:
            return self.soup.find_all(class_=name)
        
    def clean_scraped(self):
        def get_clean_text(string):
            return string.text.strip()
        
        def prices_as_numbers(price):
            return float(replace_dict(price, {"€": "", "\xa0": "", ",": ".", " ": ""}))
        
        def replace_dict(text, changes):
            for old, new in changes.items():
                text = text.replace(old, new)
            return text
        
        self.menu_items = map(get_clean_text, self.menu_items,)
        self.tags = map(get_clean_text, self.tags,)
        self.location = get_clean_text(self.location)
        self.restaurant_name = get_clean_text(self.restaurant_name)
        self.menu_prices = map(get_clean_text, self.menu_prices)
        self.menu_prices = map(prices_as_numbers, self.menu_prices)
        self.avg_price = prices_as_numbers(get_clean_text(self.avg_price)[-8:])
        self.glob_rating = prices_as_numbers(get_clean_text(self.glob_rating))
    
    def iter_menu(self):
        for item, price in zip(self.menu_items, self.menu_prices):
            yield (self.restaurant_name, item, price, self.url, self.glob_rating, self.avg_price,
                   self.location, list(self.tags))

#### Example

In [66]:
url = "https://www.lafourchette.com/restaurant/le-23-clauzel-julie-riviere/6999"
rest = Restaurant(url)
rest.scrape()

### Extracting the list of restaurants?

In [97]:
#Buggy: see fb chat 
# Need to implement: 
# https://stackoverflow.com/questions/37953182/how-do-i-wait-through-a-wait-page-and-then-download-a-pdf-using-python

def find_restaurants():
    base_url = "https://www.lafourchette.com/"
    search_url = base_url + "restaurant+paris#sort=QUALITY_DESC&page={}"
    pattern = re.compile("restaurant/[a-z-]+/[0-9]+$")
    restaurants = []
    for i in range(1, 255):
        if i % 10 == 0:
            print("Handled {} pages, have {} restaurant urls".format(i, len(restaurants)))
        search_page = search_url.format(i)
        soup = BeautifulSoup(urllib.request.urlopen(search_page).read(), "lxml")
        restaurants.extend(base_url + rest["href"] for rest in soup.find_all(href=pattern))
    return restaurants


# restaurants = find_restaurants()

KeyboardInterrupt: 