### Extracting the menu

In [4]:
import re
import urllib.request
from bs4 import BeautifulSoup

#selenium imports
#install geckodriver first and make sure you have selenium installed and firefox is a version > 53.0
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [5]:
class Restaurant:
    """
    Represents a restaurant with all the menu.
    Used for scraping before covnerting everything to csv.
    """
    def __init__(self, url):
        self.url = url
        
    def scrape(self):
        self.soup = BeautifulSoup(urllib.request.urlopen(url).read(), "lxml")
        self.scrape_menu()
        self.scrape_info()
        self.clean_scraped()
        
        
    def scrape_menu(self):
        self.menu_items, self.menu_prices = map(self.scrape_all_class, 
                                                ["cardCategory-itemTitle","cardCategory-itemPrice"])

    def scrape_info(self):
        self.avg_price = self.scrape_all_class("pull-left restaurantSummary-price", True)
        self.tags = self.scrape_all_class("restaurantTag")
        self.location = self.scrape_all_class("restaurantSummary-address", True)
        self.restaurant_name = self.scrape_all_class("restaurantSummary-name", True)
        self.glob_rating = self.scrape_all_class("rating-ratingValue", True)
        
    def scrape_all_class(self, name, first=False):
        if first:
            return self.soup.find(class_=name)
        else:
            return self.soup.find_all(class_=name)
        
    def clean_scraped(self):
        def get_clean_text(string):
            return string.text.strip()
        
        def prices_as_numbers(price):
            return float(replace_dict(price, {"€": "", "\xa0": "", ",": ".", " ": ""}))
        
        def replace_dict(text, changes):
            for old, new in changes.items():
                text = text.replace(old, new)
            return text
        
        self.menu_items = map(get_clean_text, self.menu_items,)
        self.tags = map(get_clean_text, self.tags,)
        self.location = get_clean_text(self.location)
        self.restaurant_name = get_clean_text(self.restaurant_name)
        self.menu_prices = map(get_clean_text, self.menu_prices)
        self.menu_prices = map(prices_as_numbers, self.menu_prices)
        self.avg_price = prices_as_numbers(get_clean_text(self.avg_price)[-8:])
        self.glob_rating = prices_as_numbers(get_clean_text(self.glob_rating))
    
    def iter_menu(self):
        for item, price in zip(self.menu_items, self.menu_prices):
            yield (self.restaurant_name, item, price, self.url, self.glob_rating, self.avg_price,
                   self.location, list(self.tags))

#### Example

In [6]:
url = "https://www.lafourchette.com/restaurant/le-23-clauzel-julie-riviere/6999"
rest = Restaurant(url)
rest.scrape()
rest.restaurant_name

'Le 23 Clauzel - Julie Rivière -'

### Extracting the list of restaurants?

In [7]:
#introducting the functions for wait times
def wait(dr, x, t,i):
    element = WebDriverWait(dr, t).until(EC.text_to_be_present_in_element((By.XPATH, x),i))
    return element

In [8]:
######## check code to make sure the driver works, don't run it each time##########
#dr = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
dr = webdriver.Firefox()
dr.get("https://www.lafourchette.com/restaurant+paris#sort=QUALITY_DESC&page=105")
wait(dr,"//li[@class='active']",10,"105")

#restaurants

True

In [33]:
####### check code to see if it works with beautifulsoup::::: NOT parsing all the strings but still does a good work###########
soup = BeautifulSoup(dr.page_source,'html.parser')
base_url = "https://www.lafourchette.com"
pattern = re.compile("/restaurant/[a-z-]+/[0-9]+$")
restaurants = []
restaurants.extend(base_url + rest["href"] for rest in soup.find_all(href=pattern))

In [18]:
with open("temp_page_source.html","w") as f:
    f.write(dr.page_source)

In [34]:
len(list(set(restaurants)))

23

In [9]:
#Buggy: see fb chat 
# Need to implement: 
# https://stackoverflow.com/questions/37953182/how-do-i-wait-through-a-wait-page-and-then-download-a-pdf-using-python

###### FIXED the find_restaurants #########3


def find_restaurants():
    
    base_url_search = "https://www.lafourchette.com"
    base_url = "https://www.lafourchette.com/"
    search_url = base_url + "restaurant+paris#sort=QUALITY_DESC&page={}"
    pattern = re.compile("restaurant/[a-z-]+/[0-9]+$")
    restaurants = []
    for i in range(1, 255):
        dr = webdriver.Firefox()
        if i % 10 == 0:
            print("Handled {} pages, have {} restaurant urls".format(i, len(restaurants)))
        search_page = search_url.format(i)
        dr.get(search_page)
        wait(dr,"//li[@class='active']",10,str(i))
        soup = BeautifulSoup(dr.page_source, "lxml")
        temp = []
        temp.extend(base_url_search + rest["href"] for rest in soup.find_all(href=pattern))
        temp = list(set(temp))
        #restaurants.extend(base_url + rest["href"] for rest in soup.find_all(href=pattern))
        restaurants.extend(temp)
        dr.quit()
    
    return restaurants


In [10]:
### Will take a lot of time to complete ~1-2 hrs depending on the internet speed. Couldn't get phantomJS to work 
#### or would  have been faster ####

restaurants = find_restaurants()

len(restaurants)

Handled 10 pages, have 218 restaurant urls
Handled 20 pages, have 456 restaurant urls
Handled 30 pages, have 694 restaurant urls
Handled 40 pages, have 930 restaurant urls
Handled 50 pages, have 1161 restaurant urls
Handled 60 pages, have 1399 restaurant urls
Handled 70 pages, have 1636 restaurant urls
Handled 80 pages, have 1875 restaurant urls
Handled 90 pages, have 2117 restaurant urls
Handled 100 pages, have 2358 restaurant urls
Handled 110 pages, have 2600 restaurant urls
Handled 120 pages, have 2841 restaurant urls


WebDriverException: Message: Reached error page: about:neterror?e=dnsNotFound&u=https%3A//www.lafourchette.com/restaurant+paris%23sort%3DQUALITY_DESC%26page%3D122&c=UTF-8&f=regular&d=Firefox%20can%E2%80%99t%20find%20the%20server%20at%20www.lafourchette.com.


In [44]:
len(set(restaurants))

5135

In [2]:
restaurants[0:10]

NameError: name 'restaurants' is not defined

In [1]:
''' object rest contains all the restaurants which were able to be parsed. Could be made more robust to introduce
    the try and catch in the class restaurant to make sure the ones which have the important urls are kept
'''
obj_rests = []
for url in restaurants:
    try:
        rest = Restaurant(url)
        rest.scrape()
        obj_rests.append(rest)
        print('Succesfully parsed' , rest.restaurant_name)
    except:
        print('Not able to parse' , rest.restaurant_name)


NameError: name 'restaurants' is not defined