### Extracting the menu

In [3]:
import re
import csv
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd
#selenium imports
#install geckodriver first and make sure you have selenium installed and firefox is a version > 53.0
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [11]:
class Restaurant:
    """
    Represents a restaurant with all the menu.
    Used for scraping before covnerting everything to csv.
    """
    def __init__(self, url):
        self.url = url
        
    def scrape(self):
        self.html = urllib.request.urlopen(url).read()
        self.soup = BeautifulSoup(self.html, "lxml")
        self.scrape_menu()
        self.scrape_info()
        self.clean_scrape()
        with open("restaurants_pages/{}.html".format(self.restaurant_name), "w") as htmlfile:
            htmlfile.write(str(self.html))
        # Cleanup to avoid using too much memo 
        del self.html
        del self.soup
        
        
    def scrape_menu(self):
        self.menu_items, self.menu_prices = map(self.scrape_all_class, 
                                                ["cardCategory-itemTitle","cardCategory-itemPrice"])

    def scrape_info(self):
        self.avg_price = self.scrape_all_class("pull-left restaurantSummary-price", True)
        self.tags = self.scrape_all_class("restaurantTag")
        self.location = self.scrape_all_class("restaurantSummary-address", True)
        self.restaurant_name = self.scrape_all_class("restaurantSummary-name", True)
        self.glob_rating = self.scrape_all_class("rating-ratingValue", True)
        
    def scrape_all_class(self, name, first=False):
        if first:
            return self.soup.find(class_=name)
        else:
            return list(self.soup.find_all(class_=name))
        
    def clean_scrape(self):
        def get_clean_text(string):
            return string.text.strip()
        
        def prices_as_numbers(price):
            return float(replace_dict(price, {"€": "", "\xa0": "", ",": ".", " ": ""}))
        
        def replace_dict(text, changes):
            for old, new in changes.items():
                text = text.replace(old, new)
            return text
        
        self.menu_items = map(get_clean_text, self.menu_items)
        self.tags = map(get_clean_text, self.tags)
        self.location = get_clean_text(self.location)
        self.restaurant_name = get_clean_text(self.restaurant_name)
        self.menu_prices = map(get_clean_text, self.menu_prices)
        self.menu_prices = map(prices_as_numbers, self.menu_prices)
        self.avg_price = prices_as_numbers(get_clean_text(self.avg_price)[-8:])
        self.glob_rating = prices_as_numbers(get_clean_text(self.glob_rating))
    
    def iter_menu(self):
        for item, price in zip(self.menu_items, self.menu_prices):
            yield (self.restaurant_name, item, price, self.url, self.glob_rating, self.avg_price,
                   self.location, list(self.tags))

#### Example

In [12]:
url = "https://www.lafourchette.com/restaurant/le-23-clauzel-julie-riviere/6999"
rest = Restaurant(url)
rest.scrape()
rest.restaurant_name

'Le 23 Clauzel - Julie Rivière -'

### Extracting the list of restaurants

In [None]:
#introducting the functions for wait times
def wait(dr, x, t,i):
    element = WebDriverWait(dr, t).until(EC.text_to_be_present_in_element((By.XPATH, x),i))
    return element

In [5]:
######## check code to make sure the driver works, don't run it each time##########
#dr = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
dr = webdriver.Firefox()
dr.get("https://www.lafourchette.com/restaurant+paris#sort=QUALITY_DESC&page=105")
wait(dr,"//li[@class='active']",10,"105")

True

In [6]:
####### check code to see if it works with beautifulsoup::::: NOT parsing all the strings but still does a good work###########
soup = BeautifulSoup(dr.page_source, 'lxml')
base_url = "https://www.lafourchette.com"
pattern = re.compile("/restaurant/[a-z-]+/[0-9]+$")
restaurants = []
restaurants.extend(base_url + rest["href"] for rest in soup.find_all(href=pattern))

In [7]:
with open("temp_page_source.html","w") as f:
    f.write(dr.page_source)

In [8]:
len(list(set(restaurants)))

24

In [9]:
###### FIXED the find_restaurants #########

def find_restaurants():
    
    base_url_search = "https://www.lafourchette.com"
    base_url = "https://www.lafourchette.com/"
    search_url = base_url + "restaurant+paris#sort=QUALITY_DESC&page={}"
    pattern = re.compile("restaurant/[a-z-]+/[0-9]+$")
    restaurants = []
    for i in range(1, 255):
        dr = webdriver.Firefox()
        if i % 10 == 0:
            print("Handled {} pages, have {} restaurant urls".format(i, len(restaurants)))
        search_page = search_url.format(i)
        dr.get(search_page)
        wait(dr,"//li[@class='active']",10,str(i))
        soup = BeautifulSoup(dr.page_source, "lxml")
        temp = []
        temp.extend(base_url_search + rest["href"] for rest in soup.find_all(href=pattern))
        temp = list(set(temp))
        #restaurants.extend(base_url + rest["href"] for rest in soup.find_all(href=pattern))
        restaurants.extend(temp)
        dr.quit()
    
    return restaurants


In [10]:
### Will take a lot of time to complete ~1-2 hrs depending on the internet speed. Couldn't get phantomJS to work 
#### or would  have been faster ####

restaurants = find_restaurants()

len(restaurants)

Handled 10 pages, have 218 restaurant urls
Handled 20 pages, have 458 restaurant urls
Handled 30 pages, have 695 restaurant urls
Handled 40 pages, have 930 restaurant urls
Handled 50 pages, have 1163 restaurant urls
Handled 60 pages, have 1401 restaurant urls
Handled 70 pages, have 1640 restaurant urls
Handled 80 pages, have 1877 restaurant urls
Handled 90 pages, have 2119 restaurant urls
Handled 100 pages, have 2361 restaurant urls
Handled 110 pages, have 2602 restaurant urls
Handled 120 pages, have 2843 restaurant urls
Handled 130 pages, have 3080 restaurant urls
Handled 140 pages, have 3322 restaurant urls
Handled 150 pages, have 3563 restaurant urls
Handled 160 pages, have 3801 restaurant urls
Handled 170 pages, have 4040 restaurant urls
Handled 180 pages, have 4280 restaurant urls
Handled 190 pages, have 4519 restaurant urls
Handled 200 pages, have 4764 restaurant urls
Handled 210 pages, have 5007 restaurant urls
Handled 220 pages, have 5135 restaurant urls
Handled 230 pages, have

5135

### Saving in case we need to reload...

In [3]:
pd.Series(restaurants).to_csv("rest.csv", index=False)

In [13]:
restaurants = pd.read_csv("rest.csv")

In [None]:
### We want to make sure we save periodically given all the issues
import csv

fails = 0
obj_rests = []
failed = []


with open("all_items.csv", "w") as csvfile:
    # Use tab because "," or ";" might be used in description
    writer = csv.writer(csvfile, delimiter='\t')
    writer.writerow(["rest_name", "item", "item_price", "rest_url", "rest_global_rating",
                     "rest_avg_price", "rest_location", "rest_tags"])

    
for i, url in enumerate(restaurants.restaurants):
    if not (i+1) % 50:
        print("Parsed {} restaurants".format(i+1))
        print("Saving menu...")
        
        with open("all_items.csv", "a+") as csvfile:
        # Use tab because "," or ";" might be used in description
            writer = csv.writer(csvfile, delimiter='\t')
            for restaurant_menu in obj_rests[i-50:]:
                try:
                    writer.writerows(restaurant_menu.iter_menu())
                except:
                    fails += 1
                    
    try:
        rest = Restaurant(url)
        rest.scrape()
        obj_rests.append(rest)
    except:
        failed.append(rest)

Parsed 50 restaurants
Saving menu...
Parsed 100 restaurants
Saving menu...
Parsed 150 restaurants
Saving menu...
Parsed 200 restaurants
Saving menu...
Parsed 250 restaurants
Saving menu...
Parsed 300 restaurants
Saving menu...
Parsed 350 restaurants
Saving menu...
Parsed 400 restaurants
Saving menu...
Parsed 450 restaurants
Saving menu...
Parsed 500 restaurants
Saving menu...
Parsed 550 restaurants
Saving menu...
Parsed 600 restaurants
Saving menu...
Parsed 650 restaurants
Saving menu...
Parsed 700 restaurants
Saving menu...
Parsed 750 restaurants
Saving menu...
Parsed 800 restaurants
Saving menu...
Parsed 850 restaurants
Saving menu...
Parsed 900 restaurants
Saving menu...
Parsed 950 restaurants
Saving menu...
Parsed 1000 restaurants
Saving menu...
Parsed 1050 restaurants
Saving menu...
Parsed 1100 restaurants
Saving menu...
Parsed 1150 restaurants
Saving menu...
Parsed 1200 restaurants
Saving menu...
Parsed 1250 restaurants
Saving menu...
Parsed 1300 restaurants
Saving menu...
Parsed

In [20]:
import pickle
output = open("restaurants.pkl", "wb")
pickle.dump(obj_rests, output)

RecursionError: maximum recursion depth exceeded while pickling an object

In [6]:
test_2 = pd.read_csv("all_items.csv", sep="\t")