# Projet 4

In [359]:
import requests
from bs4 import BeautifulSoup

Ecrire une fonction `get_prices_from_url()` qui extrait des informations à partir des 2 pages ci-dessous.

Exemple `URL_PAGE2` doit retourner :

<pre>{'Personal': {'price': '$5', 'storage': '1GB', 'databases': 1},
  'Small Business': {'price': '$25', 'storage': '10GB', 'databases': 5},
  'Enterprise': {'price': '$45', 'storage': '100GB', 'databases': 25}}
</pre>

In [360]:
URL_PAGE2 = "https://kim.fspot.org/cours/page2.html"
URL_PAGE3 = "https://kim.fspot.org/cours/page3.html"

def get_prices_from_url(url):
    prices = {}
    
    r = requests.get(url)
    soup = BeautifulSoup(r.content)

    for div_tag in soup.findAll('div', {'class': re.compile('pure-u-1 pure-u-md-1-[3,4]')}):

        row={}

        # Name
        prices[div_tag.find('h2').text] = row

        # Price
        price = div_tag.find('span', {'class':'pricing-table-price'}).text.strip()
        price = re.split(' ',price)[0]
        row['price'] = price


        ul = div_tag.find('ul', {'class':'pricing-table-list'})

        # Storage
        storage = ul.find('li').find_next_sibling().find_next_sibling().find_next_sibling().text
        storage = re.split(' ',storage)[0]
        row['storage'] = storage

        # Databases
        databases = ul.find('li').find_next_sibling().find_next_sibling().find_next_sibling().find_next_sibling().text
        databases = re.split(' ',databases)[0]
        row['databases'] = int(databases)
    
    return prices

In [361]:
prices = get_prices_from_url(URL_PAGE2)
prices

{'Personal': {'price': '$5', 'storage': '1GB', 'databases': 1},
 'Small Business': {'price': '$25', 'storage': '10GB', 'databases': 5},
 'Enterprise': {'price': '$45', 'storage': '100GB', 'databases': 25}}

In [362]:
prices3 = get_prices_from_url(URL_PAGE3)
prices3

{'Personal': {'price': '$5', 'storage': '1GB', 'databases': 1},
 'Small Business': {'price': '$25', 'storage': '10GB', 'databases': 5},
 'Enterprise': {'price': '$45', 'storage': '100GB', 'databases': 25},
 'Privilege': {'price': '$99', 'storage': '1TB', 'databases': 100}}

Ecrire une fonction qui extrait des informations sur une bière de beowulf.

Exemple d'URL: https://www.beerwulf.com/fr-fr/p/bieres/melusine-bio.33 

La fonction doit retourner :
<pre>
{'name': 'Mélusine Bio', 'note': 70, 'price': 38.99, 'volume': 33}
</pre>

In [363]:
def extract_beer_infos(url):
    
    r = requests.get(url)
    soup = BeautifulSoup(r.content)
    div = soup.find('div', {'class':'product-detail-info-row mobile-header-details'})

    name = div.find('h1').text
    volume = div.find('span').text.split('|')[2].strip().split(' ')[0]
    note = div.find('div', {'class':'stars'})['data-percent']
    price = soup.find('span',{'class': "price"}).text.split(' ')[0].replace(',','.')

    infos = {
        'name': name,
        'note': int(note),
        'price': float(price),
        'volume': int(volume),
    }
    return infos


In [364]:
info = extract_beer_infos('https://www.beerwulf.com/fr-fr/p/bieres/melusine-bio.33')
info

{'name': 'Mélusine Bio', 'note': 70, 'price': 38.99, 'volume': 33}

Cette URL retourne un JSON avec une liste de bières :

In [365]:
URL_BEERLIST_FRANCE = "https://www.beerwulf.com/fr-FR/api/search/searchProducts?country=France&container=Bouteille"

Ecrire une fonction qui prend l'argument cet URL retourne les informations sur une liste de bière via l'API de beowulf.

Cette fonction doit retourner la liste des informations obtenues par la fonction `extract_beer_infos()` définie ci-dessus.

Chercher comment optimiser cette fonction en utilisant multiprocessing.Pool pour paralléliser les accès web.

Exemple de retour :

<pre>[{'name': 'Gallia East IPA', 'note': 80, 'price': 42.99, 'volume': 33},
    {'name': 'La Lager Sans Gluten de Vézelay', 'note': 60, 'price': 38.99, 'volume': 25},
    {'name': 'Brasserie De Sutter Brin de Folie', 'note': 70, 'price': 44.99, 'volume': 33},
    {'name': 'La Cristal IPA du Mont Blanc', 'note': 70, 'price': 44.99, 'volume': 33},
    {'name': 'Mélusine Bio', 'note': 70, 'price': 38.99, 'volume': 33},
    {'name': 'La Parisienne Le Titi Parisien', 'note': 70, 'price': 38.99, 'volume': 33},
    {'name': 'Gallia Session IPA', 'note': 70, 'price': 42.99, 'volume': 33},
    {'name': 'Ninkasi Brut IPA', 'note': 70, 'price': 44.99, 'volume': 33},
    {'name': 'Pietra', 'note': 60, 'price': 38.99, 'volume': 33},
    {'name': 'Desperados', 'note': 60, 'price': 35.99, 'volume': 33},
    {'name': 'Gallia West IPA', 'note': 70, 'price': 42.99, 'volume': 33}]
</pre>

In [380]:
def extract_beer_list_infos(url):
    # Collecter les pages de bières à partir du JSON
    response = requests.get(url)
    
    beer_pages = response.json()['items']
    
    # Sequential version (slow):
    beers = [extract_beer_infos('https://www.beerwulf.com'+beer['contentReference']) for beer in beer_pages]

    # Parallel version (faster):
    #beers = list(map(lambda x : extract_beer_infos('https://www.beerwulf.com'+x['contentReference']), beer_pages))
    
    return beers

In [381]:
%%time
url = 'https://www.beerwulf.com/fr-FR/api/search/searchProducts?country=France&container=Bouteille'
beers = extract_beer_list_infos(url)
beers

CPU times: user 998 ms, sys: 69.6 ms, total: 1.07 s
Wall time: 3.11 s


[{'name': 'Gallia East IPA', 'note': 80, 'price': 42.99, 'volume': 33},
 {'name': 'La Lager Sans Gluten de Vézelay',
  'note': 60,
  'price': 38.99,
  'volume': 25},
 {'name': 'Brasserie De Sutter Brin de Folie',
  'note': 70,
  'price': 44.99,
  'volume': 33},
 {'name': 'La Cristal IPA du Mont Blanc',
  'note': 70,
  'price': 44.99,
  'volume': 33},
 {'name': 'Mélusine Bio', 'note': 70, 'price': 38.99, 'volume': 33},
 {'name': 'La Parisienne Le Titi Parisien',
  'note': 70,
  'price': 38.99,
  'volume': 33},
 {'name': 'Gallia Session IPA', 'note': 70, 'price': 42.99, 'volume': 33},
 {'name': 'Ninkasi Brut IPA', 'note': 70, 'price': 44.99, 'volume': 33},
 {'name': 'Pietra', 'note': 60, 'price': 38.99, 'volume': 33},
 {'name': 'Desperados', 'note': 60, 'price': 35.99, 'volume': 33},
 {'name': 'Gallia West IPA', 'note': 70, 'price': 42.99, 'volume': 33}]

In [387]:
def extract_beer_list_infos2(url):
    # Collecter les pages de bières à partir du JSON
    response = requests.get(url)
    
    beer_pages = response.json()['items']
    
    # Sequential version (slow):
    #beers = [extract_beer_infos('https://www.beerwulf.com'+beer['contentReference']) for beer in beer_pages]

    # Parallel version (faster):
    beers = list(map(lambda x : extract_beer_infos('https://www.beerwulf.com'+x['contentReference']), beer_pages))
    
    return beers

In [388]:
%%time
url = 'https://www.beerwulf.com/fr-FR/api/search/searchProducts?country=France&container=Bouteille'
beers2 = extract_beer_list_infos2(url)
beers2

CPU times: user 913 ms, sys: 48.2 ms, total: 961 ms
Wall time: 2.86 s


[{'name': 'Gallia East IPA', 'note': 80, 'price': 42.99, 'volume': 33},
 {'name': 'La Lager Sans Gluten de Vézelay',
  'note': 60,
  'price': 38.99,
  'volume': 25},
 {'name': 'Brasserie De Sutter Brin de Folie',
  'note': 70,
  'price': 44.99,
  'volume': 33},
 {'name': 'La Cristal IPA du Mont Blanc',
  'note': 70,
  'price': 44.99,
  'volume': 33},
 {'name': 'Mélusine Bio', 'note': 70, 'price': 38.99, 'volume': 33},
 {'name': 'La Parisienne Le Titi Parisien',
  'note': 70,
  'price': 38.99,
  'volume': 33},
 {'name': 'Gallia Session IPA', 'note': 70, 'price': 42.99, 'volume': 33},
 {'name': 'Ninkasi Brut IPA', 'note': 70, 'price': 44.99, 'volume': 33},
 {'name': 'Pietra', 'note': 60, 'price': 38.99, 'volume': 33},
 {'name': 'Desperados', 'note': 60, 'price': 35.99, 'volume': 33},
 {'name': 'Gallia West IPA', 'note': 70, 'price': 42.99, 'volume': 33}]

In [389]:
import unittest

class Lesson4Tests(unittest.TestCase):
    def test_01_get_prices_from_url_page2(self):
        prices = get_prices_from_url(URL_PAGE2)
        # We should have found 3 products:
        self.assertIsInstance(prices, dict)
        self.assertEqual(len(prices), 3)
        self.assertIn('Personal', prices)
        self.assertIn('Small Business', prices)
        self.assertIn('Enterprise', prices)
        
        personal = prices['Personal']
        self.assertIn('price', personal)
        self.assertIn('storage', personal)
        self.assertIn('databases', personal)
        self.assertEqual(personal['price'], '$5')
        self.assertEqual(personal['storage'], '1GB')
        self.assertEqual(personal['databases'], 1)
        
    def test_02_get_prices_from_url_page3(self):
        prices = get_prices_from_url(URL_PAGE3)
        self.assertIsInstance(prices, dict)
        self.assertEqual(len(prices), 4)
        self.assertEqual(
            prices['Privilege'],
            {'databases': 100, 'price': '$99', 'storage': '1TB'}
        )
    
    def test_03_extract_beer_list_infos(self):
        infos = extract_beer_list_infos2(URL_BEERLIST_FRANCE)
        # We should have 11 French beers:
        self.assertIsInstance(infos, list)
        self.assertEqual(len(infos), 11)
        # All of them are 25cl or 33cl:
        for beer in infos:
            self.assertIn(beer['volume'], [25, 33])

            
def run_tests():
    test_suite = unittest.makeSuite(Lesson4Tests)
    runner = unittest.TextTestRunner(verbosity=2)
    runner.run(test_suite)

In [390]:
if __name__ == '__main__':
    run_tests()

test_01_get_prices_from_url_page2 (__main__.Lesson4Tests) ... ok
test_02_get_prices_from_url_page3 (__main__.Lesson4Tests) ... ok
test_03_extract_beer_list_infos (__main__.Lesson4Tests) ... ok

----------------------------------------------------------------------
Ran 3 tests in 3.207s

OK
