# Projet 4

In [1]:
# Imports
import requests
import pandas as pd
import re
import sys
from multiprocessing import Pool
from bs4 import BeautifulSoup

Ecrire une fonction `get_prices_from_url()` qui extrait des informations à partir des 2 pages ci-dessous.

Exemple `URL_PAGE2` doit retourner :

<pre>{'Personal': {'price': '$5', 'storage': '1GB', 'databases': 1},
  'Small Business': {'price': '$25', 'storage': '10GB', 'databases': 5},
  'Enterprise': {'price': '$45', 'storage': '100GB', 'databases': 25}}
</pre>

In [2]:
URL_PAGE2 = "https://kim.fspot.org/cours/page2.html"
URL_PAGE3 = "https://kim.fspot.org/cours/page3.html"

In [3]:
def get_prices_from_url(url):
    """Returns a dictionnary containing the price, the storage and the number
    of databases of each offer from a given website.
    
    Arguments:
    url (type: str): the url for a website.
    """
    r = requests.get(url)  # Extract informations from a given url
     
    content = r.content.decode('utf-8')  # Transform content into a str
    
    soup = BeautifulSoup(content)  # Set up of the BeautifulSoup method
    
    prices = {}  # Initialisation of the prices dict
    
    for div_tag in soup.findAll('div', attrs={'class': 'pure-u-1 pure-u-md-1-3'}) or soup.findAll('div', attrs={'class': 'pure-u-1 pure-u-md-1-4'}):
        rx_store = re.compile('<li>([^<]+)file storage</li>')
        match_store = rx_store.search(str(div_tag.findAll('li')))  # Use of regex method to find the storage
        
        rx_data = re.compile('<li>([^<]+)data([^<]+)</li>')
        match_data = rx_data.search(str(div_tag.findAll('li')))  # Use of regex method to find the nbr of databases

        prices[div_tag.find('h2').text] = {
            'price': div_tag.find('span').text.split()[0],
            'storage': match_store.group(1).split()[0],
            'databases': int(match_data.group(1).split()[0]),
        }

    return prices

In [4]:
get_prices_from_url(URL_PAGE2)

{'Personal': {'price': '$5', 'storage': '1GB', 'databases': 1},
 'Small Business': {'price': '$25', 'storage': '10GB', 'databases': 5},
 'Enterprise': {'price': '$45', 'storage': '100GB', 'databases': 25}}

In [5]:
get_prices_from_url(URL_PAGE3)

{'Personal': {'price': '$5', 'storage': '1GB', 'databases': 1},
 'Small Business': {'price': '$25', 'storage': '10GB', 'databases': 5},
 'Enterprise': {'price': '$45', 'storage': '100GB', 'databases': 25},
 'Privilege': {'price': '$99', 'storage': '1TB', 'databases': 100}}

Ecrire une fonction qui extrait des informations sur une bière de beowulf.

Exemple d'URL: https://www.beerwulf.com/fr-fr/p/bieres/melusine-bio.33 

La fonction doit retourner :
<pre>
{'name': 'Mélusine Bio', 'note': 70, 'price': 38.99, 'volume': 33}
</pre>

In [6]:
url = 'https://www.beerwulf.com/fr-fr/p/bieres/melusine-bio.33'

In [7]:
def extract_beer_infos(url):
    """Returns a dictionnary containing the name, the note, the price
    and the volume of a beer.
    
    Arguments:
    url (type: str): the url of the beer web page.
    """
    r = requests.get(url)  # Extract informations from a given url
     
    content = r.content.decode('utf-8')  # Transform content into a str
    
    soup = BeautifulSoup(content)  # Set up of the BeautifulSoup method
    
    biere = soup.find('div', {'class': "product-detail-info-row mobile-header-details"})  # Find the root
    
    price_content = soup.find('script', {'type': 'application/ld+json'}).contents[0]
    rx = re.compile('"price": "([^<]+)",')
    match_price = rx.search(price_content)  # Use of regex method to find the beer price
    
    infos = {
        'name': biere.find('h1').text,
        'note': int(biere.find('div', {'class':'stars'}).attrs['data-percent']),
        'price': float(match_price.group(1)),
        'volume': int(biere.find('span').text.split(' | ')[-1][:-len(' cl')]),
    }

    return infos

In [8]:
extract_beer_infos(url)

{'name': 'Mélusine Bio', 'note': 70, 'price': 2.89, 'volume': 33}

Cette URL retourne un JSON avec une liste de bières :

In [9]:
URL_BEERLIST_FRANCE = "https://www.beerwulf.com/fr-FR/api/search/searchProducts?country=France&container=Bouteille"

Ecrire une fonction qui prend l'argument cet URL retourne les informations sur une liste de bière via l'API de beowulf.

Cette fonction doit retourner la liste des informations obtenues par la fonction `extract_beer_infos()` définie ci-dessus.

Chercher comment optimiser cette fonction en utilisant multiprocessing.Pool pour paralléliser les accès web.

Exemple de retour :

<pre>[{'name': 'Gallia East IPA', 'note': 80, 'price': 42.99, 'volume': 33},
    {'name': 'La Lager Sans Gluten de Vézelay', 'note': 60, 'price': 38.99, 'volume': 25},
    {'name': 'Brasserie De Sutter Brin de Folie', 'note': 70, 'price': 44.99, 'volume': 33},
    {'name': 'La Cristal IPA du Mont Blanc', 'note': 70, 'price': 44.99, 'volume': 33},
    {'name': 'Mélusine Bio', 'note': 70, 'price': 38.99, 'volume': 33},
    {'name': 'La Parisienne Le Titi Parisien', 'note': 70, 'price': 38.99, 'volume': 33},
    {'name': 'Gallia Session IPA', 'note': 70, 'price': 42.99, 'volume': 33},
    {'name': 'Ninkasi Brut IPA', 'note': 70, 'price': 44.99, 'volume': 33},
    {'name': 'Pietra', 'note': 60, 'price': 38.99, 'volume': 33},
    {'name': 'Desperados', 'note': 60, 'price': 35.99, 'volume': 33},
    {'name': 'Gallia West IPA', 'note': 70, 'price': 42.99, 'volume': 33}]
</pre>

In [10]:
def extract_beer_list_infos(url):
    # Collecter les pages de bières à partir du JSON
    """Returns a list of dictionnaries containing the name, the note, 
    the price and the volume of all beers given in the url.
    
    Arguments:
    url (type: str): the url of the beer website.
    """
    response = requests.get(URL_BEERLIST_FRANCE)  # Extract informations from a given url
    data = response.json()  # Convert into a json (a dict)
    
    df = pd.DataFrame(data['items'])  # Create a DataFrame containing all the pages
    df['beer_pages'] = 'https://www.beerwulf.com' + df['contentReference']  # Create a col with the url of all beers 
    
    # Sequential version (slow):
    #df['beers'] = df['beer_pages'].apply(lambda x: extract_beer_infos(x))  # Apply the extract_beer_infos to each beer
    
    # Parallel version (faster):
    import extract_beer_infos
    
    if __name__ == '__main__':
        with Pool() as pool:
            beers = pool.map(extract_beer_infos.extract_beer_infos , list(df['beer_pages']))
    
    return beers

In [11]:
extract_beer_list_infos(URL_BEERLIST_FRANCE)

[{'name': 'Gallia East IPA', 'note': 80, 'price': 3.09, 'volume': 33},
 {'name': 'La Lager Sans Gluten de Vézelay',
  'note': 60,
  'price': 2.79,
  'volume': 25},
 {'name': 'Brasserie De Sutter Brin de Folie',
  'note': 70,
  'price': 2.85,
  'volume': 33},
 {'name': 'La Cristal IPA du Mont Blanc',
  'note': 70,
  'price': 2.89,
  'volume': 33},
 {'name': 'Mélusine Bio', 'note': 70, 'price': 2.89, 'volume': 33},
 {'name': 'La Parisienne Le Titi Parisien',
  'note': 70,
  'price': 2.99,
  'volume': 33},
 {'name': 'Gallia Session IPA', 'note': 70, 'price': 2.99, 'volume': 33},
 {'name': 'Ninkasi Brut IPA', 'note': 70, 'price': 3.29, 'volume': 33},
 {'name': 'Pietra', 'note': 60, 'price': 2.69, 'volume': 33},
 {'name': 'Desperados', 'note': 60, 'price': 1.99, 'volume': 33},
 {'name': 'Gallia West IPA', 'note': 70, 'price': 2.99, 'volume': 33}]

In [12]:
import unittest

class Lesson4Tests(unittest.TestCase):
    def test_01_get_prices_from_url_page2(self):
        prices = get_prices_from_url(URL_PAGE2)
        # We should have found 3 products:
        self.assertIsInstance(prices, dict)
        self.assertEqual(len(prices), 3)
        self.assertIn('Personal', prices)
        self.assertIn('Small Business', prices)
        self.assertIn('Enterprise', prices)
        
        personal = prices['Personal']
        self.assertIn('price', personal)
        self.assertIn('storage', personal)
        self.assertIn('databases', personal)
        self.assertEqual(personal['price'], '$5')
        self.assertEqual(personal['storage'], '1GB')
        self.assertEqual(personal['databases'], 1)
        
    def test_02_get_prices_from_url_page3(self):
        prices = get_prices_from_url(URL_PAGE3)
        self.assertIsInstance(prices, dict)
        self.assertEqual(len(prices), 4)
        self.assertEqual(
            prices['Privilege'],
            {'databases': 100, 'price': '$99', 'storage': '1TB'}
        )
    
    def test_03_extract_beer_list_infos(self):
        infos = extract_beer_list_infos(URL_BEERLIST_FRANCE)
        # We should have 11 French beers:
        self.assertIsInstance(infos, list)
        self.assertEqual(len(infos), 11)
        # All of them are 25cl or 33cl:
        for beer in infos:
            self.assertIn(beer['volume'], [25, 33])

            
def run_tests():
    test_suite = unittest.makeSuite(Lesson4Tests)
    runner = unittest.TextTestRunner(verbosity=2)
    runner.run(test_suite)

In [13]:
if __name__ == '__main__':
    run_tests()

test_01_get_prices_from_url_page2 (__main__.Lesson4Tests) ... ok
test_02_get_prices_from_url_page3 (__main__.Lesson4Tests) ... ok
test_03_extract_beer_list_infos (__main__.Lesson4Tests) ... ok

----------------------------------------------------------------------
Ran 3 tests in 1.246s

OK
