# Projet maison n° 4

In [8]:
# imports
import json
import requests
from bs4 import BeautifulSoup

**Partie A**

Ecrire une fonction `get_prices_from_url()` qui extrait des informations à partir des 2 pages ci-dessous.

```python
URL_PAGE2 = "https://kim.fspot.org/cours/page2.html"
URL_PAGE3 = "https://kim.fspot.org/cours/page3.html"
```

Avec `URL_PAGE2`, la fonction doit retourner :

```json
{'Personal': {'price': '$5', 'storage': '1GB', 'databases': 1},
 'Small Business': {'price': '$25', 'storage': '10GB', 'databases': 5},
 'Enterprise': {'price': '$45', 'storage': '100GB', 'databases': 25}}
```

In [9]:
def make_request(url):
    # Make request to server
    req = requests.get(url)
    if req.status_code != 200:
        raise ValueError(f"Error while making request to {url}")
    else:
        return req

In [10]:
# partie A
URL_PAGE2 = "https://kim.fspot.org/cours/page2.html"
URL_PAGE3 = "https://kim.fspot.org/cours/page3.html"

def get_prices_from_url(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36', 'Accept-Language': 'fr,fr-FR;'}):
    prices = {}
    
    # Make request to server
    req = make_request(url)
    
    # Retrieve whole content
    soup = BeautifulSoup(req.content)
    
    # Retrieve specific information
    soup_info = soup.find('div', class_='pricing-tables pure-g')
    for soup_price in soup_info.findAll('div', class_='pure-u-1'):
        price_type = soup_price.find('h2').text.strip()
        price_value = soup_price.find('span', attrs={'class': 'pricing-table-price'}).text.strip().split()[0]
        price_storage = soup_price.findAll('li')[3].text.strip().split()[0]
        price_database = int(soup_price.findAll('li')[4].text.strip().split()[0])
        
        # Update dictionary
        prices.update({price_type:{'price':price_value, 'storage':price_storage, 'databases':price_database}})
        
    return prices

prices = get_prices_from_url(url=URL_PAGE2)
prices

{'Personal': {'price': '$5', 'storage': '1GB', 'databases': 1},
 'Small Business': {'price': '$25', 'storage': '10GB', 'databases': 5},
 'Enterprise': {'price': '$45', 'storage': '100GB', 'databases': 25}}

**Partie B**

*L'abus d'alcool est dangereux pour la santé, à consommer avec modération.*

1) Ecrire une fonction `extract_beer_infos()` qui extrait des informations sur une bière du site de bières *beowulf*.

Exemple d'URL: https://www.beerwulf.com/fr-fr/p/bieres/cuvee-des-trolls.33 

La fonction doit retourner :
```json
{'Nom': 'Cuvée des Trolls',
 'Style': 'Bière Blonde',
 'Contenu': 25,
 'Degré d’alcool': 7.0,
 'Origine': 'Belgique',
 'Brasseur': 'Brasserie Dubuisson Freres'}
```

2) L'URL ci-après retourne un JSON avec une liste de bières :

```python
URL_BEERLIST_FRANCE = "https://www.beerwulf.com/fr-FR/api/search/searchProducts?country=France&container=Bouteille"
```

Ecrire une fonction `extract_beer_list_infos(url)` qui prend en argument cet URL et retourne les informations sur une liste de bières du site *beowulf*.

Cette fonction doit retourner la liste des informations obtenues par la fonction précédemment définie `extract_beer_infos()`.

Exemple de retour :

```json
[{'Nom':'Desperados','Style':'Lager','Contenu':33,'Degré d’alcool':5.9,'Origine':'France','Brasseur':'Desperados'},
{'Nom':'La Lager Sans Gluten de Vézelay','Style':'Lager','Contenu':25,'Degré d’alcool':4.0,'Origine':'France','Brasseur':'Brasserie de Vézelay'},
{'Nom':'Mélusine Bio','Style':'Pale Ale','Contenu': 33,'Degré d’alcool': 5.0,'Origine':'France','Brasseur':'Mélusine'},
{'Nom':'La Parisienne Le Titi Parisien','Style':'IPA','Contenu':33,'Degré d’alcool':5.5,'Origine':'France','Brasseur': 'Brasserie la Parisienne'},
{'Nom':'Brasserie De Sutter Brin de Folie','Style':'Bière Blonde','Contenu': 33,'Degré d’alcool':6.5,'Origine':'France','Brasseur':'Brasserie de Sutter'}]
```

**Facultatif**

Chercher comment optimiser cette fonction en utilisant `multiprocessing.Pool()` pour paralléliser les accès web.

In [11]:
# partie B-1
def extract_beer_infos(url):
    beer_infos = {
        'Nom': None,
        'Style': None,
        'Contenu': None,
        'Degré d’alcool': None,
        'Origine': None,
        'Brasseur': None
    }
    
    # Make request
    req = make_request(url)
    
    # Retrieve whole content
    soup = BeautifulSoup(req.content)
    
    # Retrieve specific information
    soup_info = soup.find('div', class_='small-12 content-column')
    beer_infos['Nom'] = soup_info.find('h1').text.strip()
    beer_infos['Style'] = soup_info.find('dd', class_='small-6 medium-9 columns').find('a').text.strip()
    beer_infos['Contenu'] = int(soup_info.find('dd', class_='small-6 medium-9 columns js-beer-volume').text.strip().split()[0])
    beer_infos['Degré d’alcool'] = float(soup_info.find('dd', class_='small-6 medium-9 columns').find_next('dd', class_='small-6 medium-9 columns').text.strip().replace(',', '.').replace('%', ''))
    beer_infos['Origine'] = soup_info.find('dd', class_='small-6 medium-9 columns js-beer-country').text.strip()
    beer_infos['Brasseur'] = soup_info.find('dd', class_='small-6 medium-9 columns').find_next('dd', class_='small-6 medium-9 columns').find_next('dd', class_='small-6 medium-9 columns').text.strip()
    
    return beer_infos

beer_infos = extract_beer_infos(url='https://www.beerwulf.com/fr-fr/p/bieres/desperados-fut-2l')
beer_infos

{'Nom': 'Desperados - Fût 2L The SUB',
 'Style': 'Lager',
 'Contenu': 2,
 'Degré d’alcool': 5.9,
 'Origine': 'France',
 'Brasseur': 'Desperados'}

In [12]:
# partie B-2
URL_BEERLIST_FRANCE = "https://www.beerwulf.com/fr-FR/api/search/searchProducts?country=France&container=Bouteille"

def extract_beer_list_infos(url, URL_HOME='https://www.beerwulf.com/'):
    # Collecter les pages de bières à partir du JSON
    req = make_request(url)
    soup = req.json()
    beer_pages = [f"{URL_HOME}{beer_info['contentReference']}" for beer_info in soup['items']]
    
    # Sequential version (slow):
    beers = [extract_beer_infos(url=beer_page) for beer_page in beer_pages]

    #### Facultatif - Parallel version (faster):
    # see projet4.py
    # Mode 'sequential' took 1.0801212787628174 seconds
    # Mode 'parallel' took 0.7547028064727783 seconds
    
    return beers

beer_list_infos = extract_beer_list_infos(url=URL_BEERLIST_FRANCE)
beer_list_infos

[{'Nom': 'La Cristal IPA du Mont Blanc',
  'Style': 'IPA',
  'Contenu': 33,
  'Degré d’alcool': 4.7,
  'Origine': 'France',
  'Brasseur': 'Mont-Blanc'},
 {'Nom': 'Ninkasi Pale Ale',
  'Style': 'Bière Ambrée',
  'Contenu': 33,
  'Degré d’alcool': 4.5,
  'Origine': 'France',
  'Brasseur': 'Ninkasi Fabriques'},
 {'Nom': 'Desperados',
  'Style': 'Lager',
  'Contenu': 33,
  'Degré d’alcool': 5.9,
  'Origine': 'France',
  'Brasseur': 'Desperados'},
 {'Nom': 'Pietra',
  'Style': 'Lager',
  'Contenu': 33,
  'Degré d’alcool': 6.0,
  'Origine': 'France',
  'Brasseur': 'Brasserie Pietra'}]

In [13]:
import unittest

class Session4Tests(unittest.TestCase):
    def test_01_get_prices_from_url_page2(self):
        prices = get_prices_from_url(URL_PAGE2)
        # We should have found 3 products:
        self.assertIsInstance(prices, dict)
        self.assertEqual(len(prices), 3)
        self.assertIn('Personal', prices)
        self.assertIn('Small Business', prices)
        self.assertIn('Enterprise', prices)
        
        personal = prices['Personal']
        self.assertIn('price', personal)
        self.assertIn('storage', personal)
        self.assertIn('databases', personal)
        self.assertEqual(personal['price'], '$5')
        self.assertEqual(personal['storage'], '1GB')
        self.assertEqual(personal['databases'], 1)
        
    def test_02_get_prices_from_url_page3(self):
        prices = get_prices_from_url(URL_PAGE3)
        self.assertIsInstance(prices, dict)
        self.assertEqual(len(prices), 4)
        self.assertEqual(
            prices['Privilege'],
            {'databases': 100, 'price': '$99', 'storage': '1TB'}
        )
    
    def test_03_extract_beer_list_infos(self):
        infos = extract_beer_list_infos(URL_BEERLIST_FRANCE)
        self.assertIsInstance(infos, list)
        self.assertGreater(len(infos), 1)
        # Contenu = int
        # Degré d’alcool = float
        for beer in infos:
            self.assertIsInstance(beer['Nom'], str)
            self.assertIsInstance(beer['Style'], str)
            self.assertIsInstance(beer['Contenu'], int)
            self.assertIsInstance(beer['Degré d’alcool'], float)
            self.assertEqual(beer['Origine'], "France")
            self.assertIsInstance(beer['Brasseur'], str)

            
def run_tests():
    test_suite = unittest.makeSuite(Session4Tests)
    runner = unittest.TextTestRunner(verbosity=2)
    runner.run(test_suite)

In [14]:
if __name__ == '__main__':
    run_tests()

test_01_get_prices_from_url_page2 (__main__.Session4Tests) ... ok
test_02_get_prices_from_url_page3 (__main__.Session4Tests) ... ok
test_03_extract_beer_list_infos (__main__.Session4Tests) ... ok

----------------------------------------------------------------------
Ran 3 tests in 1.636s

OK
