In [None]:
import re
import pandas as pd
import unittest
import requests
from bs4 import BeautifulSoup
from multiprocessing import Pool

In [None]:
URL_PAGE2 = "https://kim.fspot.org/cours/page2.html"
URL_PAGE3 = "https://kim.fspot.org/cours/page3.html"

1) Ecrire une fonction get_prices_from_url() qui extrait des informations à partir des 2 pages ci-dessus. Exemple get_prices_from_url(URL_PAGE2) doit retourner : {'Personal': {'price': '$5', 'storage': '1GB', 'databases': 1}, 'Small Business': {'price': '$25', 'storage': '10GB', 'databases': 5}, 'Enterprise': {'price': '$45', 'storage': '100GB', 'databases': 25}}

In [None]:
def get_prices_from_url(url):
    
    response = requests.get(url)
    soup = BeautifulSoup(response.content)
    customer_info = soup.find_all('h2')
    price_info = soup.find_all('span')[::2]
    info = soup.find_all("ul")[1:]

    storage_info, database_info = [], []
    for ul in info:
        database_info.append(ul.find("li", text=re.compile(".*database.*")))
        storage_info.append(ul.find("li", text=re.compile(".*storage.*")))
    
    prices = {}
    for i in range(len(customer_info)):
        customer = customer_info[i].text
        prices[customer] = {}
        prices[customer]['price'] = re.findall('\$\d+', price_info[i].text)[0]
        prices[customer]['storage'] = re.findall('\d+.B', storage_info[i].text)[0]
        prices[customer]['databases'] = int(re.findall('\d+', database_info[i].text)[0])
        
    return(prices)

In [None]:
dico = get_prices_from_url(URL_PAGE3)
dico["Personal"]


2) Ecrire une fonction qui extrait des informations sur une bière de beowulf Exemple URL: https://www.beerwulf.com/fr-fr/p/bieres/brouwerij-t-verzet-super-noah.33

In [None]:
URL_BEER = "https://www.beerwulf.com/fr-fr/p/bieres/brouwerij-t-verzet-super-noah.33"

In [None]:
response = requests.get(URL_BEER)
soup = BeautifulSoup(response.content)
beer = soup.find("h1")
info = beer.find_next("span")
note = info.find_next("span")
price = note.find_next("span")

info

In [None]:
def extract_beer_infos(url):
    
    response = requests.get(url)
    soup = BeautifulSoup(response.content)
    beer = soup.find("h1")
    info = beer.find_next("span")
    note = info.find_next("span")
    price = note.find_next("span")
    infos = {
        'name': beer.text,
        'note': float(note.text.strip('()').replace(',', '.')),
        'price': float(price.text.strip(" €").replace(',', '.')),
        'volume': int(re.search('\d+ cl', info.text)[0].strip(" cl"))
    }
    return(infos)

In [None]:
extract_beer_infos(URL_BEER)

In [None]:
URL_BEERLIST_AUTRICHE = "https://www.beerwulf.com/fr-FR/api/search/searchProducts?country=Autriche&container=Bouteille"

In [None]:

response = requests.get(URL_BEERLIST_AUTRICHE)
data = response.json()
data["items"][0]["contentReference"]


3) Ecrire une fonction qui prend l'argument "url" retourne les informations sur une liste de bière via l'API de beowulf. Cette fonction doit retourner la liste des informations obtenues par la fonction extract_beer_infos() définie ci-dessus. Chercher comment optimiser cette fonction en utilisant multiprocessing.Pool pour paralléliser les accès web.

In [None]:
def extract_beer_list_infos(url):
    response = requests.get(url)
    data = response.json()
    base_url = "https://www.beerwulf.com"
    beer_pages = [base_url + x["contentReference"] for x in data["items"]]
    beers = [extract_beer_infos(page) for page in beer_pages]
    return(beers)

In [None]:
class Lesson3Tests(unittest.TestCase):
    def test_01_get_prices_from_url_page2(self):
        prices = get_prices_from_url(URL_PAGE2)
        # We should have found 3 products:
        self.assertIsInstance(prices, dict)
        self.assertEqual(len(prices), 3)
        self.assertIn('Personal', prices)
        self.assertIn('Small Business', prices)
        self.assertIn('Enterprise', prices)

        personal = prices['Personal']
        self.assertIn('price', personal)
        self.assertIn('storage', personal)
        self.assertIn('databases', personal)
        self.assertEqual(personal['price'], '$5')
        self.assertEqual(personal['storage'], '1GB')
        self.assertEqual(personal['databases'], 1)

    def test_02_get_prices_from_url_page3(self):
        prices = get_prices_from_url(URL_PAGE3)
        self.assertIsInstance(prices, dict)
        self.assertEqual(len(prices), 4)
        self.assertEqual(
            prices['Privilege'],
            {'databases': 100, 'price': '$99', 'storage': '1TB'}
        )

    def test_03_extract_beer_list_infos(self):
        infos = extract_beer_list_infos(URL_BEERLIST_AUTRICHE)
        # >Il y a 9 bières autrichiennes :
        self.assertIsInstance(infos, list)
        self.assertEqual(len(infos), 9)
        # toutes ont 33cl :
        for beer in infos:
            self.assertEqual(beer['volume'], 33)

        

In [None]:

def run_tests():
    test_suite = unittest.makeSuite(Lesson3Tests)
    runner = unittest.TextTestRunner(verbosity=2)
    runner.run(test_suite)

if __name__ == '__main__':
    run_tests()