In [None]:
# coding: utf-8

import unittest
import requests
from bs4 import BeautifulSoup

# imports
import pandas as pd
from lxml import etree

# options d'affichage
pd.set_option("display.min_rows", 16)

In [None]:

URL_PAGE2 = "https://kim.fspot.org/cours/page2.html"
URL_PAGE3 = "https://kim.fspot.org/cours/page3.html"


In [None]:
## Make sure we are allowed to scrap the page
requests.get(URL_PAGE2).status_code

In [None]:

# 1) Ecrire une fonction get_prices_from_url() qui extrait des informations à partir des 2 pages ci-dessus.
# Exemple get_prices_from_url(URL_PAGE2) doit retourner :
# {'Personal': {'price': '$5', 'storage': '1GB', 'databases': 1},
#  'Small Business': {'price': '$25', 'storage': '10GB', 'databases': 5},
#  'Enterprise': {'price': '$45', 'storage': '100GB', 'databases': 25}}

In [None]:
# get the content of web page
content = requests.get(URL_PAGE2).content.decode('utf-8')

# Create a BeautifulSoup object of it
soup = BeautifulSoup(content, 'html.parser')

## After first insight into soup, I can see that all the interesting data is
# located into div class named 'pricing-tables pure-g'
## Trying to break down the problem by isolating the area of interest. 
## In this exercise, this is not too much useful as the site is not very big. 

soup = soup.find('div', class_ = 'pricing-tables pure-g')
print(soup.prettify())


# Step 1 : Isolate data one by one in one single offer

In [None]:
## Isolate offer type
offer_type = soup.find('h2').text
# typr offre = soup_url2. trouve les lignes <h2> puis en extrait le text correspondant
offer_type

In [None]:
## Isolate price

offer_price = soup.find('span', class_ = 'pricing-table-price').text.split()[0]
#  prix service = soup.find(lignes <span> ayant la classe 'pricing-table-price'). on sort le texte 
# et on le splitte. On prend ensuite le premier élément [0] qui correspond à $5

offer_price

In [None]:
## Isolate storage and database size as they are both under "li"
# These infos are under <ul class="pricing-table-list">
#                         <li>

offer_storage = soup.find('ul', class_="pricing-table-list").find_all('li')[3].text.split()[0]
# storage= find in ul pricing-table-list --> find all 'li' as they contain relevant data.
# the storage is in position [3]. We then want the text, split it and take 1st item as 
# it is the size that we are looking for

offer_database = soup.find('ul', class_="pricing-table-list").find_all('li')[4].text.split()[0]
# same as storage. But in this case, it is in element index 4 of the find(ul, class).find_all(li)

print("Storage found : ", offer_storage)
print("Database found : ", offer_database)


# Step 2: Now that we have successfully extracted elements we wanted for one single offer, we want to make a for loop to catch same info from other offers

In [None]:

# Since we found the information under class_="pricing-table", we need to catch all
# the similar pricing-table classes using the find_all function
pricing_table = soup.find_all(class_="pricing-table")


output = {}

for offer in pricing_table:  # for offer in personal/smallbizness/entreprise/etc...
    offer_type = offer.find('h2').text
    offer_price = offer.find('span', class_ = 'pricing-table-price').text.split()[0]
    offer_storage = offer.find('ul', class_="pricing-table-list").find_all('li')[3].text.split()[0]
    offer_database = int(offer.find('ul', class_="pricing-table-list").find_all('li')[4].text.split()[0])
        
    output[offer_type] = {
        'price': offer_price,
        'storage': offer_storage,
        'databases': offer_database,
    }
    
    
output

# Step 3: Put all these operations in a def function 

In [None]:
def get_prices_from_url(url):
    content = requests.get(url).content.decode('utf-8')
    soup = BeautifulSoup(content, 'html.parser')
    soup = soup.find('div', class_ = 'pricing-tables pure-g')
    
    pricing_table = soup.find_all(class_="pricing-table")


    prices = {}

    for offer in pricing_table:  # for offer in personal/smallbizness/entreprise/etc...
        offer_type = offer.find('h2').text
        offer_price = offer.find('span', class_ = 'pricing-table-price').text.split()[0]
        offer_storage = offer.find('ul', class_="pricing-table-list").find_all('li')[3].text.split()[0]

        offer_database = int(offer.find('ul', class_="pricing-table-list").find_all('li')[4].text.split()[0])

        prices[offer_type] = {
            'price': offer_price,
            'storage': offer_storage,
            'databases': offer_database,
        }

    
    return prices


# Step 4 : test it !

In [None]:
get_prices_from_url(URL_PAGE3)

# EXO 2 
### Ecrire une fonction qui extrait des informations sur une bière de beowulf
### Exemple URL: https://www.beerwulf.com/fr-fr/p/bieres/brouwerij-t-verzet-super-noah.33

## Step 1: retrieve data, select what we want of it and print it 

In [None]:
URL= 'https://www.beerwulf.com/fr-fr/p/bieres/brouwerij-t-verzet-super-noah.33'
# start by getting info from the url 
content = requests.get(URL).content.decode('utf-8')

# Create a BeautifulSoup object of it
soup = BeautifulSoup(content, 'html.parser')

# focus on the interesting part of data
soup = soup.find('div', class_ = 'product-detail-info-row')

print(soup.prettify())

## Step 2: Isolate requested values for function definition

In [None]:
# find out name
name = soup.find('h1').text
name

In [None]:
# find out note
note = int(soup.find(class_="stars").attrs['data-percent'])
note

In [None]:
# find out price
price = float(soup.find('span',class_="price").text.split()[0].replace(",","."))
price

In [None]:
# find out volume
# volume = float(soup.find('div',class_="product-subtext").find('span').text.split()[5].replace("%","").replace(",","."))
volume = float(soup.find('div',class_="product-subtext").find('span').text.split()[-2])

volume, type(volume)

## Step 3: pieces of code are ready to be put in the function

In [None]:
# write the full fonction containing data reading and values identification

def extract_beer_infos(url):

    content = requests.get(url).content.decode('utf-8')
    soup = BeautifulSoup(content, 'html.parser')
    soup = soup.find('div', class_ = 'product-detail-info-row')
    
    name = soup.find('h1').text
    note = int(soup.find(class_="stars").attrs['data-percent'].replace('\n',''))
    price = float(soup.find('span',class_="price").text.split()[0].replace(",",".").replace('\n',''))
    volume = float(soup.find('div',class_="product-subtext").find('span').text.split()[-2].replace('\n',''))
    

#     infos = {
#         'name': name, 
#         'note': note,
#         'price': price,
#         'volume': volume,
    infos = {'name': name, 'note': note,'price': price,'volume': volume}
        
    return infos

## Step 4 : Test on a page

In [None]:
# test on the page to check that it scraps data correctly

extract_beer_infos('https://beerwulf.com/fr-fr/p/bieres/engelzell-trappisten-weibe-bottle-.33')

# EXO 3

In [None]:


# 3) Ecrire une fonction qui prend l'argument "url" retourne les informations sur une liste de bière via l'API de beowulf.
# Cette fonction doit retourner la liste des informations obtenues par la fonction extract_beer_infos() définie ci-dessus.
# Chercher comment optimiser cette fonction en utilisant multiprocessing.Pool pour paralléliser les accès web.
#
# Exemple de retour :
# [{'name': 'Engelszell Benno', 'note': 70, 'price': 4.29, 'volume': 33}
#  {'name': 'Engelszell Trappisten Weiße', 'note': 70, 'price': 3.39, 'volume': 33}
#  {'name': 'Engelszell Gregorius', 'note': 70, 'price': 4.49, 'volume': 33}
#  {'name': 'Bevog Rudeen Black IPA', 'note': 80, 'price': 4.49, 'volume': 33}
#  {'name': 'Bevog Tak Pale Ale', 'note': 70, 'price': 2.79, 'volume': 33}
#  {'name': 'Brew Age Affenkönig', 'note': 70, 'price': 3.49, 'volume': 33}
#  {'name': 'Stiegl Goldbraü', 'note': 70, 'price': 2.49, 'volume': 33}
#  {'name': 'Stiegl Columbus 1492', 'note': 70, 'price': 2.49, 'volume': 33}
#  {'name': 'Brew Age Hopfenauflauf', 'note': 70, 'price': 2.99, 'volume': 33}]

# get the content of web page
content = requests.get(URL_PAGE2).content.decode('utf-8')

# Create a BeautifulSoup object of it
soup = BeautifulSoup(content, 'html.parser')

# Cette URL retourne un JSON avec une liste de bières
URL_BEERLIST_AUTRICHE = "https://www.beerwulf.com/fr-FR/api/search/searchProducts?country=Autriche&container=Bouteille"


In [None]:
# how to load data from json ? 
content = requests.get(URL_BEERLIST_AUTRICHE)
content = content.json()
content,type(content)
# turns out that content is a dictionary

In [None]:
# Figure out how to isolate beer page
content['items'][1]['contentReference']

In [None]:
# Collecter les pages de bières à partir du JSON
beer_pages = []

for i in content['items']:
    beer_pages.append("https://beerwulf.com" + i['contentReference'])

beer_pages

In [None]:
import os
type(os.cpu_count())
from multiprocessing import Pool
import os

In [None]:
import os
type(os.cpu_count())
from multiprocessing import Pool
import os

## CELLE CI MARCHE EN SEQUENTIEL !!!

def extract_beer_list_infos(url):
    # ouvrir le json dans python
    content = requests.get(url).json()
    
    # Collecter les pages de bières à partir du JSON
    beer_pages = []
    
    for i in content['items']:
        beer_pages.append("https://beerwulf.com" + i['contentReference'])

    
    beers = []
    
    # Sequential version (slow):
    for page in beer_pages:
         beers.append(extract_beer_infos(page))

#     #Parallel version (faster):
#     from multiprocessing import Pool
#     import os
#     p = Pool(processes=os.cpu_count())   #  processes est le nombre de processus workers à utiliser. 
#     # Si processes est None, le nombre renvoyé par os.cpu_count() est utilisé.
#     beers = p.imap(extract_beer_infos, beer_pages)
#     # on utilise le process créé et on passe la fonction extract_beer_list_info sur tous les éléments de beer_pages
#     p.close()
#     p.join()
#     return beers



In [None]:
%%time
extract_beer_list_infos(URL_BEERLIST_AUTRICHE)

In [None]:

class Lesson3Tests(unittest.TestCase):
    def test_01_get_prices_from_url_page2(self):
        prices = get_prices_from_url(URL_PAGE2)
        # We should have found 3 products:
        self.assertIsInstance(prices, dict)
        self.assertEqual(len(prices), 3)
        self.assertIn('Personal', prices)
        self.assertIn('Small Business', prices)
        self.assertIn('Enterprise', prices)

        personal = prices['Personal']
        self.assertIn('price', personal)
        self.assertIn('storage', personal)
        self.assertIn('databases', personal)
        self.assertEqual(personal['price'], '$5')
        self.assertEqual(personal['storage'], '1GB')
        self.assertEqual(personal['databases'], 1)

    def test_02_get_prices_from_url_page3(self):
        prices = get_prices_from_url(URL_PAGE3)
        self.assertIsInstance(prices, dict)
        self.assertEqual(len(prices), 4)
        self.assertEqual(
            prices['Privilege'],
            {'databases': 100, 'price': '$99', 'storage': '1TB'}
        )

    def test_03_extract_beer_list_infos(self):
        infos = extract_beer_list_infos(URL_BEERLIST_AUTRICHE)
        # >Il y a 9 bières autrichiennes :
        self.assertIsInstance(infos, list)
        self.assertEqual(len(infos), 9)
        # toutes ont 33cl :
        for beer in infos:
            self.assertEqual(beer['volume'], 33)


def run_tests():
    test_suite = unittest.makeSuite(Lesson3Tests)
    runner = unittest.TextTestRunner(verbosity=2)
    runner.run(test_suite)


if __name__ == '__main__':
    run_tests()
