In [9]:
import codecs
import json
import os
import time
import requests
import re
import pandas as pd

from bs4 import BeautifulSoup
from tqdm import tqdm

from IPython.display import Image
# beautifulsoup4>=4.10.0
# requests>=2.26.0

# Web Scraping

In this notebook we'll use *requests* and *BeautifulSoup* modules to directly access *Guia Peñín* and automatically extract all wine data sheets from the DO:

       'Cava  D.O.  / D.O.P.',
       'Terra Alta  D.O.  / D.O.P.', 
       'Costers del Segre  D.O.  / D.O.P.',
       'Conca de Barberà  D.O.  / D.O.P.',
       'Priorat  D.O.  Ca.  / D.O.P.', 
       'Tarragona  D.O.  / D.O.P.',
       'Penedès  D.O.  / D.O.P.',
       'Empordà  D.O.  / D.O.P.',
       'Catalunya  D.O.  / D.O.P.',
       'Alella  D.O.  / D.O.P.',
       'Montsant  D.O.  / D.O.P.',
       'Pla de Bages  D.O.  / D.O.P.'

We have detected that not all the wine data sheets are the same, some of them have more information than the others for example:
- **Care Finca Marimú 2019 Barrica**: We only have some basic atributes:

<img src="Care.png" width=900 height=500 />

- **Parés Baltà Blanca Cusiné 2013 Gran Reserva Brut Nature**: We find more attributes like wine,when this wine was tasted, the percentage of each variety:

<img src="Pares.png" width=900 height=500 />

- **Poco a Poco Envejecido en Barrica 2019 Crianza**: In this case we don't find attributes like color,smell,taste rather only the wine style. This can be really usefull for the recommender:

<img src="Poco.png" width=900 height=500 />



Due to the seen variety of wine data sheets we need an example of each one to test our single wine data page scrapper that given a url will return a dictonary containing all the wine attributes:

In [6]:
# EXAMPLES
simple_wine = 'https://guiapenin.wine/guide/wine/care_finca_marimu_2019_t_ba/es'
two_years_wine = 'https://guiapenin.wine/guide/wine/scala_dei_masdeu_2013_t/es'
rel_quality_wine = 'https://guiapenin.wine/guide/wine/poco_a_poco_envejecido_en_barrica_2019_t_c/es'
final_boss_wine = 'https://guiapenin.wine/guide/wine/pares_balt_blanca_cusine_2013_be_gr_bn/es'
martinet_bru = 'https://guiapenin.wine/guide/wine/martinet_bru_2018_t/es'

master_wine_list = 'https://guiapenin.wine/guide/wines/es?vino=1&winename=&bodega[]=&zona[]=2513-zona,2532-zona,2535-zona,2519-zona,2545-zona,2581-zona,2517-zona,2512-zona,2544-zona,2515-zona,2503-zona,2534-zona,2506-zona,2573-zona,2523-zona,2557-zona,2597-zona,2679-zona,2580-zona,2558-zona,2533-zona&puntuacionmin=&puntuacionmax=&preciomin=&preciomax=&anyomin=&anyomax=&minestrellas=&maxestrellas=&pais[]=69&&take=15&orden=desc'

## Single Wine Page info scrapper

In [7]:
cookies = {"laravel_session": "852ef98a3a69b274b98c520086ebbd47e6b45d93"}


def get_wine_data_from_url(wine_url):
    """ Return dict of wine data from a given URL """
    
    wine_html = requests.get(wine_url, cookies=cookies).text
    bs = BeautifulSoup(wine_html)
    wine_data = {}

    wine_data['name'] = bs.select('.wine__header__name')[0].text
    
    wine_data['points'] = bs.select('.js-points')[0].text

    # All wine info
    wine_info = bs.select('.wine__info')[0]

    # basic wine info

    for el in wine_info:
        if el.name == 'dt':
            wine_info_title = el.text.lower()
            continue

        if el.name == 'dd':

            if wine_info_title in ['tipo', 'variedades']:
                # Multivalued
                prop_val = []
                for prop_el in el:
                    if len(prop_el.text.strip()) > 0:
                        value = prop_el.text.strip().replace('  ',' ').lower()
                        # replace percent of variety with nothing -> Will handle this case when importing it to DB
                        # value = re.sub(r'\d{2,3}% ', '', value)
                        prop_val += [value]
                wine_data[wine_info_title] = prop_val

            else:
                wine_data[wine_info_title] = el.text.strip()

    wine_data['ecologic'] = bs.find(src='/images/ic-ecologico.png') != None

    # --------------------------------

    wine_props = bs.select('.properties .property')

    for el in wine_props:
        prop_name = el.find(class_='tag').text.strip().lower()
        prop_value = el.find(class_='value').text.strip().lower().split(',')
        prop_value = [i.strip() for i in prop_value]
        wine_data[prop_name] = prop_value

    # --------------------------------

    # Dades de la cata

    wine_tasting = bs.select('.wine__data.js-datosCata')

    for el in wine_tasting:
        price_el = el.find(class_='price')
        if(price_el):
            wine_data['price'] = float(
                price_el.text.replace('€', '').replace('.','').replace(',', '.'))

    return wine_data


In [8]:
print(martinet_bru)
get_wine_data_from_url(martinet_bru)

https://guiapenin.wine/guide/wine/martinet_bru_2018_t/es


{'name': 'Martinet Bru 2018',
 'points': '92',
 'tipo': ['tinto'],
 'bodega': 'Mas Martinet Viticultors',
 'zona de producción': 'Priorat  D.O.  Ca.  / D.O.P.',
 'país': 'España',
 'fecha de cata': '01-07-2020',
 'ecologic': False,
 'estilo': ['silvestre', 'frutal'],
 'color': ['cereza', 'borde violáceo'],
 'aroma': ['fruta roja', 'floral', 'especiado'],
 'boca': ['sabroso', 'frutoso', 'buena acidez', 'largo', 'fácil de beber']}

Let's see how it works for a simple wine with few attributes:

In [4]:
print(simple_wine)
get_wine_data_from_url(simple_wine)

https://guiapenin.wine/guide/wine/care_finca_marimu_2019_t_ba/es


{'name': 'Care Finca Marimú 2019 Barrica',
 'points': '91',
 'tipo': ['tinto barrica'],
 'variedades': ['cariñena'],
 'bodega': 'Bodegas Care',
 'zona de producción': 'Cariñena  D.O.  / D.O.P.',
 'país': 'España',
 'ecologic': False,
 'color': ['cereza intenso'],
 'aroma': ['hierbas secas',
  'roble cremoso',
  'fruta negra',
  'especias dulces',
  'violetas'],
 'boca': ['fruta madura', 'especiado', 'sabroso', 'estructurado']}

A more complex wine but without color,aroma and boca:

In [5]:
print(rel_quality_wine)
get_wine_data_from_url(rel_quality_wine)

https://guiapenin.wine/guide/wine/poco_a_poco_envejecido_en_barrica_2019_t_c/es


{'name': 'Poco a Poco Envejecido en Barrica 2019 Crianza',
 'points': '89',
 'tipo': ['tinto crianza'],
 'variedades': ['tempranillo', 'syrah'],
 'bodega': 'Bodegas Gardel',
 'zona de producción': 'Castilla Vino de la Tierra / I.G.P.',
 'país': 'España',
 'ecologic': True,
 'estilo': ['agradable', 'tostado', 'sabroso', 'maduro'],
 'price': 8.0}

And finally a wine with a high number of attributes include if the wine is ecologic as well as price:

In [9]:
print(final_boss_wine)
get_wine_data_from_url(final_boss_wine)

https://guiapenin.wine/guide/wine/pares_balt_blanca_cusine_2013_be_gr_bn/es


{'name': 'Parés Baltà Blanca Cusiné 2013 Gran Reserva Brut Nature',
 'points': '95',
 'tipo': ['blanco espumoso brut nature gran reserva'],
 'variedades': ['80% xarel.lo', '10% chardonnay', '10% pinot noir'],
 'bodega': 'Parés Baltà',
 'zona de producción': 'Cava  D.O.  / D.O.P.',
 'país': 'España',
 'fecha de cata': '27-01-2020',
 'ecologic': True,
 'estilo': ['con personalidad'],
 'color': ['dorado brillante'],
 'aroma': ['lías finas',
  'hierbas de tocador',
  'con carácter',
  'fruta madura',
  'frutos secos',
  'notas de cereal'],
 'boca': ['potente',
  'sabroso',
  'buena acidez',
  'burbuja fina',
  'fino amargor'],
 'price': 18.0}

## Master Wine List API reader

Now that we know how to extract and structure a given url we need to create a function to retrieve all the wine data sheets urls:


In [12]:
def get_catalan_urls():
    """ Create a json file with all the Catalan wine data sheets url """
    
    wine_list_api = "https://guiapenin.wine/guide/guideSearch"
    wine_list_params = "?pais=69&zona[]=2513-zona,2532-zona,2535-zona,2519-zona,2545-zona,2581-zona,2517-zona,2512-zona,2544-zona,2515-zona,2503-zona,2534-zona,2506-zona,2573-zona,2523-zona,2557-zona,2597-zona,2679-zona,2580-zona,2558-zona,2533-zona"
    wine_list = []

    pages = 35 # there are 260 different pages of 15 wines each. To speed up the process we take 150 each time, therfore we need to do this process 27 times. We needed to repeat this for Pla del Bages and Monstant
    take = 150

    for i in tqdm(range(pages)):
        wine_list_data = requests.get(wine_list_api+wine_list_params+f"&&take={take}&skip={i}", cookies=cookies).json()
        wine_list += [wine_list_data['wines']['resultados']]
        time.sleep(5) # Decreasing the request frequence to the website

    # save the complete list to a json file
    open('catalan_wine_list.json','w').write(json.dumps(wine_list))

In [13]:
get_catalan_urls()

100%|██████████| 35/35 [07:21<00:00, 12.61s/it]


## Read Wine list and get info on all wines

In [21]:
wine_list = json.load(codecs.open('catalan_wine_list.json', 'r', 'utf-8'))

In [22]:
# not_found_wines = open("wine_errors.log","w")
# not_found_wines.close()

In [23]:
WINE_INFO_DIRECTORY = 'wine_data_raw_2'

not_found_wines = open("wine_errors.log","a")

wine_url_list = []

for wine_list_list in wine_list:
    for wine_item in wine_list_list:
        wine_url_list += [wine_item['url'],]
        
limit = 100000

for wine in wine_url_list:
    wine_filename = f'./{WINE_INFO_DIRECTORY}/{wine}.json'
    
    if(os.path.exists(wine_filename)):
        continue
        
        
    wine_url = f'https://guiapenin.wine/guide/wine/{wine}/es'
    
    try:
        print(wine)
        wine_info = get_wine_data_from_url(wine_url)
        fh = codecs.open(wine_filename,'w','utf-8')
        fh.write(json.dumps(wine_info, indent = 4, ensure_ascii=False))
        fh.close()
    except:
        print(f"NOT FOUND: {wine}")
        not_found_wines.write(wine_url+"\n")
    
    
    time.sleep(1)
    
    limit -= 1
    if limit == 0:
        break

not_found_wines.close()

torrent_negre_seleccio_privada_cabernet_2014_t_c
NOT FOUND: torrent_negre_seleccio_privada_cabernet_2014_t_c
mim_natura_pinot_noir_rosado_2019_re_r_br
NOT FOUND: mim_natura_pinot_noir_rosado_2019_re_r_br
mim_natura_blanc_de_noirs_2016_be_gr_bn
NOT FOUND: mim_natura_blanc_de_noirs_2016_be_gr_bn
clos_gelida_4_heretats_2017_be_gr_bn
NOT FOUND: clos_gelida_4_heretats_2017_be_gr_bn
clot_del_roure_2021_b
NOT FOUND: clot_del_roure_2021_b
gr_5_senders_2020_b
NOT FOUND: gr_5_senders_2020_b
clot_del_roure_xarello_brisat_2021_b
NOT FOUND: clot_del_roure_xarello_brisat_2021_b
pla_del_bosc_xarello_vermell_2021_b
NOT FOUND: pla_del_bosc_xarello_vermell_2021_b
gr_5_senders_2020_t
NOT FOUND: gr_5_senders_2020_t
torre_de_capmany_viejas_soleras_b__d_6
NOT FOUND: torre_de_capmany_viejas_soleras_b__d_6
pago_de_tharsys_2017_be_gr_bn
NOT FOUND: pago_de_tharsys_2017_be_gr_bn
rimarts_2017_be_gr_bn_1
NOT FOUND: rimarts_2017_be_gr_bn_1
porprat_2016_t
lolivera_reserva_superior_2018_be__bn
NOT FOUND: lolivera_res