In [76]:
import requests
import pandas as pd
import json
import time
import re

from bs4 import BeautifulSoup

def scraper_web(filename = ""):
    try:
        with open(filename, 'r') as f:
            ulabox = json.load(f)             
    except:
        print("The document '" + filename + "' is not available.\n")
        print("The scraper will retrieve all the products url.\n")
        # call to init scraper of url's
        # get the dict/json urls
    else:
        print("The document '" + filename + "' is available.\n")
        print("Scraped url products loaded succesfuly.\n")
    
    # send the urls to the parser
    return ulabox

def scraper_get_all_products(dictionary_url):
    '''
    The scraper recieves the scraped url's, we parse the document and retrieve the information to finally create the dataset
    '''
    # Empty dataframe to store the scraped information of each products
    df_products = pd.DataFrame(columns=['Id', 'Categoria', 'Subcategoria','Enllaç',
                                        'Nom Producte','Preu','PreuBase','Ingredients',
                                        'Valor Energètic Kj','Valor Energetic KC', 'Grases', 'Hidrats', 'Sucre', 'Proteines', 'Sal',
                                        'Fabricant'])
    
    # Loop for retrieving and sraping each product
    id = 0
    for category in dictionary_url:
        for subcategory in dictionary_url[category]:
            for productsUrl in dictionary_url[category][subcategory]:
                
                # getting the url of the product
                print("Enllaç complet al producte: ", baseUrl + productsUrl)
                product_link = baseUrl + productsUrl
                
                # getting the soup and the product information
                soup = scraper_get_soup(product_link)
                product_info = scraper_get_products(soup, id, category, subcategory)
                
                # Adding the product to the dataframe
                df_products = df_products.append(product_info, ignore_index = True)
                print("Added product\n\n")
                
                # Product control for ID's and Sle
                id = id+1
                time.sleep(1)
        print("Finished! \n*Doing just one category")
        break
                
    # Saving the results of all products
    df_products.to_csv('dades_productes_ulabox_test.csv', index = False , encoding='utf-8-sig')
                
    return

def scraper_get_soup(url):
    
    UserAgent =  ({'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
            'Accept-Language': 'en-US, en;q=0.5'})
    web = requests.get(url, headers = UserAgent)
    soup = BeautifulSoup(web.content, 'html.parser')
    
    return soup  
    
def get_new_product(id, category, subcategory):
    
    product = {
        'Id' : id,
        'Categoria': category,
        'Subcategoria': subcategory,
        'Enllaç': "Not Available",
        'Nom Producte': "Not Available",
        'Preu': "Not Available",
        'PreuBase': "Not Available",
        'Ingredients': "Not Available",
        'Valor Energètic Kj': "Not Available",
        'Valor Energètic KC': "Not Available",
        'Grases': "Not Available",
        'Hidrats': "Not Available",
        'Sucre': "Not Available",
        'Proteines': "Not Available",
        'Sal' : "Not Available",
        'Fabricant' : "Not Available"
    }
    
    return product

def scraper_get_product_title(soup, product):
    try:
        name = soup.find("h1").get_text()
        product.update({'Nom Producte' : name})
    except:
        print("Product name not found\n")
    
    return product    
    
def scraper_get_product_link(soup, product):
    try:
        link_soup = soup.find('link', {'href': True, 'rel': 'canonical'})
        link = link_soup['href']
        product.update({'Enllaç' : link})
    except:
        print("Product link not found\n")
    
    return product

def scraper_get_product_price(soup, product):
    try:
        price_soup = soup.find("meta", { 'itemprop' : 'price'})
        price = price_soup['content']
        product.update({'Preu' : price})
    except:
        print("Product price not found\n")
    
    return product

def scraper_get_product_base_price(soup, product):
    try:
        
        classFilter = [ "jss470" ]
        
        subpreu = soup.find_all("p")
        basePrice = ""
    
        for preustotals in subpreu:
            if preustotals['class'][1] in classFilter:
                if basePrice != "":
                    basePrice = basePrice + " " + preustotals.text
                else:
                    basePrice = preustotals.text
        
        product.update({'PreuBase' : basePrice})
        
    except:
        print("Product base price not found\n")
    
    return product

def scraper_get_product_nutritional_table(soup, product):
    '''
    The nutritional table may be found in ¿different layouts?:
    - Catalan titles
    - Spanish titles
    ¿english titles?
    '''
    try:
        
        titols = soup.find_all("h6")
        apartatsText = [
            "Description",
            "Ingredients",
            "Usage and preservation",
            "Additional information"
        ]
        # Get divs for the table content
        apartatsTaules = [
            "Nutrients",
            "Measures"
        ]
        
        divs = {}
        results = {}
        valors_nutricionals = []
        for titol in titols:
            if(titol.text in apartatsText):
                divs[titol.text] = titol.find_next('div')
                results[titol.text] = divs[titol.text].text
                
            if(titol.text in apartatsTaules):
                for sibling in titol.find_next_siblings():
                # Eliminem el text que no ens interesa
                # From: 'Energetic valueAprox.3700 KJ' to -> 3700
                    valor_nutricional_numeric = sibling.text
                    valors_nutricionals.append(re.search(r'\d+', valor_nutricional_numeric).group())
  
        product.update({'Valor Energètic Kj' : valors_nutricionals[0]}) 
        product.update({'Valor Energètic KC' : valors_nutricionals[1]})
        product.update({'Grases' : valors_nutricionals[2]})
        product.update({'Hidrats' : valors_nutricionals[3]})
        product.update({'Sucre' : valors_nutricionals[4]})
        product.update({'Proteines' : valors_nutricionals[5]})
        product.update({'Sal' : valors_nutricionals[6]})
        
    except:
        print("Product nutritional table not found\n")
    
    return product

def scraper_get_product_factory(soup, product):
    try:
        titols = soup.find_all("h6")
        apartatsText = [
            "Additional information"
        ]
        
        divs = {}
        results = {}
        valors_nutricionals = []
        for titol in titols:
            if(titol.text in apartatsText):
                divs[titol.text] = titol.find_next('div')
                results[titol.text] = divs[titol.text].text
                factory = re.split(": ", results[titol.text])[1]
        
        product.update({'Fabricant' : factory})
            
    except:
        print("Product factory name not found\n")
    
    return product

def scraper_get_products_ingredients(soup, product):
    try:
        titols = soup.find_all("h6")
        apartatsText = [
            "Ingredients"
        ]
        
        divs = {}
        results = {}
        valors_nutricionals = []
        for titol in titols:
            if(titol.text in apartatsText):
                divs[titol.text] = titol.find_next('div')
                results[titol.text] = divs[titol.text].text
                ingredients = results[titol.text]
        
        product.update({'Ingredients' : ingredients})
        
    except:
        print("Product ingredients not found\n")
    
    return product

def scraper_get_products(soup, id, category, subcategory):
    
    # Create an empty product
    product = get_new_product(id, category, subcategory)
    
    # Retrieve information and storage in the product
    product = scraper_get_product_title(soup, product)
    product = scraper_get_product_link(soup, product)
    product = scraper_get_product_price(soup, product)
    product = scraper_get_product_base_price(soup, product)
    product = scraper_get_product_nutritional_table(soup, product)
    product = scraper_get_product_factory(soup, product)
    product = scraper_get_products_ingredients(soup, product)
    
    print(product)
    
    # Add the product to the dataframe
    #df_products = df_products.append(producte, ignore_index = True)
    
    # Send the dataframe to a csv
    #df_products.to_csv('dades_productes_ulabox.csv', index = False , encoding='utf-8-sig')


    return product
    
    
dictionary_url = scraper_web("productsUrls.json")
baseUrl = "https://www.ulabox.com"
scraper_get_all_products(dictionary_url)

The document 'productsUrls.json' is available.

Scraped url products loaded succesfuly.

Enllaç complet al producte:  https://www.ulabox.com/ca/producte/aceite-de-oliva-suave-borges-2l/28350?ula_src=front_category_show&ula_mdm=product_list
Category:  Olis i Condiments
Subcategory:  Oli dÂ´Oliva
{'Id': 0, 'Categoria': 'Olis i Condiments', 'Subcategoria': 'Oli dÂ´Oliva', 'Enllaç': 'https://www.ulabox.com/en/product/aceite-de-oliva-suave-borges-2l/28350', 'Nom Producte': 'Soft Oil Borges 2L', 'Preu': '11.99', 'PreuBase': '2 l 6,00\xa0€ / l.', 'Ingredients': "Oli d'oliva refinat i oli d'oliva verge extra.", 'Valor Energètic Kj': '3700', 'Valor Energètic KC': '900', 'Grases': '100', 'Hidrats': '16', 'Sucre': '0', 'Proteines': '0', 'Sal': '0', 'Fabricant': 'Borges Branded Foods S.L.U.'}
Added product


Enllaç complet al producte:  https://www.ulabox.com/ca/producte/aceite-virgen-extra-priordei-500ml/131244?ula_src=front_category_show&ula_mdm=product_list
Category:  Olis i Condiments
Subcateg

KeyboardInterrupt: 