In [96]:
import re

import pandas as pd
import numpy as np

#
import sys
sys.path.append("..")

# import data extraction support function
from src.support.data_extraction_support import extract_table_from_link, extract_productnames_links, extract_categorynames_links, extract_supermarkets

# import data transformation support functions
from src.support.data_transformation_support import extract_quantity_from_product_name, sanitize_filename

# 1. Introduction to this notebook

The purpose of this notebook is to explain the decision making in the different cleaning procedures applied to the data scraped during the extraction phase.

# 2. Data transformation

The scraping of historical data poses no major problem, as the only transformations to be made for the data to be ready for use are:
- The replacement of a comma by the correct floating point character.
- Transformation to datetime and float data types.
- Cleaning of characters incompatible with filepath writing

However, the available html fields for extraction do not explicitly provide other information about the products, such as:
- Quantity
- Unit of measure
- Volume/weight
- Brand
- Packaging
- Subcategory of product (Milk for babies, extra quality olive oil, protein milk, with or without lactose)

This information is extremely valuable for the analysis, especially the quantities, units of measure and volume/weight, as they can multiply differences in prices inadvertedly. As said, this data is not available through picky extraction of the relevant fields but from the product name itself. Thus, a processing of the extracted product name is necessary to obtain these valuable fields for the database.

An extraction of a base dataframe has been made to enable the exploration necessary for the cleaning process.

## 2.1 Extract quantity, unit of measure and volume/weight

The structures in product names accross brands, supermarkets and products is erratic. The approach to extract quantity, unit of measure and volume or weight is therefore through complex regular expression patterns.

In [135]:
names_list = list()
category_list = list()
supermarket_list = list()

supermarket_links = extract_supermarkets("https://super.facua.org/")

for supermarket_link in supermarket_links:

    category_links = extract_categorynames_links(supermarket_link)

    for category_link in category_links:

        product_names, product_links = extract_productnames_links(category_link)

        names_list.extend(product_names)
        supermarket_names = [product_link.split("/")[3].replace("-","_") for product_link in product_links]
        supermarket_list.extend(supermarket_names)
        category_names = [product_link.split("/")[4].replace("-","_") for product_link in product_links]
        category_list.extend(category_names)


In [137]:
products = pd.DataFrame(zip(names_list,category_list, supermarket_list), columns=["product_name","category","supermarket"])
products.head()

Unnamed: 0,product_name,category,supermarket
0,"Aceite De Girasol Refinado 0,2º Hacendado 1 L.",aceite_de_girasol,mercadona
1,"Aceite De Girasol Refinado 0,2º Hacendado 5 L.",aceite_de_girasol,mercadona
2,"Aceite De Oliva 0,4º Hacendado 1 L.",aceite_de_oliva,mercadona
3,Aceite De Oliva 1º Hacendado 1 L.,aceite_de_oliva,mercadona
4,Aceite De Oliva Intenso Hacendado 3 L.,aceite_de_oliva,mercadona


In [110]:
products["product_name2"] = products["product_name"].apply(lambda x: sanitize_filename(x))

### Exploration of different strings

In [130]:
# IF YOU WANT TO HAVE THIS OUTPUT DISPLAYED: UNCOMMENT THIS CELL

for name in products["product_name2"]:
    print(name)

Aceite De Girasol Refinado 0,2º Hacendado 1 L.
Aceite De Girasol Refinado 0,2º Hacendado 5 L.
Aceite De Oliva 0,4º Hacendado 1 L.
Aceite De Oliva 1º Hacendado 1 L.
Aceite De Oliva Intenso Hacendado 3 L.
Aceite De Oliva Suave Hacendado 3 L.
Aceite De Oliva Virgen Extra Hacendado 0.2 L.
Aceite De Oliva Virgen Extra Hacendado 1 L.
Aceite De Oliva Virgen Extra Hacendado 3 L.
Aceite De Oliva Virgen Extra Hacendado Gran Selección 0.75 L.
Aceite De Oliva Virgen Extra Picual Casa Juncal 0.5 L.
Aceite De Oliva Virgen Hacendado 1 L.
Aceite De Oliva Virgen Hacendado 3 L.
Leche +Proteínas Desnatada Hacendado 6 L.
Leche Desnatada Calcio Hacendado 6 L.
Leche Desnatada Hacendado 1 L.
Leche Desnatada Hacendado 1.5 L.
Leche Desnatada Hacendado 6 L.
Leche Desnatada Hacendado 9 L.
Leche Desnatada Sin Lactosa Hacendado 1 L.
Leche Desnatada Sin Lactosa Hacendado 6 L.
Leche Entera Calcio Hacendado 1 L.
Leche Entera Calcio Hacendado 6 L.
Leche Entera Fresca Hacendado 1 L.
Leche Entera Hacendado 1 L.
Leche En

### General exploration

In [None]:
cadena = """'6 l', '1 l', '1.5 l', '9 l', '1.2 l', '250 ml', '450 g', '210 g',
       '387 g', '370 g', '740 g', '400 ml', '500 ml', '800 g', '1200 g',
       '200 ml', '1,5 l', '9 x 1 l', '6 x 200 ml', '750 ml', '525 g',
       '10 x 7,5 g', '2 x 210 g', '2 x 160 g', '265 ml', '4 x 120 g',
       '6 x 100 g', '14 x 100 g', '270 ml', '2,2 l', '50 cl', '400 g',
       '6x200 ml', '3x210 g', '10x7,5 g', '6x188 ml', '3x200 ml',
       '6 x 1 l', '2.2 l', '6 x 1.5 l', '6 x 2.2 l', '3 x 200 ml',
       '6 x 188 ml', '600 g', '1,5 ml', '500 g', '188 ml', '20 cl.',
       '2 l', '6x1 l', '6x 1 l', '6 x 1l', '4 x 1.5 l', '6 x 500 ml',
       '1.5l', '1l', '6x 1l'"""

In [None]:
re.findall(r"(\d+(?:[.,]\d+)?\s?(?:l|litros?|ml|mililitros?))", cadena.lower())[0]

'1 l'

In [None]:
re.findall(r"\d\s?(\w{1,2})$", "6x 1 g")

['g']

In [None]:
re.findall(r"(?:\d\s?x\s?)?(\d?\.?\d+)\s?\w{1,2}?", cadena.replace(",","."))

['6',
 '1',
 '1.5',
 '9',
 '1.2',
 '250',
 '450',
 '210',
 '387',
 '370',
 '740',
 '400',
 '500',
 '800',
 '1200',
 '200',
 '1.5',
 '1',
 '200',
 '750',
 '525',
 '10',
 '7.5',
 '210',
 '160',
 '265',
 '120',
 '100',
 '14',
 '100',
 '270',
 '2.2',
 '50',
 '400',
 '200',
 '210',
 '10',
 '7.5',
 '188',
 '200',
 '1',
 '2.2',
 '1.5',
 '2.2',
 '200',
 '188',
 '600',
 '1.5',
 '500',
 '188',
 '200',
 '2',
 '1',
 '1',
 '1',
 '1.5',
 '500',
 '1.5',
 '1',
 '1']

In [None]:
re.findall(r"(?:\d\s?x\s?)?(\d?\.?\d+)\s?\w{1,2}?", cadena.replace(",","."))[-10:]

['20', '2', '1', '1', '1', '1.5', '500', '1.5', '1', '1']

In [None]:
re.findall(r"(\d+)\s?x", cadena)

['9',
 '6',
 '10',
 '2',
 '2',
 '4',
 '6',
 '14',
 '6',
 '3',
 '10',
 '6',
 '3',
 '6',
 '6',
 '6',
 '3',
 '6',
 '6',
 '6',
 '6',
 '4',
 '6',
 '6']

### EXTRACT liters and quantities - leche

In [111]:
names = products.loc[products["category"] == "leche","product_name"].str.lower().str.replace(" unidades de ", " x ").str.replace(" uds. x ", " x ").str.replace(" uds. ", " x ").str.replace(" briks de ", " x ")
names.head()

13    leche +proteínas desnatada hacendado 6 l.
14        leche desnatada calcio hacendado 6 l.
15               leche desnatada hacendado 1 l.
16             leche desnatada hacendado 1.5 l.
17               leche desnatada hacendado 6 l.
Name: product_name, dtype: object

In [112]:
names = names.str.extract(r"(\d+(?:[.,]\d+)?\s?(?:l|litros?|ml|g|gr|cl|g)|\d+\s?(?:uds\.?|botes|x)\s?\d+(?:[.,]\d+)?\s?(?:l|ml|g|gr|cl|g))")
names.iloc[:,0].unique()

array(['6 l', '1 l', '1.5 l', '9 l', '1.2 l', '250 ml', '450 g', '210 g',
       '387 g', '370 g', '740 g', '400 ml', '500 ml', '800 g', '1200 g',
       '200 ml', '1,5 l', '9 x 1 l', nan, '6 x 200 ml', '750 ml', '525 g',
       '10 x 7,5 g', '2 x 210 g', '2 x 160 g', '265 ml', '4 x 120 g',
       '6 x 100 g', '14 x 100 g', '270 ml', '2,2 l', '50 cl', '400 g',
       '6x200 ml', '3x210 g', '10x7,5 g', '6x188 ml', '3x200 ml',
       '6 x 1 l', '2.2 l', '6 x 1.5 l', '6 x 2.2 l', '3 x 200 ml',
       '6 x 188 ml', '600 g', '1,5 ml', '500 g', '188 ml', '200 cl',
       '2 l', '6x1 l', '6x 1 l', '6 x 1l', '4 x 1.5 l', '6 x 500 ml',
       '1.5l', '1l', '6x 1l'], dtype=object)

In [113]:
names.isna().sum() / names.shape[0]

0    0.003464
dtype: float64

Visual inspection of the not converted:

In [None]:
#######

Improvement

In [129]:
#####

### EXTRACT liters and quantities - Aceite de girasol

In [117]:
names = products.loc[products["category"] == "aceite_de_girasol","product_name"].str.lower()
names.head()

0     aceite de girasol refinado 0,2º hacendado 1 l.
1     aceite de girasol refinado 0,2º hacendado 5 l.
41            aceite de girasol capicua garrafa 5 l.
42         aceite de girasol carrefour classic' 1 l.
43          aceite de girasol carrefour garrafa 5 l.
Name: product_name, dtype: object

In [118]:
names = names.str.extract(r"(\d+(?:[.,]\d+)?\s?(?:l|litros?|ml|mililitros?))")
names.iloc[:,0].unique()

array(['1 l', '5 l', '3 l', '150 ml', '50 ml', '200 ml'], dtype=object)

In [119]:
names.isna().sum() / names.shape[0]

0    0.0
dtype: float64

### EXTRACT liters and quantities - Aceite de oliva

In [93]:
names = products.loc[products["category"] == "aceite_de_oliva","product_name"].str.lower()
names

2                     aceite de oliva 0,4º hacendado 1 l.
3                       aceite de oliva 1º hacendado 1 l.
4                  aceite de oliva intenso hacendado 3 l.
5                    aceite de oliva suave hacendado 3 l.
6           aceite de oliva virgen extra hacendado 0.2 l.
                              ...                        
1414    ybarra aceite de oliva virgen extra  botella d...
1415    ybarra aceite de oliva virgen extra botella 50...
1416    ybarra aceite de oliva virgen extra botella de...
1417    ybarra aceite de oliva virgen extra botella de...
1418      ybarra aceite de oliva virgen extra garrafa 5 l
Name: product_name, Length: 693, dtype: object

In [94]:
names = names.str.extract(r"(\d+(?:[.,]\d+)?\s?(?:l|litros?|ml|mililitros?))")
names.iloc[:,0].unique()

array(['1 l', '3 l', '0.2 l', '0.75 l', '0.5 l', '5 l', '750 ml',
       '250 ml', '500 ml', '10 ml', '200 ml', '2,5 l', nan, '300 ml',
       '400 ml', '150 ml', '20 ml', '2 l', '280 ml', '100 ml', '4 l',
       '1l'], dtype=object)

In [95]:
names.isna().sum() / names.shape[0]

0    0.132756
dtype: float64

### Final function

From the extracted quantity_magnitude_unit, divide each into its single field to be input as a separate column value.

In [None]:
import re
def extract_quantity_from_product_name(product_name, category_name):
    patterns = {
        "aceite-de-oliva" : r"(\d+(?:[.,]\d+)?\s?(?:l|litros?|ml|mililitros?))",
        "aceite-de-girasol": r"(\d+(?:[.,]\d+)?\s?(?:l|litros?|ml|mililitros?))",
        "leche" : r"(\d+(?:[.,]\d+)?\s?(?:l|litros?|ml|g|gr|cl|g)|\d+\s?(?:uds\.?|botes|x)\s?\d+(?:[.,]\d+)?\s?(?:l|ml|g|gr|cl|g))"
    }

    conversions_magnitude = {'g': 1, 'kg': 1000, 'mg': 0.001, 'l': 1, 'ml': 0.001, 'cl': 0.01}
    conversions_unit = {'g': 'g', 'kg': 'g', 'mg': 'g', 'l': 'l', 'ml': 'l', 'cl': 'l'}

    try:
        quantity_magnitude_unit = re.findall(patterns[category_name], product_name.lower())[0]
        quantity = re.findall(r"(\d+)\s?x", quantity_magnitude_unit)[0]
    except:
        quantity = np.nan

    try:
        units = re.findall(r"\d\s?(\w{1,2})$", quantity_magnitude_unit)[0]
    except:
        units = np.nan

    try:
        magnitude = re.findall(r"(?:\d\s?x\s?)?(\d?\.?\d+)\s?\w{1,2}?", quantity_magnitude_unit.replace(",","."))[0]
    except:
        magnitude = 1

    magnitude = float(magnitude) * conversions_magnitude.get(units, np.nan)
    units = conversions_unit.get(units, np.nan)

    return quantity, magnitude, units

### Extract brand names

In [143]:
for x in products[(products["category"]=="aceite_de_girasol") & (products["supermarket"]=="alcampo")]["product_name"]:
    print(x)

Campomar Nature Aceite De Girasol Ecológico Capomar 1 L.
Capicua Aceite De Girasol Alto Oleico Botella De 1 L.
Coosol Aceite De Girasol Botella De 1 L.
Coosol Premium Aceite De Girasol Botella De 1 L.
Girasol Cexasol 1 L.
Koipe Fritos Aceite De Girasol Especial Para Frituras Botella De 1 L.
Koipe Sol Aceite De Girasol En Spray, 150 Ml.
Koipesol Aceite De Girasol  Garrafa De 5 L.
Koipesol Aceite De Girasol Botella De 1 L.
La Española Soy Plus  Aceite De Girasol Y Soja Con Omega 3 Botella De 1 L.
La Masia Aceite De Girasol Botella 1 L.
La Masia Aceite De Girasol Garrafa De 5 L.
Ondosol Aceite De Girasol Botella 1 L.
Producto Alcampo Aceite De Girasol Botella De 1 L.
Producto Alcampo Aceite De Girasol Garrafa De 5 L
Producto Alcampo Aceite De Refinado De Girasol Alto Oleico Botella De 1 L.
Ucasol Aceite Refinado De Girasol  Garrafa De 5 L.



- aceite girasol
- aceite oliva: extra suave, suave, virgen o virgen extra
- leche: leche entera, leche semi (desnatada), leche desnatada, leche condensada | + sin lactosa
        leche de cabra. si no otras

In [None]:
- aceite girasol
- aceite oliva

First clean types:
- aceite girasol
- aceite oliva
- leche: "Sin lactosa"

Rules:
- If "hacendado" in name: "Hacendado".
- If alcampo:
    - aceite_girasol: whatever comes before aceite (later clean and unify brands by brand name keyword)
    - aceite_oliva: same
    - leche: whatever comes before leche

- IF carrefour:
    - aceite girasol: whatever comes after freir o ecologico, otherwise, girasol. And before space garrafa, botella, lata or digit
    - aceite oliva: whatever comes after extra or virgen, otherwise oliva. And before space garrafa, botella, lata or digit
    - leche: whatever comes after

Unify brand names:



In [274]:
brands = ["hacendado", "casa juncal", "carrefour", "campomar nature", "la masia", "ybarra", "carbonell", "koipe",
          "la espanola", "natursoy", "dcoop", "k arginano", "oro bailen", "capricho andaluz", "coosur", "de nuestra tierra",
          "oleum", "maestros de hojiblanca", "jaencoop", "guillen", "la laguna", "senorio de segura", "puleva",
          "asturiana", "kaiku", "alcampo", "pascual", "president", "santa teresa", "nivea", "flora", "mustela",
          "babaria", "babybio", "cantero de letur", "ultzama", "movit", "rianxeira", "el buen pastor", "eroski",
          "natura ecologica", "la colmenarena", "larsa", "ram", "vega de oro", "leyma natura", "priegola", "pravia",
          "llet nostra", "mantequilla bujia", "comansi", "montecelio", "caprea", "ecobasic", "artinata", "caprilait",
          "pasqualet", "fageda", "granja noe", "mimosa", "aiguafreda", "lacturale", "el castillo", "rio", "villacorona",
          "arla", "elosol", "diazol", "sveltesse", "ideal", "saha", "etnia", "leyenda", "bove", "valdezarza", "duc",
          "aires de jaen", "cambil", "olea espana", "cuatro esquinas", "quinta aldea", "arroyo de jaen", "mueloliva",
          "finca penamoucho", "coop solera", "beneo", "picualia", "pure bios", "les gallines", "dominus", "cortijo spiritu",
          "al-tabwa", "dos lunas", "la redonda", "quesos casario", "arcos", "aguilar de la frontera", "olivar de segura",
          "tierra de sabor", "coosol", "capicua", "fontasol", "ozolife", "abaco", "aromas del sur", "marques de grinon",
          "nunez de prado", "retama", "ondoliva", "verde segura", "suroliva", "saeta", "oro", "celta", "l.r.", "nestle",
          "president", "lauki", "montbelle", "oleoestepa", "aceites de ardales", "abril", "fuenroble", "olivar del sur",
          "olibeas", "oliva verde", "oleodiel", "oleaurum", "somontano", "oleo cazorla", "mar de olivos", "carbonel",
          "ucasol", "borges", "ondosol", "la masia", "cexasol", "granja noe", "lar", "letona", "lilibet", "lletera",
          "madriz", "unicla", "valles unidos", "auchan", "dia", "hipercor", "danone", "maeva", "santa teresa",
          "ecran sunnique", "nectar of bio", "denenes", "covap", "lanisol", "urzante", "olilan", "palacio de los olivos",
          "nekeas", "carapelli", "hojiblanca", "cazorliva", "arrolan", "saqura", "mil olivas", "don arroniz", "elizondo",
          "beyena", "bomilk", "euskal herria", "bizkaia esnea", "gaza", "el corte ingles", "agus", "alhema de queiles",
          "aljibes", "almaoliva", "amarga y pica", "arboleda", "casas de hualdo", "castillo de canena", "changlot real",
          "conde de benalua", "ester sole", "ferrarini", "flor de arana", "germanor", "go vegg", "hacienda el palo",
          "iznaoliva", "jacoliva", "k arguinano", "l'estornell", "la almazara de canjayar", "la boella", "la organic cuisine",
          "merula", "miro", "molino de olivas de bolea", "pago baldios san carlos", "parqueoliva", "reales almazaras de alcaniz",
          "romanico", "santiveri", "tresces", "unio", "valroble", "venta del baron", "altamira", "ato", "clesa", "ecomil",
          "feiraco", "la yerbera", "el lagar del soto", "el molino d gines", "fruto del sur", "giralda", "karlos arguinano",
          "monegros", "oleocazorla", "laban", "santa gadea", "k. arguinano", "lactebal"]



brands_lower = [brand.lower() for brand in brands]
brands_normalized = [unidecode(brand).lower() for brand in brands]

In [264]:
def extract_brand(product_name):
    for brand in brands_lower:
        if brand in product_name:
            return brand
    else:
        return np.nan
    



In [265]:
products["brands"] = products["product_name"].apply(lambda product_name: extract_brand(unidecode(product_name.lower())))

In [268]:
product_names_filtered = products[(products["brands"].isna())]

print(f"There are {product_names_filtered['brands'].isna().sum()} products without brand\n\n")

for product_name in product_names_filtered["product_name"]:
    print(product_name)

There are 38 products without brand


Aceite De Orujo De Oliva 1 L.
Anchoa Con Aceite De Oliva Virgen Extra Lorea Gourmet 125 G.
Anchoas En Aceite De Oliva Consorcio 50 G.
Anchoas En Aceite De Oliva Consorcio 53 G.
Anchoas En Aceite De Oliva Consorcio 78 G.
Anchoas En Aceite De Oliva Consorcio 87 G.
Anchoas En Aceite De Oliva Virgen Extra Ecológico Consorcio 38 G.
Boquerones En Vinagre Con Aceite De Oliva Virgen Extra Lorea 55 G.
Caballa De Andalucía En Aceite De Oliva Tejero 60 G.
Filete De Caballa En Aceite De Oliva Isabel 85 G.
Jabón De Manos En Pastilla De Aceite De Oliva 125 G.
Mojicones Aceite De Oliva Luna 600 G.
Piadina Sfogliatissima Con Aceite De Oliva Virgen Extra Loriana 350 G
Pimientos Asados Con Aceite De Oliva Ibsa Sin Gluten 295 G.
Leche Condensada De Coco Ecológica Naturgreen 210 G.
Leche De Continuación Desde Los 6 Meses Protech 2 Optimum Blemil 800 G.
Leche De Crecimiento Hipp Combiotik 3 800 G.
Leche De Inicio Blemil Plus 1 Forte 1200 G.
Leche Desnatada Brik 1 L.
Le

### Extract subcategories


- aceite girasol
- aceite oliva: extra suave, suave, virgen o virgen extra
- leche: leche entera, leche semi (desnatada), leche desnatada, leche condensada | + sin lactosa
        leche de cabra. si no otras

In [367]:
def extract_distinction_eco(product_name, category):
    distinction = np.nan

    if category == "leche":
        if "semidesnatada" in product_name:
            distinction = "semidesnatada"
        elif "desnatada" in product_name:
            distinction = "desnatada"
        elif "entera" in product_name:
            distinction = "entera"

        if not pd.isna(distinction) and "lactosa" in product_name:
            distinction += " sin lactosa"
        if not pd.isna(distinction) and "calcio" in product_name:
            distinction += " calcio"       
        if not pd.isna(distinction) and "calcio" in product_name:
            distinction += " proteinas"

    if " eco " in product_name or "ecologic" in product_name:
        eco = 1
    else:
        eco = 0

    return distinction, eco

def extract_subcategory(product_name, category, distinction):
    if category == "aceite_de_girasol":
        if "freir" in product_name:
            subcategory = "freir"
        else:
            subcategory = "normal"


    elif category == "aceite_de_oliva" and "en aceite" not in product_name and "con aceite" not in product_name:
        if "virgen extra" in product_name:
            subcategory = "virgen extra"
        elif "virgen"  in product_name:
            subcategory = "virgen"
        elif "intenso"  in product_name:
            subcategory = "intenso"
        else:
            subcategory = "suave"

    elif category == "leche": # leche
        if "cabra" in product_name:
            subcategory = "leche cabra"
        elif "vaca" in product_name:
            subcategory = "leche vaca"
        elif "condensada" in product_name:
            subcategory = "leche condensada"
        elif "leche" in product_name:
            subcategory = "leche vaca"
        else: 
            subcategory = np.nan

    else:
        subcategory = np.nan

    return subcategory

In [368]:
def apply_subcategory_distinction(product_name, category):
    distinction, eco = extract_distinction_eco(product_name, category)
    subcategory = extract_subcategory(product_name, category, distinction)

    return subcategory, distinction, eco

In [371]:
products[["subcategory","distinction","eco"]] = products[["product_name", "category"]].apply(
    lambda row: apply_subcategory_distinction(unidecode(row["product_name"].lower()), row["category"]),
    axis=1,
    result_type="expand"  # Ensures that the tuple output is expanded across columns
)


In [372]:
products[products["subcategory"].isna()]

Unnamed: 0,product_name,category,supermarket,brands,subcategory,distinction,eco
156,Anchoa Con Aceite De Oliva Virgen Extra Lorea ...,aceite_de_oliva,carrefour,,,,0.0
157,Anchoas En Aceite De Oliva Carrefour 55 G.,aceite_de_oliva,carrefour,carrefour,,,0.0
158,Anchoas En Aceite De Oliva Consorcio 50 G.,aceite_de_oliva,carrefour,,,,0.0
159,Anchoas En Aceite De Oliva Consorcio 53 G.,aceite_de_oliva,carrefour,,,,0.0
160,Anchoas En Aceite De Oliva Consorcio 78 G.,aceite_de_oliva,carrefour,,,,0.0
...,...,...,...,...,...,...,...
1385,Oro Virgen Aceite De Oliva Virgen Extra Botell...,aceite_de_oliva,alcampo,oro,,,0.0
1386,Oro Virgen Aceite De Oliva Virgen Extra Botell...,aceite_de_oliva,alcampo,oro,,,0.0
1582,Puleva Bebida Láctea Con Extractos Vegetales Y...,leche,alcampo,puleva,,,0.0
1609,"Puleva Omega 3 Preparado Lacteo Desnatado, Si...",leche,alcampo,puleva,,,0.0


In [375]:
products[550:600]

Unnamed: 0,product_name,category,supermarket,brands,subcategory,distinction,eco
550,"Leche Semidesnatada Asturiana, Brik 50 Cl",leche,eroski,asturiana,leche vaca,semidesnatada,0.0
551,"Leche Semidesnatada Beyena, Brik 1 Litro",leche,eroski,beyena,leche vaca,semidesnatada,0.0
552,"Leche Semidesnatada Bomilk, Brik 1 Litro",leche,eroski,bomilk,leche vaca,semidesnatada,0.0
553,"Leche Semidesnatada Calcio Asturiana, Brik 1 L...",leche,eroski,asturiana,leche vaca,semidesnatada,0.0
554,"Leche Semidesnatada Calcio Eroski, Brik 1 Litro",leche,eroski,eroski,leche vaca,semidesnatada,0.0
555,"Leche Semidesnatada Calcio Kaiku, Brik 1 Litro",leche,eroski,kaiku,leche vaca,semidesnatada,0.0
556,"Leche Semidesnatada Calcio Pascual, Brik 1 Litro",leche,eroski,pascual,leche vaca,semidesnatada,0.0
557,"Leche Semidesnatada Calcio Puleva, Brik 1 Litro",leche,eroski,puleva,leche vaca,semidesnatada,0.0
558,"Leche Semidesnatada Celta, Brik 1,5 Litros",leche,eroski,celta,leche vaca,semidesnatada,0.0
559,"Leche Semidesnatada De Cabra Puleva, Brik 1 Litro",leche,eroski,puleva,leche cabra,semidesnatada,0.0


In [None]:
product_names_filtered = products[(products["subcategory"].isna())]

print(f"There are {product_names_filtered['brands'].isna().sum()} products without brand\n\n")

for product_name in product_names_filtered["product_name"]:
    print(product_name)