In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
from tqdm import tqdm
import csv
import re

In [2]:
# Base URL for Alnatura product search API
url = 'https://www.alnatura.de/api/sitecore/products/SearchProducts?query=&querydatasource={3DB15B16-A7BB-4613-82E9-C68A201C00C8}&category=categoryid&page='

# List to store all product links
all_links = []

# List of category IDs to scrape products from
categoryids = ['2001000000', '2002000000', '2004000000', '2005000000', '2006000000', '2007000000', '2008000000', '2009000000', '2011000000', '2012000000']

# Loop through each category ID
for categoryid in tqdm(categoryids):
    # Loop through the first 100 pages of each category
    for p in range(1, 100):
        # Make a GET request to the API with the current category ID and page number
        resp = requests.get(url.replace('categoryid', categoryid) + str(p))
        
        # Parse the JSON response
        data = json.loads(resp.text)

        # Extract product links from the response
        links = ['https://www.alnatura.de' + a['TargetUrl'] for a in data['Payload']['Products']]
        
        # If no more links are found, break out of the loop
        if links == []:
            break
        
        # Add the links to the all_links list
        all_links.extend(links)


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [02:17<00:00, 13.80s/it]


In [3]:
# List to store all product attributes
all_attrs = []

# Loop through each link in all_links
for l in tqdm(all_links[:]):
    # Dictionary to store attributes for each product
    attrs = {}
    attrs['Artikelinformationen'] = ""
    
    # Make a GET request to the product page
    resp = requests.get(l)
    soup = BeautifulSoup(resp.text)
    
    # Extract product ID (GTIN)
    try:
        attrs['id'] = soup.find("dd", class_="product__gtin").text
    except:
        pass

    # Extract product name
    try:
        attrs['name'] = soup.find('h1').text.strip()
    except:
        pass

    # Extract brand name
    try:
        attrs['brand'] = soup.find("div", class_="product-stage__product-brand").text.strip()
    except:
        pass

    # Extract ingredients
    try:
        attrs['ingredients'] = re.sub(r'[\s]+', ' ', soup.find(class_='product__ingredients').text.strip())
    except:
        pass

    # Extract category from the URL
    try:
        attrs['category'] = l.split('/')[6].replace('-', ' ').title()
    except:
        pass

    # Extract sub-category from the URL
    try:
        attrs['sub-category'] = l.split('/')[7].replace('-', ' ').title()
    except:
        pass

    # Extract allergens information
    try:
        attrs['allergens'] = re.sub(r'[\s]+', ' ', soup.find(class_='product__allergens').text.replace('Allergene', '').strip())
    except:
        pass

    # Initialize labels as an empty string
    attrs['labels'] = ""

    # Extract nutrition information
    try:
        n_label = [a.text for a in soup.find(class_='nutrition-group__labels').find_all('li')]
        n_value = [re.sub(r'[\s]+', ' ', a.text.strip()) for a in soup.find(class_='nutrition-group__list').find_all('li')]
        attrs['nutriments'] = "- ".join([f"{k}: {v}" for k, v in zip(n_label, n_value)])
    except:
        pass

    # Extract image link
    try:
        attrs['image-Link'] = soup.find(class_='product-stage__image').find('img')['src']
    except:
        pass

    # Extract product description
    try:
        product_descriptions = []
        for h in soup.find_all(class_='product__description--long'):
            h2 = h.find('h2')
            if h2:
                cleaned_text = h.text.replace(h2.text.strip(), '').strip()
            else:
                cleaned_text = h.text.strip()
            product_descriptions.append(cleaned_text)
        
        # Join the cleaned descriptions and replace multiple spaces with a single space
        attrs['description'] = ' '.join(re.sub(r'[\s]+', ' ', desc) for desc in product_descriptions)
    except:
        pass

    # Extract price
    try:
        attrs['price'] = [a.text for a in soup.find_all("dd", class_="product__usage")][0]
    except:
        pass

    # Extract unit price
    try:
        attrs['unit-Price'] = [a.text for a in soup.find_all("dd", class_="product__usage")][1]
    except:
        pass

    # Extract product properties
    try:
        attrs['Properties'] = "- ".join([re.sub(r'[\s]+', ' ', a.text.strip()) for a in soup.find(class_='product__key-benefits').find_all('li')])
    except:
        pass

    # Extract additional product information
    try:
        key = [a.text for a in soup.find(class_='product__information-items').find_all('dt')]
        value = [re.sub(r'[\s]+', ' ', a.text.strip()) for a in soup.find(class_='product__information-items').find_all('dd')]
        for k, v in zip(key, value):
            if k not in ['GTIN', 'Preis UVP', 'Grundpreis']:
                attrs['Artikelinformationen'] += "- " + k + ": " + v + "- "
                attrs['Artikelinformationen'] = attrs['Artikelinformationen'].replace('\n', '')
                attrs['Artikelinformationen'] = attrs['Artikelinformationen'].rstrip('- ')
                attrs['Artikelinformationen'] = attrs['Artikelinformationen'].lstrip('- ')
    except:
        pass

    ################################################### FOR RATINGS #################################################################
    # from selenium import webdriver
    # from selenium.webdriver.common.by import By
    # from selenium.webdriver.support.ui import WebDriverWait
    # from selenium.webdriver.support import expected_conditions as EC

    # driver = webdriver.Chrome()
    # driver.get(l)

    # wait = WebDriverWait(driver, 10) # Wait for the element to be present on the page

    # try:
    #     ratings_element = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'bv_numReviews_text')))
    #     ratings_text = ratings_element.text # Get the text content of the element
    #     driver.quit() # Close the browser
    # except:
    #     ratings_text = None
    #     pass
    # attrs['Rating'] = int(ratings_text.strip('()'))
    ################################################### FOR RATINGS #################################################################

    # Store the product link
    try:
        attrs['link'] = l
    except:
        pass

    # Add the product attributes to the list
    all_attrs.append(attrs)

# Output the list of all product attributes
all_attrs

  0%|          | 0/1255 [00:00<?, ?it/s]

100%|██████████| 1255/1255 [29:43<00:00,  1.42s/it]


[{'Artikelinformationen': 'Name: Hirse-Getreidebrei- Inhalt: 250 g- Aufbewahrung: Bitte trocken lagern und vor Wärme schützen.- Zertifizierung: DE-ÖKO-001',
  'id': '4104420253551',
  'name': 'Hirse-Getreidebrei',
  'brand': 'Alnatura',
  'ingredients': 'Hirsevollkornmehl*, Thiamin (Vitamin B1) *aus biologischer Landwirtschaft',
  'category': 'Baby Kind',
  'sub-category': 'Baby Kindernahrung',
  'allergens': 'Kann Spuren von Senf enthalten.',
  'labels': '',
  'nutriments': 'kcal: 392 kcal- kJ: 1660 kJ- Fett: 4 g- Davon gesättigte Fettsäuren: 0,60 g- Kohlenhydrate: 75,80 g- Davon Zucker: 0,40 g- Ballaststoffe: 3,50 g- Eiweiß: 11,50 g- Salz: 0,02 g- Vitamin B1: 1,20 mg- Natrium: 0,01 g',
  'image-Link': 'https://mediaservice.alnatura.de/image/501324310981/image_to0fsif2t53up8sfp59o8hb90h/-FJPG-S300x300',
  'description': 'Ein Bio-Vollkornbrei für die Säuglingsernährung ab dem 5. Lebensmonat. Der Alnatura Hirse-Getreidebrei ist gluten- sowie milchfrei und wird ohne Zuckerzusatz hergeste

In [4]:
################################# TRANSFERRING VALUES FROM "ARTIKEL INFORMATIONEN(ITEM INFO)" KEY TO THE "description" KEY############################
for item in all_attrs:
    # Add 'Artikelinformationen' to 'Description' if it exists
    if 'Artikelinformationen' in item:
        # Ensure 'Description' exists and is properly initialized
        if 'description' not in item:
            item['description'] = ''
        
        # Append 'Artikelinformationen' to 'Description'
        item['description'] += ' - ' + item['Artikelinformationen']
        
        # Remove 'Artikelinformationen' key
        del item['Artikelinformationen']
all_attrs

[{'id': '4104420253551',
  'name': 'Hirse-Getreidebrei',
  'brand': 'Alnatura',
  'ingredients': 'Hirsevollkornmehl*, Thiamin (Vitamin B1) *aus biologischer Landwirtschaft',
  'category': 'Baby Kind',
  'sub-category': 'Baby Kindernahrung',
  'allergens': 'Kann Spuren von Senf enthalten.',
  'labels': '',
  'nutriments': 'kcal: 392 kcal- kJ: 1660 kJ- Fett: 4 g- Davon gesättigte Fettsäuren: 0,60 g- Kohlenhydrate: 75,80 g- Davon Zucker: 0,40 g- Ballaststoffe: 3,50 g- Eiweiß: 11,50 g- Salz: 0,02 g- Vitamin B1: 1,20 mg- Natrium: 0,01 g',
  'image-Link': 'https://mediaservice.alnatura.de/image/501324310981/image_to0fsif2t53up8sfp59o8hb90h/-FJPG-S300x300',
  'description': 'Ein Bio-Vollkornbrei für die Säuglingsernährung ab dem 5. Lebensmonat. Der Alnatura Hirse-Getreidebrei ist gluten- sowie milchfrei und wird ohne Zuckerzusatz hergestellt (Zutaten enthalten von Natur aus Zucker). Er kann vielseitig verwendet werden: als Halbmilchbrei, mit Säuglingsmilchnahrung oder ab dem 7. Monat als Getr

In [5]:
################################# TRANSFERRING VALUES FROM "Properties" KEY TO THE "description" KEY############################
for item in all_attrs:
    # Add 'Properties' to 'Description' if it exists
    if 'Properties' in item:
        # Ensure 'Description' exists and is properly initialized
        if 'description' not in item:
            item['description'] = ''
        
        # Append 'Properties' to 'Description'
        item['description'] += ' - ' + item['Properties']
        
        # Remove 'Properties' key
        del item['Properties']

all_attrs

[{'id': '4104420253551',
  'name': 'Hirse-Getreidebrei',
  'brand': 'Alnatura',
  'ingredients': 'Hirsevollkornmehl*, Thiamin (Vitamin B1) *aus biologischer Landwirtschaft',
  'category': 'Baby Kind',
  'sub-category': 'Baby Kindernahrung',
  'allergens': 'Kann Spuren von Senf enthalten.',
  'labels': '',
  'nutriments': 'kcal: 392 kcal- kJ: 1660 kJ- Fett: 4 g- Davon gesättigte Fettsäuren: 0,60 g- Kohlenhydrate: 75,80 g- Davon Zucker: 0,40 g- Ballaststoffe: 3,50 g- Eiweiß: 11,50 g- Salz: 0,02 g- Vitamin B1: 1,20 mg- Natrium: 0,01 g',
  'image-Link': 'https://mediaservice.alnatura.de/image/501324310981/image_to0fsif2t53up8sfp59o8hb90h/-FJPG-S300x300',
  'description': 'Ein Bio-Vollkornbrei für die Säuglingsernährung ab dem 5. Lebensmonat. Der Alnatura Hirse-Getreidebrei ist gluten- sowie milchfrei und wird ohne Zuckerzusatz hergestellt (Zutaten enthalten von Natur aus Zucker). Er kann vielseitig verwendet werden: als Halbmilchbrei, mit Säuglingsmilchnahrung oder ab dem 7. Monat als Getr

In [6]:
############################################# SAVING ALL PRODUCTS IN A .JSON FILE ################################################

with open('alnatura.json', 'w', encoding='utf-8') as json_file:
    json.dump(all_attrs, json_file, ensure_ascii=False, indent=4)

In [7]:
############################################# SAVING ALL PRODUCTS IN A .CSV FILE ################################################

fieldnames = all_attrs[0].keys()

# Save the list of dictionaries to a CSV file
with open("alnatura.csv", 'w', newline='', encoding='utf-8') as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(all_attrs)

In [8]:
######################################## THIS IS A TRIAL FOR A SINGLE PRODUCT ###############################################

# l = "https://www.alnatura.de/de-de/produkte/alle-produkte/tiefkuehl/bio-fisch/black-tiger-garnelen-tk-204479/"
# all_attrs = []
# attrs = {}
# attrs['Artikelinformationen'] = ""
# ################################################### FOR RATINGS #################################################################
# # from selenium import webdriver
# # from selenium.webdriver.common.by import By
# # from selenium.webdriver.support.ui import WebDriverWait
# # from selenium.webdriver.support import expected_conditions as EC

# # driver = webdriver.Chrome()
# # driver.get(l)

# # wait = WebDriverWait(driver, 10) # Wait for the element to be present on the page

# # try:
# #     ratings_element = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'bv_numReviews_text')))
# #     ratings_text = ratings_element.text # Get the text content of the element
# #     driver.quit() # Close the browser
# # except:
# #     ratings_text = None
# #     pass
# # attrs['Rating'] = int(ratings_text.strip('()'))
# ################################################### FOR RATINGS #################################################################

# resp = requests.get(l)
# soup = BeautifulSoup(resp.text)
# attrs['id'] = soup.find("dd", class_="product__gtin").text
# attrs['name'] = soup.find('h1').text.strip()
# attrs['brand'] = soup.find("div", class_="product-stage__product-brand").text.strip()
# product_descriptions = []
# for h in soup.find_all(class_='product__description--long'):
#     h2 = h.find('h2')
#     if h2:
#         cleaned_text = h.text.replace(h2.text.strip(), '').strip()
#     else:
#         cleaned_text = h.text.strip()
#     product_descriptions.append(cleaned_text)

# # Join the cleaned descriptions and replace multiple spaces with a single space
# attrs['description'] = ' '.join(re.sub(r'[\s]+', ' ', desc) for desc in product_descriptions)

# attrs['category'] = l.split('/')[6].replace('-', ' ').title()
# attrs['sub-category'] = l.split('/')[7].replace('-', ' ').title()
# attrs['price'] = [a.text for a in soup.find_all("dd", class_="product__usage")][0]
# attrs['unit-Price'] = [a.text for a in soup.find_all("dd", class_="product__usage")][1]
# attrs['Properties'] = "- ".join([re.sub(r'[\s]+', ' ', a.text.strip()) for a in soup.find(class_='product__key-benefits').find_all('li')])

# try:
#     attrs['ingredients'] = re.sub(r'[\s]+', ' ', soup.find(class_='product__ingredients').text.strip())
# except:
#     pass

# try:
#     attrs['allergens'] = re.sub(r'[\s]+', ' ', soup.find(class_='product__allergens').text.replace('Allergene', '').strip())
# except:
#     pass

# key = [a.text for a in soup.find(class_='product__information-items').find_all('dt')]
# value = [re.sub(r'[\s]+', ' ', a.text.strip()) for a in soup.find(class_='product__information-items').find_all('dd')]
# for k, v in zip(key, value):
#     if k not in ['GTIN', 'Preis UVP', 'Grundpreis']:
#         attrs['Artikelinformationen'] += "- " + k + ": " + v + "- "
#         attrs['Artikelinformationen'] = attrs['Artikelinformationen'].replace('\n', '')
#         attrs['Artikelinformationen'] = attrs['Artikelinformationen'].rstrip('- ')
#         attrs['Artikelinformationen'] = attrs['Artikelinformationen'].lstrip('- ')
# try:
#     n_label = [a.text for a in soup.find(class_='nutrition-group__labels').find_all('li')]
#     n_value = [re.sub(r'[\s]+', ' ', a.text.strip()) for a in soup.find(class_='nutrition-group__list').find_all('li')]
#     attrs['nutriments'] = "- ".join([f"{k}: {v}" for k, v in zip(n_label, n_value)])
# except:
#     pass

# attrs['image-Link'] = soup.find(class_='product-stage__image').find('img')['src']
# attrs['link'] = l
# all_attrs.append(attrs)

# all_attrs

[{'Artikelinformationen': 'Name: Black Tiger Garnelen (TK)- Inhalt: 180 g- Aufbewahrung: ***-Fach (-18 °C): mindestens haltbar bis: siehe Seitenlasche; **-Fach (-12 °C): 6 Tage; *-Fach (-6 °C): 2 Tage; Kühlschrank: 1 Tag Nach dem Auftauen nicht wieder einfrieren!- Zertifizierung: BE-BIO-02',
  'id': '4104420253056',
  'name': 'Black Tiger Garnelen (TK)',
  'brand': 'Alnatura Origin',
  'description': 'Unsere Naturland-zertifizierten Alnatura Origin Black-Tiger-Garnelen stammen von einer Gruppe traditionell wirtschaftender Garnelenfarmen aus den dörflichen Strukturen der Inseln Java und Sulawesi (Indonesien). Schon seit über 30 Jahren werden dort Garnelen und andere Fischarten in Polykultur erzeugt. Maximal 10 Garnelen pro Quadratmeter leben in den naturnahen Teichanlagen, deren Ufer mit Pflanzen bewachsen sind und Lebensraum für viele Tierarten bieten. Die Bio-Garnelen sind roh und mit aufgeschnittener Schale – ideal für Rezeptideen rund um Grill und Pfanne. Die Garnelen nur im durchge