# Aldi Nord

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
import json
import time
import re


In [4]:
'''Aldi-Nord Script'''

main_url = "https://www.aldi-nord.de/"
print('Main Website Page', main_url)

def request_to_aldiNord(url):
    # Function to make a request to the given URL and return the BeautifulSoup object
    first_response = requests.get(url)
    if first_response.status_code == 200:
        html_content = first_response.text
    else:
        print("Error:", first_response.status_code)
    soup = BeautifulSoup(html_content, "html.parser")
    return soup

def access_main_page(soup):
    # Function to access the main page and extract the link to the 'Produkte' section
    what_searching = 'Produkte'
    anchor_element = soup.find('a', {'class': 'mod-main-navigation__action', 'data-attr-value': what_searching})
    # Check if the anchor element is found
    if anchor_element:
        # Get the value of the href attribute
        href_value = anchor_element.get('href')
        print('    Modified Url:', main_url + href_value[1:])
        return main_url + href_value[1:]
    else:
        print("Anchor element not found.")
        return None

def access_category_1(soup):
    # Function to access the first level of categories
    h4_elements = soup.find_all('h4', class_='mod-content-tile__title')

    for h4_element in h4_elements[17:23]:  # Considering a specific range of elements for demonstration
        h4_ele = h4_element.string.replace_with(h4_element.text)
        print('    List of Main Categories', h4_ele)
        anchor_element = soup.find('a', {'class': 'mod-content-tile__action', 'data-attr-value': h4_ele})
        href_value = anchor_element.get('href')
        print("href:", href_value)
        print('        Links to Main Categories', main_url + href_value[1:])
        # Requesting for the next level of categories
        product_list_soup = request_to_aldiNord(main_url + href_value[1:])
        try:
            anchor_ele_product_list_soup = product_list_soup.find_all('a', {'class': 'link link--primary'})
            if anchor_ele_product_list_soup:
                AA = access_cateogry_2(anchor_ele_product_list_soup)
            else:
                raise Exception("Anchor element not found")
        except Exception as e:
            print("An error occurred:", e)
            AA = access_category_3(product_list_soup)

def access_cateogry_2(anchor_elements):
    # Function to access the second level of categories
    for anchor_element in anchor_elements:
        href_value = anchor_element.get('href')
        print('        Link to Sub Categories', main_url + href_value[1:])
        product_list_soup2 = request_to_aldiNord(main_url + href_value[1:])
        try:
            anchor_ele_product_list_soup2 = product_list_soup2.find_all('a', {'class': 'link link--primary'})
            if anchor_ele_product_list_soup2:
                AA = access_category_n(anchor_ele_product_list_soup2)
            else:
                raise Exception("Anchor element not found")
        except Exception as e:
            print("An error occurred:", e)
            AA = access_category_3(product_list_soup2)

def access_category_n(anchor_elements):
    # Function to access the nth level of categories
    for anchor_element in anchor_elements:
        href_value = anchor_element.get('href')
        print('        Link to Sub-Sub Categories', main_url + href_value[1:])
        product_list_soup2 = request_to_aldiNord(main_url + href_value[1:])
        AA = access_category_3(product_list_soup2)

def access_category_3(product_list_soup2):
    # Function to access the third level of categories and extract product information
    count = 1
    product_list_div_elements = product_list_soup2.find_all('div', {'class': 'mod-article-tile-placeholder'})
    for product_list_div_element in product_list_div_elements:
        html_links_to_products = product_list_div_element.get('data-tile-url')
     #   print('            Unloaded Link to the Product', main_url + html_links_to_products[1:])
        product_list_soup3 = request_to_aldiNord(main_url + html_links_to_products[1:])
        product_anchor_element = product_list_soup3.find('a', {'class': 'mod-article-tile__action'})
        href_value = product_anchor_element.get('href')
    #    print('            Loaded Link to the Product', main_url + href_value[1:])
        product_soup4 = request_to_aldiNord(main_url + href_value[1:])
        time.sleep(0.5)
        print('Extracting Product : ', count)
        try:
            extracted_data = product_extraction(product_soup4)
            save_json(extracted_data)  # Save data immediately after successful extraction
        except Exception as e:
            print("An error occurred while extracting product data:", e)
            print("Skipping to the next product.")
        count += 1
        print(100 * '*')

def product_extraction(final_soup):
    # Function to extract product information from the final soup
    data_to_save = {}  # Dictionary to store extracted data
    # Find the script tag containing JSON-like data
    script_tags = final_soup.find_all('script', type='application/ld+json')
    for script_tag in script_tags:
        # Extract the text content of the script tag
        json_data = script_tag.string
        # Check if the script tag contains JSON-like data
        if json_data:
            # Parse JSON-like data
            data = json.loads(json_data)
            # Extract description if available
            description = data.get('description')
            Publication_date = data.get('datePublished')
            if description:
                data_to_save['Description'] = description
                data_to_save['Publication_date'] = Publication_date
    # Find the script tag containing digitalData
    script_tag = final_soup.find('script', text=re.compile(r'var digitalData ='))
    # Extract the JavaScript code containing digitalData
    js_code = script_tag.string
    # Use regular expressions to extract the digitalData variable
    match = re.search(r'var digitalData = (\{.*?\});', js_code)
    # Check if a match is found
    if match:
        # Extract the digitalData JSON object
        digital_data_json = match.group(1)
        # Convert the JSON object to a Python dictionary
        digital_data_dict = eval(digital_data_json)
        data_to_save['Primary_category'] = digital_data_dict['page']['pageCategory']['primaryCategory']
        data_to_save['1sub_category'] = digital_data_dict['page']['pageCategory']['subCategory1']
        data_to_save['2sub_category'] = digital_data_dict['page']['pageCategory']['subCategory2']
        data_to_save['3sub_category'] = digital_data_dict['page']['pageCategory']['subCategory3']
    else:
        print("digitalData variable not found.")
    digital_data_dict_2 = final_soup.find('div', {'class': 'mod-article-intro'})

    second_json = json.loads(digital_data_dict_2.get('data-article'))
    data_to_save['Product_name'] = second_json['productInfo']['productName']
    data_to_save['Product_id'] = second_json['productInfo']['productID']
    data_to_save['Product_brand'] = second_json['productInfo']['brand']
    data_to_save['Product_price'] = second_json['productInfo']['priceWithTax']

    # extract product image
    product_image = final_soup.find('a', {'class': 'mod-gallery-article__media'})
    data_to_save['Product_Image'] = main_url + product_image.get('href')[1:]

    certificate_images = final_soup.find_all('img')
    count_certificate = 1
    for certificate_image in certificate_images:
        # Extract the data-srcset attribute value
        data_srcset = certificate_image.get('data-srcset')
        if data_srcset is not None:
            data_srcset_split = data_srcset.split(', ')[0]
            data_to_save['certification_' + str(count_certificate)] = main_url + data_srcset_split.split(' ')[0][1:]
            count_certificate += 1
    return data_to_save

def save_json(data):
    # Function to save data to JSON file, appending new data to existing content
    try:
        with open('scraped_data_3.json', 'r') as json_file:
            existing_data = json.load(json_file)
    except FileNotFoundError:
        existing_data = []

    existing_data.append(data)

    with open('scraped_data_3.json', 'w') as json_file:
        json.dump(existing_data, json_file, indent=4)

    print("Data appended to scraped_data_3.json")

# Main program execution starts here
soup = request_to_aldiNord(main_url)
Produkte_link = access_main_page(soup)
product_soup = request_to_aldiNord(Produkte_link)
Items_link = access_category_1(product_soup)

Main Website Page https://www.aldi-nord.de/
    Modified Url: https://www.aldi-nord.de/sortiment.html
    List of Main Categories Kosmetik und Pflege
href: /sortiment/kosmetik-pflege.html
        Links to Main Categories https://www.aldi-nord.de/sortiment/kosmetik-pflege.html
        Link to Sub Categories https://www.aldi-nord.de/sortiment/kosmetik-pflege/hygiene-selbsttest-produkte.html
An error occurred: Anchor element not found
Extracting Product :  1
Data appended to scraped_data_3.json
****************************************************************************************************


  script_tag = final_soup.find('script', text=re.compile(r'var digitalData ='))


Extracting Product :  2
Data appended to scraped_data_3.json
****************************************************************************************************
Extracting Product :  3
Data appended to scraped_data_3.json
****************************************************************************************************
Extracting Product :  4
Data appended to scraped_data_3.json
****************************************************************************************************
        Link to Sub Categories https://www.aldi-nord.de/sortiment/kosmetik-pflege/haarpflege-styling.html
An error occurred: Anchor element not found
Extracting Product :  1
Data appended to scraped_data_3.json
****************************************************************************************************
Extracting Product :  2
Data appended to scraped_data_3.json
****************************************************************************************************
Extracting Product :  3
Data appended to

Extracting Product :  3
Data appended to scraped_data_3.json
****************************************************************************************************
Extracting Product :  4
Data appended to scraped_data_3.json
****************************************************************************************************


KeyboardInterrupt: 