In [1]:
from apify_token import *
import json
from apify_client import ApifyClient
from bs4 import BeautifulSoup
import requests
import numpy as np
import time
import os
from tqdm import tqdm
import datetime
from glob import glob
import re

In [2]:
DATA_PATH = "../../Data.nosync/" # Relative path to the data folder

### 1. Get Subcategories
- In order to run the Zalando Scraper from Apify, one should have the urls of the various sub-categories listed on Zalando. 
- Of course, one could also fetch everything from one category (e.g. dresses) directly. 
- The problem with this approach is that the number of items to be scraped is very high and the scraper might not be able to handle it. 
- Therefore, it is recommended to scrape sub-categories separately. 


- The code below first retrieves the names and urls of the categoreis from Zalando and then retrieves the sub-categories of each category. 
- Subcategoreis for the category dress are e.g. "cocktail dress", "maxi dress", "shirt dress", etc. 

In [3]:
def fetch_url_content(url):
    # Use the requests library to fetch the content of the URL
    response = requests.get(url)
    
    # Check if the request was successful (HTTP status code 200)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to fetch content. HTTP status code: {response.status_code}")

In [4]:
def get_subcategories(cat_url):
    response = fetch_url_content(cat_url)
    soup = BeautifulSoup(response, 'lxml')
    subcats = soup.find('ul', class_ = 'ODGSbs').find('ul', class_ = 'ODGSbs')
    subcats = subcats.find_all('li')
    subcats = {elem.text: elem.find('a')['href'] for elem in subcats}
    return subcats

In [5]:
# Only run this code if the categories dict does not exist already
if not os.path.exists('categories_dict.json'):
    print("Fetching categories")

    # Get the toplevel categories from the Zalando website
    response = fetch_url_content("https://en.zalando.de/womens-clothing/")
    soup = BeautifulSoup(response, 'lxml')
    subcats = soup.find('ul', class_ = 'ODGSbs')
    subcats = subcats.find_all('li')
    garment_categories = {elem.text: elem.find('a')['href'] for elem in subcats}

    # Remove the 'Sale' category since this is very different from the other categories
    if 'Sale' in garment_categories:
        del garment_categories['Sale']

    # Get the subcategories for each toplevel category
    all_categories = {}
    for cat_name, cat_url in tqdm(garment_categories.items(), desc='Fetching sub-categories'):
        try:
            subcats = get_subcategories(cat_url)
        except:
            print(f"Failed to fetch subcategories for {cat_name}")
            subcats = {}
        all_categories[cat_name] = {
            'url': cat_url,
            'subcategories': subcats
        }
        # Sleep for a random amount of time to avoid getting blocked
        time.sleep(np.random.uniform(1, 5))

    # Save the categories dict to a JSON file
    with open('categories_dict.json', 'w') as f:
        json.dump(all_categories, f)
    print("Categories saved to categories_dict.json")
else:
    # Load the categories dict from the JSON file
    with open('categories_dict.json', 'r') as f:
        all_categories = json.load(f)
    print("categories_dict.json exists; skipping fetching categories")
    print("Categories loaded from categories_dict.json")

categories_dict.json exists; skipping fetching categories
Categories loaded from categories_dict.json


### 2. Scrape the Article Data from Zalando

In [6]:
def zalando_scraper(start_url, out_name, max_items=None,):
    # Initialize the ApifyClient with your API token
    client = ApifyClient(API_KEY)

    # Prepare the Actor input
    run_input = {
        "startUrls": [
            start_url
        ],
        "maxItems": max_items,
        "proxy": {
            "useApifyProxy": True,
            #"apifyProxyCountry": "DE",
            "apifyProxyGroups": ["RESIDENTIAL"],
        },
        }
    if max_items == None:
        del run_input["maxItems"]

    # Run the Actor and wait for it to finish
    run = client.actor("wPoILN4JczGRGC1xe").call(run_input=run_input)

    # Concat all items to list and dump to JSON
    articles = []
    for item in client.dataset(run["defaultDatasetId"]).iterate_items():
        articles.append(item)
    
    with open(out_name, 'w') as f:
        json.dump(articles, f)

In [7]:
all_categories.keys()

dict_keys(['Dresses', 'T-shirts & tops', 'Trousers', 'Jeans', 'Shirts & Blouses', 'Jackets & Blazers', 'Swimwear', 'Sweatshirts & Hoodies', 'Skirts', 'Knitwear & Cardigans', 'Sportswear', 'Shorts', 'Jumpsuits', 'Coats', 'Underwear', 'Nightwear & Loungewear', 'Socks & Tights'])

In [8]:
def scrape_whole_category(cat_name):
    # Make sure the folder exists
    folder_name = cat_name.replace('&', 'and').replace(' ', '_').replace('-', '_').lower()
    folder_path = f"{DATA_PATH}Zalando_Germany_Dataset/metadata_dicts/{folder_name}"
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    # Get subcategory URLs
    subcat_urls = all_categories[cat_name]['subcategories']

    # Scrape each subcategory
    for subcat in subcat_urls.keys():
        category_name_clean = subcat.replace('&', 'and').replace(' ', '_').replace('-', '_').lower()
        out_path = f"{folder_path}/{category_name_clean}.json"

        if not os.path.exists(out_path):
            print(f"{datetime.datetime.now()}: Scraping {subcat}")
            zalando_scraper(subcat_urls[subcat], out_path, max_items=None)
            print(f"{datetime.datetime.now()}: Finished scraping {subcat}")
        else:
            print(f"Skipping {subcat} since it already exists")

In [9]:
scrape_whole_category('Dresses')

Skipping Casual Dresses since it already exists
Skipping Evening Dresses since it already exists
Skipping Occasion Dresses since it already exists
Skipping Shirt Dresses since it already exists
Skipping Jersey Dresses since it already exists
Skipping Shift Dresses since it already exists
Skipping Maxi Dresses since it already exists
Skipping Denim Dresses since it already exists
Skipping Knitted Dresses since it already exists
Skipping Dirndl Dresses since it already exists


### 3. Scrape the Packshot Images from Zalando

### Steps: 
1. Read in metadata
2. Loop over all articles
3. If article HTML does not exist in the folder, request and save it there
4. Load the HTML and parse using BeautifulSoup
5. Save all image data in the image_dicts folder

In [10]:
existing_metadata_folders = glob(f"{DATA_PATH}Zalando_Germany_Dataset/metadata_dicts/*/")
print("Existing Gamrent Categories:")
display(existing_metadata_folders)

print("Existing Metadata Files for selected Category:")
folder_to_scrape = existing_metadata_folders[0]
existing_metadata_files = glob(f"{folder_to_scrape}*.json")
existing_metadata_files

Existing Gamrent Categories:


['../../Data.nosync/Zalando_Germany_Dataset/metadata_dicts/dresses/']

Existing Metadata Files for selected Category:


['../../Data.nosync/Zalando_Germany_Dataset/metadata_dicts/dresses/jersey_dresses.json',
 '../../Data.nosync/Zalando_Germany_Dataset/metadata_dicts/dresses/dirndl_dresses.json',
 '../../Data.nosync/Zalando_Germany_Dataset/metadata_dicts/dresses/shift_dresses.json',
 '../../Data.nosync/Zalando_Germany_Dataset/metadata_dicts/dresses/occasion_dresses.json',
 '../../Data.nosync/Zalando_Germany_Dataset/metadata_dicts/dresses/shirt_dresses.json',
 '../../Data.nosync/Zalando_Germany_Dataset/metadata_dicts/dresses/evening_dresses.json',
 '../../Data.nosync/Zalando_Germany_Dataset/metadata_dicts/dresses/knitted_dresses.json',
 '../../Data.nosync/Zalando_Germany_Dataset/metadata_dicts/dresses/casual_dresses.json',
 '../../Data.nosync/Zalando_Germany_Dataset/metadata_dicts/dresses/denim_dresses.json',
 '../../Data.nosync/Zalando_Germany_Dataset/metadata_dicts/dresses/maxi_dresses.json']

In [11]:
def fetch_and_save_url_content(url, file_path):
    # Use the requests library to fetch the content of the URL
    response = requests.get(url)
    
    # Check if the request was successful (HTTP status code 200)
    if response.status_code == 200:
        # Open a file for writing in binary mode (to accommodate all content types)
        with open(file_path, 'wb') as file:
            file.write(response.content)
        #print(f"Content saved to {file_path}")
    else:
        print(f"Failed to fetch content. HTTP status code: {response.status_code}") 
        # This will print if the request was unsuccessful so that the scraper can be stopped immediately. 
        # This might prevent you from being blocked by the website.

In [12]:
def extract_packshot_link(html_content):
    product_soup = BeautifulSoup(html_content, 'html.parser')

    # find list of images
    product_img_thumbs = product_soup.find('ul', attrs={"aria-label": "Product media gallery"})

    # in some page designs, the images are wrapped in a div
    if product_img_thumbs is None:
        product_img_thumbs = product_soup.find('div', attrs={"class": "I7OI1O C3wGFf L5YdXz _0xLoFW _7ckuOK mROyo1 _5qdMrS"})

    # find all image objects
    try:
        thumb_links = product_img_thumbs.find_all('img')
        frontal_img_link = None
        for thumb in thumb_links:
            # get images links from source code
            thumb = re.findall(r'src=".+?"', str(thumb))
            thumb = str(thumb)[7:-3]
            # packshot signifies that this image shows the frontal view of the product with no model
            if "packshot" in thumb:
                replacement_res = "500"
                # bring image link into the right format
                frontal_img_link = re.sub(r'(?<=imwidth=).+?(?=&)', replacement_res, thumb, flags=re.S)
    except:
        frontal_img_link = None

    return frontal_img_link     

In [13]:
def scrape_image_data(metadata_file):
    # Create folder to save the image dicts
    folder_to_save_images = metadata_file.replace('metadata_dicts', 'image_dicts')
    folder_to_save_images = '/'.join(folder_to_save_images.split('/')[:-1])+ '/'
    if not os.path.exists(folder_to_save_images):
        os.makedirs(folder_to_save_images)
    dict_save_path = f"{folder_to_save_images}{metadata_file.split('/')[-1]}"
    if os.path.exists(dict_save_path):
        print(f"Skipping {metadata_file.split('/')[-1]} since it already exists")
        return
    

    print(f"Scraping images for {metadata_file.split('/')[-1]}")
    with open(metadata_file, 'r') as f:
        metadata = json.load(f)

    # Create a folder to save the HTML files
    folder_to_save = metadata_file.replace('metadata_dicts', 'html_files')
    folder_to_save = '/'.join(folder_to_save.split('/')[:-1])+ '/'
    if not os.path.exists(folder_to_save):
        os.makedirs(folder_to_save)
    else: 
        print(f"Current number of files in folder: {len(os.listdir(folder_to_save))}")

    # Create dictionary with all image links
    image_dict = {}

    for article in tqdm(metadata, desc='Fetching images'):
        # Fetch the URL and save the content
        url = article['url']
        save_path = f"{folder_to_save}{article['sku']}.html"

        if not os.path.exists(save_path):
            fetch_and_save_url_content(url, save_path)
            time.sleep(np.random.uniform(0, 1))

        # Load the HTML file
        try:
            with open(save_path, 'r') as f:
                html_content = f.read()

            # Extract the packshot link
            packshot_link = extract_packshot_link(html_content)
            image_dict[article['sku']] = {'images':article['images'], 
                                        'packshot_link':packshot_link, 
                                        'thumbnail':article['thumbnail']}
        except:
            image_dict[article['sku']] = {'images':article['images'], 
                                        'packshot_link':None, 
                                        'thumbnail':article['thumbnail']}

        # Save the image dict to a JSON file
    
    with open(dict_save_path, 'w') as f:
        json.dump(image_dict, f)
        

In [14]:
for metadata_file in existing_metadata_files:
    scrape_image_data(metadata_file)

Skipping jersey_dresses.json since it already exists
Skipping dirndl_dresses.json since it already exists
Skipping shift_dresses.json since it already exists
Skipping occasion_dresses.json since it already exists
Skipping shirt_dresses.json since it already exists
Skipping evening_dresses.json since it already exists
Skipping knitted_dresses.json since it already exists
Skipping casual_dresses.json since it already exists
Skipping denim_dresses.json since it already exists
Skipping maxi_dresses.json since it already exists


### Code to test current IP Adress in use: 


In [15]:
import requests 
 
def get_public_ip(): 
    try: 
        response = requests.get('https://httpbin.org/ip') 
        if response.status_code == 200: 
            ip_data = response.json() 
            public_ip = ip_data.get('origin') 
            return public_ip 
        else: 
            print(f"Failed to retrieve IP (Status code: {response.status_code})") 
    except Exception as e: 
        print(f"Error: {e}") 
 
# Get and print the public IP address 
public_ip = get_public_ip() 
print(f"Your public IP address is: {public_ip}") 

Your public IP address is: 185.104.138.53
