In [1]:
from apify_token import *
import json
from apify_client import ApifyClient
from bs4 import BeautifulSoup
import requests
import numpy as np
import time
import os
from tqdm import tqdm
import datetime

### 1. Get Subcategories
- In order to run the Zalando Scraper from Apify, one should have the urls of the various sub-categories listed on Zalando. 
- Of course, one could also fetch everything from one category (e.g. dresses) directly. 
- The problem with this approach is that the number of items to be scraped is very high and the scraper might not be able to handle it. 
- Therefore, it is recommended to scrape sub-categories separately. 


- The code below first retrieves the names and urls of the categoreis from Zalando and then retrieves the sub-categories of each category. 
- Subcategoreis for the category dress are e.g. "cocktail dress", "maxi dress", "shirt dress", etc. 

In [2]:
def fetch_url_content(url):
    # Use the requests library to fetch the content of the URL
    response = requests.get(url)
    
    # Check if the request was successful (HTTP status code 200)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to fetch content. HTTP status code: {response.status_code}")

In [3]:
def get_subcategories(cat_url):
    response = fetch_url_content(cat_url)
    soup = BeautifulSoup(response, 'lxml')
    subcats = soup.find('ul', class_ = 'ODGSbs').find('ul', class_ = 'ODGSbs')
    subcats = subcats.find_all('li')
    subcats = {elem.text: elem.find('a')['href'] for elem in subcats}
    return subcats

In [4]:
# Only run this code if the categories dict does not exist already
if not os.path.exists('categories_dict.json'):
    print("Fetching categories")

    # Get the toplevel categories from the Zalando website
    response = fetch_url_content("https://en.zalando.de/womens-clothing/")
    soup = BeautifulSoup(response, 'lxml')
    subcats = soup.find('ul', class_ = 'ODGSbs')
    subcats = subcats.find_all('li')
    garment_categories = {elem.text: elem.find('a')['href'] for elem in subcats}

    # Remove the 'Sale' category since this is very different from the other categories
    if 'Sale' in garment_categories:
        del garment_categories['Sale']

    # Get the subcategories for each toplevel category
    all_categories = {}
    for cat_name, cat_url in tqdm(garment_categories.items(), desc='Fetching sub-categories'):
        try:
            subcats = get_subcategories(cat_url)
        except:
            print(f"Failed to fetch subcategories for {cat_name}")
            subcats = {}
        all_categories[cat_name] = {
            'url': cat_url,
            'subcategories': subcats
        }
        # Sleep for a random amount of time to avoid getting blocked
        time.sleep(np.random.uniform(1, 5))

    # Save the categories dict to a JSON file
    with open('categories_dict.json', 'w') as f:
        json.dump(all_categories, f)
    print("Categories saved to categories_dict.json")
else:
    # Load the categories dict from the JSON file
    with open('categories_dict.json', 'r') as f:
        all_categories = json.load(f)
    print("categories_dict.json exists; skipping fetching categories")
    print("Categories loaded from categories_dict.json")

categories_dict.json exists; skipping fetching categories
Categories loaded from categories_dict.json


### 2. Scrape the Article Data from Zalando

In [5]:
def zalando_scraper(start_url, out_name, max_items=None,):
    # Initialize the ApifyClient with your API token
    client = ApifyClient(API_KEY)

    # Prepare the Actor input
    run_input = {
        "startUrls": [
            start_url
        ],
        "maxItems": max_items,
        "proxy": {
            "useApifyProxy": True,
            #"apifyProxyCountry": "DE",
            "apifyProxyGroups": ["RESIDENTIAL"],
        },
        }
    if max_items == None:
        del run_input["maxItems"]

    # Run the Actor and wait for it to finish
    run = client.actor("wPoILN4JczGRGC1xe").call(run_input=run_input)

    # Concat all items to list and dump to JSON
    articles = []
    for item in client.dataset(run["defaultDatasetId"]).iterate_items():
        articles.append(item)
    
    with open(out_name, 'w') as f:
        json.dump(articles, f)

In [6]:
all_categories.keys()

dict_keys(['Dresses', 'T-shirts & tops', 'Trousers', 'Jeans', 'Shirts & Blouses', 'Jackets & Blazers', 'Swimwear', 'Sweatshirts & Hoodies', 'Skirts', 'Knitwear & Cardigans', 'Sportswear', 'Shorts', 'Jumpsuits', 'Coats', 'Underwear', 'Nightwear & Loungewear', 'Socks & Tights'])

In [7]:
def scrape_whole_category(cat_name):
    # Make sure the folder exists
    folder_name = cat_name.replace('&', 'and').replace(' ', '_').replace('-', '_').lower()
    folder_path = f"../../Data.nosync/Zalando_Germany_Dataset/metadata_dicts/{folder_name}"
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    # Get subcategory URLs
    subcat_urls = all_categories[cat_name]['subcategories']

    # Scrape each subcategory
    for subcat in subcat_urls.keys():
        category_name_clean = subcat.replace('&', 'and').replace(' ', '_').replace('-', '_').lower()
        out_path = f"{folder_path}/{category_name_clean}.json"

        if not os.path.exists(out_path):
            print(f"{datetime.datetime.now()}: Scraping {subcat}")
            zalando_scraper(subcat_urls[subcat], out_path, max_items=None)
            print(f"{datetime.datetime.now()}: Finished scraping {subcat}")
        else:
            print(f"Skipping {subcat} since it already exists")

In [8]:
scrape_whole_category('Dresses')

2024-03-28 20:16:25.272350: Scraping Casual Dresses
2024-03-28 20:37:46.996890: Finished scraping Casual Dresses
2024-03-28 20:37:46.997214: Scraping Evening Dresses
2024-03-28 20:42:58.779386: Finished scraping Evening Dresses
2024-03-28 20:42:58.779678: Scraping Occasion Dresses
2024-03-28 20:45:52.303153: Finished scraping Occasion Dresses
2024-03-28 20:45:52.303515: Scraping Shirt Dresses
2024-03-28 20:46:17.273311: Finished scraping Shirt Dresses
2024-03-28 20:46:17.274075: Scraping Jersey Dresses
2024-03-28 20:52:24.163466: Finished scraping Jersey Dresses
2024-03-28 20:52:24.163779: Scraping Shift Dresses
2024-03-28 20:56:26.080296: Finished scraping Shift Dresses
2024-03-28 20:56:26.080589: Scraping Maxi Dresses


KeyboardInterrupt: 