In [None]:
!pip install parsel

Collecting parsel
  Downloading parsel-1.9.1-py2.py3-none-any.whl.metadata (11 kB)
Collecting cssselect>=1.2.0 (from parsel)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting jmespath (from parsel)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting w3lib>=1.19.0 (from parsel)
  Downloading w3lib-2.2.1-py3-none-any.whl.metadata (2.1 kB)
Downloading parsel-1.9.1-py2.py3-none-any.whl (17 kB)
Downloading cssselect-1.2.0-py2.py3-none-any.whl (18 kB)
Downloading w3lib-2.2.1-py3-none-any.whl (21 kB)
Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Installing collected packages: w3lib, jmespath, cssselect, parsel
Successfully installed cssselect-1.2.0 jmespath-1.0.1 parsel-1.9.1 w3lib-2.2.1


## Introduction

### Workshop Overview

#### Meet Danny, a Freelancing Web Scraping Developer
Danny leads the data acquisition team, handling scraping projects for major e-commerce clients. Despite years of experience with traditional scraping tools, his team faces mounting challenges:

Current Pain Points:

1. Their scrapers break frequently due to JavaScript-heavy sites.
2. Anti-bot measures are becoming more sophisticated.
3. Handling infinite scroll and dynamic loading is a constant battle.
4. Maintaining proxy infrastructure is eating up development time.
5. Browser automation scripts are fragile and resource-intensive.
6. Regional data access requires complex proxy management.


### Moving Beyond Traditional Scraping with Zyte API
This workshop shows you how to overcome these common developer frustrations. You'll learn to build resilient scraping systems that:

Architecture Improvements:
- Replace brittle Selenium scripts with stable API-based solutions
- Eliminate proxy management overhead
- Handle modern web technologies efficiently
- Scale without infrastructure headaches

Technical Challenges You'll Solve

- Extract data from JavaScript-rendered content without using explicit browser automation tools.
- Navigate infinite scroll without complex scroll simulations
- Access geo-restricted content through simple API parameters
- Handle anti-bot measures without maintaining proxy pools
- Extract structured data from dynamic tables and listings

#### Workshop Approach: Building Real-World Scraping Solutions
##### Learning Through Production Scenarios

In this workshop, we'll build scrapers for actual e-commerce websites, progressing from basic extraction to complex automation. Each concept is taught through practical examples, followed by hands-on implementation.


#### Prerequisites
- Zyte API account and key
- Python programming experience
- Basic understanding of web scraping concepts

In [None]:
Zyte_Api_Key = ""

## Module 1: Foundation & Setup

- Chapter 1: Getting Started with Zyte API
- Chapter 2: Browser Rendering Fundamentals

### Chapter 1: Extract Product Details with Zyte API

#### Problem Statement:
We will amazon.in to extract details of a product.  


#### Objective:
Scrape structured data for products, including names, prices, ratings, and availability, and store it in a usable format.

#### How it works:


In [None]:
from base64 import b64decode

import requests

api_response = requests.post(
    "https://api.zyte.com/v1/extract",
    auth=(Zyte_Api_Key, ""),
    json={
        "url": "https://www.amazon.in/gp/product/B0CX2L28HH/",
        "product" : True
    },
)

product = api_response.json()['product']
print(product)

{'name': 'OnePlus 12R (Iron Gray, 8 GB RAM, 256 GB Storage)', 'price': '42999.0', 'currency': 'INR', 'currencyRaw': '₹', 'availability': 'InStock', 'sku': 'B0CX2L28HH', 'mpn': '\u200eCPH2585', 'brand': {'name': 'OnePlus'}, 'breadcrumbs': [{'name': 'Electronics', 'url': 'https://www.amazon.in/electronics/b/ref=dp_bc_aui_C_1/261-5124165-2520305?ie=UTF8&node=976419031'}, {'name': 'Mobiles & Accessories', 'url': 'https://www.amazon.in/mobile-phones/b/ref=dp_bc_aui_C_2/261-5124165-2520305?ie=UTF8&node=1389401031'}, {'name': 'Smartphones & Basic Mobiles', 'url': 'https://www.amazon.in/smartphones-basic-mobiles/b/ref=dp_bc_aui_C_3/261-5124165-2520305?ie=UTF8&node=1389432031'}, {'name': 'Smartphones', 'url': 'https://www.amazon.in/Smartphones/b/ref=dp_bc_aui_C_4/261-5124165-2520305?ie=UTF8&node=1805560031'}], 'mainImage': {'url': 'https://m.media-amazon.com/images/I/71XNeka-BRL._SX679_.jpg'}, 'images': [{'url': 'https://m.media-amazon.com/images/I/71XNeka-BRL._SX679_.jpg'}, {'url': 'https://m.

💡 **You cannot combine multiple automatic extraction request fields (e.g. product and productList) on the same request.**

In [None]:
from base64 import b64decode

import requests

api_response = requests.post(
    "https://api.zyte.com/v1/extract",
    auth=(Zyte_Api_Key, ""),
    json={
        "url": "https://www.tatacliq.com/wall-lights/c-msh2216106/?q=:relevance:category:MSH2216106:inStockFlag:true",
        "productList" : True,

    },
)

products = api_response.json()['productList']['products']
print(products)
print(len(products))


[{'url': 'https://www.tatacliq.com/homesake-yellow-copper-snowflake-led-fairy-lights/p-mp000000019256916', 'mainImage': {'url': 'https://www.tatacliq.com/src/general/components/img/green-filled-star.svg'}, 'metadata': {'probability': 0.3808705364427567}}, {'url': 'https://www.tatacliq.com/fos-lighting-palatial-golden-aluminium-five-light-candelabra-wall-sconce/p-mp000000017030563', 'name': 'FOS LIGHTING Palatial Golden Aluminium Five Light Candelabra Wall Sconce', 'metadata': {'probability': 0.1121336614578361}}, {'url': 'https://www.tatacliq.com/fos-lighting-black-mild-steel-modern-mesh-band-box-outdoor-wall-sconce/p-mp000000020717443', 'name': 'FOS LIGHTING Black Mild Steel Modern Mesh Band Box Outdoor Wall Sconce', 'price': '3275.0', 'currencyRaw': '₹', 'currency': 'INR', 'metadata': {'probability': 0.4457084815804535}}, {'url': 'https://www.tatacliq.com/kapoor-lamp-shades-brass-glass-hollis-wall-light/p-mp000000023870448', 'name': 'Kapoor Lamp Shades Brass & Glass Hollis Wall Light

36


### Chapter 2: Browser Rendering Fundamentals

In [None]:
from base64 import b64decode
import requests

# Replace with your actual Zyte API Key


api_response = requests.post(
    "https://api.zyte.com/v1/extract",
    auth=(Zyte_Api_Key, ""),
    json={
        "url": "https://www.tatacliq.com/wall-lights/c-msh2216106/?q=:relevance:category:MSH2216106:inStockFlag:true",
        "productList": True,
        "actions": [

            {
                "action": "click",
                "selector": {
                    "type": "css",  # or "xpath" depending on the selector you choose
                    "value": "button.ShowMoreButtonPlp__button"
                },
                #"waitForNavigation": True,  # Optional: wait for navigation after clicking
            },
        ],
    },
)

# Extracting the product list from the response
products = api_response.json()['productList']['products']
print(products)
print(len(products))

[{'url': 'https://www.tatacliq.com/homesake-yellow-copper-snowflake-led-fairy-lights/p-mp000000019256916', 'mainImage': {'url': 'https://www.tatacliq.com/src/general/components/img/green-filled-star.svg'}, 'metadata': {'probability': 0.3939786827250167}}, {'url': 'https://www.tatacliq.com/fos-lighting-palatial-golden-aluminium-five-light-candelabra-wall-sconce/p-mp000000017030563', 'name': 'FOS LIGHTING Palatial Golden Aluminium Five Light Candelabra Wall Sconce', 'metadata': {'probability': 0.10266314872968474}}, {'url': 'https://www.tatacliq.com/fos-lighting-black-mild-steel-modern-mesh-band-box-outdoor-wall-sconce/p-mp000000020717443', 'name': 'FOS LIGHTING Black Mild Steel Modern Mesh Band Box Outdoor Wall Sconce', 'price': '3275.0', 'currencyRaw': '₹', 'currency': 'INR', 'metadata': {'probability': 0.41981525143391885}}, {'url': 'https://www.tatacliq.com/kapoor-lamp-shades-brass-glass-hollis-wall-light/p-mp000000023870448', 'name': 'Kapoor Lamp Shades Brass & Glass Hollis Wall Lig

In [None]:

https://www.nike.com/in/w/running-37v7j

In [None]:
from base64 import b64decode
import requests

# Replace with your actual Zyte API Key


api_response = requests.post(
    "https://api.zyte.com/v1/extract",
    auth=(Zyte_Api_Key, ""),
    json={
        "url": "https://www.nike.com/in/",
        "productList": True,
        "actions": [
             {
                    "action": "waitForTimeout",
                    "timeout": 5,
                    "onError": "return"
                },

          {
              "action": "click",
              "selector": {
                  "type": "css",
                  "value": "a[aria-label='Running']",
                  },

          },
                 {
                    "action": "waitForTimeout",
                    "timeout": 5,
                    "onError": "return"
                },

            {
                "action": "scrollBottom",

            },
        ],
    },
)



In [None]:

print(api_response.status_code)


200


In [None]:
# Extracting the product list from the response
# products = api_response.json()['productList']['products']
print(api_response.json())
# print(len(products))

{'url': 'https://www.nike.com/in/', 'statusCode': 200, 'productList': {'url': 'https://www.nike.com/in/', 'metadata': {'dateDownloaded': '2024-12-31T06:57:47Z'}, 'categoryName': 'Classics Spotlight'}, 'actions': [{'action': 'waitForTimeout', 'elapsedTime': 4.999, 'status': 'success'}, {'action': 'click', 'elapsedTime': 1.738, 'status': 'success'}, {'action': 'waitForTimeout', 'elapsedTime': 5, 'status': 'success'}, {'action': 'scrollBottom', 'elapsedTime': 1.503, 'status': 'success'}]}


## Module 2: Basic Extraction Techniques

- Chapter 3: Static vs Dynamic Content
- Chapter 4: Geolocation & Regional Data



### Chapter 3: Static vs Dynamic Content

In [None]:
from base64 import b64decode

import requests

api_response = requests.post(
    "https://api.zyte.com/v1/extract",
    auth=(Zyte_Api_Key, ""),
    json={
        "url": "https://www.spaceposters.co/",
        "product" : True
    },
)

product = api_response.json()['product']
print(product)

{'name': 'Minimalist Space Posters', 'mainImage': {'url': 'https://cdn.prod.website-files.com/5c6c5d537c647f619adce5fc/5eea68fcdbde355322ab077a_Screen%20Shot%202020-06-17%20at%2012.01.45%20PM.png'}, 'description': "Minimalist space posters showcasing cosmological topography and the greatest of humanity's space endeavors. Starting at $49.99", 'url': 'https://www.spaceposters.co/', 'canonicalUrl': 'https://www.spaceposters.co//', 'metadata': {'probability': 0.011791097931563854, 'dateDownloaded': '2025-01-20T05:36:37Z'}}
{'name': "Women's Fashion Clothing", 'breadcrumbs': [{'name': 'Home', 'url': 'https://us.vestiairecollective.com/'}, {'name': "Women's Fashion Clothing"}], 'description': "Discover pre-owned Women's Clothing, Luxury and Fashion Designer brands at up to 70% off", 'aggregateRating': {'ratingValue': 4.5, 'reviewCount': 44885}, 'url': 'https://us.vestiairecollective.com/women/', 'canonicalUrl': 'https://us.vestiairecollective.com/women/', 'metadata': {'probability': 0.012635

In [None]:
# dynamic
from base64 import b64decode

import requests

api_response = requests.post(
    "https://api.zyte.com/v1/extract",
    auth=(Zyte_Api_Key, ""),
    json={
        "browserHtml": True,
        "url": "https://www.vestiairecollective.com/women/",
        "product" : True
    },
)

product = api_response.json()['product']
print(product)

### Chapter 4: Geolocation & Regional Data

In [None]:
from base64 import b64decode

import requests

api_response = requests.post(
    "https://api.zyte.com/v1/extract",
    auth=(Zyte_Api_Key, ""),
    json={
  "url": "https://www.amazon.com/b/?ie=UTF8&node=11058691",
  "actions": [
    {
      "action": "setLocation",
      "onError": "return",
      "address": {
        "postalCode": "11001"
      }
    }
  ],
  "browserHtml": True
},
)

product = api_response.json()
print(product)

NameError: name 'Zyte_Api_Key' is not defined

## Module 3: Dynamic Content Handling

- Chapter 5: Infinite Scroll Mastery
- Chapter 6: Interactive Elements



### Chapter 5: Infinite Scroll Mastery



In [None]:
from base64 import b64decode
from parsel import Selector
import requests

api_response = requests.post(
    "https://api.zyte.com/v1/extract",
    auth=(Zyte_Api_Key, ""),
    json={
        "url": "http://quotes.toscrape.com/scroll",
        "browserHtml": True,
        "actions": [
            {
                "action": "scrollBottom",
            },
        ]

    },
)

if api_response.status_code == 200:
    html_response = api_response.json().get('browserHtml', "")

    # Parse HTML with Parsel
    selector = Selector(html_response)

    # Extract data using CSS selectors
    for quote in selector.css(".quote"):
        data = {
            "author": quote.css(".author::text").get(),
            "tags": quote.css(".tag::text").getall(),
            "text": quote.css(".text::text").get()[1:-1],
        }
        print(data)
else:
    print(f"Failed to fetch data: {api_response.status_code}")

{'author': 'Albert Einstein', 'tags': ['change', 'deep-thoughts', 'thinking', 'world'], 'text': 'The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.'}
{'author': 'J.K. Rowling', 'tags': ['abilities', 'choices'], 'text': 'It is our choices, Harry, that show what we truly are, far more than our abilities.'}
{'author': 'Albert Einstein', 'tags': ['inspirational', 'life', 'live', 'miracle', 'miracles'], 'text': 'There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.'}
{'author': 'Jane Austen', 'tags': ['aliteracy', 'books', 'classic', 'humor'], 'text': 'The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.'}
{'author': 'Marilyn Monroe', 'tags': ['be-yourself', 'inspirational'], 'text': "Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring."}
{'author':

### Chapter 6: Interactive Elements

## Module 4: Data Extraction Strategies

- Chapter 7: Dynamic Tables & Structured Data
- Chapter 8: Pagination Strategies



### Chapter 7: Dynamic Tables & Structured Data

### Chapter 8: Pagination Strategies

## Module 5: Advanced Interaction Patterns

- Chapter 9: Complex User Behavior Simulation
- Chapter 10: Form Handling & Data Submission



### Chapter 9: Complex User Behavior Simulation

### Chapter 10: Form Handling & Data Submission

In [None]:
from parsel import Selector
import requests

# Define the payload for the Zyte API request
payload = {
    "url": "http://quotes.toscrape.com/search.aspx",
    "browserHtml": True,
    "actions": [
        {
            "action": "select",
            "selector": {"type": "css", "value": "#author"},
            "values": ["Albert Einstein"],
        },
        {
            "action": "waitForSelector",
            "selector": {"type": "css", "value": "[value=\"world\"]", "state": "attached"},
        },
        {
            "action": "select",
            "selector": {"type": "css", "value": "#tag"},
            "values": ["world"],
        },
        {
            "action": "click",
            "selector": {"type": "css", "value": "[type='submit']"},
        },
        {
            "action": "waitForSelector",
            "selector": {"type": "css", "value": ".quote"},
        },
    ],
}

# Send the request to the Zyte API
response = requests.post(
    "https://api.zyte.com/v1/extract",
    auth=(Zyte_Api_Key, ""),
    json=payload,
)

# Check for a successful response
if response.status_code == 200:
    # Get the HTML content from the Zyte API response
    html_response = response.json().get('browserHtml', "")

    # Parse the HTML content
    selector = Selector(html_response)

    # Extract and print data using CSS selectors
    for quote in selector.css(".quote"):
        data = {
            "author": quote.css(".author::text").get(),
            "tags": quote.css(".tag::text").getall(),
            "text": quote.css(".content::text").get()[1:-1],
        }
        print(data)
else:
    print(f"Failed to fetch data: {response.status_code}")


{'author': 'Albert Einstein', 'tags': ['world'], 'text': 'The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.'}


## Module 6: Advanced Technical Extraction

- Chapter 11: API & Network Requests
- Chapter 12: Hidden Content & iframes

### Chapter 11: API & Network Requests

In [None]:
import json
from base64 import b64decode
import requests


# Define the payload for the Zyte API request
payload = {
    "url": "http://quotes.toscrape.com/scroll",
    "browserHtml": True,
    "actions": [
        {
            "action": "scrollBottom",
        },
    ],
    "networkCapture": [
        {
            "filterType": "url",
            "httpResponseBody": True,
            "value": "/api/",
            "matchType": "contains",
        },
    ],
}

# Send the request to the Zyte API
response = requests.post(
    "https://api.zyte.com/v1/extract",
    auth=(Zyte_Api_Key, ""),
    json=payload,
)

# Check for a successful response
if response.status_code == 200:
    # Parse the raw API response
    api_response = response.json()
    network_captures = api_response.get("networkCapture", [])

    # Process each network capture
    for capture in network_captures:
        # Decode the HTTP response body from Base64
        http_response_body = capture.get("httpResponseBody", "")
        if http_response_body:
            decoded_text = b64decode(http_response_body).decode()

            # Convert the decoded text to JSON
            data = json.loads(decoded_text)

            # Extract and print quotes
            for quote in data.get("quotes", []):
                extracted_data = {
                    "author": quote["author"]["name"],
                    "tags": quote["tags"],
                    "text": quote["text"],
                }
                print(extracted_data)
else:
    print(f"Failed to fetch data: {response.status_code}")


{'author': 'Albert Einstein', 'tags': ['change', 'deep-thoughts', 'thinking', 'world'], 'text': '“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”'}
{'author': 'J.K. Rowling', 'tags': ['abilities', 'choices'], 'text': '“It is our choices, Harry, that show what we truly are, far more than our abilities.”'}
{'author': 'Albert Einstein', 'tags': ['inspirational', 'life', 'live', 'miracle', 'miracles'], 'text': '“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”'}
{'author': 'Jane Austen', 'tags': ['aliteracy', 'books', 'classic', 'humor'], 'text': '“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”'}
{'author': 'Marilyn Monroe', 'tags': ['be-yourself', 'inspirational'], 'text': "“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”"}


### Chapter 12: Hidden Content & iframes

## Session Management


In [None]:
import requests
from scrapy.http import HtmlResponse

results = []
for category_link in category_URLs:
    print(f"Processing label: {category_link}"    )

    actions = [
        {
            "action": "waitForSelector",
            "selector": {"type": "css", "value": ".product-card"},
            "timeout": 10,
            "onError": "continue",
        },
        {
            "action": "scrollBottom",
            "timeout": 5,
            "maxScrollDelay": 5,
            "onError": "continue",
            "maxScrollCount": 3000,
        },
    ]

    label_response = requests.post(
        Zapi_url,
        auth=(Zyte_Api_Key, ""),
        json={
            "url": category_link,
            "actions": actions,
            "productList": True,
        },
    )

    if label_response.status_code == 200:
        data = label_response.json().get("productList", [])["products"]
        print(f"Fetched {len(data)} items for label '{category_link}'.")
        results.extend(data)
    else:
        print(f"Failed to scrape label '{category_link}'. Status code: {label_response.status_code}")

print("results:", results)

## Creating and Maintaining Sessions

Zyte API's session management allows you to maintain state across multiple requests. This is useful for:

- Login workflows that require authentication
- Multi-step processes like checkout flows
- Persistent cookies and preferences
- Stateful interactions with websites

Let's start with creating a new session:

In [None]:
def create_session(url, initial_actions=None):
    """Create a new session for persistent browsing"""
    payload = {
        "url": url,
        "browserHtml": True,
        "sessionId": "create"  # Request creation of a new session
    }

    if initial_actions:
        payload["actions"] = initial_actions

    response = zyte_request(payload)

    if response and "sessionId" in response:
        print(f"Created new session with ID: {response['sessionId']}")
        return response
    else:
        print("Failed to create session")
        return None

# Create a new session on a simple website
session_response = create_session("https://books.toscrape.com/")

if session_response:
    # Extract the session ID for future requests
    session_id = session_response["sessionId"]

    # Verify we have the page content
    if "browserHtml" in session_response:
        selector = Selector(session_response["browserHtml"])
        title = selector.css("title::text").get("")
        print(f"Page title: {title}")
else:
    session_id = None

Now that we have a session, let's use it for subsequent requests to maintain state:

In [None]:
def use_existing_session(session_id, url=None, actions=None):
    """Make a request using an existing session"""
    if not session_id:
        print("No session ID provided")
        return None

    payload = {
        "sessionId": session_id,
        "browserHtml": True
    }

    # If URL is provided, navigate to that URL
    if url:
        payload["url"] = url

    # If actions are provided, add them
    if actions:
        payload["actions"] = actions

    return zyte_request(payload)

# Only proceed if we have a valid session ID
if session_id:
    # Use the session to navigate to a category page
    navigate_actions = [
        # Wait for the sidebar categories to be visible
        {
            "action": "waitForSelector",
            "selector": {"type": "css", "value": ".side_categories"},
            "timeout": 5
        },

        # Click on a category link (Travel)
        {
            "action": "click",
            "selector": {"type": "css", "value": ".side_categories ul.nav-list > li > ul > li:first-child > a"},
            "onError": "return"
        },

        # Wait for the category page to load
        {
            "action": "waitForSelector",
            "selector": {"type": "css", "value": ".page_inner h1"},
            "timeout": 5
        }
    ]

    # Make the request with the existing session
    category_response = use_existing_session(session_id, actions=navigate_actions)

    if category_response and "browserHtml" in category_response:
        # Parse the HTML to verify navigation was successful
        selector = Selector(category_response["browserHtml"])
        category_title = selector.css(".page_inner h1::text").get("")
        book_count = len(selector.css(".product_pod"))

        print(f"Navigated to category: {category_title}")
        print(f"Found {book_count} books in this category")
        print(f"Current URL: {category_response.get('url', 'N/A')}")
    else:
        print("Failed to navigate to category page using the session")

In [None]:
# Import required libraries and set up API key
import os
import requests
import json
import time
from dotenv import load_dotenv
from parsel import Selector

# Load API key
load_dotenv()
ZYTE_API_KEY = os.getenv("ZYTE_API_KEY", "")
if not ZYTE_API_KEY:
    ZYTE_API_KEY = input("Enter your Zyte API key: ")

# Helper function for API requests
def zyte_request(payload, max_retries=3):
    headers = {"Content-Type": "application/json", "Accept": "application/json"}

    for attempt in range(max_retries):
        try:
            response = requests.post(
                "https://api.zyte.com/v1/extract",
                auth=(ZYTE_API_KEY, ""),
                headers=headers,
                json=payload,
                timeout=30
            )
            response.raise_for_status()
            return response.json()
        except Exception as e:
            print(f"Request failed on attempt {attempt + 1}: {str(e)}")
            if attempt < max_retries - 1:
                wait_time = 2 ** attempt
                print(f"Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
            else:
                print("All retry attempts failed.")
                return None
    return None

## Login Workflows

One of the most common use cases for session management is handling login workflows. Let's create a function to log in to a website and maintain the authenticated session:

In [None]:
def login_to_website(url, username, password, username_selector, password_selector, submit_selector, success_indicator=None):
    """Log in to a website and create a session"""
    login_actions = [
        # Wait for the login form
        {
            "action": "waitForSelector",
            "selector": {"type": "css", "value": username_selector},
            "timeout": 10
        },

        # Fill the username field
        {
            "action": "fill",
            "selector": {"type": "css", "value": username_selector},
            "value": username
        },

        # Fill the password field
        {
            "action": "fill",
            "selector": {"type": "css", "value": password_selector},
            "value": password
        },

        # Click the login button
        {
            "action": "click",
            "selector": {"type": "css", "value": submit_selector},
            "waitForNavigation": True
        }
    ]

    # If a success indicator is provided, wait for it after login
    if success_indicator:
        login_actions.append({
            "action": "waitForSelector",
            "selector": {"type": "css", "value": success_indicator},
            "timeout": 5,
            "onError": "continue"  # Continue even if indicator not found
        })

    # Add a final delay to ensure the page loads completely
    login_actions.append({
        "action": "waitForTimeout",
        "timeout": 2
    })

    payload = {
        "url": url,
        "browserHtml": True,
        "sessionId": "create",  # Create a new session
        "actions": login_actions
    }

    return zyte_request(payload)

# Example: Log in to quotes.toscrape.com
login_url = "https://quotes.toscrape.com/login"

# Note: These are example credentials that should work with quotes.toscrape.com
username = "admin"
password = "admin"

login_response = login_to_website(
    login_url,
    username,
    password,
    "input[name='username']",  # Username input selector
    "input[name='password']",  # Password input selector
    "input[type='submit']",    # Submit button selector
    ".header-box .icon"        # Success indicator (logged-in user icon)
)

if login_response:
    # Check if we have a session ID
    login_session_id = login_response.get("sessionId")

    if login_session_id:
        print(f"Login created session with ID: {login_session_id}")

        # Check if login was successful by looking for indicators
        html = login_response.get("browserHtml", "")
        selector = Selector(html)

        # Look for success or failure indicators
        error_message = selector.css(".error::text").get("")
        logout_link = selector.css("a[href='/logout']").get()

        if logout_link:
            print("Login successful! Found logout link.")
            login_success = True
        elif error_message:
            print(f"Login failed. Error: {error_message}")
            login_success = False
        else:
            # Check the URL for redirection to a protected page
            if login_response.get("url", "") != login_url:
                print("Login appears successful based on URL redirection.")
                login_success = True
            else:
                print("Login status unclear. Could not find confirmation indicators.")
                login_success = False
    else:
        print("Failed to create session.")
        login_success = False
        login_session_id = None
else:
    print("Login request failed.")
    login_success = False
    login_session_id = None

If login is successful, we can use the session to access protected content:

In [None]:
def access_protected_content(session_id, protected_url):
    """Access protected content using an authenticated session"""
    if not session_id:
        print("No session ID provided")
        return None

    payload = {
        "url": protected_url,
        "browserHtml": True,
        "sessionId": session_id
    }

    return zyte_request(payload)

# Only proceed if login was successful
if login_success and login_session_id:
    # Access a protected page (for quotes.toscrape.com, this could be the user profile)
    protected_url = "https://quotes.toscrape.com/"  # Main page with logged in user
    protected_response = access_protected_content(login_session_id, protected_url)

    if protected_response and "browserHtml" in protected_response:
        # Check if we still have authenticated access
        selector = Selector(protected_response["browserHtml"])

        # Look for authenticated indicators
        logout_link = selector.css("a[href='/logout']").get()

        if logout_link:
            print("Successfully accessed protected content with authenticated session")

            # Extract some content from the protected page
            user_content = selector.css(".header-box::text").get("").strip()
            print(f"User content: {user_content}")
        else:
            print("Session appears to have expired or logged out")
    else:
        print("Failed to access protected content")

## Stateful Scraping with Multi-Step Workflows

Let's implement a more complex stateful scraping workflow that involves multiple steps:

In [None]:
def run_multi_step_workflow(start_url, workflow_steps):
    """Execute a multi-step workflow using a persistent session"""
    # Create a new session
    session_response = create_session(start_url)

    if not session_response or "sessionId" not in session_response:
        print("Failed to create session for workflow")
        return None

    session_id = session_response["sessionId"]
    results = [session_response]  # Store initial response in results

    # Execute each step in the workflow
    for i, step in enumerate(workflow_steps, 1):
        print(f"\nExecuting workflow step {i}/{len(workflow_steps)}: {step['description']}")

        # Get the URL for this step (if provided)
        url = step.get('url', None)

        # Execute the step using the existing session
        step_response = use_existing_session(session_id, url, step['actions'])

        if not step_response:
            print(f"Step {i} failed. Workflow cannot continue.")
            break

        results.append(step_response)
        print(f"Step {i} completed successfully")

        # Add a delay between steps
        time.sleep(1)

    return {
        "session_id": session_id,
        "results": results,
        "completed_steps": len(results) - 1  # Subtract 1 for initial session creation
    }

# Example: Multi-step workflow on books.toscrape.com
# 1. Navigate to a category
# 2. Select a book
# 3. Check book details

books_workflow = [
    {
        "description": "Navigate to Travel category",
        "actions": [
            {
                "action": "waitForSelector",
                "selector": {"type": "css", "value": ".side_categories"},
                "timeout": 5
            },
            {
                "action": "click",
                "selector": {"type": "css", "value": ".side_categories ul.nav-list > li > ul > li:first-child > a"},
                "onError": "return"
            },
            {
                "action": "waitForSelector",
                "selector": {"type": "css", "value": ".page_inner h1"},
                "timeout": 5
            }
        ]
    },
    {
        "description": "Select first book in category",
        "actions": [
            {
                "action": "waitForSelector",
                "selector": {"type": "css", "value": ".product_pod h3 a"},
                "timeout": 5
            },
            {
                "action": "click",
                "selector": {"type": "css", "value": ".product_pod:first-child h3 a"},
                "onError": "return"
            },
            {
                "action": "waitForSelector",
                "selector": {"type": "css", "value": ".product_page"},
                "timeout": 5
            }
        ]
    },
    {
        "description": "Add book to basket",
        "actions": [
            {
                "action": "waitForSelector",
                "selector": {"type": "css", "value": ".btn-add-to-basket"},
                "timeout": 5
            },
            {
                "action": "click",
                "selector": {"type": "css", "value": ".btn-add-to-basket"},
                "onError": "return"
            },
            {
                "action": "waitForSelector",
                "selector": {"type": "css", "value": ".alert-success"},
                "timeout": 5,
                "onError": "continue"
            }
        ]
    },
    {
        "description": "View basket",
        "actions": [
            {
                "action": "click",
                "selector": {"type": "css", "value": ".btn-group a.btn-default"},
                "onError": "return"
            },
            {
                "action": "waitForSelector",
                "selector": {"type": "css", "value": ".basket-items"},
                "timeout": 5
            }
        ]
    }
]

# Run the workflow
workflow_result = run_multi_step_workflow("https://books.toscrape.com/", books_workflow)

if workflow_result:
    print(f"\nWorkflow completed {workflow_result['completed_steps']} of {len(books_workflow)} steps")

    # Analyze the final result (basket contents)
    if workflow_result['completed_steps'] == len(books_workflow) and workflow_result['results']:
        final_response = workflow_result['results'][-1]

        if "browserHtml" in final_response:
            selector = Selector(final_response["browserHtml"])

            # Check basket contents
            basket_items = selector.css(".basket-items tr")
            print(f"\nBasket contains {len(basket_items) - 1} items:")  # Subtract 1 for header row

            for item in selector.css(".basket-items .row"):
                title = item.css(".col-sm-4 a::text").get("").strip()
                price = item.css(".price_color::text").get("").strip()
                quantity = item.css(".form-control::attr(value)").get("")

                if title and price:
                    print(f"- {title}: {price} (Quantity: {quantity})")

            # Get total price
            total = selector.css(".total::text").get("").strip()
            print(f"Total: {total}")

## Session Timeout and Reuse

Sessions in Zyte API have a limited lifetime and may expire after a period of inactivity. Here's how to check if a session is still valid and reuse it if possible:

## Stateful Scraping with Multi-Step Workflows

Let's implement a more complex stateful scraping workflow that involves multiple steps:

## Stateful Scraping with Multi-Step Workflows

Let's implement a more complex stateful scraping workflow that involves multiple steps:

In [None]:
def run_multi_step_workflow(start_url, workflow_steps):
    """Execute a multi-step workflow using a persistent session"""
    # Create a new session
    session_response = create_session(start_url)

    if not session_response or "sessionId" not in session_response:
        print("Failed to create session for workflow")
        return None

    session_id = session_response["sessionId"]
    results = [session_response]  # Store initial response in results

    # Execute each step in the workflow
    for i, step in enumerate(workflow_steps, 1):
        print(f"\nExecuting workflow step {i}/{len(workflow_steps)}: {step['description']}")

        # Get the URL for this step (if provided)
        url = step.get('url', None)

        # Execute the step using the existing session
        step_response = use_existing_session(session_id, url, step['actions'])

        if not step_response:
            print(f"Step {i} failed. Workflow cannot continue.")
            break

        results.append(step_response)
        print(f"Step {i} completed successfully")

        # Add a delay between steps
        time.sleep(1)

    return {
        "session_id": session_id,
        "results": results,
        "completed_steps": len(results) - 1  # Subtract 1 for initial session creation
    }

# Example: Multi-step workflow on books.toscrape.com
# 1. Navigate to a category
# 2. Select a book
# 3. Check book details

books_workflow = [
    {
        "description": "Navigate to Travel category",
        "actions": [
            {
                "action": "waitForSelector",
                "selector": {"type": "css", "value": ".side_categories"},
                "timeout": 5
            },
            {
                "action": "click",
                "selector": {"type": "css", "value": ".side_categories ul.nav-list > li > ul > li:first-child > a"},
                "onError": "return"
            },
            {
                "action": "waitForSelector",
                "selector": {"type": "css", "value": ".page_inner h1"},
                "timeout": 5
            }
        ]
    },
    {
        "description": "Select first book in category",
        "actions": [
            {
                "action": "waitForSelector",
                "selector": {"type": "css", "value": ".product_pod h3 a"},
                "timeout": 5
            },
            {
                "action": "click",
                "selector": {"type": "css", "value": ".product_pod:first-child h3 a"},
                "onError": "return"
            },
            {
                "action": "waitForSelector",
                "selector": {"type": "css", "value": ".product_page"},
                "timeout": 5
            }
        ]
    },
    {
        "description": "Add book to basket",
        "actions": [
            {
                "action": "waitForSelector",
                "selector": {"type": "css", "value": ".btn-add-to-basket"},
                "timeout": 5
            },
            {
                "action": "click",
                "selector": {"type": "css", "value": ".btn-add-to-basket"},
                "onError": "return"
            },
            {
                "action": "waitForSelector",
                "selector": {"type": "css", "value": ".alert-success"},
                "timeout": 5,
                "onError": "continue"
            }
        ]
    },
    {
        "description": "View basket",
        "actions": [
            {
                "action": "click",
                "selector": {"type": "css", "value": ".btn-group a.btn-default"},
                "onError": "return"
            },
            {
                "action": "waitForSelector",
                "selector": {"type": "css", "value": ".basket-items"},
                "timeout": 5
            }
        ]
    }
]

# Run the workflow
workflow_result = run_multi_step_workflow("https://books.toscrape.com/", books_workflow)

if workflow_result:
    print(f"\nWorkflow completed {workflow_result['completed_steps']} of {len(books_workflow)} steps")

    # Analyze the final result (basket contents)
    if workflow_result['completed_steps'] == len(books_workflow) and workflow_result['results']:
        final_response = workflow_result['results'][-1]

        if "browserHtml" in final_response:
            selector = Selector(final_response["browserHtml"])

            # Check basket contents
            basket_items = selector.css(".basket-items tr")
            print(f"\nBasket contains {len(basket_items) - 1} items:")  # Subtract 1 for header row

            for item in selector.css(".basket-items .row"):
                title = item.css(".col-sm-4 a::text").get("").strip()
                price = item.css(".price_color::text").get("").strip()
                quantity = item.css(".form-control::attr(value)").get("")

                if title and price:
                    print(f"- {title}: {price} (Quantity: {quantity})")

            # Get total price
            total = selector.css(".total::text").get("").strip()
            print(f"Total: {total}")

In [None]:
def check_session_validity(session_id):
    """Check if a session is still valid"""
    if not session_id:
        return False

    try:
        # Try to use the session with a simple action
        payload = {
            "sessionId": session_id,
            "browserHtml": True,
            "actions": [
                {
                    "action": "waitForTimeout",
                    "timeout": 1
                }
            ]
        }

        response = zyte_request(payload)

        # If we get a valid response with the same session ID, the session is still valid
        return response is not None and "sessionId" in response and response["sessionId"] == session_id
    except Exception as e:
        print(f"Error checking session validity: {str(e)}")
        return False

# Check if our session is still valid
if 'session_id' in locals() and session_id:
    is_valid = check_session_validity(session_id)
    print(f"Session validity check: {'Valid' if is_valid else 'Invalid or expired'}")

    # If valid, reuse it for another request
    if is_valid:
        home_response = use_existing_session(session_id, "https://books.toscrape.com/")

        if home_response and "browserHtml" in home_response:
            selector = Selector(home_response["browserHtml"])
            title = selector.css("title::text").get("")
            print(f"Successfully reused session. Page title: {title}")
        else:
            print("Failed to reuse session")

In [None]:
import requests

ZYTE_API_KEY = "389ad5e88cba4528ad48aced5e9c3e4d"

def fetch_page(page_num):
    url = f"https://www.kaufland.de/c/milch/~1951/?page={page_num}"

    response = requests.post(
        "https://api.zyte.com/v1/extract",
        auth=(ZYTE_API_KEY, ""),
        json={
            "url": url,
            "browserHtml": True,
            "productList": True,
            "productNavigation": True
        },
    )

    data = response.json()
    return data.get("productList", {}).get("products", []), data.get("productNavigation", {}).get("next", None)

# Loop through the first 3 pages
for page in range(1, 4):
    print(f"--- Page {page} ---")
    products, next_page = fetch_page(page)
    for product in products:
        print(f"{product.get('name')} - {product.get('price')} {product.get('currency')}")


--- Page 1 ---
Kefirpilz bestehend aus 10g Knollen - 14.99 EUR
Weihenstephan Barista 1l, Milch + Milchkännchen aus Edelstahl Milk Pitcher 350ml/12oz Milchkanne Milchschaumkännchen Milch Aufschäumen für Cappuccino und Latte Art, Silber + Turbo-Mixer - 24.9 EUR
Andechser Natur Ziegen-H-Milch 3,0% -- 1l - 3.39 EUR
Natumi Cashew Drink Natural 1 l - 2.41 EUR
Sucofin Magermilchpulver 250g - 2.9 EUR
Bärenmarke Der Milch Schaum luftig locker in der Sprühflasche 250ml - 4.38 EUR
Kefirpilz bestehend aus 10g Knollen - 14.99 EUR
Deals - None None
Berchtesgadener Land HaltbareAlpenmilch 3,5%, 1 l - 2.29 EUR
Nestlé Nido Instant Vollmilchpulver 400g | Fettgehalt mind. 26% | Full Cream Milk Powder | Milchpulver - 8.06 EUR
Softeis Pulver Frozen Joghurt 1 Kg Ice Bär Cremig und Lecker 1:3,5 Verhältnis - 9.95 EUR
5kg Vollmilchpulver sprühgetrocknet Backen Milchpulver 5 kg - 43.99 EUR
Nestle Milchmädchen 8% 400g - 2.75 EUR
Golden Turtle Sojaöl 1L | Soja Öl zum braten, backen und frittieren | Sojabohnenöl -