In [58]:
%pip install aiohttp nest_asyncio
%pip install ipywidgets

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### Imports

In [59]:
import os
import json
import asyncio
import aiohttp
from tqdm.notebook import tqdm
import nest_asyncio
from datetime import datetime

# Constants

Amount of pages to scrape from steamspy (each page is ~1000 games)

In [60]:
STEAMSPY_PAGES = 1

# Setting up data folder to save to

In [61]:
DATA_DIR = "Data"
TODAY_DATE = datetime.now().strftime("%Y-%m-%d")
DAILY_DATA_DIR = os.path.join(DATA_DIR, TODAY_DATE)
os.makedirs(DAILY_DATA_DIR, exist_ok=True)

OUTPUT_FILE = os.path.join(DAILY_DATA_DIR, "steamspy_data.jsonl")
PROGRESS_LOG = os.path.join(DAILY_DATA_DIR, "scraped_appids.txt")
ERROR_LOG = os.path.join(DAILY_DATA_DIR, "steamspy_errors.log")
METADATA_FILE = os.path.join(DAILY_DATA_DIR, "daily_metadata.json")


## Async setup & steamspy definition

In [62]:

nest_asyncio.apply()
BASE_URL = "https://steamspy.com/api.php"
ALL_REQUEST_DELAY = 60  # seconds between 'all' page requests
APPDETAILS_RATE_INTERVAL = 1.0  # seconds between appdetails requests

# Async class

In [63]:
class RateLimiter:
    def __init__(self, interval):
        self.interval = interval
        self.lock = asyncio.Lock()
        self.last_called = 0

    async def __aenter__(self):
        async with self.lock:
            now = asyncio.get_event_loop().time()
            # Calculate if we need to wait
            wait_time = self.interval - (now - self.last_called)
            if wait_time > 0:
                await asyncio.sleep(wait_time)
            self.last_called = asyncio.get_event_loop().time()

    async def __aexit__(self, exc_type, exc, tb):
        pass

# Create a global rate limiter for app details requests
appdetails_rate_limiter = RateLimiter(APPDETAILS_RATE_INTERVAL)


# Logging functions

In [64]:
def log_error(message):
    """Append error message to the error log file."""
    with open(ERROR_LOG, 'a', encoding='utf-8') as f:
        f.write(message + '\n')

def save_progress(appid):
    """Append a successfully scraped appid to the progress log."""
    with open(PROGRESS_LOG, 'a', encoding='utf-8') as f:
        f.write(str(appid) + '\n')

def load_scraped_ids():
    """Load all appids that have been scraped already."""
    if os.path.exists(PROGRESS_LOG):
        with open(PROGRESS_LOG, 'r', encoding='utf-8') as f:
            return set(int(line.strip()) for line in f if line.strip())
    return set()


# Metadata Generation

In [65]:
def save_daily_metadata(start_time, end_time, attempted_count, successful_count, failed_count, error_log_path, metadata_file_path, max_pages):
    """Saves the daily metadata to a JSON file."""
    TODAY_DATE = datetime.now().strftime("%Y-%m-%d")
    error_counts = {"connection_error": 0, "timeout_error": 0, "other_errors": 0}
    if os.path.exists(error_log_path):
        with open(error_log_path, 'r', encoding='utf-8') as f:
            for line in f:
                if "HTTP" in line.lower() or "connection" in line.lower():
                    error_counts["connection_error"] += 1
                elif "timeout" in line.lower():
                    error_counts["timeout_error"] += 1
                else:
                    error_counts["other_errors"] += 1

    metadata = {
        "scrape_date": TODAY_DATE,
        "scrape_start_time": start_time,
        "scrape_end_time": end_time,
        "total_appids_attempted": attempted_count,
        "total_appids_successful": successful_count,
        "total_appids_failed": failed_count,
        "error_summary": error_counts,
        "api_endpoints_used": [BASE_URL + "?request=all", BASE_URL + "?request=appdetails&appid="],
        "scrape_parameters": {"max_pages": max_pages},
        "scraper_version": "1.0", # Update if you make changes
        "notes": "Daily rescrape of all available pages."
    }

    with open(metadata_file_path, 'w', encoding='utf-8') as f:
        json.dump(metadata, f, indent=2)

    print(f"Metadata saved to: {metadata_file_path}")

# Async querying

In [66]:
async def get_all_apps(session, page=0):
    """Fetch the 'all' endpoint which returns a list of apps for a given page."""
    params = {"request": "all", "page": page}
    try:
        async with session.get(BASE_URL, params=params) as response:
            if response.status != 200:
                log_error(f"Error on all page {page}: HTTP {response.status}")
                return {}
            return await response.json()
    except Exception as e:
        log_error(f"Error on all page {page}: {e}")
        return {}

async def fetch_app_details(session, appid):
    """Fetch details for a single app using the rate limiter."""
    params = {"request": "appdetails", "appid": appid}
    
    # Wait for token before doing the request.
    async with appdetails_rate_limiter:
        try:
            async with session.get(BASE_URL, params=params) as response:
                if response.status != 200:
                    log_error(f"App {appid}: HTTP {response.status}")
                    return None
                data = await response.json()
                # Filter out if the developer has hidden the data
                if data.get("appid") == 999999:
                    return None
                return data
        except Exception as e:
            log_error(f"App {appid}: {e}")
            return None


# Scraping data

In [67]:
async def scrape_appdetails_for_list(session, app_ids):
    """
    For each appid in app_ids (skipping already scraped ones),
    scrape app details and append data to the output file.
    Returns the count of new apps scraped.
    """
    scraped_ids = load_scraped_ids()
    new_scraped = 0

    # Open output file in append mode
    with open(OUTPUT_FILE, 'a', encoding='utf-8') as f:
        # Use tqdm to monitor progress
        for appid in tqdm(app_ids, desc="Scraping appdetails"):
            # Skip if already processed
            if appid in scraped_ids:
                continue
            data = await fetch_app_details(session, appid)
            if data:
                json.dump(data, f)
                f.write('\n')
                save_progress(appid)
                new_scraped += 1
    return new_scraped

async def scrape_all(max_pages=2):
    """
    Main async function that iterates over pages of 'all' endpoints,
    gathers app ids, and calls app details scraper.
    Respects the 60-second delay between consecutive 'all' page requests.
    Handles the core scraping logic and calls the metadata saving function.
    """
    start_time = datetime.now().isoformat()
    total_attempted = 0
    total_successful = 0
    total_failed = 0

    async with aiohttp.ClientSession() as session:
        for page in range(max_pages):
            print(f"\nFetching app list from page {page}...")
            apps = await get_all_apps(session, page=page)
            if not apps:
                print(f"Warning: No apps returned for page {page}")
                continue

            # Extract app IDs from keys of the JSON result
            app_ids = [int(appid) for appid in apps.keys()]
            attempted_on_page = len(app_ids)
            successful_on_page = await scrape_appdetails_for_list(session, app_ids)
            failed_on_page = attempted_on_page - successful_on_page
            print(f"Page {page} - Attempted: {attempted_on_page}, Successful: {successful_on_page}, Failed: {failed_on_page}")

            total_attempted += attempted_on_page
            total_successful += successful_on_page
            total_failed += failed_on_page

            # Wait before fetching the next page
            if page < max_pages - 1:
                print("Waiting 60 seconds before fetching next page...")
                await asyncio.sleep(ALL_REQUEST_DELAY)

    end_time = datetime.now().isoformat()

    # Save the daily metadata
    save_daily_metadata(start_time, end_time, total_attempted, total_successful, total_failed, ERROR_LOG, METADATA_FILE, max_pages)

    print(f"\nScraping complete for {TODAY_DATE}.")
    print(f"Data saved to: {OUTPUT_FILE}")
    print(f"Errors logged in: {ERROR_LOG}")


Scraping time

In [68]:
await scrape_all(max_pages=STEAMSPY_PAGES)


Fetching app list from page 0...


Scraping appdetails:   0%|          | 0/1000 [00:00<?, ?it/s]

Page 0 - Attempted: 1000, Successful: 2, Failed: 998
Metadata saved to: Data\2025-05-19\daily_metadata.json

Scraping complete for 2025-05-19.
Data saved to: Data\2025-05-19\steamspy_data.jsonl
Errors logged in: Data\2025-05-19\steamspy_errors.log
