In [175]:
from selectolax.parser import HTMLParser
from playwright.async_api import async_playwright
import pandas as pd
from httpx import get
import json

In [176]:
CONFIG='config-steam.json'

def get_config():
    with open(CONFIG) as c:
        return json.load(c)

config = get_config()

url = config['url']
container = config['container']
selectors = config['item']
out_file = config['output_file']

config

{'url': 'https://store.steampowered.com/specials',
 'output_file': 'steam_games.csv',
 'container': {'name': 'store_sale_divs',
  'selector': 'div[class*="StoreSaleWidgetContainer"]',
  'match': 'all',
  'type': 'node'},
 'item': [{'name': 'title',
   'selector': 'div[class*="salepreviewwidgets_StoreSaleWidgetTitle"]',
   'match': 'first',
   'type': 'text'},
  {'name': 'thumbnail_link',
   'selector': 'img[class*="CapsuleImage"]',
   'match': 'first',
   'type': 'src'},
  {'name': 'category_tags',
   'selector': 'div[class*="StoreSaleWidgetTags"] > a',
   'match': 'all',
   'type': 'text'},
  {'name': 'rating',
   'selector': 'div[class*="ReviewScoreValue"] > div',
   'match': 'first',
   'type': 'text'},
  {'name': 'nr_reviews',
   'selector': 'div[class*="ReviewScoreValue"] > div[class*="ReviewScoreCount"]',
   'match': 'first',
   'type': 'text'},
  {'name': 'original_price',
   'selector': 'div[class*="StoreOriginalPrice"]',
   'match': 'first',
   'type': 'text'},
  {'name': 'dis

In [192]:
container_selector = container['selector']

async def playwright_get_html():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(url)

        await page.wait_for_load_state("networkidle")
        await page.evaluate("() => window.scroll(0, document.body.scrollHeight)")
        await page.wait_for_selector(container_selector)
        

        return await page.inner_html('body')

In [193]:
html = await playwright_get_html()

In [194]:
def get_containers():
    tree = HTMLParser(html)
    
    if container['match'] == 'all':
        containers = tree.css(container_selector)
    elif container['match'] == 'first':
        containers = [tree.css_first(container_selector)]

    if container['type'] == 'node':
        return containers
    elif container['type'] == 'text':
        return [container.text() for container in containers]

containers = get_containers()

In [195]:
from selectolax.parser import Node

def get_parsed_info(node:Node, selectors:list=selectors):
    parsed = {}

    for s in selectors:
        if s['match'] == 'all':
            elems = node.css(s['selector'])
        elif s['match'] == 'first':
            elems = [node.css_first(s['selector'])]

        if s['type'] == 'text':
            elems = [el.text() for el in elems]
        else:
            elems = [el.attributes[s['type']] for el in elems]

        if len(elems) == 1:
            parsed[s['name']] = elems[0]
        else:
            parsed[s['name']] = elems

    return parsed

In [196]:
parsed = [get_parsed_info(c) for c in containers]
parsed[0]

{'title': 'Forza Horizon 5',
 'thumbnail_link': 'https://cdn.cloudflare.steamstatic.com/steam/apps/1551360/header.jpg?t=1692199916',
 'category_tags': ['Racing',
  'Open World',
  'Driving',
  'Multiplayer',
  'Automobile Sim',
  'Adventure',
  'Realistic',
  'Simulation',
  'Exploration',
  'Arcade',
  'First-Person',
  'Atmospheric',
  'Sports',
  'Co-op',
  'PvP',
  'Online Co-Op',
  'Singleplayer',
  'Beautiful',
  'Action',
  'Third Person'],
 'rating': 'Very Positive',
 'nr_reviews': '| 124,131 User Reviews',
 'original_price': '59,99€',
 'discounted_price': '29,99€',
 'discount': '-50%',
 'release_date': 'Nov 9, 2021'}

In [197]:
from datetime import datetime
import re

def reformat_date(date, from_format, out_format):
    new_date = datetime.strptime(date, from_format)
    return datetime.strftime(new_date, out_format)

def get_price(price):
    regex = re.compile('(\d*,\d*)')
    match = regex.match(price)
    return match.group(0)

def transform(parsed:dict):
    parsed['category_tags'] = parsed['category_tags'][:5]
    parsed['nr_reviews'] = parsed['nr_reviews'].split(' ')[1]
    parsed['release_date'] = reformat_date(parsed['release_date'], from_format='%b %d, %Y', out_format='%Y-%m-%d')
    parsed['currency'] = parsed['original_price'][-1]
    parsed['original_price'] = get_price(parsed['original_price'])
    parsed['discounted_price'] = get_price(parsed['discounted_price'])

In [198]:
[transform(pars) for pars in parsed]
parsed

[{'title': 'Forza Horizon 5',
  'thumbnail_link': 'https://cdn.cloudflare.steamstatic.com/steam/apps/1551360/header.jpg?t=1692199916',
  'category_tags': ['Racing',
   'Open World',
   'Driving',
   'Multiplayer',
   'Automobile Sim'],
  'rating': 'Very Positive',
  'nr_reviews': '124,131',
  'original_price': '59,99',
  'discounted_price': '29,99',
  'discount': '-50%',
  'release_date': '2021-11-09',
  'currency': '€'},
 {'title': 'F1® 23',
  'thumbnail_link': 'https://cdn.cloudflare.steamstatic.com/steam/apps/2108330/header.jpg?t=1688987889',
  'category_tags': ['VR', 'Racing', 'Driving', 'Sports', 'Multiplayer'],
  'rating': 'Very Positive',
  'nr_reviews': '4,358',
  'original_price': '69,99',
  'discounted_price': '41,99',
  'discount': '-40%',
  'release_date': '2023-06-16',
  'currency': '€'},
 {'title': 'Crusader Kings III',
  'thumbnail_link': 'https://cdn.cloudflare.steamstatic.com/steam/apps/1158310/header.jpg?t=1692714098',
  'category_tags': ['Strategy',
   'Simulation',


In [200]:
final = parsed

df = pd.DataFrame(final)
df.to_csv(out_file, index=False)