# Steam Bundle Scraping

In [3]:
import pandas as pd
import numpy as np
from pathlib import Path

In [119]:
fp = Path("data") / "bundle10.csv"
df = pd.read_csv(fp).rename(columns={
    "AppID": "id",
    "Game Name": "title",
    "Bundle List URL": "url"
})
df.head(3)

Unnamed: 0,id,title,url
0,2379780,Balatro,https://store.steampowered.com/bundlelist/2379...
1,1794680,Vampire Survivors,https://store.steampowered.com/bundlelist/1794...
2,1466390,Kathy Rain: Director's Cut,https://store.steampowered.com/bundlelist/1466...


In [120]:
urls = df["url"].to_numpy()
urls

array(['https://store.steampowered.com/bundlelist/2379780/Balatro',
       'https://store.steampowered.com/bundlelist/1794680/Vampire_Survivors',
       'https://store.steampowered.com/bundlelist/1466390/Kathy_Rain_Directors_Cut',
       'https://store.steampowered.com/bundlelist/475550/Beholder',
       'https://store.steampowered.com/bundlelist/904570/TOK',
       'https://store.steampowered.com/bundlelist/892760/Seed_of_Evil',
       'https://store.steampowered.com/bundlelist/1419750/Apostle_Rebellion',
       'https://store.steampowered.com/bundlelist/2820820/Jotunnslayer_Hordes_of_Hel',
       'https://store.steampowered.com/bundlelist/701160/Kingdom_Two_Crowns',
       'https://store.steampowered.com/bundlelist/1642220/FIND_ALL'],
      dtype=object)

Using selenium since the bundle html is not loaded until after the page loads

In [121]:
# %pip install selenium

In [4]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from bs4 import BeautifulSoup

options = Options()
driver = webdriver.Chrome(options=options)

# Step 1 - Individual Game Store Pages

In [7]:
def load_single_game_html(url, class_name, driver, wait_time=1):
    driver.get(url)
    WebDriverWait(driver, wait_time).until(
        EC.presence_of_element_located((By.CLASS_NAME, class_name))
    )
    html = driver.page_source
    return html

# Step 2 - Bundle URLs for Individual Games 

In [8]:
def get_bundle_urls(url, html, driver):
    html = load_single_game_html(url, "StoreSaleWidgetTitle", driver)
    soup = BeautifulSoup(html)
    titles = soup.find_all("div", class_="StoreSaleWidgetTitle")
    bundle_urls = list()
    for title in titles:
        curr_url = title.parent.get("href")
        bundle_urls.append(curr_url)

    return bundle_urls

# Step 3 - Retrieve Meta Data (bundle item names, genre)

In [125]:
# %pip install webdriver_manager

In [9]:
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

In [10]:
from selenium.webdriver.common.action_chains import ActionChains

def load_age_restricted_html(url, class_name, driver):
    driver.get(url)
    dropdown_element = WebDriverWait(driver, 5).until(
        EC.presence_of_element_located((By.ID, "ageYear"))
    )
    driver.execute_script("arguments[0].value = 2000;", dropdown_element) # register dropdown update

    view_page_button = driver.find_element(By.ID, "view_product_page_btn")
    actions = ActionChains(driver)
    actions.move_to_element(view_page_button).click().perform()
    
    return load_single_game_html(url, class_name, driver)


In [11]:
def get_meta_data(bundle_urls, html, driver, id=0):
    class_name = "tab_item_content"
    meta_data = list()
    # print(f"Bundle urls: ", bundle_urls)
    for url in bundle_urls:
        try:
            html = load_single_game_html(url, class_name, driver)
        except Exception as e:
            print("failed inside 1")
            print(e)
            try:
                html = load_age_restricted_html(url, driver)
            except:
                print("not an age restriction issue.")
                print(url)
                return list(), id

        soup = BeautifulSoup(html)
        try:
            titles = soup.find_all("div", class_="tab_item_name")
            titles = [title.text for title in titles]
        except Exception as e:
            print("failed inside 2")
            print(e)
            print(url)

        try:
            genres = soup.find_all("div", class_="tab_item_details")
            genres = [genre.text for genre in genres]
        except Exception as e:
            print("failed inside 3")
            print(e)
            print(url)

        for title, genre in list(zip(titles, genres)):
            meta_dict = {
                "name": title, 
                "genres": genre, 
                "bundle_id": id,
            }
            meta_data.append(meta_dict)
        id -= 1 # already using negative indexing

    # meta data contains all bundle items for the given game (indicated by bundled_urls)
    return meta_data, id

# Step 4 - Bypass Age Verification + Combine Data

In [12]:
def build_current_games(urls, driver):
    class_name = "StoreSaleWidgetTitle"
    items = list()

    id = 0
    for n, url in enumerate(urls):
        print(f"Currently visiting URL #{n}")
        try:
            html = load_single_game_html(url, class_name, driver)
        except Exception as e:
            print("failed at 1")
            print(e)
            continue
        
        try:
            bundle_urls = get_bundle_urls(url, html, driver)
        except Exception as e:
            print("failed at 2")
            print(e)
            continue

        try:
            meta_data, update_id = get_meta_data(bundle_urls, html, driver, id=~id) # use negative indexing to avoid conflicts with old dataset
        except Exception as e:
            print("failed at 3")
            print(e)
            continue
            
        id = -update_id # id should increase in value so future id values are negative
        items.extend(meta_data)
        time.sleep(3)
        
    items = pd.DataFrame(items)

    return items

In [130]:
driver = webdriver.Chrome(options=options)
driver.get("https://store.steampowered.com/")

# add cookies to avoid age check inputs
driver.add_cookie({'name': 'wants_mature_content', 'value': '1', 'domain': 'store.steampowered.com', 'path': '/'})
driver.add_cookie({'name': 'birthtime', 'value': '946684800', 'domain': 'store.steampowered.com', 'path': '/'})
driver.add_cookie({"name": "lastagecheckage", "value": "1-1-2000", "domain":"store.steampowered.com", "path": "/"})

current_games = build_current_games(urls, driver)
current_games.shape

Currently visiting URL #0
Currently visiting URL #1
Currently visiting URL #2
Currently visiting URL #3
Currently visiting URL #4
Currently visiting URL #5
Currently visiting URL #6
Currently visiting URL #7
Currently visiting URL #8
Currently visiting URL #9


(723, 3)

In [None]:
current_games.groupby("bundle_id").apply(len).min() # all groups created are valid

  current_games.groupby("bundle_id").apply(len).min()


np.int64(2)

# Retrieving New Data

In [151]:
fp = Path("data") / "bundle1000.csv"
many_urls = pd.read_csv(fp, header=None)
many_urls = many_urls[0].to_numpy()
many_urls

array(['https://store.steampowered.com/bundlelist/485590/Nioh_Complete_Edition',
       'https://store.steampowered.com/bundlelist/1325200/Nioh_2__The_Complete_Edition',
       'https://store.steampowered.com/bundlelist/378540/The_Surge', ...,
       'https://store.steampowered.com/bundlelist/262060/Darkest_Dungeon',
       'https://store.steampowered.com/bundlelist/1940340/Darkest_Dungeon_II',
       'https://store.steampowered.com/bundlelist/1092790/Inscryption'],
      shape=(1082,), dtype=object)

In [154]:
driver = webdriver.Chrome(options=options)
driver.get("https://store.steampowered.com/")

# add cookies to avoid age check inputs
driver.add_cookie({'name': 'wants_mature_content', 'value': '1', 'domain': 'store.steampowered.com', 'path': '/'})
driver.add_cookie({'name': 'birthtime', 'value': '946684800', 'domain': 'store.steampowered.com', 'path': '/'})
driver.add_cookie({"name": "lastagecheckage", "value": "1-1-2000", "domain":"store.steampowered.com", "path": "/"})

test_data = build_current_games(many_urls, driver)
test_data.shape

Currently visiting URL #0
failed at 1
Message: 
Stacktrace:
Symbols not available. Dumping unresolved backtrace:
	0x7ff7a412a235
	0x7ff7a3e82630
	0x7ff7a3c116dd
	0x7ff7a3c6a27e
	0x7ff7a3c6a58c
	0x7ff7a3cbed77
	0x7ff7a3cbbaba
	0x7ff7a3c5b0ed
	0x7ff7a3c5bf63
	0x7ff7a4155d60
	0x7ff7a414fe8a
	0x7ff7a4171005
	0x7ff7a3e9d71e
	0x7ff7a3ea4e1f
	0x7ff7a3e8b7c4
	0x7ff7a3e8b97f
	0x7ff7a3e718e8
	0x7ffcd3a7e8d7
	0x7ffcd4f2c53c

Currently visiting URL #1
Currently visiting URL #2
Currently visiting URL #3
Currently visiting URL #4
failed at 1
Message: 
Stacktrace:
Symbols not available. Dumping unresolved backtrace:
	0x7ff7a412a235
	0x7ff7a3e82630
	0x7ff7a3c116dd
	0x7ff7a3c6a27e
	0x7ff7a3c6a58c
	0x7ff7a3cbed77
	0x7ff7a3cbbaba
	0x7ff7a3c5b0ed
	0x7ff7a3c5bf63
	0x7ff7a4155d60
	0x7ff7a414fe8a
	0x7ff7a4171005
	0x7ff7a3e9d71e
	0x7ff7a3ea4e1f
	0x7ff7a3e8b7c4
	0x7ff7a3e8b97f
	0x7ff7a3e718e8
	0x7ffcd3a7e8d7
	0x7ffcd4f2c53c

Currently visiting URL #5
failed at 1
Message: 
Stacktrace:
Symbols not available. Dum

(11683, 3)

In [None]:
test_data.to_csv("test_data.csv", index=False)

# Selenium Video

In [18]:
fp = Path("data") / "bundle1000.csv"
many_urls = pd.read_csv(fp, header=None)
many_urls = many_urls[0].to_numpy()[:5]
many_urls

array(['https://store.steampowered.com/bundlelist/485590/Nioh_Complete_Edition',
       'https://store.steampowered.com/bundlelist/1325200/Nioh_2__The_Complete_Edition',
       'https://store.steampowered.com/bundlelist/378540/The_Surge',
       'https://store.steampowered.com/bundlelist/644830/The_Surge_2',
       'https://store.steampowered.com/bundlelist/265300/Lords_of_the_Fallen_2014'],
      dtype=object)

In [19]:
driver = webdriver.Chrome(options=options)
driver.get("https://store.steampowered.com/")

# add cookies to avoid age check inputs
driver.add_cookie({'name': 'wants_mature_content', 'value': '1', 'domain': 'store.steampowered.com', 'path': '/'})
driver.add_cookie({'name': 'birthtime', 'value': '946684800', 'domain': 'store.steampowered.com', 'path': '/'})
driver.add_cookie({"name": "lastagecheckage", "value": "1-1-2000", "domain":"store.steampowered.com", "path": "/"})

test_data = build_current_games(many_urls, driver)
test_data.shape

Currently visiting URL #0
failed at 1
Message: 
Stacktrace:
Symbols not available. Dumping unresolved backtrace:
	0x7ff68c75a235
	0x7ff68c4b2630
	0x7ff68c2416dd
	0x7ff68c29a27e
	0x7ff68c29a58c
	0x7ff68c2eed77
	0x7ff68c2ebaba
	0x7ff68c28b0ed
	0x7ff68c28bf63
	0x7ff68c785d60
	0x7ff68c77fe8a
	0x7ff68c7a1005
	0x7ff68c4cd71e
	0x7ff68c4d4e1f
	0x7ff68c4bb7c4
	0x7ff68c4bb97f
	0x7ff68c4a18e8
	0x7ffcd3a7e8d7
	0x7ffcd4f2c53c

Currently visiting URL #1
Currently visiting URL #2
Currently visiting URL #3
Currently visiting URL #4
failed at 1
Message: 
Stacktrace:
Symbols not available. Dumping unresolved backtrace:
	0x7ff68c75a235
	0x7ff68c4b2630
	0x7ff68c2416dd
	0x7ff68c29a27e
	0x7ff68c29a58c
	0x7ff68c2eed77
	0x7ff68c2ebaba
	0x7ff68c28b0ed
	0x7ff68c28bf63
	0x7ff68c785d60
	0x7ff68c77fe8a
	0x7ff68c7a1005
	0x7ff68c4cd71e
	0x7ff68c4d4e1f
	0x7ff68c4bb7c4
	0x7ff68c4bb97f
	0x7ff68c4a18e8
	0x7ffcd3a7e8d7
	0x7ffcd4f2c53c



(7, 3)