In [1]:
%config Completer.use_jedi = False
%load_ext autoreload

# Imports & Loads

In [160]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import json
from requests.compat import urljoin
from datetime import datetime
import re
from time import sleep


from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service

In [7]:
LOGIN_USERNAME_FIELD = '//*[@id="inputUsername"]'
LOGIN_PASSWORD_FIELD = '//*[@id="inputPassword"]'
LOGIN_BUTTON = '//*[@id="mainbody"]/div/div/gg-login-page/div[1]/div/gg-login-form/form/fieldset/div[3]/button[1]'

with open("/home/msnow/config.json", "r") as fp:
    secrets = json.load(fp)
USERNAME = secrets["bgg_crawler"]["username"]
PASSWORD = secrets["bgg_crawler"]["password"]

chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
cookies = {}

In [162]:
driver = webdriver.Chrome(service=Service('/usr/lib/chromium-browser/chromedriver'), options=chrome_options)
driver.get("https://boardgamegeek.com/login")
login = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.XPATH, LOGIN_USERNAME_FIELD))
)
password = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.XPATH, LOGIN_PASSWORD_FIELD))
)

login_button = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.XPATH, LOGIN_BUTTON))
)

login.send_keys(USERNAME)
password.send_keys(PASSWORD)

login_button.click()
sleep(1)
selenium_cookies = driver.get_cookies()
for cookie in selenium_cookies:
    cookies[cookie["name"]] = cookie["value"]

# Functions

In [164]:
def browse_games(page_num, cookies):
    bs_url = "https://boardgamegeek.com/browse/boardgame/page/"
    pg_url = f"{bs_url}{page_num}"
    if page_num <= 10:
        pg = requests.get(pg_url)
    else:
        pg = requests.get(pg_url, cookies=cookies)
    soup = BeautifulSoup(pg.content, "html.parser")
    return soup

In [16]:
def extract_game_ids(soup):
    bs_pg = "https://boardgamegeek.com/"
    all_games = soup.find_all("td", {"class": "collection_objectname"})
    game_ids = [x.find("a")["href"].split("/")[-2] for x in all_games]
    game_pages = [urljoin(bs_pg, x.find("a")["href"]) for x in all_games]
    return game_ids, game_pages

In [121]:
def extract_xml(soup, game_links):
    item_list = []
    items = soup.find_all("item")
    for idx, item in enumerate(items):
        item_list.append(extract_item(item, game_links[idx]))
    return item_list

In [110]:
def extract_item(game_item, game_url):
    game_dict = {"name": game_item.find("name")["value"], "game_id": game_item["id"]}
    values_int = [
        "yearpublished",
        "minplayers",
        "maxplayers",
        "playingtime",
        "minplaytime",
        "maxplaytime",
        "minage",
    ]
    for vals in values_int:
        game_dict[vals] = game_item.find(vals)["value"]
    link_categ = [
        "boardgamecategory",
        "boardgamemechanic",
        "boardgamefamily",
        "boardgameexpansion",
        "boardgameartist",
        "boardgamecompilation",
        "boardgameimplementation",
        "boardgamedesigner",
        "boardgamepublisher",
        "boardgameintegration",
    ]
    for categ in link_categ:
        game_dict[categ] = [
            x["value"] for x in game_item.find_all("link", {"type": categ})
        ]
    stats_float = ["average", "bayesaverage", "stddev", "median", "averageweight"]
    for stat in stats_float:
        game_dict[stat] = float(game_item.find(stat)["value"])
    stats_int = [
        "usersrated",
        "owned",
        "trading",
        "wanting",
        "wishing",
        "numcomments",
        "numweights",
    ]
    for stat in stats_int:
        game_dict[stat] = int(game_item.find(stat)["value"])
    for game_cat in game_item.find_all("rank"):
        cat_name = re.sub("\W", "", game_cat["friendlyname"])
        game_dict[cat_name] = int(game_cat["value"])
    # Player count recommendations
    player_count_poll = game_item.find("poll", attrs={"name": "suggested_numplayers"})
    result_dict = {"total_votes": int(player_count_poll.attrs["totalvotes"])}
    player_count_results = player_count_poll.findAll("results")
    game_dict["player_count_recs"] = {}
    for player_count in player_count_results:
        num_players = player_count.attrs["numplayers"]
        player_count_values = {
            x.attrs["value"]: int(x.attrs["numvotes"])
            for x in player_count.findAll("result")
        }
        play_count_rec = max(player_count_values, key=player_count_values.get)
        if play_count_rec in game_dict["player_count_recs"]:
            game_dict["player_count_recs"][play_count_rec].append(num_players)
        else:
            game_dict["player_count_recs"][play_count_rec] = [num_players]
        result_dict[num_players] = player_count_values
        result_dict[num_players]["total_votes"] = sum(
            int(x.attrs["numvotes"]) for x in player_count.findAll("result")
        )
    game_dict["suggested_numplayers"] = result_dict
    return game_dict

In [120]:
def game_data():
    xml_bs = "https://www.boardgamegeek.com/xmlapi2/thing?type=boardgame&stats=1&ratingcomments=1&page=1&pagesize=10&id="
    all_items = []
    for pg in range(1, 51):
        pg_items = []
        ct = 0
        soup_pg = browse_games(pg)
        pg_ids, pg_links = extract_game_ids(soup_pg)
        print(f"page number {pg} attempt number {ct}")
        while len(pg_items) != 100 and ct < 20:
            xml_fl = requests.get(f'{xml_bs}{",".join(pg_ids)}')
            soup_xml = BeautifulSoup(xml_fl.content, "xml")
            pg_items = extract_xml(soup_xml, pg_links)
            ct += 1
            if ct > 1:
                print(f"page number {pg} attempt number {ct}")
                print(len(pg_items))
        all_items += pg_items
    return all_items

# Scrape Main Pages

In [8]:
xml_bs = "https://www.boardgamegeek.com/xmlapi2/thing?type=boardgame&stats=1&ratingcomments=1&page=1&pagesize=10&id="
all_items = []

In [126]:
pg = 2

In [136]:
len(all_items)

300

In [140]:
xml_fl = requests.get(f'{xml_bs}{",".join(pg_ids)}')

In [141]:
soup_xml = BeautifulSoup(xml_fl.content, "xml")

In [144]:
all_items += pg_items

In [143]:
pg_items = extract_xml(soup_xml, pg_links)
print(pg, len(pg_items))
if len(pg_items)< 100:
    break
else:
    all_items += pg_items

4 100


SyntaxError: 'break' outside loop (1099662947.py, line 4)

In [150]:
BeautifulSoup(xml_fl.content, "xml")

<?xml version="1.0" encoding="utf-8"?>
<items termsofuse="https://boardgamegeek.com/xmlapi/termsofuse"/>

In [154]:
soup_pg

<!DOCTYPE html>

<html lang="en-US">
<head>
<link href="https://cf.geekdo-static.com/icons/touch-icon180.png" rel="apple-touch-icon"/>
<link href="https://cf.geekdo-static.com/icons/favicon2.ico" rel="shortcut icon" type="image/ico"/>
<link href="https://cf.geekdo-static.com/icons/favicon2.ico" rel="icon" type="image/ico"/>
<link href="/game-opensearch.xml" rel="search" title="BGG Game Search" type="application/opensearchdescription+xml"/>
<meta content="BGG" name="apple-mobile-web-app-title"/>
<meta content="#2e2b47" name="theme-color"/>
<link href="https://api.geekdo.com" rel="preconnect"/>
<meta content="board game, boardgames, boardgame, board, games, game, hobby, boardgamegeek, geek, geekdo" name="keywords"/>
<script>window.AdSlots = window.AdSlots || {
	cmd: [],
	disableScripts: ['gpt'],
	renderOnFirstLoad: false,
	divCheck: false
};</script>
<title>BoardGameGeek</title><link crossorigin="" href="https://use.typekit.net" rel="preconnect"/>
<meta charset="utf-8">
<!-- Load the Pro

In [182]:
for pg in range(40,51):
    pg_items = []
    ct = 0
    soup_pg = browse_games(pg, cookies=cookies)
    pg_ids, pg_links = extract_game_ids(soup_pg)
    xml_fl = requests.get(f'{xml_bs}{",".join(pg_ids)}')
    soup_xml = BeautifulSoup(xml_fl.content, "xml")
    pg_items = extract_xml(soup_xml, pg_links)
    print(pg, len(pg_items))
    if len(pg_items)< 100:
        break
    else:
        all_items += pg_items
        print(len(all_items))

40 100
4100
41 100
4200
42 100
4300
43 100
4400
44 100
4500
45 100
4600
46 100
4700
47 100
4800
48 100
4900
49 100
5000
50 100
5100


In [171]:
df_bgg = pd.DataFrame(all_items)
df_bgg.head()

Unnamed: 0,name,game_id,yearpublished,minplayers,maxplayers,playingtime,minplaytime,maxplaytime,minage,boardgamecategory,...,player_count_recs,suggested_numplayers,ThematicRank,WarGameRank,FamilyGameRank,CustomizableRank,AbstractGameRank,PartyGameRank,ChildrensGameRank,RPGItemRank
0,Brass: Birmingham,224517,2018,2,4,120,60,120,14,"[Age of Reason, Economic, Industry / Manufactu...",...,"{'Not Recommended': ['1', '4+'], 'Recommended'...","{'total_votes': 994, '1': {'Best': 0, 'Recomme...",,,,,,,,
1,Pandemic Legacy: Season 1,161936,2015,2,4,60,60,60,13,"[Environmental, Medical]",...,"{'Not Recommended': ['1', '4+'], 'Recommended'...","{'total_votes': 836, '1': {'Best': 19, 'Recomm...",1.0,,,,,,,
2,Gloomhaven,174430,2017,1,4,120,60,120,14,"[Adventure, Exploration, Fantasy, Fighting, Mi...",...,"{'Recommended': ['1', '2', '4'], 'Best': ['3']...","{'total_votes': 1492, '1': {'Best': 143, 'Reco...",2.0,,,,,,,
3,Ark Nova,342942,2021,1,4,150,90,150,14,"[Animals, Economic, Environmental]",...,"{'Recommended': ['1', '3'], 'Best': ['2'], 'No...","{'total_votes': 1538, '1': {'Best': 141, 'Reco...",,,,,,,,
4,Twilight Imperium: Fourth Edition,233078,2017,3,6,480,240,480,14,"[Civilization, Economic, Exploration, Negotiat...",...,"{'Not Recommended': ['1', '2', '6+'], 'Recomme...","{'total_votes': 524, '1': {'Best': 0, 'Recomme...",3.0,,,,,,,


In [172]:
!pwd

/home/msnow/git/bgg/notebooks


In [184]:
df_bgg = pd.DataFrame(all_items)
df_bgg.to_csv(
    f"../data/outputs/{str(datetime.now().date())}_bgg_top{len(all_items)}_pipes.csv",
    index=False,
    sep="|"
)

In [175]:
df_bgg.to_csv("../data/outputs/bgg_top_1900.csv", index=False, sep=",")

In [129]:
all_items += pg_items

In [130]:
len(all_items)

200

In [20]:
pg = 1
ct = 0
pg_items = []

In [18]:
bs_url = "https://boardgamegeek.com/browse/boardgame/page/"
pg_url = f"{bs_url}{pg}"
pg = requests.get(pg_url)

In [21]:
soup_pg = browse_games(pg)

In [23]:
pg_ids, pg_links = extract_game_ids(soup_pg)

In [25]:
xml_fl = requests.get(f'{xml_bs}{",".join(pg_ids)}')

In [111]:
soup_xml = BeautifulSoup(xml_fl.content, "xml")
pg_items = extract_xml(soup_xml, pg_links)

In [114]:
soup_xml[0]

KeyError: 0

In [119]:
pd.DataFrame(pg_items)

Unnamed: 0,name,game_id,yearpublished,minplayers,maxplayers,playingtime,minplaytime,maxplaytime,minage,boardgamecategory,...,BoardGameRank,StrategyGameRank,player_count_recs,suggested_numplayers,ThematicRank,WarGameRank,FamilyGameRank,CustomizableRank,AbstractGameRank,PartyGameRank
0,Brass: Birmingham,224517,2018,2,4,120,60,120,14,"[Age of Reason, Economic, Industry / Manufactu...",...,1,1.0,"{'Not Recommended': ['1', '4+'], 'Recommended'...","{'total_votes': 994, '1': {'Best': 0, 'Recomme...",,,,,,
1,Pandemic Legacy: Season 1,161936,2015,2,4,60,60,60,13,"[Environmental, Medical]",...,2,2.0,"{'Not Recommended': ['1', '4+'], 'Recommended'...","{'total_votes': 836, '1': {'Best': 19, 'Recomm...",1.0,,,,,
2,Gloomhaven,174430,2017,1,4,120,60,120,14,"[Adventure, Exploration, Fantasy, Fighting, Mi...",...,3,4.0,"{'Recommended': ['1', '2', '4'], 'Best': ['3']...","{'total_votes': 1492, '1': {'Best': 143, 'Reco...",2.0,,,,,
3,Ark Nova,342942,2021,1,4,150,90,150,14,"[Animals, Economic, Environmental]",...,4,3.0,"{'Recommended': ['1', '3'], 'Best': ['2'], 'No...","{'total_votes': 1538, '1': {'Best': 141, 'Reco...",,,,,,
4,Twilight Imperium: Fourth Edition,233078,2017,3,6,480,240,480,14,"[Civilization, Economic, Exploration, Negotiat...",...,5,5.0,"{'Not Recommended': ['1', '2', '6+'], 'Recomme...","{'total_votes': 524, '1': {'Best': 0, 'Recomme...",3.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Decrypto,225694,2018,3,8,45,15,45,12,"[Deduction, Party Game, Spies/Secret Agents, W...",...,96,,"{'Not Recommended': ['1', '2', '3', '8+'], 'Be...","{'total_votes': 313, '1': {'Best': 0, 'Recomme...",,,,,,1.0
96,Inis,155821,2016,2,4,90,60,90,14,"[Card Game, Exploration, Medieval, Mythology, ...",...,97,85.0,"{'Not Recommended': ['1', '4+'], 'Recommended'...","{'total_votes': 442, '1': {'Best': 0, 'Recomme...",,,,,,
97,The 7th Continent,180263,2017,1,4,1000,5,1000,14,"[Adventure, Card Game, Exploration, Science Fi...",...,98,,"{'Best': ['1', '2'], 'Not Recommended': ['3', ...","{'total_votes': 643, '1': {'Best': 416, 'Recom...",28.0,,,,,
98,Keyflower,122515,2012,2,6,120,90,120,12,"[City Building, Economic, Renaissance, Territo...",...,99,80.0,"{'Not Recommended': ['1', '6+'], 'Recommended'...","{'total_votes': 418, '1': {'Best': 0, 'Recomme...",,,,,,


In [33]:
pg_items = extract_xml(soup_xml, pg_links)

In [116]:
item_0

<item id="224517" type="boardgame">
<thumbnail>https://cf.geekdo-images.com/x3zxjr-Vw5iU4yDPg70Jgw__thumb/img/o18rjEemoWaVru9Y2TyPwuIaRfE=/fit-in/200x150/filters:strip_icc()/pic3490053.jpg</thumbnail>
<image>https://cf.geekdo-images.com/x3zxjr-Vw5iU4yDPg70Jgw__original/img/FpyxH41Y6_ROoePAilPNEhXnzO8=/0x0/filters:format(jpeg)/pic3490053.jpg</image>
<name sortindex="1" type="primary" value="Brass: Birmingham"/>
<name sortindex="1" type="alternate" value="Brass. Бирмингем"/>
<name sortindex="1" type="alternate" value="Brass. Бірмінгем"/>
<name sortindex="1" type="alternate" value="ブラス：バーミンガム"/>
<name sortindex="1" type="alternate" value="工业革命：伯明翰(Chinese edition) (2018)"/>
<name sortindex="1" type="alternate" value="工業革命：伯明翰"/>
<name sortindex="1" type="alternate" value="브라스: 버밍엄"/>
<description>Brass: Birmingham is an economic strategy game sequel to Martin Wallace' 2007 masterpiece, Brass. Brass: Birmingham tells the story of competing entrepreneurs in Birmingham during the industrial 

In [48]:
soup_xml = BeautifulSoup(xml_fl.content, "xml")
soup_items = soup_xml.find_all("item")
item_0 = soup_items[0]

In [88]:
player_count_poll = item_0.find("poll",attrs={"name":"suggested_numplayers"})
player_count_results = player_count_poll.findAll("results")
result_dict = {"total_votes":int(player_count_poll.attrs["totalvotes"])}
for player_count in player_count_results:
    result_dict[player_count.attrs["numplayers"]] = {x.attrs["value"]:int(x.attrs["numvotes"]) for x in player_count.findAll("result")}
    result_dict[player_count.attrs["numplayers"]]["total_votes"] = 

In [None]:
aa = {x.attrs["value"]:int(x.attrs["numvotes"]) for x in player_count.findAll("result")}

In [104]:
aa

{'Best': 1, 'Recommended': 6, 'Not Recommended': 441}

In [103]:
aa = {x.attrs["value"]:int(x.attrs["numvotes"]) for x in player_count.findAll("result")}
max(aa, key=aa.get)

'Not Recommended'

In [97]:
sum(int(x.attrs["numvotes"]) for x in player_count.findAll("result"))

448

In [85]:
player_count_poll.attrs

{'name': 'suggested_numplayers',
 'title': 'User Suggested Number of Players',
 'totalvotes': '994'}

In [87]:
int(player_count_poll.attrs["totalvotes"])

994

In [78]:
player_count_poll

<poll name="suggested_numplayers" title="User Suggested Number of Players" totalvotes="994">
<results numplayers="1">
<result numvotes="0" value="Best"/>
<result numvotes="50" value="Recommended"/>
<result numvotes="528" value="Not Recommended"/>
</results>
<results numplayers="2">
<result numvotes="97" value="Best"/>
<result numvotes="635" value="Recommended"/>
<result numvotes="118" value="Not Recommended"/>
</results>
<results numplayers="3">
<result numvotes="541" value="Best"/>
<result numvotes="321" value="Recommended"/>
<result numvotes="11" value="Not Recommended"/>
</results>
<results numplayers="4">
<result numvotes="528" value="Best"/>
<result numvotes="284" value="Recommended"/>
<result numvotes="36" value="Not Recommended"/>
</results>
<results numplayers="4+">
<result numvotes="1" value="Best"/>
<result numvotes="6" value="Recommended"/>
<result numvotes="441" value="Not Recommended"/>
</results>
</poll>

In [73]:
aa

<results numplayers="1">
<result numvotes="0" value="Best"/>
<result numvotes="50" value="Recommended"/>
<result numvotes="528" value="Not Recommended"/>
</results>

In [77]:
{x.attrs["value"]:int(x.attrs["numvotes"]) for x in aa.findAll("result")}

{'Best': 0, 'Recommended': 50, 'Not Recommended': 528}

In [68]:
aa.findAll("result")[0]

[<result numvotes="0" value="Best"/>,
 <result numvotes="50" value="Recommended"/>,
 <result numvotes="528" value="Not Recommended"/>]

In [67]:
aa.attrs["numplayers"]

'1'

In [35]:
soup_xml

<?xml version="1.0" encoding="utf-8"?>
<items termsofuse="https://boardgamegeek.com/xmlapi/termsofuse"><item id="224517" type="boardgame">
<thumbnail>https://cf.geekdo-images.com/x3zxjr-Vw5iU4yDPg70Jgw__thumb/img/o18rjEemoWaVru9Y2TyPwuIaRfE=/fit-in/200x150/filters:strip_icc()/pic3490053.jpg</thumbnail>
<image>https://cf.geekdo-images.com/x3zxjr-Vw5iU4yDPg70Jgw__original/img/FpyxH41Y6_ROoePAilPNEhXnzO8=/0x0/filters:format(jpeg)/pic3490053.jpg</image>
<name sortindex="1" type="primary" value="Brass: Birmingham"/>
<name sortindex="1" type="alternate" value="Brass. Бирмингем"/>
<name sortindex="1" type="alternate" value="Brass. Бірмінгем"/>
<name sortindex="1" type="alternate" value="ブラス：バーミンガム"/>
<name sortindex="1" type="alternate" value="工业革命：伯明翰(Chinese edition) (2018)"/>
<name sortindex="1" type="alternate" value="工業革命：伯明翰"/>
<name sortindex="1" type="alternate" value="브라스: 버밍엄"/>
<description>Brass: Birmingham is an economic strategy game sequel to Martin Wallace' 2007 masterpiece, B

In [36]:
item_list = []
items = soup_xml.find_all("item")

In [38]:
items[0]

<item id="224517" type="boardgame">
<thumbnail>https://cf.geekdo-images.com/x3zxjr-Vw5iU4yDPg70Jgw__thumb/img/o18rjEemoWaVru9Y2TyPwuIaRfE=/fit-in/200x150/filters:strip_icc()/pic3490053.jpg</thumbnail>
<image>https://cf.geekdo-images.com/x3zxjr-Vw5iU4yDPg70Jgw__original/img/FpyxH41Y6_ROoePAilPNEhXnzO8=/0x0/filters:format(jpeg)/pic3490053.jpg</image>
<name sortindex="1" type="primary" value="Brass: Birmingham"/>
<name sortindex="1" type="alternate" value="Brass. Бирмингем"/>
<name sortindex="1" type="alternate" value="Brass. Бірмінгем"/>
<name sortindex="1" type="alternate" value="ブラス：バーミンガム"/>
<name sortindex="1" type="alternate" value="工业革命：伯明翰(Chinese edition) (2018)"/>
<name sortindex="1" type="alternate" value="工業革命：伯明翰"/>
<name sortindex="1" type="alternate" value="브라스: 버밍엄"/>
<description>Brass: Birmingham is an economic strategy game sequel to Martin Wallace' 2007 masterpiece, Brass. Brass: Birmingham tells the story of competing entrepreneurs in Birmingham during the industrial 