# Imports and Paths

In [165]:
from urllib import request
from bs4 import BeautifulSoup, Comment
import pandas as pd
from datetime import datetime
from shutil import copyfile
import time
import json

In [None]:
PATH = '../data/'

In [4]:
bs_pg = 'https://www.boardgamegeek.com/xmlapi2/'

# XML2 API

Base URI: /xmlapi2/thing?parameters
- id=NNN	
  - Specifies the id of the thing(s) to retrieve. To request multiple things with a single query, NNN can specify a comma-delimited list of ids.
- type=THINGTYPE	
  - Specifies that, regardless of the type of thing asked for by id, the results are filtered by the THINGTYPE(s) specified. Multiple THINGTYPEs can be specified in a comma-delimited list.
- versions=1	
  - Returns version info for the item.
- videos = 1	
  - Returns videos for the item.
- stats=1		
  - Returns ranking and rating stats for the item.
- historical=1		
  - Returns historical data over time. See page parameter.
- marketplace=1		
  - Returns marketplace data.
- comments=1		
  - Returns all comments about the item. Also includes ratings when commented. See page parameter.
- ratingcomments=1		
  - Returns all ratings for the item. Also includes comments when rated. See page parameter. The ratingcomments and comments parameters cannot be used together, as the output always appears in the \<comments\> node of the XML; comments parameter takes precedence if both are specified. Ratings are sorted in descending rating value, based on the highest rating they have assigned to that item (each item in the collection can have a different rating).
- page=NNN		
  - Defaults to 1, controls the page of data to see for historical info, comments, and ratings data.
- pagesize=NNN		
  - Set the number of records to return in paging. Minimum is 10, maximum is 100.
- from=YYYY-MM-DD		
  - Not currently supported.
- to=YYYY-MM-DD		
  - Not currently supported.

# Get the id's of the top 2000 board games

In [50]:
pg_gm_rnks = 'https://boardgamegeek.com/browse/boardgame/page/'

In [51]:
pg_num = 1
pg = request.urlopen(f'{pg_gm_rnks}{str(pg_num)}')
soup = BeautifulSoup(pg, 'html.parser')

In [53]:
def extract_gm_id(soup):
    rows = soup.find('div', {'id': 'collection'}).find_all('tr')[1:]
    id_list = []
    for row in rows:
        id_list.append(int(row.find_all('a')[1]['href'].split('/')[2]))
    return id_list

In [80]:
gm_ids = []
for pg_num in range(1,21):
    pg = request.urlopen(f'{pg_gm_rnks}{str(pg_num)}')
    soup = BeautifulSoup(pg, 'html.parser')
    gm_ids += extract_gm_id(soup)

In [81]:
len(gm_ids)

2000

In [113]:
gm_ids[750]

17329

# Extract the info for each game

In [57]:
bs_pg = 'https://www.boardgamegeek.com/xmlapi2/'

## Using the extracted game id's

In [129]:
bs_pg_gm = f'{bs_pg}thing?type=boardgame&stats=1&ratingcomments=1&page=1&pagesize=10&id='

In [139]:
def extract_game_item(item):
    gm_dict = {}
    field_int = ['yearpublished', 'minplayers', 'maxplayers', 'playingtime', 'minplaytime', 'maxplaytime', 'minage']
    field_categ = ['boardgamecategory', 'boardgamemechanic', 'boardgamefamily','boardgamedesigner', 'boardgameartist', 'boardgamepublisher']
    field_rank = [x['friendlyname'] for x in item.find_all('rank')]
    field_stats = ['usersrated', 'average', 'bayesaverage', 'stddev', 'median', 'owned', 'trading', 'wanting', 'wishing', 'numcomments', 'numweights', 'averageweight']
    gm_dict['name'] = item.find('name')['value']
    gm_dict['id'] = item['id']
    gm_dict['num_of_rankings'] = int(item.find('comments')['totalitems'])
    for i in field_int:
        field_val = item.find(i)
        if field_val is None:
            gm_dict[i] = -1
        else:
            gm_dict[i] = int(field_val['value'])
    for i in field_categ:
        gm_dict[i] = [x['value'] for x in item.find_all('link',{'type':i})]
    for i in field_rank:
        field_val = item.find('rank',{'friendlyname':i})
        if field_val is None or field_val['value'] == 'Not Ranked':
            gm_dict[i.replace(' ','')] = -1
        else:
            gm_dict[i.replace(' ','')] = int(field_val['value'])
    for i in field_stats:
        field_val = item.find(i)
        if field_val is None:
            gm_dict[i] = -1
        else:
            gm_dict[i] = float(field_val['value'])
    return gm_dict

In [144]:
aa = f'{bs_pg_gm}{str(idx)}'

In [148]:
gm_list = []
idx_split = 4
idx_size = int(len(gm_ids)/idx_split)
for i in range(idx_split):
    idx = str(gm_ids[i*idx_size:(i+1)*idx_size]).replace(' ','')[1:-1]   
    pg = request.urlopen(f'{bs_pg_gm}{str(idx)}')
    xsoup = BeautifulSoup(pg, 'xml')
    gm_list += [extract_game_item(x) for x in xsoup.find_all('item')]
df = pd.DataFrame(gm_list)

In [150]:
df.to_csv(f'../data/bgg_top{len(gm_ids)}_{str(datetime.now().date())}.csv', index=False)

In [149]:
df.shape

(2000, 38)

## Iterating through every possible id

You can't get games by bgg ranking, so I will go through every game, extract their info and then go back and extract info based on rank

In [11]:
bs_pg_gm = f'{bs_pg}thing?type=boardgame&stats=1&id='

In [136]:
!ls '../data/'

In [73]:
def extract_game_data(soup):
    gm_dict = {}
    field_int = ['yearpublished', 'minplayers', 'maxplayers', 'playingtime', 'minplaytime', 'maxplaytime', 'minage']
    field_categ = ['boardgamecategory', 'boardgamemechanic', 'boardgamefamily','boardgamedesigner', 'boardgameartist', 'boardgamepublisher']
    field_rank = [x['friendlyname'] for x in soup.find_all('rank')]
    field_stats = ['usersrated', 'average', 'bayesaverage', 'stddev', 'median', 'owned', 'trading', 'wanting', 'wishing', 'numcomments', 'numweights', 'averageweight']
    gm_dict['name'] = soup.find('name')['value']
    gm_dict['id'] = soup.find('item')['id']
    for i in field_int:
        field_val = soup.find(i)
        if field_val is None:
            gm_dict[i] = -1
        else:
            gm_dict[i] = int(field_val['value'])
    for i in field_categ:
        gm_dict[i] = [x['value'] for x in soup.find_all('link',{'type':i})]
    for i in field_rank:
        field_val = soup.find('rank',{'friendlyname':i})
        if field_val is None:
            gm_dict[i] = -1
        else:
            gm_dict[i] = int(field_val['value'])
    for i in field_stats:
        field_val = soup.find(i)
        if field_val is None:
            gm_dict[i] = -1
        else:
            gm_dict[i] = float(field_val['value'])
    return gm_dict

In [184]:
def extract_game_item(item):
    gm_dict = {}
    field_int = ['yearpublished', 'minplayers', 'maxplayers', 'playingtime', 'minplaytime', 'maxplaytime', 'minage']
    field_categ = ['boardgamecategory', 'boardgamemechanic', 'boardgamefamily','boardgamedesigner', 'boardgameartist', 'boardgamepublisher']
    field_rank = [x['friendlyname'] for x in item.find_all('rank')]
    field_stats = ['usersrated', 'average', 'bayesaverage', 'stddev', 'median', 'owned', 'trading', 'wanting', 'wishing', 'numcomments', 'numweights', 'averageweight']
    gm_dict['name'] = item.find('name')['value']
    gm_dict['id'] = item['id']
    for i in field_int:
        field_val = item.find(i)
        if field_val is None:
            gm_dict[i] = -1
        else:
            gm_dict[i] = int(field_val['value'])
    for i in field_categ:
        gm_dict[i] = [x['value'] for x in item.find_all('link',{'type':i})]
    for i in field_rank:
        field_val = item.find('rank',{'friendlyname':i})
        if field_val is None or field_val['value'] == 'Not Ranked':
            gm_dict[i.replace(' ','')] = -1
        else:
            gm_dict[i.replace(' ','')] = int(field_val['value'])
    for i in field_stats:
        field_val = item.find(i)
        if field_val is None:
            gm_dict[i] = -1
        else:
            gm_dict[i] = float(field_val['value'])
    return gm_dict

In [181]:
gm_list = []

In [149]:
idx_start = 0
idx_dist = 1000
idx = str(tuple(range(idx_start, idx_start+idx_dist))).replace(' ','')[1:-1]
pg = request.urlopen(f'{bs_pg_gm}{str(idx)}')
soup = BeautifulSoup(pg, 'xml')
gm_list += [extract_game_item(x) for x in soup.find_all('item')]

CPU times: user 40.3 s, sys: 188 ms, total: 40.5 s
Wall time: 1min 3s


In [189]:
gm_list = []
idx_dist = 10**3
idx_max = 10**5
for idx_start in range(0,idx_max,idx_dist):    
    idx = str(tuple(range(idx_start, idx_start+idx_dist))).replace(' ','')[1:-1]
    pg = request.urlopen(f'{bs_pg_gm}{str(idx)}')
    soup = BeautifulSoup(pg, 'xml')
    gm_list += [extract_game_item(x) for x in soup.find_all('item')]
    df = pd.DataFrame(gm_list)
    if idx_start > 0:
        copyfile('../data/all_game_data.csv','../data/all_game_data_backup.csv')
    df.to_csv('../data/all_game_data.csv', index=False)
    print(idx_start+idx_dist, len(gm_list), str(datetime.now().time())[:8])

1000 855 14:57:40
2000 1702 14:58:20
3000 2569 14:58:59
4000 3460 14:59:37
5000 4320 15:00:12
6000 5221 15:00:48
7000 6108 15:01:21
8000 7002 15:01:57
9000 7842 15:02:29
10000 8727 15:03:03
11000 9625 15:04:14
12000 10522 15:05:21
13000 11415 15:06:31
14000 12273 15:07:42
15000 13160 15:08:49
16000 14035 15:09:57
17000 14909 15:11:03
18000 15781 15:12:04
19000 16629 15:12:58
20000 17479 15:14:04
21000 18316 15:15:04
22000 19158 15:16:01
23000 20012 15:17:03
24000 20868 15:17:59
25000 21749 15:19:00
26000 22626 15:20:02
27000 23487 15:21:04
28000 24369 15:22:15
29000 25240 15:23:19
30000 26122 15:24:25
31000 26977 15:25:26
32000 27828 15:26:23
33000 28706 15:27:26
34000 29566 15:28:27
35000 30419 15:29:30
36000 31301 15:30:35
37000 32211 15:31:39
38000 33083 15:32:38
39000 33932 15:33:45
40000 34742 15:34:48
41000 35492 15:35:38
42000 36132 15:36:32
43000 36770 15:37:23
44000 37215 15:38:02
45000 37297 15:38:14
46000 37366 15:38:27
47000 37507 15:38:45
48000 37607 15:38:59
49000 37661 1

ValueError: invalid literal for int() with base 10: ''

# Get the users and ratings for each game

Order the games by the total number of comments and then group them togather in groups of 200

In [163]:
id_by_num_of_rankings = df.loc[df.num_of_rankings.sort_values().index,'id'].astype(int).tolist()

In [122]:
all_game_dict = {gm_id_lookup:gm_comment_dict}

In [126]:
len(all_game_dict)

10

In [182]:
df['num_of_rankings'].loc[df.id==str(id_by_num_of_rankings[idx_start+idx_dist])].iloc[0]

752

In [183]:
df.loc[df.id==str(id_by_num_of_rankings[idx_start+idx_dist]),'num_of_rankings'].iloc[0]

752

In [185]:
aa = csoup.find_all('comments')

200

In [205]:
all_game_dict2 = {}

In [206]:
{x['username']:x['rating'] for x in item.find_all('comment')}

{'JohnCoveyou': '10',
 'Jonas Thierry': '10',
 'pmthompson': '10',
 'Shleaky': '10',
 'iafmars': '10',
 'inmyeye': '10',
 'ankara': '10',
 'dfwgeek': '10',
 'Flying Pip': '10',
 'antilles2210': '10',
 'Sekahedo': '10',
 'PAND4RK': '10',
 'Phil1221': '10',
 'mountainfire': '10',
 'Meepletopia': '10',
 'JohnnyCrow17': '10',
 'IntaglioDragon': '10',
 'meberl': '10',
 'leobpfb': '10',
 'neidiana': '10',
 'ironbuckeye': '10',
 'JellyfishSilver': '10',
 'ilgarrone': '10',
 'Necrofear00': '10',
 'ssj71': '10',
 'adampudliner': '10',
 'phurridog': '10',
 'unlimitedgaming': '10',
 'gdttek': '10',
 'engrraf': '10',
 'Tolkana': '9.8',
 'Hetajeta': '9.75',
 'MMats': '9',
 'Balargon': '9',
 'crucius': '9',
 'MagiRev': '9',
 'ChewyTKE609': '9',
 'Mvalentino': '9',
 'sgclouthier': '9',
 'nsgocev': '9',
 'ReverendGamer': '9',
 'illy': '9',
 'jeremyslayton': '9',
 'joefling34': '9',
 'Zelgadiss': '9',
 'khari': '9',
 'The Meeples Champ': '9',
 'tehflash': '9',
 'michael_witman': '9',
 'Nico_RJ': '9',
 

In [220]:
comments_total

752

In [None]:
bs_comments = 'https://www.boardgamegeek.com/xmlapi2/thing?ratingcomments=1&' 
idx_dist = 200
for idx_start in range(0,len(id_by_num_of_rankings),idx_dist):
    print(idx_start, str(datetime.now().time())[:8])
    idx = str(id_by_num_of_rankings[idx_start:idx_start+idx_dist]).replace(' ','')[1:-1]
    pg_ct = 1
    comments_total = df.loc[df.id==str(id_by_num_of_rankings[idx_start+idx_dist]),'num_of_rankings'].iloc[0]
    pg = request.urlopen(f'{bs_comments}id={idx}&page={pg_ct}')
    csoup = BeautifulSoup(pg, 'xml')
    for idx2, item in enumerate(csoup.find_all('comments')):
        all_game_dict[id_by_num_of_rankings[idx_start+idx2]] = {x['username']:x['rating'] for x in item.find_all('comment')}
    while pg_ct*100 < comments_total:
        pg_ct += 1
        pg = request.urlopen(f'{bs_comments}id={idx}&page={pg_ct}')
        csoup = BeautifulSoup(pg, 'xml')
        for idx2, item in enumerate(csoup.find_all('comments')):
            dict_loc = id_by_num_of_rankings[idx_start+idx2]
            if item.find_all('comment') is not None:
                all_game_dict[dict_loc].update({x['username']:x['rating'] for x in item.find_all('comment')})
    with open('../data/all_game_dict_v2.json', 'w') as fp:
        json.dump(all_game_dict, fp)    

0 23:00:00
200 23:00:20
400 23:02:41
600 23:06:06
800 23:10:30


In [224]:
[len(all_game_dict2[x]) for x in all_game_dict2.keys()]

[237,
 302,
 343,
 346,
 351,
 354,
 352,
 360,
 361,
 378,
 375,
 379,
 388,
 392,
 393,
 398,
 398,
 398,
 398,
 403,
 413,
 423,
 424,
 428,
 427,
 429,
 429,
 429,
 432,
 433,
 435,
 433,
 438,
 437,
 438,
 443,
 446,
 449,
 451,
 455,
 450,
 460,
 462,
 464,
 464,
 468,
 473,
 473,
 475,
 476,
 479,
 471,
 476,
 481,
 479,
 489,
 487,
 494,
 494,
 497,
 496,
 499,
 501,
 498,
 511,
 518,
 518,
 518,
 520,
 525,
 530,
 533,
 532,
 531,
 537,
 537,
 532,
 540,
 540,
 542,
 536,
 545,
 547,
 552,
 553,
 548,
 556,
 559,
 538,
 565,
 564,
 562,
 566,
 561,
 567,
 561,
 568,
 568,
 574,
 575,
 574,
 576,
 574,
 585,
 585,
 588,
 591,
 590,
 590,
 597,
 594,
 597,
 599,
 602,
 602,
 606,
 604,
 605,
 608,
 599,
 612,
 621,
 621,
 624,
 624,
 622,
 625,
 627,
 615,
 626,
 627,
 630,
 630,
 632,
 630,
 633,
 631,
 630,
 638,
 636,
 640,
 639,
 636,
 643,
 641,
 651,
 651,
 654,
 654,
 655,
 655,
 648,
 660,
 658,
 656,
 663,
 668,
 671,
 672,
 672,
 675,
 676,
 673,
 676,
 680,
 686,
 687

In [215]:
all_game_dict2[dict_loc].update({})

In [213]:
all_game_dict2.keys()

dict_keys([63170])

In [219]:
len(all_game_dict2[63170])

198

In [128]:
bs_comments = 'https://www.boardgamegeek.com/xmlapi2/thing?ratingcomments=1&' 

# gm_id_lookup = gm_ids[0]
for gm_id_lookup in gm_ids[2:10]:
    pg_ct = 1
    pg = request.urlopen(f'{bs_comments}id={gm_id_lookup}&page={pg_ct}')
    csoup = BeautifulSoup(pg, 'xml')
    comments_total = int(csoup.find('comments')['totalitems'])
    gm_comment_dict = {x['username']:x['rating'] for x in csoup.find_all('comment')}
    while pg_ct*100 < comments_total:
        pg_ct += 1
        pg = request.urlopen(f'{bs_comments}id={gm_id_lookup}&page={pg_ct}')
        csoup = BeautifulSoup(pg, 'xml')
        gm_comment_dict.update({x['username']:x['rating'] for x in csoup.find_all('comment')})
        time.sleep(2)
        if pg_ct*100%10**3 == 0:
            print(gm_id_lookup, pg_ct, str(datetime.now().time())[:8])
    all_game_dict[gm_id_lookup] = gm_comment_dict

182028 10 20:17:50
182028 20 20:18:13
182028 30 20:18:35
182028 40 20:19:00
182028 50 20:19:24
182028 60 20:19:49
182028 70 20:20:13
182028 80 20:20:38
182028 90 20:21:02
182028 100 20:21:27
182028 110 20:21:51
182028 120 20:22:15
182028 130 20:22:40
167791 10 20:23:19
167791 20 20:23:45
167791 30 20:24:12
167791 40 20:24:38
167791 50 20:25:04
167791 60 20:25:31
167791 70 20:25:57
167791 80 20:26:23
167791 90 20:26:50
167791 100 20:27:16
167791 110 20:27:42
167791 120 20:28:08
167791 130 20:28:34
167791 140 20:29:01
167791 150 20:29:27
167791 160 20:29:53
167791 170 20:30:20
167791 180 20:30:47
167791 190 20:31:13
167791 200 20:31:39
167791 210 20:32:05
167791 220 20:32:32
167791 230 20:32:58
167791 240 20:33:25
167791 250 20:33:51
167791 260 20:34:17
167791 270 20:34:43
167791 280 20:35:09
167791 290 20:35:32
167791 300 20:35:54
12333 10 20:36:26
12333 20 20:36:53
12333 30 20:37:19
12333 40 20:37:45
12333 50 20:38:12
12333 60 20:38:39
12333 70 20:39:05
12333 80 20:39:32
12333 90 20:39

In [166]:
with open('../data/all_game_dict.json', 'w') as fp:
    json.dump(all_game_dict, fp)

In [103]:
pg = request.urlopen(f'{bs_comments}id={gm_id_lookup}&page={str(pg_ct)}')
csoup = BeautifulSoup(pg, 'xml')

In [105]:
f'{bs_comments}id={gm_id_lookup}&pg={str(pg_ct)}'

'https://www.boardgamegeek.com/xmlapi2/thing?ratingcomments=1&id=174430&pg=2'

In [104]:
csoup

<?xml version="1.0" encoding="utf-8"?>
<items termsofuse="https://boardgamegeek.com/xmlapi/termsofuse"><item id="174430" type="boardgame">
<thumbnail>https://cf.geekdo-images.com/thumb/img/e7GyV4PaNtwmalU-EQAGecwoBSI=/fit-in/200x150/pic2437871.jpg</thumbnail>
<image>https://cf.geekdo-images.com/original/img/lDN358RgcYvQfYYN6Oy2TXpifyM=/0x0/pic2437871.jpg</image>
<name sortindex="1" type="primary" value="Gloomhaven"/>
<name sortindex="1" type="alternate" value="幽城迷港"/>
<description>Gloomhaven  is a game of Euro-inspired tactical combat in a persistent world of shifting motives. Players will take on the role of a wandering adventurer with their own special set of skills and their own reasons for traveling to this dark corner of the world. Players must work together out of necessity to clear out menacing dungeons and forgotten ruins. In the process, they will enhance their abilities with experience and loot, discover new locations to explore and plunder, and expand an ever-branching story

In [98]:
aa = {x['username']:x['rating'] for x in csoup.find_all('comment')}

In [107]:
len(gm_comment_dict)

200

In [89]:
comment_dict = {x['username']:x['rating'] for x in csoup.find_all('comment')}

In [94]:
comment_dict

{174430: {'clarkkent22': '10',
  'Garroc': '10',
  'Neva Kee': '10',
  'Chris Coyote': '10',
  'Beaushek': '10',
  'ravenpolar': '10',
  'brenmcgovern': '10',
  'grovermerc': '10',
  'Ronnyknox': '10',
  'webs05': '10',
  'Iguloy': '10',
  'Slyght': '10',
  'Leonce': '10',
  'Wretched Git': '10',
  'rlphay': '10',
  'testicleez': '10',
  'crwills': '10',
  'WeeGee': '10',
  'Azil3': '10',
  'robbin 1': '10',
  'mcscowl': '10',
  'kittenhoarder': '10',
  'Kitarja': '10',
  'Shampoo4you': '10',
  'Blood Demon': '10',
  'stevelabny': '10',
  'vardamir': '10',
  'Aspudde': '10',
  'Zottelmonster': '10',
  'gimmster': '10',
  'Clinton': '10',
  'nadurgin': '10',
  'mergryphon': '10',
  'darcypennell': '10',
  'snorman': '10',
  'LouisDavid': '10',
  'Quotho': '10',
  'davidcoleman': '10',
  'camidon': '10',
  'repairmanjack': '10',
  'benjamininja': '10',
  'jamarre2': '10',
  'tilde72': '10',
  'bwarner34': '10',
  'olafpkyou': '10',
  'thesumo5': '10',
  'Tarkan': '10',
  'Electric421': '

In [85]:
csoup

<?xml version="1.0" encoding="utf-8"?>
<items termsofuse="https://boardgamegeek.com/xmlapi/termsofuse"><item id="174430" type="boardgame">
<thumbnail>https://cf.geekdo-images.com/thumb/img/e7GyV4PaNtwmalU-EQAGecwoBSI=/fit-in/200x150/pic2437871.jpg</thumbnail>
<image>https://cf.geekdo-images.com/original/img/lDN358RgcYvQfYYN6Oy2TXpifyM=/0x0/pic2437871.jpg</image>
<name sortindex="1" type="primary" value="Gloomhaven"/>
<name sortindex="1" type="alternate" value="幽城迷港"/>
<description>Gloomhaven  is a game of Euro-inspired tactical combat in a persistent world of shifting motives. Players will take on the role of a wandering adventurer with their own special set of skills and their own reasons for traveling to this dark corner of the world. Players must work together out of necessity to clear out menacing dungeons and forgotten ruins. In the process, they will enhance their abilities with experience and loot, discover new locations to explore and plunder, and expand an ever-branching story