# Imports and Paths

In [158]:
from urllib import request
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
from shutil import copyfile

In [None]:
PATH = '../data/'

In [4]:
bs_pg = 'https://www.boardgamegeek.com/xmlapi2/'

# XML2 API

Base URI: /xmlapi2/thing?parameters
- id=NNN	
  - Specifies the id of the thing(s) to retrieve. To request multiple things with a single query, NNN can specify a comma-delimited list of ids.
- type=THINGTYPE	
  - Specifies that, regardless of the type of thing asked for by id, the results are filtered by the THINGTYPE(s) specified. Multiple THINGTYPEs can be specified in a comma-delimited list.
- versions=1	
  - Returns version info for the item.
- videos = 1	
  - Returns videos for the item.
- stats=1		
  - Returns ranking and rating stats for the item.
- historical=1		
  - Returns historical data over time. See page parameter.
- marketplace=1		
  - Returns marketplace data.
- comments=1		
  - Returns all comments about the item. Also includes ratings when commented. See page parameter.
- ratingcomments=1		
  - Returns all ratings for the item. Also includes comments when rated. See page parameter. The ratingcomments and comments parameters cannot be used together, as the output always appears in the \<comments\> node of the XML; comments parameter takes precedence if both are specified. Ratings are sorted in descending rating value, based on the highest rating they have assigned to that item (each item in the collection can have a different rating).
- page=NNN		
  - Defaults to 1, controls the page of data to see for historical info, comments, and ratings data.
- pagesize=NNN		
  - Set the number of records to return in paging. Minimum is 10, maximum is 100.
- from=YYYY-MM-DD		
  - Not currently supported.
- to=YYYY-MM-DD		
  - Not currently supported.

# Extract the info for each game

You can't get games by bgg ranking, so I will go through every game, extract their info and then go back and extract info based on rank

In [11]:
bs_pg_gm = f'{bs_pg}thing?type=boardgame&stats=1&id='

In [136]:
!ls '../data/'

In [73]:
def extract_game_data(soup):
    gm_dict = {}
    field_int = ['yearpublished', 'minplayers', 'maxplayers', 'playingtime', 'minplaytime', 'maxplaytime', 'minage']
    field_categ = ['boardgamecategory', 'boardgamemechanic', 'boardgamefamily','boardgamedesigner', 'boardgameartist', 'boardgamepublisher']
    field_rank = [x['friendlyname'] for x in soup.find_all('rank')]
    field_stats = ['usersrated', 'average', 'bayesaverage', 'stddev', 'median', 'owned', 'trading', 'wanting', 'wishing', 'numcomments', 'numweights', 'averageweight']
    gm_dict['name'] = soup.find('name')['value']
    gm_dict['id'] = soup.find('item')['id']
    for i in field_int:
        field_val = soup.find(i)
        if field_val is None:
            gm_dict[i] = -1
        else:
            gm_dict[i] = int(field_val['value'])
    for i in field_categ:
        gm_dict[i] = [x['value'] for x in soup.find_all('link',{'type':i})]
    for i in field_rank:
        field_val = soup.find('rank',{'friendlyname':i})
        if field_val is None:
            gm_dict[i] = -1
        else:
            gm_dict[i] = int(field_val['value'])
    for i in field_stats:
        field_val = soup.find(i)
        if field_val is None:
            gm_dict[i] = -1
        else:
            gm_dict[i] = float(field_val['value'])
    return gm_dict

In [184]:
def extract_game_item(item):
    gm_dict = {}
    field_int = ['yearpublished', 'minplayers', 'maxplayers', 'playingtime', 'minplaytime', 'maxplaytime', 'minage']
    field_categ = ['boardgamecategory', 'boardgamemechanic', 'boardgamefamily','boardgamedesigner', 'boardgameartist', 'boardgamepublisher']
    field_rank = [x['friendlyname'] for x in item.find_all('rank')]
    field_stats = ['usersrated', 'average', 'bayesaverage', 'stddev', 'median', 'owned', 'trading', 'wanting', 'wishing', 'numcomments', 'numweights', 'averageweight']
    gm_dict['name'] = item.find('name')['value']
    gm_dict['id'] = item['id']
    for i in field_int:
        field_val = item.find(i)
        if field_val is None:
            gm_dict[i] = -1
        else:
            gm_dict[i] = int(field_val['value'])
    for i in field_categ:
        gm_dict[i] = [x['value'] for x in item.find_all('link',{'type':i})]
    for i in field_rank:
        field_val = item.find('rank',{'friendlyname':i})
        if field_val is None or field_val['value'] == 'Not Ranked':
            gm_dict[i.replace(' ','')] = -1
        else:
            gm_dict[i.replace(' ','')] = int(field_val['value'])
    for i in field_stats:
        field_val = item.find(i)
        if field_val is None:
            gm_dict[i] = -1
        else:
            gm_dict[i] = float(field_val['value'])
    return gm_dict

In [181]:
gm_list = []

In [149]:
idx_start = 0
idx_dist = 1000
idx = str(tuple(range(idx_start, idx_start+idx_dist))).replace(' ','')[1:-1]
pg = request.urlopen(f'{bs_pg_gm}{str(idx)}')
soup = BeautifulSoup(pg, 'xml')
gm_list += [extract_game_item(x) for x in soup.find_all('item')]

CPU times: user 40.3 s, sys: 188 ms, total: 40.5 s
Wall time: 1min 3s


In [189]:
gm_list = []
idx_dist = 10**3
idx_max = 10**5
for idx_start in range(0,idx_max,idx_dist):    
    idx = str(tuple(range(idx_start, idx_start+idx_dist))).replace(' ','')[1:-1]
    pg = request.urlopen(f'{bs_pg_gm}{str(idx)}')
    soup = BeautifulSoup(pg, 'xml')
    gm_list += [extract_game_item(x) for x in soup.find_all('item')]
    df = pd.DataFrame(gm_list)
    if idx_start > 0:
        copyfile('../data/all_game_data.csv','../data/all_game_data_backup.csv')
    df.to_csv('../data/all_game_data.csv', index=False)
    print(idx_start+idx_dist, len(gm_list), str(datetime.now().time())[:8])

1000 855 14:57:40
2000 1702 14:58:20
3000 2569 14:58:59
4000 3460 14:59:37
5000 4320 15:00:12
6000 5221 15:00:48
7000 6108 15:01:21
8000 7002 15:01:57
9000 7842 15:02:29
10000 8727 15:03:03
11000 9625 15:04:14
12000 10522 15:05:21
13000 11415 15:06:31
14000 12273 15:07:42
15000 13160 15:08:49
16000 14035 15:09:57
17000 14909 15:11:03
18000 15781 15:12:04
19000 16629 15:12:58
20000 17479 15:14:04
21000 18316 15:15:04
22000 19158 15:16:01
23000 20012 15:17:03
24000 20868 15:17:59
25000 21749 15:19:00
26000 22626 15:20:02
27000 23487 15:21:04
28000 24369 15:22:15
29000 25240 15:23:19
30000 26122 15:24:25
31000 26977 15:25:26
32000 27828 15:26:23
33000 28706 15:27:26
34000 29566 15:28:27
35000 30419 15:29:30
36000 31301 15:30:35
37000 32211 15:31:39
38000 33083 15:32:38
39000 33932 15:33:45
40000 34742 15:34:48
41000 35492 15:35:38
42000 36132 15:36:32
43000 36770 15:37:23
44000 37215 15:38:02
45000 37297 15:38:14
46000 37366 15:38:27
47000 37507 15:38:45
48000 37607 15:38:59
49000 37661 1

ValueError: invalid literal for int() with base 10: ''

In [176]:
pg = request.urlopen(f"{bs_pg_gm}{'1,2,3,4,5,6'}")
soup = BeautifulSoup(pg, 'xml')
gm_list2 = [extract_game_item(x) for x in soup.find_all('item')]

In [188]:
df.loc[df.BoardGameRank>0,'BoardGameRank'].sort_values()

84        57
39        71
83       107
189      125
167      127
11       134
403      148
206      149
367      175
2        180
0        197
4        211
51       212
213      214
111      223
108      224
67       230
154      262
12       293
47       294
414      321
136      327
417      331
17       349
44       365
327      366
10       379
153      414
80       419
43       455
       ...  
261    14026
262    14190
357    14212
239    14218
352    14239
289    14258
36     14295
330    14402
313    14413
57     14465
293    14483
310    14595
316    14637
344    14652
323    14860
117    14921
358    14939
52     14989
215    15031
340    15051
58     15184
162    15275
165    15374
394    15375
294    15399
146    15401
245    15586
30     15712
351    15848
365    15876
Name: BoardGameRank, Length: 426, dtype: int64

In [165]:
df.head().T

Unnamed: 0,0,1,2,3,4
AbstractGameRank,,,,,
AccessoryRank,,,,,
BoardGameRank,-1,-1,-1,-1,-1
Children'sGameRank,,,,,
CustomizableRank,,,,,
FamilyGameRank,,,,,
PartyGameRank,,,,,
StrategyGameRank,-1,-1,-1,,-1
ThematicRank,,,,,
WarGameRank,,,,,


In [48]:
gm_dict = {}
field_int = ['yearpublished', 'minplayers', 'maxplayers', 'playingtime', 'minplaytime', 'maxplaytime', 'minage']
field_categ = ['boardgamecategory', 'boardgamemechanic', 'boardgamefamily','boardgamedesigner', 'boardgameartist', 'boardgamepublisher']
field_rank = [x['friendlyname'] for x in soup.find_all('rank')]
field_stats = ['usersrated', 'average', 'bayesaverage', 'stddev', 'median', 'owned', 'trading', 'wanting', 'wishing', 'numcomments', 'numweights', 'averageweight']
gm_dict['name'] = soup.find('name')['value']
for i in field_int:
    field_val = soup.find(i)
    if field_val is None:
        gm_dict[i] = -1
    else:
        gm_dict[i] = int(field_val['value'])
for i in field_categ:
    gm_dict[i] = [x['value'] for x in soup.find_all('link',{'type':i})]
for i in field_rank:
    field_val = soup.find('rank',{'friendlyname':i})
    if field_val is None:
        gm_dict[i] = -1
    else:
        gm_dict[i] = int(field_val['value'])
for i in field_stats:
    field_val = soup.find(i)
    if field_val is None:
        gm_dict[i] = -1
    else:
        gm_dict[i] = float(field_val['value'])
gm_dict

{'name': 'Panda Monium',
 'yearpublished': 1994,
 'minplayers': 3,
 'maxplayers': 6,
 'playingtime': 0,
 'minplaytime': 0,
 'maxplaytime': 0,
 'minage': 6,
 'boardgamecategory': ['Action / Dexterity',
  'Card Game',
  "Children's Game",
  'Movies / TV / Radio theme',
  'Real-time'],
 'boardgamemechanic': ['Acting'],
 'boardgamefamily': ['Animals: Bears', 'Animals: Pandas', 'Animals: Rabbits'],
 'boardgamedesigner': ['Hajo Bücken'],
 'boardgameartist': ['Dave Clegg',
  'Oliver Freudenreich',
  'Wesly Gibs',
  'Johannes Saurer'],
 'boardgamepublisher': ['AMIGO', 'Gamewright', 'Heye Verlag', 'Kanga Games'],
 'Board Game Rank': 13677,
 "Children's Game Rank": 416,
 'usersrated': 115.0,
 'average': 5.38087,
 'bayesaverage': 5.49125,
 'stddev': 1.26043,
 'median': 0.0,
 'owned': 254.0,
 'trading': 16.0,
 'wanting': 4.0,
 'wishing': 13.0,
 'numcomments': 56.0,
 'numweights': 15.0,
 'averageweight': 1.0}