# Imports and Paths

In [5]:
from urllib import request
from bs4 import BeautifulSoup, Comment
import pandas as pd
from datetime import datetime
from shutil import copyfile
import time
import json

In [None]:
PATH = '../data/'

In [4]:
bs_pg = 'https://www.boardgamegeek.com/xmlapi2/'

# XML2 API

Base URI: /xmlapi2/thing?parameters
- id=NNN	
  - Specifies the id of the thing(s) to retrieve. To request multiple things with a single query, NNN can specify a comma-delimited list of ids.
- type=THINGTYPE	
  - Specifies that, regardless of the type of thing asked for by id, the results are filtered by the THINGTYPE(s) specified. Multiple THINGTYPEs can be specified in a comma-delimited list.
- versions=1	
  - Returns version info for the item.
- videos = 1	
  - Returns videos for the item.
- stats=1		
  - Returns ranking and rating stats for the item.
- historical=1		
  - Returns historical data over time. See page parameter.
- marketplace=1		
  - Returns marketplace data.
- comments=1		
  - Returns all comments about the item. Also includes ratings when commented. See page parameter.
- ratingcomments=1		
  - Returns all ratings for the item. Also includes comments when rated. See page parameter. The ratingcomments and comments parameters cannot be used together, as the output always appears in the \<comments\> node of the XML; comments parameter takes precedence if both are specified. Ratings are sorted in descending rating value, based on the highest rating they have assigned to that item (each item in the collection can have a different rating).
- page=NNN		
  - Defaults to 1, controls the page of data to see for historical info, comments, and ratings data.
- pagesize=NNN		
  - Set the number of records to return in paging. Minimum is 10, maximum is 100.
- from=YYYY-MM-DD		
  - Not currently supported.
- to=YYYY-MM-DD		
  - Not currently supported.

# Get the id's of the top 2000 board games

In [2]:
pg_gm_rnks = 'https://boardgamegeek.com/browse/boardgame/page/'

In [51]:
pg_num = 1
pg = request.urlopen(f'{pg_gm_rnks}{str(pg_num)}')
soup = BeautifulSoup(pg, 'html.parser')

In [3]:
def extract_gm_id(soup):
    rows = soup.find('div', {'id': 'collection'}).find_all('tr')[1:]
    id_list = []
    for row in rows:
        id_list.append(int(row.find_all('a')[1]['href'].split('/')[2]))
    return id_list

In [4]:
gm_ids = []
for pg_num in range(1,21):
    pg = request.urlopen(f'{pg_gm_rnks}{str(pg_num)}')
    soup = BeautifulSoup(pg, 'html.parser')
    gm_ids += extract_gm_id(soup)

In [6]:
len(gm_ids)

2000

# Extract the info for each game

In [57]:
bs_pg = 'https://www.boardgamegeek.com/xmlapi2/'

## Using the extracted game id's

In [129]:
bs_pg_gm = f'{bs_pg}thing?type=boardgame&stats=1&ratingcomments=1&page=1&pagesize=10&id='

In [139]:
def extract_game_item(item):
    gm_dict = {}
    field_int = ['yearpublished', 'minplayers', 'maxplayers', 'playingtime', 'minplaytime', 'maxplaytime', 'minage']
    field_categ = ['boardgamecategory', 'boardgamemechanic', 'boardgamefamily','boardgamedesigner', 'boardgameartist', 'boardgamepublisher']
    field_rank = [x['friendlyname'] for x in item.find_all('rank')]
    field_stats = ['usersrated', 'average', 'bayesaverage', 'stddev', 'median', 'owned', 'trading', 'wanting', 'wishing', 'numcomments', 'numweights', 'averageweight']
    gm_dict['name'] = item.find('name')['value']
    gm_dict['id'] = item['id']
    gm_dict['num_of_rankings'] = int(item.find('comments')['totalitems'])
    for i in field_int:
        field_val = item.find(i)
        if field_val is None:
            gm_dict[i] = -1
        else:
            gm_dict[i] = int(field_val['value'])
    for i in field_categ:
        gm_dict[i] = [x['value'] for x in item.find_all('link',{'type':i})]
    for i in field_rank:
        field_val = item.find('rank',{'friendlyname':i})
        if field_val is None or field_val['value'] == 'Not Ranked':
            gm_dict[i.replace(' ','')] = -1
        else:
            gm_dict[i.replace(' ','')] = int(field_val['value'])
    for i in field_stats:
        field_val = item.find(i)
        if field_val is None:
            gm_dict[i] = -1
        else:
            gm_dict[i] = float(field_val['value'])
    return gm_dict

In [144]:
aa = f'{bs_pg_gm}{str(idx)}'

In [148]:
gm_list = []
idx_split = 4
idx_size = int(len(gm_ids)/idx_split)
for i in range(idx_split):
    idx = str(gm_ids[i*idx_size:(i+1)*idx_size]).replace(' ','')[1:-1]   
    pg = request.urlopen(f'{bs_pg_gm}{str(idx)}')
    xsoup = BeautifulSoup(pg, 'xml')
    gm_list += [extract_game_item(x) for x in xsoup.find_all('item')]
df = pd.DataFrame(gm_list)

In [150]:
df.to_csv(f'../data/bgg_top{len(gm_ids)}_{str(datetime.now().date())}.csv', index=False)

In [149]:
df.shape

(2000, 38)

## Iterating through every possible id

You can't get games by bgg ranking, so I will go through every game, extract their info and then go back and extract info based on rank

In [11]:
bs_pg_gm = f'{bs_pg}thing?type=boardgame&stats=1&id='

In [136]:
!ls '../data/'

In [73]:
def extract_game_data(soup):
    gm_dict = {}
    field_int = ['yearpublished', 'minplayers', 'maxplayers', 'playingtime', 'minplaytime', 'maxplaytime', 'minage']
    field_categ = ['boardgamecategory', 'boardgamemechanic', 'boardgamefamily','boardgamedesigner', 'boardgameartist', 'boardgamepublisher']
    field_rank = [x['friendlyname'] for x in soup.find_all('rank')]
    field_stats = ['usersrated', 'average', 'bayesaverage', 'stddev', 'median', 'owned', 'trading', 'wanting', 'wishing', 'numcomments', 'numweights', 'averageweight']
    gm_dict['name'] = soup.find('name')['value']
    gm_dict['id'] = soup.find('item')['id']
    for i in field_int:
        field_val = soup.find(i)
        if field_val is None:
            gm_dict[i] = -1
        else:
            gm_dict[i] = int(field_val['value'])
    for i in field_categ:
        gm_dict[i] = [x['value'] for x in soup.find_all('link',{'type':i})]
    for i in field_rank:
        field_val = soup.find('rank',{'friendlyname':i})
        if field_val is None:
            gm_dict[i] = -1
        else:
            gm_dict[i] = int(field_val['value'])
    for i in field_stats:
        field_val = soup.find(i)
        if field_val is None:
            gm_dict[i] = -1
        else:
            gm_dict[i] = float(field_val['value'])
    return gm_dict

In [184]:
def extract_game_item(item):
    gm_dict = {}
    field_int = ['yearpublished', 'minplayers', 'maxplayers', 'playingtime', 'minplaytime', 'maxplaytime', 'minage']
    field_categ = ['boardgamecategory', 'boardgamemechanic', 'boardgamefamily','boardgamedesigner', 'boardgameartist', 'boardgamepublisher']
    field_rank = [x['friendlyname'] for x in item.find_all('rank')]
    field_stats = ['usersrated', 'average', 'bayesaverage', 'stddev', 'median', 'owned', 'trading', 'wanting', 'wishing', 'numcomments', 'numweights', 'averageweight']
    gm_dict['name'] = item.find('name')['value']
    gm_dict['id'] = item['id']
    for i in field_int:
        field_val = item.find(i)
        if field_val is None:
            gm_dict[i] = -1
        else:
            gm_dict[i] = int(field_val['value'])
    for i in field_categ:
        gm_dict[i] = [x['value'] for x in item.find_all('link',{'type':i})]
    for i in field_rank:
        field_val = item.find('rank',{'friendlyname':i})
        if field_val is None or field_val['value'] == 'Not Ranked':
            gm_dict[i.replace(' ','')] = -1
        else:
            gm_dict[i.replace(' ','')] = int(field_val['value'])
    for i in field_stats:
        field_val = item.find(i)
        if field_val is None:
            gm_dict[i] = -1
        else:
            gm_dict[i] = float(field_val['value'])
    return gm_dict

In [181]:
gm_list = []

In [149]:
idx_start = 0
idx_dist = 1000
idx = str(tuple(range(idx_start, idx_start+idx_dist))).replace(' ','')[1:-1]
pg = request.urlopen(f'{bs_pg_gm}{str(idx)}')
soup = BeautifulSoup(pg, 'xml')
gm_list += [extract_game_item(x) for x in soup.find_all('item')]

CPU times: user 40.3 s, sys: 188 ms, total: 40.5 s
Wall time: 1min 3s


In [None]:
gm_list = []
idx_dist = 10**3
idx_max = 10**5
for idx_start in range(0,idx_max,idx_dist):    
    idx = str(tuple(range(idx_start, idx_start+idx_dist))).replace(' ','')[1:-1]
    pg = request.urlopen(f'{bs_pg_gm}{str(idx)}')
    soup = BeautifulSoup(pg, 'xml')
    gm_list += [extract_game_item(x) for x in soup.find_all('item')]
    df = pd.DataFrame(gm_list)
    if idx_start > 0:
        copyfile('../data/all_game_data.csv','../data/all_game_data_backup.csv')
    df.to_csv('../data/all_game_data.csv', index=False)
    print(idx_start+idx_dist, len(gm_list), str(datetime.now().time())[:8])

# Get the users and ratings for each game

Order the games by the total number of comments and then group them togather in groups of 200

In [6]:
df = pd.read_csv('../data/bgg_top2000_2018-10-06.csv')
id_by_num_of_rankings = df.loc[df.num_of_rankings.sort_values().index,'id'].astype(int).tolist()

In [7]:
all_game_dict = {}

In [11]:
comments_total

80386

In [12]:
[len(all_game_dict[x]) for x in all_game_dict.keys()]

[32562,
 32869,
 33158,
 33283,
 34920,
 35921,
 35922,
 38381,
 38746,
 40723,
 42102,
 43678,
 43210,
 43912,
 44012,
 44034,
 44042,
 43980,
 44001,
 44055,
 44056,
 43991,
 43982,
 43960,
 43879]

In [14]:
df.columns

Index(['AbstractGameRank', 'BoardGameRank', 'Children'sGameRank',
       'CustomizableRank', 'FamilyGameRank', 'PartyGameRank', 'RPGItemRank',
       'StrategyGameRank', 'ThematicRank', 'WarGameRank', 'average',
       'averageweight', 'bayesaverage', 'boardgameartist', 'boardgamecategory',
       'boardgamedesigner', 'boardgamefamily', 'boardgamemechanic',
       'boardgamepublisher', 'id', 'maxplayers', 'maxplaytime', 'median',
       'minage', 'minplayers', 'minplaytime', 'name', 'num_of_rankings',
       'numcomments', 'numweights', 'owned', 'playingtime', 'stddev',
       'trading', 'usersrated', 'wanting', 'wishing', 'yearpublished'],
      dtype='object')

In [15]:
df.loc[df.num_of_rankings.sort_values().index,'num_of_rankings'].tail(25)

6      32676
602    32948
244    33294
10     33409
11     34993
52     36021
78     36061
199    38495
46     38952
119    40870
330    42294
91     43792
217    44846
47     45233
231    45340
33     47926
190    50867
14     54243
18     54923
128    56974
42     63987
72     65009
73     77430
136    79839
292    80386
Name: num_of_rankings, dtype: int64

In [16]:
with open('../data/all_game_dict_v4.json', 'w') as fp:
    json.dump(all_game_dict, fp)

In [17]:
pg_ct

443

In [10]:
bs_comments = 'https://www.boardgamegeek.com/xmlapi2/thing?ratingcomments=1&' 
idx_dist = 25
for idx_start in range(1975,len(id_by_num_of_rankings),idx_dist):
    print(idx_start, sum(len(all_game_dict[x]) for x in all_game_dict.keys()), str(datetime.now().time())[:8])
    idx = str(id_by_num_of_rankings[idx_start:idx_start+idx_dist]).replace(' ','')[1:-1]
    pg_ct = 1
    comments_total = df.loc[df.id==id_by_num_of_rankings[idx_start+idx_dist-1],'num_of_rankings'].iloc[0]
    pg = request.urlopen(f'{bs_comments}id={idx}&page={pg_ct}')
    csoup = BeautifulSoup(pg, 'xml')
    for idx2, item in enumerate(csoup.find_all('comments')):
        all_game_dict[id_by_num_of_rankings[idx_start+idx2]] = {x['username']:x['rating'] for x in item.find_all('comment')}
    while pg_ct*100 < comments_total:
        pg_ct += 1
        pg = request.urlopen(f'{bs_comments}id={idx}&page={pg_ct}')
        csoup = BeautifulSoup(pg, 'xml')
        for idx2, item in enumerate(csoup.find_all('comments')):
            dict_loc = id_by_num_of_rankings[idx_start+idx2]
            if item.find_all('comment') is not None:
                all_game_dict[dict_loc].update({x['username']:x['rating'] for x in item.find_all('comment')})
        time.sleep(2)
    with open('../data/all_game_dict_v4.json', 'w') as fp:
        json.dump(all_game_dict, fp)    

1975 0 16:27:00


ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host

In [58]:
algd_keys = list(all_game_dict.keys())
dict_small = {x:all_game_dict[x] for x in algd_keys[:500]}

In [59]:
df2 = pd.DataFrame(dict_small)

In [60]:
df3 = df2.dropna(thresh=50)
df2.shape, df3.shape

((154776, 500), (5970, 500))

In [55]:
df2.shape, df3.shape

((73528, 100), (4337, 100))

In [215]:
all_game_dict2[dict_loc].update({})

In [213]:
all_game_dict2.keys()

dict_keys([63170])

In [219]:
len(all_game_dict2[63170])

198

In [None]:
bs_comments = 'https://www.boardgamegeek.com/xmlapi2/thing?ratingcomments=1&' 

# gm_id_lookup = gm_ids[0]
for gm_id_lookup in gm_ids[2:10]:
    pg_ct = 1
    pg = request.urlopen(f'{bs_comments}id={gm_id_lookup}&page={pg_ct}')
    csoup = BeautifulSoup(pg, 'xml')
    comments_total = int(csoup.find('comments')['totalitems'])
    gm_comment_dict = {x['username']:x['rating'] for x in csoup.find_all('comment')}
    while pg_ct*100 < comments_total:
        pg_ct += 1
        pg = request.urlopen(f'{bs_comments}id={gm_id_lookup}&page={pg_ct}')
        csoup = BeautifulSoup(pg, 'xml')
        gm_comment_dict.update({x['username']:x['rating'] for x in csoup.find_all('comment')})
        time.sleep(2)
        if pg_ct*100%10**3 == 0:
            print(gm_id_lookup, pg_ct, str(datetime.now().time())[:8])
    all_game_dict[gm_id_lookup] = gm_comment_dict

In [166]:
with open('../data/all_game_dict.json', 'w') as fp:
    json.dump(all_game_dict, fp)

In [103]:
pg = request.urlopen(f'{bs_comments}id={gm_id_lookup}&page={str(pg_ct)}')
csoup = BeautifulSoup(pg, 'xml')

In [105]:
f'{bs_comments}id={gm_id_lookup}&pg={str(pg_ct)}'

'https://www.boardgamegeek.com/xmlapi2/thing?ratingcomments=1&id=174430&pg=2'

In [98]:
aa = {x['username']:x['rating'] for x in csoup.find_all('comment')}

In [107]:
len(gm_comment_dict)

200

In [None]:
comment_dict = {x['username']:x['rating'] for x in csoup.find_all('comment')}