# Imports and Paths

In [1]:
import urllib3
http = urllib3.PoolManager()

In [2]:
from urllib import request
from bs4 import BeautifulSoup, Comment
import pandas as pd
from datetime import datetime
# from shutil import copyfile
# import time
import json

# Load in previous list of games

In [9]:
df_gms_lst = pd.read_csv('../data/bgg_top2000_2018-10-06.csv')

In [11]:
df_gms_lst.columns

Index(['AbstractGameRank', 'BoardGameRank', 'Children'sGameRank',
       'CustomizableRank', 'FamilyGameRank', 'PartyGameRank', 'RPGItemRank',
       'StrategyGameRank', 'ThematicRank', 'WarGameRank', 'average',
       'averageweight', 'bayesaverage', 'boardgameartist', 'boardgamecategory',
       'boardgamedesigner', 'boardgamefamily', 'boardgamemechanic',
       'boardgamepublisher', 'id', 'maxplayers', 'maxplaytime', 'median',
       'minage', 'minplayers', 'minplaytime', 'name', 'num_of_rankings',
       'numcomments', 'numweights', 'owned', 'playingtime', 'stddev',
       'trading', 'usersrated', 'wanting', 'wishing', 'yearpublished'],
      dtype='object')

In [31]:
metadata_dict = {"title": "BGG Top 2000",
                 "subtitle": "Board Game Geek top 2000 games rankings",
                 "description": "Board Game Geek top 2000 games rankings and other info",
                "id": "mseinstein/bgg_top2000",
                "licenses": [{"name": "CC-BY-SA-4.0"}],
                "resources":[
                    {"path": "bgg_top2000_2018-10-06.csv",
                     "description": "Board Game Geek top 2000 games on 2018-10-06"
                    }
                ] }

In [33]:
with open('../data/kaggle/dataset-metadata.json', 'w') as fp:
    json.dump(metadata_dict, fp)

# Get the id's of the top 2000 board games

In [3]:
pg_gm_rnks = 'https://boardgamegeek.com/browse/boardgame/page/'

In [4]:
def extract_gm_id(soup):
    rows = soup.find('div', {'id': 'collection'}).find_all('tr')[1:]
    id_list = []
    for row in rows:
        id_list.append(int(row.find_all('a')[1]['href'].split('/')[2]))
    return id_list

In [5]:
def top_2k_gms(pg_gm_rnks):
    gm_ids = []
    for pg_num in range(1,21):
        pg = request.urlopen(f'{pg_gm_rnks}{str(pg_num)}')
        soup = BeautifulSoup(pg, 'html.parser')
        gm_ids += extract_gm_id(soup)
    return gm_ids

In [6]:
gm_ids = top_2k_gms(pg_gm_rnks)
len(gm_ids)

2000

# Extract the info for each game in the top 2k using the extracted game id's

In [7]:
bs_pg = 'https://www.boardgamegeek.com/xmlapi2/'
bs_pg_gm = f'{bs_pg}thing?type=boardgame&stats=1&ratingcomments=1&page=1&pagesize=10&id='

In [8]:
def extract_game_item(item):
    gm_dict = {}
    field_int = ['yearpublished', 'minplayers', 'maxplayers', 'playingtime', 'minplaytime', 'maxplaytime', 'minage']
    field_categ = ['boardgamecategory', 'boardgamemechanic', 'boardgamefamily','boardgamedesigner', 'boardgameartist', 'boardgamepublisher']
    field_rank = [x['friendlyname'] for x in item.find_all('rank')]
    field_stats = ['usersrated', 'average', 'bayesaverage', 'stddev', 'median', 'owned', 'trading', 'wanting', 'wishing', 'numcomments', 'numweights', 'averageweight']
    gm_dict['name'] = item.find('name')['value']
    gm_dict['id'] = item['id']
    gm_dict['num_of_rankings'] = int(item.find('comments')['totalitems'])
    for i in field_int:
        field_val = item.find(i)
        if field_val is None:
            gm_dict[i] = -1
        else:
            gm_dict[i] = int(field_val['value'])
    for i in field_categ:
        gm_dict[i] = [x['value'] for x in item.find_all('link',{'type':i})]
    for i in field_rank:
        field_val = item.find('rank',{'friendlyname':i})
        if field_val is None or field_val['value'] == 'Not Ranked':
            gm_dict[i.replace(' ','')] = -1
        else:
            gm_dict[i.replace(' ','')] = int(field_val['value'])
    for i in field_stats:
        field_val = item.find(i)
        if field_val is None:
            gm_dict[i] = -1
        else:
            gm_dict[i] = float(field_val['value'])
    return gm_dict

In [9]:
def create_df_gm_ranks(gm_ids, bs_pg_gm):
    gm_list = []
    idx_split = 4
    idx_size = int(len(gm_ids)/idx_split)
    for i in range(idx_split):
        idx = str(gm_ids[i*idx_size:(i+1)*idx_size]).replace(' ','')[1:-1]   
        pg = request.urlopen(f'{bs_pg_gm}{str(idx)}')
        xsoup = BeautifulSoup(pg, 'xml')
        gm_list += [extract_game_item(x) for x in xsoup.find_all('item')]
    df = pd.DataFrame(gm_list)
    return df

In [10]:
df = create_df_gm_ranks(gm_ids, bs_pg_gm)

In [20]:
df.to_csv(f'../data/kaggle/{str(datetime.now().date())}_bgg_top{len(gm_ids)}.csv', index=False)

In [11]:
df.shape

(2000, 38)

In [14]:
with open('../data/kaggle/dataset-metadata.json', 'rb') as f:
    meta_dict = json.load(f)

In [13]:
gm_list = []
idx_split = 4
idx_size = int(len(gm_ids)/idx_split)
for i in range(idx_split):
    idx = str(gm_ids[i*idx_size:(i+1)*idx_size]).replace(' ','')[1:-1]   
    break

In [16]:
idx2 = '174430,161936,182028,167791,12333,187645,169786,220308,120677,193738,84876,173346,180263,115746,3076,102794,205637'

In [21]:
pg = request.urlopen(f'{bs_pg_gm}{str(idx)}')

In [23]:
xsoup = BeautifulSoup(pg, 'xml')

In [29]:
aa = xsoup.find_all('item')
len(aa)

500

In [None]:
http.urlopen()

In [19]:
r = http.request('GET', f'{bs_pg_gm}{str(idx)}')



In [30]:
xsoup2 = BeautifulSoup(r.data, 'xml')

In [31]:
bb = xsoup.find_all('item')
len(bb)

500

In [16]:
meta_dict['resources'].append({
    'path': f'bgg_top{len(gm_ids)}_{str(datetime.now().date())}.csv',
    'description': f'Board Game Geek top 2000 games on {str(datetime.now().date())}'
})

In [17]:
meta_dict

{'title': 'BGG Top 2000',
 'subtitle': 'Board Game Geek top 2000 games rankings',
 'description': 'Board Game Geek top 2000 games rankings and other info',
 'id': 'mseinstein/bgg_top2000',
 'licenses': [{'name': 'CC-BY-SA-4.0'}],
 'resources': [{'path': 'bgg_top2000_2018-10-06.csv',
   'description': 'Board Game Geek top 2000 games on 2018-10-06'},
  {'path': 'bgg_top2000_2018-10-13.csv',
   'description': 'Board Game Geek top 2000 games on 2018-10-13'},
  {'path': 'bgg_top2000_2018-10-20.csv',
   'description': 'Board Game Geek top 2000 games on 2018-10-20'}]}

In [19]:
with open('../data/kaggle/dataset-metadata.json', 'w') as fp:
    json.dump(meta_dict, fp)

# XML2 API

Base URI: /xmlapi2/thing?parameters
- id=NNN	
  - Specifies the id of the thing(s) to retrieve. To request multiple things with a single query, NNN can specify a comma-delimited list of ids.
- type=THINGTYPE	
  - Specifies that, regardless of the type of thing asked for by id, the results are filtered by the THINGTYPE(s) specified. Multiple THINGTYPEs can be specified in a comma-delimited list.
- versions=1	
  - Returns version info for the item.
- videos = 1	
  - Returns videos for the item.
- stats=1		
  - Returns ranking and rating stats for the item.
- historical=1		
  - Returns historical data over time. See page parameter.
- marketplace=1		
  - Returns marketplace data.
- comments=1		
  - Returns all comments about the item. Also includes ratings when commented. See page parameter.
- ratingcomments=1		
  - Returns all ratings for the item. Also includes comments when rated. See page parameter. The ratingcomments and comments parameters cannot be used together, as the output always appears in the \<comments\> node of the XML; comments parameter takes precedence if both are specified. Ratings are sorted in descending rating value, based on the highest rating they have assigned to that item (each item in the collection can have a different rating).
- page=NNN		
  - Defaults to 1, controls the page of data to see for historical info, comments, and ratings data.
- pagesize=NNN		
  - Set the number of records to return in paging. Minimum is 10, maximum is 100.
- from=YYYY-MM-DD		
  - Not currently supported.
- to=YYYY-MM-DD		
  - Not currently supported.