<a href="https://colab.research.google.com/github/MatheusSC017/BoardGameGeek_WebScraper/blob/main/BoardGameGeek_WebScrap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [38]:
from bs4 import BeautifulSoup
from openpyxl import Workbook
from tqdm import tqdm
from time import sleep
import pandas as pd
import requests
import json
import re

# Getting ranking of Board Games

In [None]:
def get_table_data(url):
    response = requests.get(url)

    if response.status_code != 200:
        raise HTTPError()

    html = BeautifulSoup(response.text, 'html.parser')

    table = html.find(id='collectionitems').find_all('tr')
    head = table[0]
    data = table[1:]

    return data, head

In [None]:
def get_columns(head):
    fields = head.find_all('th')
    columns = [field.text.strip() for field in fields]
    return columns

In [None]:
def get_value(line):
    fields = line.find_all('td')
    values = [re.sub('[\n,\t]', ' ', field.text.strip()) for field in fields]
    return values

In [None]:
def save_dataframe(dataframe):
    wb = Workbook()

    ws = wb.active
    ws.title = 'BoardGameList'

    ws.append(list(dataframe.columns))
    for line in dataframe.values:
        ws.append(list(line))

    wb.save('GeekGames.xlsx')

In [None]:
geek_games_data = None
columns = None
i = 1
while True:
    try: 
        url = f'https://boardgamegeek.com/browse/boardgame/page/{i}'
        i += 1
        data, head = get_table_data(url)
        if columns is None:
            columns = get_columns(head)
            geek_games_data = pd.DataFrame(columns=columns)
    except:
        break
    for game in data:
        fields = game.find_all('td')
        values = [re.sub('[\n,\t]', ' ', field.text.strip()) for field in fields]
        if len(values) == 7:
            geek_games_data = geek_games_data.append(
                pd.Series(values, index=columns),
                ignore_index=True
            )
    sleep(10)
geek_games_data.drop('Shop', axis=1)
save_dataframe(geek_games_data)

In [None]:
geek_games_data.shape

(2000, 7)

In [None]:
geek_games_data.head()

Unnamed: 0,Board Game Rank,Thumbnail image,Title,Geek Rating,Avg Rating,Num Voters,Shop
0,1,,Gloomhaven (2017) Vanquish monsters with ...,8.474,8.71,51616,
1,2,,Brass: Birmingham (2018) Build networks ...,8.429,8.66,29998,
2,3,,Pandemic Legacy: Season 1 (2015) Mutating...,8.426,8.58,47341,
3,4,,Gloomhaven: Jaws of the Lion (2020) Vanqu...,8.258,8.6,21648,
4,5,,Terraforming Mars (2016) Compete with riv...,8.257,8.4,80817,


# Getting Game informations

In [3]:
response = requests.get('https://boardgamegeek.com/sitemapindex')
lxml = BeautifulSoup(response.text, 'lxml')
list_of_pages = lxml.find_all('loc')

In [4]:
page_list = list()

for list_of_page in tqdm(list_of_pages[:10]):
    response = requests.get(list_of_page.text.strip())
    lxml = BeautifulSoup(response.text, 'lxml')
    page_list.extend([page.text.strip() for page in lxml.find_all('loc')])
    
    sleep(10)

100%|██████████| 10/10 [02:02<00:00, 12.20s/it]


In [7]:
print(f'Number of board games: {len(page_list)}')
print(f'Link of the first game: {page_list[0]}')

Number of board games: 100000
Link of the first game: https://boardgamegeek.com/boardgame/1/die-macher


In [25]:
response = requests.get(page_list[0])
html = BeautifulSoup(response.text, 'html.parser')

In [41]:
game_board = html.find('script', attrs={'type': 'application/ld+json'}).text
game_board = json.loads(game_board)
game_board

{'@context': 'http://schema.org',
 '@type': 'Product',
 'aggregateRating': {'@type': 'AggregateRating',
  'bestRating': '10',
  'ratingValue': '7.60792',
  'reviewCount': '5505',
  'worstRating': '1'},
 'description': 'Players represent political parties attempting to gain power in Germany.',
 'image': 'https://cf.geekdo-images.com/rpwCZAjYLD940NWwP3SRoA__itemrep/img/66VYYgqwBJGOxsh_8sLaBYROeCU=/fit-in/246x300/filters:strip_icc()/pic4718279.jpg',
 'name': 'Die Macher'}

In [43]:
game_board['aggregateRating']['ratingValue']

'7.60792'