In [1]:
import requests
import pandas as pd
import difflib
from bs4 import BeautifulSoup
import re
# import fuzzywuzzy
from thefuzz import process
from thefuzz import fuzz
import numpy as np


In [2]:
def get_categories(url) -> 'List': 
    ''' 
    Params: str: Board game url (ex: https://boardgamegeek.com/xmlapi/boardgame/{gameID})

    Returns: List: List of categories 
    '''
    response = requests.get(url)
    soup = BeautifulSoup(response.text)

    cat_text = soup.find_all('boardgamecategory')
    cat_list = []
    for cats in cat_text:
        s = str(cats)

        # extract important text between <stuff> important text </stuff>
        c = re.sub(r'<[^>]+>', '', s)
        cat_list.append(c)

    return(cat_list)

In [3]:
testUrl = 'https://boardgamegeek.com/xmlapi/boardgame/13'

print(get_categories(testUrl))

['Economic', 'Negotiation']


### Getting all the categories from BGG

In [4]:
categories_url = 'https://boardgamegeek.com/browse/boardgamecategory'

categories_response = requests.get(categories_url)

# print(response.text)

category_soup = BeautifulSoup(categories_response.text)
# print(soup.find_all('a'))

categories = []
for link in category_soup.find_all('a'):
    cat_string = link.get('href')
    if cat_string != None:
        categories.append(cat_string.split('/')[-1])
        
## the categories are, luckily, alphabetized
print(categories)

['abstract-strategy', 'action-dexterity', 'adventure', 'age-of-reason', 'american-civil-war', 'american-indian-wars', 'american-revolutionary-war', 'american-west', 'ancient', 'animals', 'arabian', 'aviation-flight', 'bluffing', 'book', 'card-game', 'childrens-game', 'city-building', 'civil-war', 'civilization', 'collectible-components', 'comic-book-strip', 'deduction', 'dice', 'economic', 'educational', 'electronic', 'environmental', 'expansion-for-base-game', 'exploration', 'fan-expansion', 'fantasy', 'farming', 'fighting', 'game-system', 'horror', 'humor', 'industry-manufacturing', 'korean-war', 'mafia', 'math', 'mature-adult', 'maze', 'medical', 'medieval', 'memory', 'miniatures', 'modern-warfare', 'movies-tv-radio-theme', 'murdermystery', 'music', 'mythology', 'napoleonic', 'nautical', 'negotiation', 'novel-based', 'number', 'party-game', 'pike-and-shot', 'pirates', 'political', 'post-napoleonic', 'prehistoric', 'print-and-play', 'puzzle', 'racing', 'real-time', 'religious', 'rena

### Merging bgg and shelfside data on name
ID might be better to be honest

In [5]:
bgg_full_gamelist = pd.read_csv('../data/bgg_gamelist.csv')
shelfside_gamelist = pd.read_csv('../data/Shelfside Games Reviewed - Review Data.csv')

## Renaming Game to name for compatibility
shelfside_gamelist = shelfside_gamelist.rename(columns = {'Game':'name'})

## need to drop rows either Ash or Dan hasn't given a rating
relevant_metrics = ['name', 'score', 'dan score', 'ash score']
data = shelfside_gamelist[relevant_metrics]
data_cleaned = data.dropna(subset = relevant_metrics)
# data_cleaned

## getting the urls for every game in the bgg_gamelist
game_ids = bgg_full_gamelist['id'].to_list()
game_urls = [f'https://boardgamegeek.com/xmlapi/boardgame/{x}' for x in game_ids]
bgg_full_gamelist['game_urls'] = game_urls
bgg_full_gamelist

Unnamed: 0,id,name,nrate,pic_url,game_urls
0,13,CATAN,128958,https://cf.geekdo-images.com/W3Bsga_uLP9kO91gZ...,https://boardgamegeek.com/xmlapi/boardgame/13
1,822,Carcassonne,128106,https://cf.geekdo-images.com/okM0dq_bEXnbyQTOv...,https://boardgamegeek.com/xmlapi/boardgame/822
2,30549,Pandemic,126448,https://cf.geekdo-images.com/S3ybV1LAp-8SnHIXL...,https://boardgamegeek.com/xmlapi/boardgame/30549
3,68448,7 Wonders,105376,https://cf.geekdo-images.com/35h9Za_JvMMMtx_92...,https://boardgamegeek.com/xmlapi/boardgame/68448
4,167791,Terraforming Mars,100322,https://cf.geekdo-images.com/wg9oOLcsKvDesSUdZ...,https://boardgamegeek.com/xmlapi/boardgame/167791
...,...,...,...,...,...
995,105593,Cheating Moth,5088,https://cf.geekdo-images.com/7w7ht4bmAnu0j-_qg...,https://boardgamegeek.com/xmlapi/boardgame/105593
996,2386,Chinese Checkers,5078,https://cf.geekdo-images.com/MFhxbJJRHjIb4oWn7...,https://boardgamegeek.com/xmlapi/boardgame/2386
997,244995,Illusion,5071,https://cf.geekdo-images.com/jcFaI9hr6iXwpwCZT...,https://boardgamegeek.com/xmlapi/boardgame/244995
998,226522,Exit: The Game – Dead Man on the Orient Express,5069,https://cf.geekdo-images.com/atCG4ujYJ-L5LKID3...,https://boardgamegeek.com/xmlapi/boardgame/226522


In [14]:
def matching(list_in, list_compare):

    list_out = [None]*len(list_in)
    for i in range(len(list_in)):
        # print(f'\nShelfname: {list_in[i]}')
        # score_list = process.extractWithoutOrder(list_in[i], list_compare, scorer = fuzz.partial_ratio, score_cutoff=81)
        score_list = process.extractWithoutOrder(list_in[i], list_compare, scorer = fuzz.ratio, score_cutoff=70)

        score_list = list(score_list)
        rating = []
        names = []

        if len(score_list) == 0:
            # no matches
            list_out[i] = list_in[i]
            continue
            
        for item in score_list:
            # print(item)
            new_name, name_score = item[0], item[1]
            if name_score == 100:
                list_out[i] = new_name
                # print(f'Exact match, Replaced shelfname: {list_in[i]} with bgg name: {new_name}')
                break
                
            elif name_score > 94:
                list_out[i] = new_name
                # print(f'Close match, Replaced shelfname: {list_in[i]} with bgg name: {new_name}')
                
            else:
                # print(f'No good matches')
                list_out[i] = list_in[i]
                continue

    return(list_out)

In [15]:
shelf_names = data_cleaned['name'].to_list()
bgg_names = bgg_full_gamelist['name'].to_list()

# print(len(shelf_names))
# print(len(bgg_names))
matched_names = matching(shelf_names, bgg_names)


In [20]:
data_cleaned['name'] = matched_names
data_cleaned

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned['name'] = matched_names


Unnamed: 0,name,score,dan score,ash score
0,Dominion,8.0,4.0,7.0
1,The Resistance,9.0,5.0,6.0
2,Incan Gold,7.0,8.0,8.0
3,Splendor,9.0,7.0,6.0
4,Dixit,6.0,8.0,4.0
...,...,...,...,...
120,Twilight Inscription,5.0,3.0,6.0
139,Acquire,9.0,6.0,7.0
140,Earth,10.0,6.0,6.0
144,Bloodstones,6.0,5.0,4.0


In [None]:


# data_cleaned['name'] = data_cleaned['name'].apply(
#   lambda x: process.extractOne(x, bgg_full_gamelist['name'], scorer=fuzz.partial_ratio, score_cutoff=80)[0]
# )
# data_cleaned

In [None]:
# df0 = pd.Dataframe

## super slow for loop but gotta see em: 
# for x in game_urls:
#     print(x)
#     print(get_categories(x))

In [None]:
## map goes like map(func, iterable) -- takes forever too but more than a thousand games
# result = map(get_categories, game_urls)
# list(result)

In [None]:
# def fuzzy_match(row, choices, scorer=fuzz.partial_ratio, threshold=80):
#     match = process.extractOne(row['name'], choices, scorer=scorer)
#     # Only return match if score is above a certain threshold, else return None
#     if match and match[1] >= threshold:
#         return match[0]
#     else:
#         return None

# # Apply fuzzy matching and create a new column for the matched name
# bgg_full_gamelist['fuzzy_name'] = bgg_full_gamelist.apply(
#     lambda row: fuzzy_match(row, data_cleaned['name']), axis=1
# )

# # Merge the dataframes based on the fuzzy matched name
# merged_df = pd.merge(
#     data_cleaned, 
#     bgg_full_gamelist, 
#     left_on='name', 
#     right_on='fuzzy_name', 
#     how="left"
# )
