In [1]:
import requests
import pandas as pd
import difflib
from bs4 import BeautifulSoup
import re
# import fuzzywuzzy
from thefuzz import process
from thefuzz import fuzz
import numpy as np


In [2]:
def get_categories(url) -> 'List': 
    ''' 
    Params: str: Board game url (ex: https://boardgamegeek.com/xmlapi/boardgame/{gameID})

    Returns: List: List of categories 
    '''
    response = requests.get(url)
    soup = BeautifulSoup(response.text)

    cat_text = soup.find_all('boardgamecategory')
    cat_list = []
    for cats in cat_text:
        s = str(cats)

        # extract important text between <stuff> important text </stuff>
        c = re.sub(r'<[^>]+>', '', s)
        cat_list.append(c)

    return(cat_list)

def get_description(url) -> 'str': 
    ''' 
    Params: str: Board game url (ex: https://boardgamegeek.com/xmlapi/boardgame/{gameID})

    Returns: str: string of description
    '''
    response = requests.get(url)
    soup = BeautifulSoup(response.text)

    cat_text = soup.find_all('description')

    return(cat_text)

In [3]:
testUrl = 'https://boardgamegeek.com/xmlapi/boardgame/13'

print(get_categories(testUrl))
print(get_description(testUrl))


['Economic', 'Negotiation']
[<description>In CATAN (formerly The Settlers of Catan), players try to be the dominant force on the island of Catan by building settlements, cities, and roads. On each turn dice are rolled to determine what resources the island produces. Players build by spending resources (sheep, wheat, wood, brick and ore) that are depicted by these resource cards; each land type, with the exception of the unproductive desert, produces a specific resource: hills produce brick, forests produce wood, mountains produce ore, fields produce wheat, and pastures produce sheep.&lt;br/&gt;&lt;br/&gt;Set-up includes randomly placing large hexagonal tiles (each showing a resource or the desert) in a honeycomb shape and surrounding them with water tiles, some of which contain ports of exchange. Number disks, which will correspond to die rolls (two 6-sided dice are used), are placed on each resource tile. Each player is given two settlements (think: houses) and roads (sticks) which ar

### Getting all the categories from BGG

In [4]:
categories_url = 'https://boardgamegeek.com/browse/boardgamecategory'

categories_response = requests.get(categories_url)

# print(response.text)

category_soup = BeautifulSoup(categories_response.text)
# print(soup.find_all('a'))

categories = []
for link in category_soup.find_all('a'):
    cat_string = link.get('href')
    if cat_string != None:
        categories.append(cat_string.split('/')[-1])
        
## the categories are, luckily, alphabetized
# print(categories)

In [5]:
# Outputs to Title format for later
category_titles = [x.replace('-', ' ').title() for x in categories] 

# Data Import


In [6]:
shelf_df = pd.read_csv('../data/Shelfside Games Reviewed - Review Data.csv')
relevant_metrics = ['name', 'id', 'shelfside score', 'dan score', 'ash score']
data = shelf_df[relevant_metrics]
data_cleaned = data.dropna(subset = ['id', 'shelfside score'])
data_cleaned

Unnamed: 0,name,id,shelfside score,dan score,ash score
0,Dominion,36218,8.0,4.0,7.0
1,The Resistance,41114,9.0,5.0,6.0
2,Incan Gold,15512,7.0,8.0,8.0
3,Splendor,148228,9.0,7.0,6.0
4,Dixit,39856,6.0,8.0,4.0
...,...,...,...,...,...
151,Brink Prototype,415078,6.0,,6.0
152,Degenesis: Clan Wars Prototype,388747,4.0,,6.0
153,Everdell,199792,9.0,7.0,9.0
154,Cyberpunk 2077: The Board Game,417542,7.0,,7.0


In [7]:
game_ids = data_cleaned['id'].to_list()
# game_ids = [x for x in game_ids if x != 'NaN']
game_urls = [f'https://boardgamegeek.com/xmlapi/boardgame/{id}' for id in game_ids]
game_categories = list(map(get_categories, game_urls))
game_descriptions = list(map(get_description, game_urls))

In [8]:
data_cleaned['categories'] = game_categories
data_cleaned['descriptions'] = game_descriptions
data_cleaned.to_csv('data_cleaned.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned['categories'] = game_categories
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned['descriptions'] = game_descriptions


In [9]:
data_cleaned

Unnamed: 0,name,id,shelfside score,dan score,ash score,categories,descriptions
0,Dominion,36218,8.0,4.0,7.0,"[Card Game, Medieval]","[[&quot;You are a monarch, like your parents b..."
1,The Resistance,41114,9.0,5.0,6.0,"[Bluffing, Card Game, Deduction, Negotiation, ...",[[The Empire must fall. Our mission must succe...
2,Incan Gold,15512,7.0,8.0,8.0,"[Adventure, Bluffing, Exploration]",[[Diamant &mdash; also published as Incan Gold...
3,Splendor,148228,9.0,7.0,6.0,"[Card Game, Economic, Renaissance]",[[Splendor is a game of chip-collecting and ca...
4,Dixit,39856,6.0,8.0,4.0,"[Card Game, Humor, Party Game]","[[Each turn in Dixit, one player is the storyt..."
...,...,...,...,...,...,...,...
151,Brink Prototype,415078,6.0,,6.0,"[Science Fiction, Space Exploration]",[[The Moonrakers are on the brink.<br/><br/>Th...
152,Degenesis: Clan Wars Prototype,388747,4.0,,6.0,"[Miniatures, Wargame]","[[Degenesis: Clan Wars is an asymmetric, semi-..."
153,Everdell,199792,9.0,7.0,9.0,"[Animals, Card Game, City Building, Fantasy]","[[Within the charming valley of Everdell, bene..."
154,Cyberpunk 2077: The Board Game,417542,7.0,,7.0,"[Adventure, Dice, Miniatures, Science Fiction,...",[[Cyberpunk 2077: the Board Game is a fast-pac...


In [10]:
# user_matrix_ash = pd.pivot_table(data_cleaned, index = ['categories'])

pd.DataFrame(columns = category_titles)

Unnamed: 0,Abstract Strategy,Action Dexterity,Adventure,Age Of Reason,American Civil War,American Indian Wars,American Revolutionary War,American West,Ancient,Animals,...,Transportation,Travel,Trivia,Video Game Theme,Vietnam War,Wargame,Word Game,World War I,World War Ii,Zombies


In [11]:
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stopwords
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
data_cleaned['description'] = data_cleaned['description'].fillna('')

#Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature
tfidf_matrix = tfidf.fit_transform(data_cleaned['description'])

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
indices = pd.Series(data_cleaned.index, index=data_cleaned['name']).drop_duplicates()

def content_recommender(name, cosine_sim=cosine_sim, df=data_cleaned, indices=indices):
    # Obtain the index of the movie that matches the title
    idx = indices[name]
    
    # Get the pairwise similarity scores of all movies with that movie
    # And convert it into a list of tuples as described above
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on the cosine similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 10 most similar movies. Ignore the first movie.
    sim_scores = sim_scores[1:11]
    
    # Get the movie indices
    game_indices = [i[0] for i in sim_scores]
    
    # Return the top 10 most similar games
    return df['name'].iloc[game_indices]

#Get recommendations
content_recommender('Dominion')

KeyError: 'description'