In [2]:
# setup

import pandas as pd 
import numpy as np

# This project is about the popularity of board games, and what makes a board game successful.

One of my main hobbies is board games. Board games have evolved significantly from the classics, such as _Monopoly_, _Scabble_. and _The Game of Life_. The modern boardgame is incredibly diverse, with different mechanics, from card drafting, worker-placing and resource management. 

One of the major board game sites, <a href=https://boardgamegeek.com/>_BoardGameGeek_</a>, is a platform for the community to document releases, provide ratings and reviews, and discuss the intracies of the hobby.
This project will be using this dataset: <a href=https://www.kaggle.com/datasets/jvanelteren/boardgamegeek-reviews>BoardGameGeek Reviews</a>
I'll be asking questions and searching for answers throughout the project.

## Data import and cleaning.

The dataset for this project is split into 5 CSV files. 
<ul>
    <li>A list of 15 Million reviews, collected in 2020</li>
    <li>A list of 19 Million reviews, collected in 2022</li>
    <li>A list of Board games with over 30 reviews, collected in 2020</li>
    <li>A list of Board games with over 30 reviews, collected in 2022</li>
    <li>A list of board games with detailed information.</li>
</ul>

In [5]:
data = "./data"
df_20 = pd.read_csv(f"{data}/2020-08-19.csv")
df_20.head()

Unnamed: 0.1,Unnamed: 0,ID,Name,Year,Rank,Average,Bayes average,Users rated,URL,Thumbnail
0,90,30549,Pandemic,2008,91,7.62,7.518,96186,/boardgame/30549/pandemic,https://cf.geekdo-images.com/micro/img/0m3-oqB...
1,172,822,Carcassonne,2000,173,7.42,7.311,96181,/boardgame/822/carcassonne,https://cf.geekdo-images.com/micro/img/z0tTaij...
2,380,13,Catan,1995,381,7.16,7.001,96171,/boardgame/13/catan,https://cf.geekdo-images.com/micro/img/e0y6Bog...
3,49,68448,7 Wonders,2010,50,7.76,7.662,79830,/boardgame/68448/7-wonders,https://cf.geekdo-images.com/micro/img/h-Ejv31...
4,87,36218,Dominion,2008,88,7.63,7.521,74933,/boardgame/36218/dominion,https://cf.geekdo-images.com/micro/img/VYp2s2f...


In [6]:
df_20.set_index('ID', inplace = True)

In [7]:
display(df_20[df_20.duplicated()])
df_20.isna().value_counts()

Unnamed: 0_level_0,Unnamed: 0,Name,Year,Rank,Average,Bayes average,Users rated,URL,Thumbnail
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1


Unnamed: 0  Name   Year   Rank   Average  Bayes average  Users rated  URL    Thumbnail
False       False  False  False  False    False          False        False  False        19317
                                                                             True            13
Name: count, dtype: int64

In [8]:
# The data is very clean, with no missing values apart from 13 thumbnails. I'll drop this column, and other inimportant columns..
to_drop = ["Thumbnail", "URL", "Unnamed: 0"]
df_20.drop(to_drop, axis = 1, inplace = True)

In [9]:
df_22 = pd.read_csv(f"{data}/2022-01-08.csv")
df_22.head()

Unnamed: 0.1,Unnamed: 0,ID,Name,Year,Rank,Average,Bayes average,Users rated,URL,Thumbnail
0,105,30549,Pandemic,2008,106,7.59,7.487,108975,/boardgame/30549/pandemic,https://cf.geekdo-images.com/S3ybV1LAp-8SnHIXL...
1,189,822,Carcassonne,2000,190,7.42,7.309,108738,/boardgame/822/carcassonne,https://cf.geekdo-images.com/okM0dq_bEXnbyQTOv...
2,428,13,Catan,1995,429,7.14,6.97,108024,/boardgame/13/catan,https://cf.geekdo-images.com/W3Bsga_uLP9kO91gZ...
3,72,68448,7 Wonders,2010,73,7.74,7.634,89982,/boardgame/68448/7-wonders,https://cf.geekdo-images.com/RvFVTEpnbb4NM7k0I...
4,103,36218,Dominion,2008,104,7.61,7.499,81561,/boardgame/36218/dominion,https://cf.geekdo-images.com/j6iQpZ4XkemZP07HN...


In [10]:
df_22.set_index('ID', inplace = True)

In [11]:
display(df_22[df_22.duplicated()])
df_22.isna().value_counts()

Unnamed: 0_level_0,Unnamed: 0,Name,Year,Rank,Average,Bayes average,Users rated,URL,Thumbnail
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1


Unnamed: 0  Name   Year   Rank   Average  Bayes average  Users rated  URL    Thumbnail
False       False  False  False  False    False          False        False  False        21825
                                                                             True             6
Name: count, dtype: int64

In [12]:
# The data is very clean, with no missing values apart from 6 thumbnails. I'll drop this column, and other inimportant columns..
df_22.drop(to_drop, axis = 1, inplace = True)

With just the data from the 2 lists of games, there's a few questions I'd like to ask:
<ol>
    <li>What are the highest rated games in both years?</li>
    <li>What is the average user rating in both years? Do users think the quality of games has gone up or down?</li>
    <li>How many more board games are there in 2022, compared to 2020?</li>
    <li>Which games had the highest change in rank?</li>
</ol>


### 1. What are the highest rated games in both years?

In [14]:
top20_2020 = df_20.sort_values(by='Rank').head(20)
top20_2020

Unnamed: 0_level_0,Name,Year,Rank,Average,Bayes average,Users rated
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
174430,Gloomhaven,2017,1,8.82,8.569,37053
161936,Pandemic Legacy: Season 1,2015,2,8.62,8.47,38345
224517,Brass: Birmingham,2018,3,8.64,8.305,14531
167791,Terraforming Mars,2016,4,8.43,8.282,57319
233078,Twilight Imperium (Fourth Edition),2017,5,8.69,8.213,11466
182028,Through the Ages: A New Story of Civilization,2015,6,8.45,8.211,21035
220308,Gaia Project,2017,7,8.5,8.162,14270
187645,Star Wars: Rebellion,2016,8,8.42,8.16,21089
12333,Twilight Struggle,2005,9,8.3,8.148,38869
193738,Great Western Trail,2016,10,8.29,8.109,26372


In [15]:
top20_2022 = df_22.sort_values(by='Rank').head(20)
top20_2022

Unnamed: 0_level_0,Name,Year,Rank,Average,Bayes average,Users rated
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
174430,Gloomhaven,2017,1,8.74,8.511,47827
161936,Pandemic Legacy: Season 1,2015,2,8.59,8.442,45041
224517,Brass: Birmingham,2018,3,8.66,8.418,25484
167791,Terraforming Mars,2016,4,8.42,8.274,74216
233078,Twilight Imperium: Fourth Edition,2017,5,8.68,8.262,16025
291457,Gloomhaven: Jaws of the Lion,2020,6,8.68,8.26,15918
220308,Gaia Project,2017,7,8.47,8.175,19169
187645,Star Wars: Rebellion,2016,8,8.42,8.172,25586
182028,Through the Ages: A New Story of Civilization,2015,9,8.38,8.15,25605
115746,War of the Ring: Second Edition,2012,10,8.51,8.136,15498


I've chosen an arbitary value to look at the top games in. But how many games stayed in the top 20? and which ones?

In [17]:
stayed_in_top20_ids = []
for id in top20_2020.index:
    if id in top20_2022.index:
        stayed_in_top20_ids.append(id)
stayed_in_top20 = top20_2022.loc[stayed_in_top20_ids]
stayed_in_top20


#stayedString = ""
#for name in stayed_in_top20['Name'].values:
#    stayedString = stayedString + f"{name}\n" 
#print(f"{stayed_in_top20.shape[0]} games stayed in the top 20. The ones that stayed were: \n{stayedString}")

Unnamed: 0_level_0,Name,Year,Rank,Average,Bayes average,Users rated
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
174430,Gloomhaven,2017,1,8.74,8.511,47827
161936,Pandemic Legacy: Season 1,2015,2,8.59,8.442,45041
224517,Brass: Birmingham,2018,3,8.66,8.418,25484
167791,Terraforming Mars,2016,4,8.42,8.274,74216
233078,Twilight Imperium: Fourth Edition,2017,5,8.68,8.262,16025
182028,Through the Ages: A New Story of Civilization,2015,9,8.38,8.15,25605
220308,Gaia Project,2017,7,8.47,8.175,19169
187645,Star Wars: Rebellion,2016,8,8.42,8.172,25586
12333,Twilight Struggle,2005,13,8.28,8.113,43052
193738,Great Western Trail,2016,12,8.29,8.123,32960


In [18]:
def notin(first, second):
    out = []
    for elem in first:
        if elem not in second:
            out.append(elem)
    return out 

left_top20 = top20_2020.loc[notin(top20_2020.index, top20_2022.index)]
entered_top20 = top20_2022.loc[notin(top20_2022.index, top20_2020.index)]
print("These games left the top 20:")
display(left_top20)
print("These games entered the top 20:")
display(entered_top20)

These games left the top 20:


Unnamed: 0_level_0,Name,Year,Rank,Average,Bayes average,Users rated
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
205637,Arkham Horror: The Card Game,2016,19,8.19,7.929,25045
266192,Wingspan,2019,20,8.11,7.925,32712


These games entered the top 20:


Unnamed: 0_level_0,Name,Year,Rank,Average,Bayes average,Users rated
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
291457,Gloomhaven: Jaws of the Lion,2020,6,8.68,8.26,15918
167355,Nemesis,2018,17,8.39,7.982,17688


In [19]:
df_22.describe()

Unnamed: 0,Year,Rank,Average,Bayes average,Users rated
count,21831.0,21831.0,21831.0,21831.0,21831.0
mean,1987.440108,10916.0,6.41696,5.682149,866.964134
std,193.510505,6302.211199,0.930267,0.364763,3679.821978
min,0.0,1.0,1.04,0.0,30.0
25%,2001.0,5458.5,5.83,5.51,56.0
50%,2011.0,10916.0,6.45,5.545,122.0
75%,2017.0,16373.5,7.04,5.674,392.0
max,3500.0,21831.0,9.57,8.511,108975.0


In [20]:
#What is the average score for both games?
avg_2022 = df_22['Average'].mean()
avg_2020 = df_20['Average'].mean()
print(f"The average in 2020 was: {avg_2020} \nThe average in 2022 was: {avg_2022}")

The average in 2020 was: 6.392255561303674 
The average in 2022 was: 6.416960285832074


In [21]:
# How about the games that lost/gained the most ranks?

# 3 States a game can be in
# In both lists
# in first list # unimplemented
# in second list # NaN as a side effect.

df_22_rank_change = df_22.copy()

def rank_change(id):
    return df_20.loc[id]['Rank'] - df_22.loc[id]['Rank']

ids_20 = df_20.index
ids_22 = df_22.index
## check if a game is in both and calculate the change:
for id in ids_22:
    if id in ids_20:
        change = rank_change(id)
        df_22_rank_change.loc[id,'Ranked change'] = change


df_22_rank_change.sample(20)


Unnamed: 0_level_0,Name,Year,Rank,Average,Bayes average,Users rated,Ranked change
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
7927,Trivial Pursuit: Junior,1992,21268,4.84,5.443,154,-2610.0
91534,Struggle for Catan,2011,3341,6.15,5.862,2546,-416.0
16149,Up And Down,2005,9770,6.24,5.558,152,-1153.0
344408,Full Throttle!,2021,9808,6.91,5.557,73,
208093,Laga Jakarta,2016,11650,7.55,5.539,43,-767.0
279,The Very Clever Pipe Game,1996,4770,6.22,5.718,821,-650.0
7947,Lucky Catch,1997,15802,6.08,5.513,44,-1882.0
169513,Dorasure,2014,9467,6.39,5.562,144,-952.0
86443,Color-A-Do,2010,19501,5.03,5.488,43,-2391.0
5522,UNO Rummy-Up,1993,12618,5.9,5.532,136,-975.0


In [22]:
print("The games that gained the most ranks were:")
display(df_22_rank_change.sort_values(by= 'Ranked change', ascending=False).head(20))
print("The games that lost the most ranks were:")
display(df_22_rank_change.sort_values(by= 'Ranked change', ascending=False, na_position ='first' ).tail(20))

The games that gained the most ranks were:


Unnamed: 0_level_0,Name,Year,Rank,Average,Bayes average,Users rated,Ranked change
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
273703,Altar Quest,2020,1500,7.92,6.291,972,11349.0
281515,Company of Heroes,2021,2624,8.83,5.984,401,10259.0
295905,Cosmic Frog,2020,2514,7.53,6.005,580,10234.0
314040,Pandemic Legacy: Season 0,2020,82,8.64,7.581,3570,9929.0
267009,Rome & Roll,2020,2969,6.99,5.921,656,9772.0
273330,Bloodborne: The Board Game,2021,941,7.99,6.572,1807,9416.0
308989,Bristol 1350,2021,3134,7.01,5.89,656,9369.0
304285,Infinity Gauntlet: A Love Letter Game,2020,1451,7.13,6.309,1728,9336.0
254708,Roll Player Adventures,2021,2068,8.47,6.114,518,9325.0
281466,Yedo: Deluxe Master Set,2020,1111,8.1,6.481,1060,8967.0


The games that lost the most ranks were:


Unnamed: 0_level_0,Name,Year,Rank,Average,Bayes average,Users rated,Ranked change
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
280281,Sinister Six,2019,17407,5.84,5.504,122,-4429.0
198836,3 Wishes,2016,19058,5.5,5.492,1035,-4449.0
223763,Atari's Centipede,2017,16845,5.79,5.507,143,-4553.0
240096,Moonshiners of the Apocalypse,2019,20607,5.57,5.471,151,-4575.0
180975,Crimson Creek,2016,19516,5.54,5.488,264,-4658.0
42702,Jackal,2009,17974,5.73,5.5,340,-4843.0
125028,Colorpop,2011,16684,5.55,5.508,609,-4877.0
175592,The Village Crone,2015,14931,5.7,5.518,373,-5042.0
287810,"New Phone, Who Dis?",2019,19957,5.29,5.482,78,-5065.0
131199,Soluna,2012,20655,5.28,5.469,200,-5315.0


In [91]:
#Actually not that interesting until we find out how many new games were added to the catalogue, and how many were taken away...
df_removed = df_20[~df_20.index.isin(df_22.index)]
df_added = df_22[~df_22.index.isin(df_20.index)]
#df_20[]
#games_added_df = df_22.loc[ids]
#display(games_removed_df.head())
#display(games_added_df.head())
#games_removed_df
removed_no = df_removed.shape[0]
added_no = df_added.shape[0]
print(f"BGG removed {removed_no} games, and added {added_no} games.")

BGG removed 48 games, and added 2549 games.


In [24]:
# Got a Dtype warning on first run, specifying dType of columns 29:Board Game Rank, and 49: Children's game rank
dtypes = { "Board Game Rank" : str,
           "Children's Game Rank": str
         }
df_game_details = pd.read_csv(f"{data}/games_detailed_info.csv", dtype = dtypes)
df_game_details.head()
df_game_details.set_index('id', inplace = True)

In [25]:
df_game_details.columns

Index(['Unnamed: 0', 'type', 'thumbnail', 'image', 'primary', 'alternate',
       'description', 'yearpublished', 'minplayers', 'maxplayers',
       'suggested_num_players', 'suggested_playerage',
       'suggested_language_dependence', 'playingtime', 'minplaytime',
       'maxplaytime', 'minage', 'boardgamecategory', 'boardgamemechanic',
       'boardgamefamily', 'boardgameexpansion', 'boardgameimplementation',
       'boardgamedesigner', 'boardgameartist', 'boardgamepublisher',
       'usersrated', 'average', 'bayesaverage', 'Board Game Rank',
       'Strategy Game Rank', 'Family Game Rank', 'stddev', 'median', 'owned',
       'trading', 'wanting', 'wishing', 'numcomments', 'numweights',
       'averageweight', 'boardgameintegration', 'boardgamecompilation',
       'Party Game Rank', 'Abstract Game Rank', 'Thematic Rank',
       'War Game Rank', 'Customizable Rank', 'Children's Game Rank',
       'RPG Item Rank', 'Accessory Rank', 'Video Game Rank', 'Amiga Rank',
       'Commodore 64

In [26]:
df_game_details['Board Game Rank'].values

array(['106', '191', '429', ..., '19461', '17262', '12772'], dtype=object)

# Quick analysis:

What is the average rating of a board game?
What's the heighest ranking game of each genre?
Whta's the