In [20]:
import json
import pandas as pd

## Import the data and look at it

In [1]:
with open('data/games.json') as json_file:
    data = json.load(json_file)

In [2]:
game = data[0]

In [3]:
game

{'id': 174430,
 'name': 'Gloomhaven',
 'stats': {'usersrated': 34855,
  'average': 8.8311,
  'bayesaverage': 8.57594,
  'stddev': 1.60889,
  'median': 0.0,
  'owned': 56031,
  'trading': 347,
  'wanting': 1417,
  'wishing': 14655,
  'numcomments': 6553,
  'numweights': 1497,
  'averageweight': 3.827,
  'ranks': [{'id': '1',
    'name': 'boardgame',
    'friendlyname': 'Board Game Rank',
    'value': 1},
   {'id': '5496',
    'name': 'thematic',
    'friendlyname': 'Thematic Rank',
    'value': 1},
   {'id': '5497',
    'name': 'strategygames',
    'friendlyname': 'Strategy Game Rank',
    'value': 1}]},
 'image': 'https://cf.geekdo-images.com/original/img/lDN358RgcYvQfYYN6Oy2TXpifyM=/0x0/pic2437871.jpg',
 'thumbnail': 'https://cf.geekdo-images.com/thumb/img/e7GyV4PaNtwmalU-EQAGecwoBSI=/fit-in/200x150/pic2437871.jpg',
 'artists': ['Alexandr Elichev', 'Josh T. McDowell', 'Alvaro Nebot'],
 'designers': ['Isaac Childres'],
 'year': 2017,
 'description': 'Gloomhaven  is a game of Euro-inspi

In [4]:
game.keys()

dict_keys(['id', 'name', 'stats', 'image', 'thumbnail', 'artists', 'designers', 'year', 'description', 'categories', 'mechanics', 'min_age', 'publishers'])

In [5]:
type(game)

dict

## Count the number of unique values for mechanics, categories, designers, publishers

In [6]:
categories_set = set()
designers_set = set()
publishers_set = set()
mechanics_set = set()

In [7]:
for game in data:
    categories_set.update(game['categories'])
    mechanics_set.update(game['mechanics'])
    designers_set.update(game['designers'])
    publishers_set.update(game['publishers'])

In [8]:
len(categories_set)

83

In [9]:
len(mechanics_set)

182

In [10]:
len(publishers_set)

6070

In [11]:
len(designers_set)

8097

Way too many designers & publishers - these will most likely be just FYI and not part of model

## Check for missing values

In [12]:
lst = []
if not lst:
    print('list evaluates to false')

list evaluates to false


In [13]:
categories_missing = 0
mechanics_missing = 0
publishers_missing = 0
designers_missing = 0

for game in data:
    if not game['mechanics']:
        mechanics_missing += 1
    if not game['categories']:
        categories_missing += 1
    if not game['designers']:
        designers_missing += 1
    if not game['publishers']:
        publishers_missing += 1

In [14]:
mechanics_missing

1448

In [15]:
categories_missing

166

In [16]:
publishers_missing

0

In [17]:
designers_missing

331

## Format Data

Overall, the data goes through 4 steps in order for a given feature to be properly one-hot encoded. 
0. The dictionary data is converted to Pandas DataFrame
1. Each game is expanded out to multiple rows, such that each row contains just 1 category from the game's categories
2. Then the data gets one-hot encoded via pd.get_dummies()
3. We take only the columns with the one-hot encoded data with game_id as unique identifier and sum over columns to bring all the categories for a given game onto just 1 row
4. The aggregated one-hot encoded data is joined back with the original dataset

In [21]:
df = pd.DataFrame(data)
df.head(3)

Unnamed: 0,id,name,stats,image,thumbnail,artists,designers,year,description,categories,mechanics,min_age,publishers
0,174430,Gloomhaven,"{'usersrated': 34855, 'average': 8.8311, 'baye...",https://cf.geekdo-images.com/original/img/lDN3...,https://cf.geekdo-images.com/thumb/img/e7GyV4P...,"[Alexandr Elichev, Josh T. McDowell, Alvaro Ne...",[Isaac Childres],2017,Gloomhaven is a game of Euro-inspired tactica...,"[Adventure, Exploration, Fantasy, Fighting, Mi...","[Action Retrieval, Campaign / Battle Card Driv...",12,"[Cephalofair Games, Albi, Asmodee, Feuerland S..."
1,161936,Pandemic Legacy: Season 1,"{'usersrated': 36949, 'average': 8.62291, 'bay...",https://cf.geekdo-images.com/original/img/P_Sw...,https://cf.geekdo-images.com/thumb/img/WI5NmPd...,[Chris Quilliams],"[Rob Daviau, Matt Leacock]",2015,Pandemic Legacy is a co-operative campaign gam...,"[Environmental, Medical]","[Action Points, Cooperative Game, Hand Managem...",13,"[Z-Man Games, Inc., Asterion Press, Devir, Fil..."
2,182028,Through the Ages: A New Story of Civilization,"{'usersrated': 19929, 'average': 8.46877, 'bay...",https://cf.geekdo-images.com/original/img/1d2h...,https://cf.geekdo-images.com/thumb/img/Ohqc2KT...,"[Filip Murmak, Radim Pech, Jakub Politzer, Mil...",[Vlaada Chvátil],2015,Through the Ages: A New Story of Civilization ...,"[Card Game, Civilization, Economic]","[Action Points, Auction/Bidding, Auction: Dutc...",14,"[Czech Games Edition, Cranio Creations, Devir,..."


In [22]:
df.shape

(17311, 13)

In [23]:
# https://stackoverflow.com/questions/27263805/pandas-column-of-lists-create-a-row-for-each-list-element
lst_col = 'categories'

categories_split = pd.DataFrame({
      col:np.repeat(df[col].values, df[lst_col].str.len())
      for col in df.columns.drop(lst_col)}
    ).assign(**{lst_col:np.concatenate(df[lst_col].values)})[df.columns]

In [24]:
categories_split.head(3)

Unnamed: 0,id,name,stats,image,thumbnail,artists,designers,year,description,categories,mechanics,min_age,publishers
0,174430,Gloomhaven,"{'usersrated': 34855, 'average': 8.8311, 'baye...",https://cf.geekdo-images.com/original/img/lDN3...,https://cf.geekdo-images.com/thumb/img/e7GyV4P...,"[Alexandr Elichev, Josh T. McDowell, Alvaro Ne...",[Isaac Childres],2017,Gloomhaven is a game of Euro-inspired tactica...,Adventure,"[Action Retrieval, Campaign / Battle Card Driv...",12,"[Cephalofair Games, Albi, Asmodee, Feuerland S..."
1,174430,Gloomhaven,"{'usersrated': 34855, 'average': 8.8311, 'baye...",https://cf.geekdo-images.com/original/img/lDN3...,https://cf.geekdo-images.com/thumb/img/e7GyV4P...,"[Alexandr Elichev, Josh T. McDowell, Alvaro Ne...",[Isaac Childres],2017,Gloomhaven is a game of Euro-inspired tactica...,Exploration,"[Action Retrieval, Campaign / Battle Card Driv...",12,"[Cephalofair Games, Albi, Asmodee, Feuerland S..."
2,174430,Gloomhaven,"{'usersrated': 34855, 'average': 8.8311, 'baye...",https://cf.geekdo-images.com/original/img/lDN3...,https://cf.geekdo-images.com/thumb/img/e7GyV4P...,"[Alexandr Elichev, Josh T. McDowell, Alvaro Ne...",[Isaac Childres],2017,Gloomhaven is a game of Euro-inspired tactica...,Fantasy,"[Action Retrieval, Campaign / Battle Card Driv...",12,"[Cephalofair Games, Albi, Asmodee, Feuerland S..."


In [25]:
categories_split.shape

(45171, 13)

In [26]:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.get_dummies.html
categories_one_hot = pd.get_dummies(categories_split, columns=['categories'])
categories_one_hot.head(3)

Unnamed: 0,id,name,stats,image,thumbnail,artists,designers,year,description,mechanics,...,categories_Transportation,categories_Travel,categories_Trivia,categories_Video Game Theme,categories_Vietnam War,categories_Wargame,categories_Word Game,categories_World War I,categories_World War II,categories_Zombies
0,174430,Gloomhaven,"{'usersrated': 34855, 'average': 8.8311, 'baye...",https://cf.geekdo-images.com/original/img/lDN3...,https://cf.geekdo-images.com/thumb/img/e7GyV4P...,"[Alexandr Elichev, Josh T. McDowell, Alvaro Ne...",[Isaac Childres],2017,Gloomhaven is a game of Euro-inspired tactica...,"[Action Retrieval, Campaign / Battle Card Driv...",...,0,0,0,0,0,0,0,0,0,0
1,174430,Gloomhaven,"{'usersrated': 34855, 'average': 8.8311, 'baye...",https://cf.geekdo-images.com/original/img/lDN3...,https://cf.geekdo-images.com/thumb/img/e7GyV4P...,"[Alexandr Elichev, Josh T. McDowell, Alvaro Ne...",[Isaac Childres],2017,Gloomhaven is a game of Euro-inspired tactica...,"[Action Retrieval, Campaign / Battle Card Driv...",...,0,0,0,0,0,0,0,0,0,0
2,174430,Gloomhaven,"{'usersrated': 34855, 'average': 8.8311, 'baye...",https://cf.geekdo-images.com/original/img/lDN3...,https://cf.geekdo-images.com/thumb/img/e7GyV4P...,"[Alexandr Elichev, Josh T. McDowell, Alvaro Ne...",[Isaac Childres],2017,Gloomhaven is a game of Euro-inspired tactica...,"[Action Retrieval, Campaign / Battle Card Driv...",...,0,0,0,0,0,0,0,0,0,0


In [27]:
categories_one_hot.shape

(45171, 95)

In [28]:
categories_one_hot.columns[:20]

Index(['id', 'name', 'stats', 'image', 'thumbnail', 'artists', 'designers',
       'year', 'description', 'mechanics', 'min_age', 'publishers',
       'categories_Abstract Strategy', 'categories_Action / Dexterity',
       'categories_Adventure', 'categories_Age of Reason',
       'categories_American Civil War', 'categories_American Indian Wars',
       'categories_American Revolutionary War', 'categories_American West'],
      dtype='object')

In [29]:
categories_one_hot.columns[12:15]

Index(['categories_Abstract Strategy', 'categories_Action / Dexterity',
       'categories_Adventure'],
      dtype='object')

In [95]:
id_and_categories_columns = ['id'] + list(categories_one_hot.columns[12:])

In [98]:
categories_one_hot[id_and_categories_columns].head(4)

Unnamed: 0,id,categories_Abstract Strategy,categories_Action / Dexterity,categories_Adventure,categories_Age of Reason,categories_American Civil War,categories_American Indian Wars,categories_American Revolutionary War,categories_American West,categories_Ancient,...,categories_Transportation,categories_Travel,categories_Trivia,categories_Video Game Theme,categories_Vietnam War,categories_Wargame,categories_Word Game,categories_World War I,categories_World War II,categories_Zombies
0,174430,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,174430,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,174430,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,174430,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [105]:
categories_collapsed = categories_one_hot[id_and_categories_columns].groupby(by='id', as_index=False).sum()
categories_collapsed

Unnamed: 0,id,categories_Abstract Strategy,categories_Action / Dexterity,categories_Adventure,categories_Age of Reason,categories_American Civil War,categories_American Indian Wars,categories_American Revolutionary War,categories_American West,categories_Ancient,...,categories_Transportation,categories_Travel,categories_Trivia,categories_Video Game Theme,categories_Vietnam War,categories_Wargame,categories_Word Game,categories_World War I,categories_World War II,categories_Zombies
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17140,276894,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17141,278553,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17142,278751,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17143,279644,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0


In [115]:
# Confirmed that only the 5 relevant categories for Gloomhaven have a 1; the rest are 0
categories_collapsed[categories_collapsed.id==174430]

Unnamed: 0,id,categories_Abstract Strategy,categories_Action / Dexterity,categories_Adventure,categories_Age of Reason,categories_American Civil War,categories_American Indian Wars,categories_American Revolutionary War,categories_American West,categories_Ancient,...,categories_Transportation,categories_Travel,categories_Trivia,categories_Video Game Theme,categories_Vietnam War,categories_Wargame,categories_Word Game,categories_World War I,categories_World War II,categories_Zombies
13403,174430,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [116]:
categories_collapsed.shape

(17145, 84)

In [117]:
# Confirming that the rows we lost are only for games that don't have any categories 
categories_collapsed.shape[0] + categories_missing == df.shape[0]

True

## Time to join the original data with the one hot encoded categories

In [134]:
df.shape

(17311, 13)

In [120]:
df.head(3)

Unnamed: 0,id,name,stats,image,thumbnail,artists,designers,year,description,categories,mechanics,min_age,publishers
0,174430,Gloomhaven,"{'usersrated': 34855, 'average': 8.8311, 'baye...",https://cf.geekdo-images.com/original/img/lDN3...,https://cf.geekdo-images.com/thumb/img/e7GyV4P...,"[Alexandr Elichev, Josh T. McDowell, Alvaro Ne...",[Isaac Childres],2017,Gloomhaven is a game of Euro-inspired tactica...,"[Adventure, Exploration, Fantasy, Fighting, Mi...","[Action Retrieval, Campaign / Battle Card Driv...",12,"[Cephalofair Games, Albi, Asmodee, Feuerland S..."
1,161936,Pandemic Legacy: Season 1,"{'usersrated': 36949, 'average': 8.62291, 'bay...",https://cf.geekdo-images.com/original/img/P_Sw...,https://cf.geekdo-images.com/thumb/img/WI5NmPd...,[Chris Quilliams],"[Rob Daviau, Matt Leacock]",2015,Pandemic Legacy is a co-operative campaign gam...,"[Environmental, Medical]","[Action Points, Cooperative Game, Hand Managem...",13,"[Z-Man Games, Inc., Asterion Press, Devir, Fil..."
2,182028,Through the Ages: A New Story of Civilization,"{'usersrated': 19929, 'average': 8.46877, 'bay...",https://cf.geekdo-images.com/original/img/1d2h...,https://cf.geekdo-images.com/thumb/img/Ohqc2KT...,"[Filip Murmak, Radim Pech, Jakub Politzer, Mil...",[Vlaada Chvátil],2015,Through the Ages: A New Story of Civilization ...,"[Card Game, Civilization, Economic]","[Action Points, Auction/Bidding, Auction: Dutc...",14,"[Czech Games Edition, Cranio Creations, Devir,..."


In [130]:
# Checking which columns overlap between the two
set(categories_collapsed.columns).intersection(set(df.columns))

{'id'}

In [144]:
len(set(df['id']).intersection(set(categories_collapsed['id'])))
# set(categories_collapsed['id'])

17145

In [149]:
df_with_categories = df.merge(categories_collapsed, how='inner', on='id')

In [151]:
df_with_categories[df_with_categories['id']==174430]

Unnamed: 0,id,name,stats,image,thumbnail,artists,designers,year,description,categories,...,categories_Transportation,categories_Travel,categories_Trivia,categories_Video Game Theme,categories_Vietnam War,categories_Wargame,categories_Word Game,categories_World War I,categories_World War II,categories_Zombies
0,174430,Gloomhaven,"{'usersrated': 34855, 'average': 8.8311, 'baye...",https://cf.geekdo-images.com/original/img/lDN3...,https://cf.geekdo-images.com/thumb/img/e7GyV4P...,"[Alexandr Elichev, Josh T. McDowell, Alvaro Ne...",[Isaac Childres],2017,Gloomhaven is a game of Euro-inspired tactica...,"[Adventure, Exploration, Fantasy, Fighting, Mi...",...,0,0,0,0,0,0,0,0,0,0


In [150]:
# We haven't lost any rows! (except the ones with no categories, which is ok)
df_with_categories.shape

(17145, 96)

## Do the same for mechanics!

In [152]:
# https://stackoverflow.com/questions/27263805/pandas-column-of-lists-create-a-row-for-each-list-element
lst_col = 'mechanics'

mechanics_split = pd.DataFrame({
      col:np.repeat(df_with_categories[col].values, df_with_categories[lst_col].str.len())
      for col in df_with_categories.columns.drop(lst_col)}
    ).assign(**{lst_col:np.concatenate(df_with_categories[lst_col].values)})[df_with_categories.columns]

In [156]:
mechanics_split.shape

(43011, 96)

In [155]:
mechanics_split.head(3)

Unnamed: 0,id,name,stats,image,thumbnail,artists,designers,year,description,categories,...,categories_Transportation,categories_Travel,categories_Trivia,categories_Video Game Theme,categories_Vietnam War,categories_Wargame,categories_Word Game,categories_World War I,categories_World War II,categories_Zombies
0,174430,Gloomhaven,"{'usersrated': 34855, 'average': 8.8311, 'baye...",https://cf.geekdo-images.com/original/img/lDN3...,https://cf.geekdo-images.com/thumb/img/e7GyV4P...,"[Alexandr Elichev, Josh T. McDowell, Alvaro Ne...",[Isaac Childres],2017,Gloomhaven is a game of Euro-inspired tactica...,"[Adventure, Exploration, Fantasy, Fighting, Mi...",...,0,0,0,0,0,0,0,0,0,0
1,174430,Gloomhaven,"{'usersrated': 34855, 'average': 8.8311, 'baye...",https://cf.geekdo-images.com/original/img/lDN3...,https://cf.geekdo-images.com/thumb/img/e7GyV4P...,"[Alexandr Elichev, Josh T. McDowell, Alvaro Ne...",[Isaac Childres],2017,Gloomhaven is a game of Euro-inspired tactica...,"[Adventure, Exploration, Fantasy, Fighting, Mi...",...,0,0,0,0,0,0,0,0,0,0
2,174430,Gloomhaven,"{'usersrated': 34855, 'average': 8.8311, 'baye...",https://cf.geekdo-images.com/original/img/lDN3...,https://cf.geekdo-images.com/thumb/img/e7GyV4P...,"[Alexandr Elichev, Josh T. McDowell, Alvaro Ne...",[Isaac Childres],2017,Gloomhaven is a game of Euro-inspired tactica...,"[Adventure, Exploration, Fantasy, Fighting, Mi...",...,0,0,0,0,0,0,0,0,0,0


In [157]:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.get_dummies.html
mechanics_one_hot = pd.get_dummies(mechanics_split, columns=['mechanics'])
mechanics_one_hot.head(3)

Unnamed: 0,id,name,stats,image,thumbnail,artists,designers,year,description,categories,...,mechanics_Turn Order: Stat-Based,mechanics_Variable Phase Order,mechanics_Variable Player Powers,mechanics_Variable Setup,mechanics_Victory Points as a Resource,mechanics_Voting,mechanics_Worker Placement,mechanics_Worker Placement with Dice Workers,"mechanics_Worker Placement, Different Worker Types",mechanics_Zone of Control
0,174430,Gloomhaven,"{'usersrated': 34855, 'average': 8.8311, 'baye...",https://cf.geekdo-images.com/original/img/lDN3...,https://cf.geekdo-images.com/thumb/img/e7GyV4P...,"[Alexandr Elichev, Josh T. McDowell, Alvaro Ne...",[Isaac Childres],2017,Gloomhaven is a game of Euro-inspired tactica...,"[Adventure, Exploration, Fantasy, Fighting, Mi...",...,0,0,0,0,0,0,0,0,0,0
1,174430,Gloomhaven,"{'usersrated': 34855, 'average': 8.8311, 'baye...",https://cf.geekdo-images.com/original/img/lDN3...,https://cf.geekdo-images.com/thumb/img/e7GyV4P...,"[Alexandr Elichev, Josh T. McDowell, Alvaro Ne...",[Isaac Childres],2017,Gloomhaven is a game of Euro-inspired tactica...,"[Adventure, Exploration, Fantasy, Fighting, Mi...",...,0,0,0,0,0,0,0,0,0,0
2,174430,Gloomhaven,"{'usersrated': 34855, 'average': 8.8311, 'baye...",https://cf.geekdo-images.com/original/img/lDN3...,https://cf.geekdo-images.com/thumb/img/e7GyV4P...,"[Alexandr Elichev, Josh T. McDowell, Alvaro Ne...",[Isaac Childres],2017,Gloomhaven is a game of Euro-inspired tactica...,"[Adventure, Exploration, Fantasy, Fighting, Mi...",...,0,0,0,0,0,0,0,0,0,0


In [159]:
mechanics_one_hot.shape

(43011, 277)

In [164]:
mechanics_one_hot.columns[95:]

Index(['mechanics_Acting', 'mechanics_Action Drafting',
       'mechanics_Action Points', 'mechanics_Action Queue',
       'mechanics_Action Retrieval', 'mechanics_Action Timer',
       'mechanics_Action/Event', 'mechanics_Advantage Token',
       'mechanics_Alliances', 'mechanics_Area Majority / Influence',
       ...
       'mechanics_Turn Order: Stat-Based', 'mechanics_Variable Phase Order',
       'mechanics_Variable Player Powers', 'mechanics_Variable Setup',
       'mechanics_Victory Points as a Resource', 'mechanics_Voting',
       'mechanics_Worker Placement',
       'mechanics_Worker Placement with Dice Workers',
       'mechanics_Worker Placement, Different Worker Types',
       'mechanics_Zone of Control'],
      dtype='object', length=182)

In [168]:
# Taking only the mechanics columns & the id column
id_and_mechanics_columns = ['id'] + list(mechanics_one_hot.columns[95:])
mechanics_one_hot[id_and_mechanics_columns].head()

Unnamed: 0,id,mechanics_Acting,mechanics_Action Drafting,mechanics_Action Points,mechanics_Action Queue,mechanics_Action Retrieval,mechanics_Action Timer,mechanics_Action/Event,mechanics_Advantage Token,mechanics_Alliances,...,mechanics_Turn Order: Stat-Based,mechanics_Variable Phase Order,mechanics_Variable Player Powers,mechanics_Variable Setup,mechanics_Victory Points as a Resource,mechanics_Voting,mechanics_Worker Placement,mechanics_Worker Placement with Dice Workers,"mechanics_Worker Placement, Different Worker Types",mechanics_Zone of Control
0,174430,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,174430,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,174430,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,174430,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,174430,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [169]:
# Collapsing the one hot encoded mechanics
mechanics_collapsed = mechanics_one_hot[id_and_mechanics_columns].groupby(by='id', as_index=False).sum()
mechanics_collapsed.head()

Unnamed: 0,id,mechanics_Acting,mechanics_Action Drafting,mechanics_Action Points,mechanics_Action Queue,mechanics_Action Retrieval,mechanics_Action Timer,mechanics_Action/Event,mechanics_Advantage Token,mechanics_Alliances,...,mechanics_Turn Order: Stat-Based,mechanics_Variable Phase Order,mechanics_Variable Player Powers,mechanics_Variable Setup,mechanics_Victory Points as a Resource,mechanics_Voting,mechanics_Worker Placement,mechanics_Worker Placement with Dice Workers,"mechanics_Worker Placement, Different Worker Types",mechanics_Zone of Control
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [170]:
mechanics_collapsed.shape

(15727, 183)

In [172]:
df_with_categories.shape

(17145, 96)

In [187]:
# Calculating the new number of games with missing mechancis (because some of them might have been removed when we dropped games with missing categories)
mechanics_missing_new = sum([True for game_mechanics in list(df_with_categories['mechanics']) if game_mechanics==[]])

In [188]:
# Confirming that the rows we lost are only for games that don't have any mechanics! Yes!
mechanics_collapsed.shape[0] + mechanics_missing_new == df_with_categories.shape[0] 

True

In [189]:
# Finally, merge mechanics columns with df_with_categories
df_with_categories_mechanics = df_with_categories.merge(mechanics_collapsed, how='inner', on='id')
df_with_categories_mechanics.head(3)

Unnamed: 0,id,name,stats,image,thumbnail,artists,designers,year,description,categories,...,mechanics_Turn Order: Stat-Based,mechanics_Variable Phase Order,mechanics_Variable Player Powers,mechanics_Variable Setup,mechanics_Victory Points as a Resource,mechanics_Voting,mechanics_Worker Placement,mechanics_Worker Placement with Dice Workers,"mechanics_Worker Placement, Different Worker Types",mechanics_Zone of Control
0,174430,Gloomhaven,"{'usersrated': 34855, 'average': 8.8311, 'baye...",https://cf.geekdo-images.com/original/img/lDN3...,https://cf.geekdo-images.com/thumb/img/e7GyV4P...,"[Alexandr Elichev, Josh T. McDowell, Alvaro Ne...",[Isaac Childres],2017,Gloomhaven is a game of Euro-inspired tactica...,"[Adventure, Exploration, Fantasy, Fighting, Mi...",...,0,0,1,0,0,0,0,0,0,0
1,161936,Pandemic Legacy: Season 1,"{'usersrated': 36949, 'average': 8.62291, 'bay...",https://cf.geekdo-images.com/original/img/P_Sw...,https://cf.geekdo-images.com/thumb/img/WI5NmPd...,[Chris Quilliams],"[Rob Daviau, Matt Leacock]",2015,Pandemic Legacy is a co-operative campaign gam...,"[Environmental, Medical]",...,0,0,1,0,0,0,0,0,0,0
2,182028,Through the Ages: A New Story of Civilization,"{'usersrated': 19929, 'average': 8.46877, 'bay...",https://cf.geekdo-images.com/original/img/1d2h...,https://cf.geekdo-images.com/thumb/img/Ohqc2KT...,"[Filip Murmak, Radim Pech, Jakub Politzer, Mil...",[Vlaada Chvátil],2015,Through the Ages: A New Story of Civilization ...,"[Card Game, Civilization, Economic]",...,0,0,0,0,0,0,0,0,0,0


In [190]:
df_with_categories_mechanics[df_with_categories_mechanics['id']==174430]

Unnamed: 0,id,name,stats,image,thumbnail,artists,designers,year,description,categories,...,mechanics_Turn Order: Stat-Based,mechanics_Variable Phase Order,mechanics_Variable Player Powers,mechanics_Variable Setup,mechanics_Victory Points as a Resource,mechanics_Voting,mechanics_Worker Placement,mechanics_Worker Placement with Dice Workers,"mechanics_Worker Placement, Different Worker Types",mechanics_Zone of Control
0,174430,Gloomhaven,"{'usersrated': 34855, 'average': 8.8311, 'baye...",https://cf.geekdo-images.com/original/img/lDN3...,https://cf.geekdo-images.com/thumb/img/e7GyV4P...,"[Alexandr Elichev, Josh T. McDowell, Alvaro Ne...",[Isaac Childres],2017,Gloomhaven is a game of Euro-inspired tactica...,"[Adventure, Exploration, Fantasy, Fighting, Mi...",...,0,0,1,0,0,0,0,0,0,0


In [195]:
# We haven't lost any rows! (except the ones with no mechanics, which is ok)
df_with_categories_mechanics.shape[0] == min(df_with_categories.shape[0], mechanics_collapsed.shape[0])

True

## Checking that all the proper mechanics are here

In [209]:
gloomhaven = df_with_categories_mechanics[df_with_categories_mechanics['id']==174430]
gloomhaven_mechanics = gloomhaven[gloomhaven.columns[96:]]
gloomhaven_mechanics

Unnamed: 0,mechanics_Acting,mechanics_Action Drafting,mechanics_Action Points,mechanics_Action Queue,mechanics_Action Retrieval,mechanics_Action Timer,mechanics_Action/Event,mechanics_Advantage Token,mechanics_Alliances,mechanics_Area Majority / Influence,...,mechanics_Turn Order: Stat-Based,mechanics_Variable Phase Order,mechanics_Variable Player Powers,mechanics_Variable Setup,mechanics_Victory Points as a Resource,mechanics_Voting,mechanics_Worker Placement,mechanics_Worker Placement with Dice Workers,"mechanics_Worker Placement, Different Worker Types",mechanics_Zone of Control
0,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [218]:
sum(list(gloomhaven_mechanics.values[0]))

15

In [221]:
# Recall, game is data[0], which is Gloomhaven
len(game['mechanics'])

15

## Saving data

In [223]:
df_with_categories_mechanics.head(2)

Unnamed: 0,id,name,stats,image,thumbnail,artists,designers,year,description,categories,...,mechanics_Turn Order: Stat-Based,mechanics_Variable Phase Order,mechanics_Variable Player Powers,mechanics_Variable Setup,mechanics_Victory Points as a Resource,mechanics_Voting,mechanics_Worker Placement,mechanics_Worker Placement with Dice Workers,"mechanics_Worker Placement, Different Worker Types",mechanics_Zone of Control
0,174430,Gloomhaven,"{'usersrated': 34855, 'average': 8.8311, 'baye...",https://cf.geekdo-images.com/original/img/lDN3...,https://cf.geekdo-images.com/thumb/img/e7GyV4P...,"[Alexandr Elichev, Josh T. McDowell, Alvaro Ne...",[Isaac Childres],2017,Gloomhaven is a game of Euro-inspired tactica...,"[Adventure, Exploration, Fantasy, Fighting, Mi...",...,0,0,1,0,0,0,0,0,0,0
1,161936,Pandemic Legacy: Season 1,"{'usersrated': 36949, 'average': 8.62291, 'bay...",https://cf.geekdo-images.com/original/img/P_Sw...,https://cf.geekdo-images.com/thumb/img/WI5NmPd...,[Chris Quilliams],"[Rob Daviau, Matt Leacock]",2015,Pandemic Legacy is a co-operative campaign gam...,"[Environmental, Medical]",...,0,0,1,0,0,0,0,0,0,0


In [229]:
df_dict = df_with_categories_mechanics.to_dict(orient='records')

In [231]:
with open('games_one_hot.json', 'w') as fp:
    json.dump(df_dict, fp)

## Appendix

In [121]:
# categories_one_hot.groupby(by=['id', 'year'],as_index=False).sum()

In [122]:
# categories_one_hot.groupby(by=list(categories_one_hot.columns[:12]),as_index=False).sum()

In [123]:
# r.pivot(columns = 'categories')

In [22]:
game['stats']

{'usersrated': 34855,
 'average': 8.8311,
 'bayesaverage': 8.57594,
 'stddev': 1.60889,
 'median': 0.0,
 'owned': 56031,
 'trading': 347,
 'wanting': 1417,
 'wishing': 14655,
 'numcomments': 6553,
 'numweights': 1497,
 'averageweight': 3.827,
 'ranks': [{'id': '1',
   'name': 'boardgame',
   'friendlyname': 'Board Game Rank',
   'value': 1},
  {'id': '5496',
   'name': 'thematic',
   'friendlyname': 'Thematic Rank',
   'value': 1},
  {'id': '5497',
   'name': 'strategygames',
   'friendlyname': 'Strategy Game Rank',
   'value': 1}]}