## BGG Data Feature Engineering

### Imports

In [12]:
import pandas as pd
import re

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)

from IPython.display import display, Markdown

### Functions

In [14]:
# Function that converts a string to lowercase, and replaces whitespace with underscores, unless the whitespace is adjacent to ' // '
def format_text(words):
    # converts the string to lowercase
    result = str(words).lower()

    # replaces whitespace with underscores, unless the whitespace is adjacent to ' // '
    return(re.sub(r'(?<!//)\s(?!//)', '_',result))

# Function to replace '//' with ''
def remove_double_slashes(text):
    return text.replace(' // ', ' ')

### Creating dataframe copy for feature engineering

In [16]:
# Creating a copy of games_clean.csv as fe_df to safeguard against data loss from any mistakes whilst feature engineering the data
fe_df = pd.read_csv('games_clean.csv')

### Formating the dataframe

In [18]:
# Formatting all text fields to ensure that labels with multiple words are distinguishable and easier to separate from other labels by replacing whitespace with an underscore, expect adjacent to the '//' separator
fe_df['category'] = fe_df['category'].apply(format_text)
fe_df['mechanism'] = fe_df['mechanism'].apply(format_text)
fe_df['game designer'] = fe_df['game designer'].apply(format_text)
fe_df['publisher'] = fe_df['publisher'].apply(format_text)

In [19]:
# Checking all text fields have been formatted as expected
fe_df[['id','name','category','mechanism','game designer','publisher']]

Unnamed: 0,id,name,category,mechanism,game designer,publisher
0,224517,Brass: Birmingham,economic // post-napoleonic // trains // age_of_reason // transportation // industry_/_manufacturing,loans // network_and_route_building // market // hand_management // tech_trees_/_tech_tracks // turn_order:_stat-based // variable_set-up // tags // income,matt_tolman // gavan_brown // martin_wallace,funforge // boardm_factory // lanlalen // gém_klub_kft. // cmon_global_limited // 盒拍工作室_hepa_studio // rebel_sp._z_o.o. // ghenos_games // lord_of_boards // arclight_games // conclave_editora // dexker_games // maldito_games // tlama_games // phalanx // giant_roc // board_game_rookie // white_goblin_games // roxley // crowd_games
1,161936,Pandemic Legacy: Season 1,medical // environmental,variable_player_powers // cooperative_game // hand_management // action_points // scenario_/_mission_/_campaign_game // legacy_game // point_to_point_movement // set_collection // trading // tags,matt_leacock // rob_daviau,z-man_games // lifestyle_boardgames_ltd // hobby_japan // devir // korea_boardgames // asterion_press // gém_klub_kft. // jolly_thinkers // ігромаг // mindok // filosofia_éditions // lacerta
2,174430,Gloomhaven,fighting // miniatures // adventure // fantasy // exploration,communication_limits // variable_player_powers // cooperative_game // simultaneous_action_selection // critical_hits_and_failures // legacy_game // tags // campaign_/_battle_card_driven // deck_construction // solo_/_solitaire_game // action_queue // narrative_choice_/_paragraph // card_play_conflict_resolution // line_of_sight // action_retrieval // hexagon_grid // role_playing // hand_management // scenario_/_mission_/_campaign_game // multi-use_cards // modular_board // grid_movement // once-per-game_abilities,isaac_childres,"games_warehouse // mybg_co.,_ltd. // arclight_games // albi_polska // feuerland_spiele // korea_boardgames // galápagos_jogos // gém_klub_kft. // hobby_world // cephalofair_games // albi"
3,342942,Ark Nova,economic // animals // environmental,open_drafting // variable_player_powers // variable_set-up // hand_management // increase_value_of_unchosen_resources // grid_coverage // end_game_bonuses // solo_/_solitaire_game // race // hexagon_grid // tile_placement // set_collection // action_queue // tags // income,mathias_wigge,capstone_games // テンデイズゲームズ(tendaysgames) // gém_klub_kft. // mindok // grok_games // cmon_global_limited // mipl // game_harbor // korea_boardgames // tower_tactic_games // cranio_creations // ludofy_creative // maldito_games // super_meeple // white_goblin_games // feuerland_spiele // portal_games // regatul_jocurilor // igames // lautapelit.fi
4,233078,Twilight Imperium: Fourth Edition,economic // space_exploration // political // wargame // negotiation // civilization // exploration // science_fiction,movement_points // variable_player_powers // action_drafting // follow // increase_value_of_unchosen_resources // tech_trees_/_tech_tracks // race // voting // hexagon_grid // modular_board // area-impulse // variable_phase_order // king_of_the_hill // grid_movement // dice_rolling // trading // variable_set-up,christian_t._petersen // corey_konieczka // dane_beltrami,sternenschimmermeer // galakta // arclight_games // geekach_games // edge_entertainment // asterion_press // galápagos_jogos // playfun_games // hobby_world // adc_blackfire_entertainment // fantasy_flight_games
...,...,...,...,...,...,...
4826,387144,R-ECO+,environmental // card_game // bluffing,open_drafting // hand_management,susumu_kawasaki,hobby_japan
4827,391288,Firefly: The Game – 10th Anniversary Collector's Edition,travel // space_exploration // adventure // movies_/_tv_/_radio_theme // science_fiction,variable_player_powers // pick-up_and_deliver // area_movement // solo_/_solitaire_game // take_that // dice_rolling // open_drafting // trading,joe_kepler // george_krubski // sean_sweigart // aaron_dill,"gale_force_nine,_llc"
4828,405752,Project L: Collector's Edition,abstract_strategy // video_game_theme // puzzle,open_drafting // action_points // pattern_building // tile_placement,michal_mikeš // adam_španěl // jan_soukal // filip_daňhel,boardcubator
4829,406767,Hegemony: Lead Your Class to Victory – Extended Edition,economic // educational // negotiation // political,"variable_player_powers // hand_management // action/event // simulation // roles_with_asymmetric_information // voting // worker_placement,_different_worker_types",varnavas_timotheou // vangelis_bagiartakis,giant_roc // boardm_factory // fox_in_the_box // galápagos_jogos // bumble3ee_interactive // hegemonic_project_games // ігромаг // don't_panic_games // mindok


### Creating bag of words

In [21]:
# Create bag of words for each game by combining all the unique labels of mechanism, game designer and publisher to a new column 'bag of words':
fe_df['bag of words'] = fe_df[['category','mechanism','game designer','publisher']].apply(
    lambda x: ' '.join(x.dropna().astype(str)),
    axis=1
)

# Apply the function, to replace // spearator with whitespace, to the entire DataFrame using applymap
fe_df = fe_df.map(lambda x: remove_double_slashes(x) if isinstance(x, str) else x)
fe_df['bag of words'] = fe_df['bag of words'].str.replace(' // ', ' ')

# Checking new column 'data'
fe_df[['id','name','category','mechanism','game designer','publisher','bag of words']].head(5)

Unnamed: 0,id,name,category,mechanism,game designer,publisher,bag of words
0,224517,Brass: Birmingham,economic post-napoleonic trains age_of_reason transportation industry_/_manufacturing,loans network_and_route_building market hand_management tech_trees_/_tech_tracks turn_order:_stat-based variable_set-up tags income,matt_tolman gavan_brown martin_wallace,funforge boardm_factory lanlalen gém_klub_kft. cmon_global_limited 盒拍工作室_hepa_studio rebel_sp._z_o.o. ghenos_games lord_of_boards arclight_games conclave_editora dexker_games maldito_games tlama_games phalanx giant_roc board_game_rookie white_goblin_games roxley crowd_games,economic post-napoleonic trains age_of_reason transportation industry_/_manufacturing loans network_and_route_building market hand_management tech_trees_/_tech_tracks turn_order:_stat-based variable_set-up tags income matt_tolman gavan_brown martin_wallace funforge boardm_factory lanlalen gém_klub_kft. cmon_global_limited 盒拍工作室_hepa_studio rebel_sp._z_o.o. ghenos_games lord_of_boards arclight_games conclave_editora dexker_games maldito_games tlama_games phalanx giant_roc board_game_rookie white_goblin_games roxley crowd_games
1,161936,Pandemic Legacy: Season 1,medical environmental,variable_player_powers cooperative_game hand_management action_points scenario_/_mission_/_campaign_game legacy_game point_to_point_movement set_collection trading tags,matt_leacock rob_daviau,z-man_games lifestyle_boardgames_ltd hobby_japan devir korea_boardgames asterion_press gém_klub_kft. jolly_thinkers ігромаг mindok filosofia_éditions lacerta,medical environmental variable_player_powers cooperative_game hand_management action_points scenario_/_mission_/_campaign_game legacy_game point_to_point_movement set_collection trading tags matt_leacock rob_daviau z-man_games lifestyle_boardgames_ltd hobby_japan devir korea_boardgames asterion_press gém_klub_kft. jolly_thinkers ігромаг mindok filosofia_éditions lacerta
2,174430,Gloomhaven,fighting miniatures adventure fantasy exploration,communication_limits variable_player_powers cooperative_game simultaneous_action_selection critical_hits_and_failures legacy_game tags campaign_/_battle_card_driven deck_construction solo_/_solitaire_game action_queue narrative_choice_/_paragraph card_play_conflict_resolution line_of_sight action_retrieval hexagon_grid role_playing hand_management scenario_/_mission_/_campaign_game multi-use_cards modular_board grid_movement once-per-game_abilities,isaac_childres,"games_warehouse mybg_co.,_ltd. arclight_games albi_polska feuerland_spiele korea_boardgames galápagos_jogos gém_klub_kft. hobby_world cephalofair_games albi","fighting miniatures adventure fantasy exploration communication_limits variable_player_powers cooperative_game simultaneous_action_selection critical_hits_and_failures legacy_game tags campaign_/_battle_card_driven deck_construction solo_/_solitaire_game action_queue narrative_choice_/_paragraph card_play_conflict_resolution line_of_sight action_retrieval hexagon_grid role_playing hand_management scenario_/_mission_/_campaign_game multi-use_cards modular_board grid_movement once-per-game_abilities isaac_childres games_warehouse mybg_co.,_ltd. arclight_games albi_polska feuerland_spiele korea_boardgames galápagos_jogos gém_klub_kft. hobby_world cephalofair_games albi"
3,342942,Ark Nova,economic animals environmental,open_drafting variable_player_powers variable_set-up hand_management increase_value_of_unchosen_resources grid_coverage end_game_bonuses solo_/_solitaire_game race hexagon_grid tile_placement set_collection action_queue tags income,mathias_wigge,capstone_games テンデイズゲームズ(tendaysgames) gém_klub_kft. mindok grok_games cmon_global_limited mipl game_harbor korea_boardgames tower_tactic_games cranio_creations ludofy_creative maldito_games super_meeple white_goblin_games feuerland_spiele portal_games regatul_jocurilor igames lautapelit.fi,economic animals environmental open_drafting variable_player_powers variable_set-up hand_management increase_value_of_unchosen_resources grid_coverage end_game_bonuses solo_/_solitaire_game race hexagon_grid tile_placement set_collection action_queue tags income mathias_wigge capstone_games テンデイズゲームズ(tendaysgames) gém_klub_kft. mindok grok_games cmon_global_limited mipl game_harbor korea_boardgames tower_tactic_games cranio_creations ludofy_creative maldito_games super_meeple white_goblin_games feuerland_spiele portal_games regatul_jocurilor igames lautapelit.fi
4,233078,Twilight Imperium: Fourth Edition,economic space_exploration political wargame negotiation civilization exploration science_fiction,movement_points variable_player_powers action_drafting follow increase_value_of_unchosen_resources tech_trees_/_tech_tracks race voting hexagon_grid modular_board area-impulse variable_phase_order king_of_the_hill grid_movement dice_rolling trading variable_set-up,christian_t._petersen corey_konieczka dane_beltrami,sternenschimmermeer galakta arclight_games geekach_games edge_entertainment asterion_press galápagos_jogos playfun_games hobby_world adc_blackfire_entertainment fantasy_flight_games,economic space_exploration political wargame negotiation civilization exploration science_fiction movement_points variable_player_powers action_drafting follow increase_value_of_unchosen_resources tech_trees_/_tech_tracks race voting hexagon_grid modular_board area-impulse variable_phase_order king_of_the_hill grid_movement dice_rolling trading variable_set-up christian_t._petersen corey_konieczka dane_beltrami sternenschimmermeer galakta arclight_games geekach_games edge_entertainment asterion_press galápagos_jogos playfun_games hobby_world adc_blackfire_entertainment fantasy_flight_games


### Saving feature engineered dataframe

In [23]:
# Saving fe_df data to csv
fe_df.to_csv('games_fe.csv', index=False) 