## Title

In [50]:
# import packages
import pandas as pd
import os

from sklearn.model_selection import train_test_split

Define helper functions

In [107]:
# helper functions

# one-hot encoding for n most prevalent features
def apply_one_hot(df: pd.DataFrame, column_name: str, n: int) -> pd.DataFrame:

    """Applies one-hot encoding on comma separated data.

    Args:
        df (pd.DataFrame): dataframe with comma separated items.
        column_name (str): name of column within the specified dataframe.
        n (int): how many of the most frequently occuring items to encode.

    Returns:
        pd.DataFrame
    """

    df[column_name] = df[column_name].astype(str)
    
    # get counts of each unique occurence
    item_count = {}

    for index, row in df.iterrows():
        items = row[column_name].split(", ") # separate on comma
        for i in items:
            if i not in item_count:
                item_count[i] = 1 # add item to item_count dictionary
            else:
                item_count[i] += 1 # incrementally increase item count

    # sort by count, get n most frequently occuring items
    top_items = sorted(item_count, key=item_count.get, reverse=True)[:n]

    # make into df
    items_df = pd.DataFrame(columns=top_items)

    # iterate over rows of original df, set the values in the new df
    for index, row in df.iterrows():
        items = row[column_name].split(", ")
        items_dict = {item: int(item in items) for item in top_items}
        items_df = items_df.append(items_dict, ignore_index=True)

    # concatenate with original df
    df = pd.concat([df, items_df], axis=1)

    return df


Read data and clean

In [118]:
# point to dataset 
infile = os.path.join("data", "bgg_dataset.csv")
d = pd.read_csv(infile, sep =";")

In [119]:
# apply one-hot encoding to relevant columns

cols_to_encode = {"Mechanics": 10, "Domains": 6} # dictionary of columns and corresponding number of items

for col, n in cols_to_encode.items():
    d = apply_one_hot(d, col, n) # for n most prevalent items in each column

# finishing details -- drop nan columns
d = d[d.columns.drop(list(d.filter(regex='nan')))]

In [121]:
d.head()

Unnamed: 0,ID,Name,Year Published,Min Players,Max Players,Play Time,Min Age,Users Rated,Rating Average,BGG Rank,...,Hexagon Grid,Simulation,Card Drafting,Tile Placement,Modular Board,Wargames,Strategy Games,Family Games,Thematic Games,Abstract Games
0,174430.0,Gloomhaven,2017.0,1,4,120,14,42055,879,1,...,1,0,0,0,1,0,1,0,1,0
1,161936.0,Pandemic Legacy: Season 1,2015.0,2,4,60,13,41643,861,2,...,0,0,0,0,0,0,1,0,1,0
2,224517.0,Brass: Birmingham,2018.0,2,4,120,14,19217,866,3,...,0,0,0,0,0,0,1,0,0,0
3,167791.0,Terraforming Mars,2016.0,1,5,120,12,64864,843,4,...,1,0,1,1,0,0,1,0,0,0
4,233078.0,Twilight Imperium: Fourth Edition,2017.0,3,6,480,14,13468,870,5,...,1,0,0,0,1,0,1,0,1,0


Split cleaned data into separate train and test sets

In [122]:
# define train and test set
train, test = train_test_split(d, test_size=0.2, random_state=6778) # set aside 20 percent