In [1]:
# Take the V1 example and use for inference in the app. Testing notebook

In [2]:
# Import the required modules
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

# Pre-Processing
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Models
from sklearn.neighbors import NearestNeighbors

# suppress warnings
import warnings
warnings.filterwarnings('ignore')

import pickle

In [3]:
PICKLE_PATH = "../app/board_games_pipeline.pkl"
COLS_TO_DROP = []
META_COLS = ['game_id', 'name']
FEATURE_COLS = ['max_players', 'max_playtime', 'min_age', 'min_players',
   'min_playtime', 'playing_time', 'category', 'mechanic',
   'average_rating', 'users_rated', 'category_count', 'mechanic_count',
   'has_expansion', 'binned_playtime', 'binned_mechanics',
   'binned_min_age', 'binned_category', 'len_description',
   'description_sentiment',]
NUMERIC_FEATURES = ['max_players', 'max_playtime', 'min_age', 'min_players',
   'min_playtime', 'playing_time', 
   'average_rating', 'users_rated', 'category_count', 'mechanic_count',
   'len_description', 'description_sentiment',]
BINARY_FEATURES = ['has_expansion']
CATEGORICAL_FEATURES = ['category', 'mechanic', 'binned_playtime', 'binned_mechanics',
   'binned_min_age', 'binned_category',]

## Define Preprocessor & Model 

In [4]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Fill missing values with the mean
    ('scaler', StandardScaler())]) 

In [5]:
binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent', missing_values=pd.NA)),  # Fill missing values
    ('label', OrdinalEncoder())])  # Convert binary features to ordinal values

In [6]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent', missing_values=pd.NA)),  # Fill missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])  # Apply one-hot encoding

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, NUMERIC_FEATURES),  # Numeric features preprocessing
        ('binary', binary_transformer, BINARY_FEATURES),  # Binary features preprocessing (if any)
        ('cat', categorical_transformer, CATEGORICAL_FEATURES)])  

In [8]:
nn_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', NearestNeighbors(metric="cosine"))    # N Neighhors default = 5 
])

## Read & Process the Data

In [9]:
df = pd.read_csv("resources/ml_clean.csv")
df = df.dropna(how="any").reset_index(drop=True)
df = df.drop(columns=COLS_TO_DROP)
df["name_lower"] = df['name'].str.lower()
df = df.drop_duplicates(subset=['name_lower']).reset_index(drop=True)
df

Unnamed: 0,game_id,max_players,max_playtime,min_age,min_players,min_playtime,name,playing_time,category,mechanic,average_rating,users_rated,category_count,mechanic_count,has_expansion,binned_playtime,binned_mechanics,binned_min_age,binned_category,len_description,description_sentiment,name_lower
0,1,5,240,14,3,240,Die Macher,240,economic,area control / area influence,7.66508,4498,3,5,0,Very Long (121-240 minutes),Strategic Mechanics,Teen (13+),Strategy,222,0.091012,die macher
1,2,4,30,12,3,30,Dragonmaster,30,card game,trick-taking,6.60815,478,2,1,0,Short (0-30 minutes),Card & Pool Mechanics,Pre-Teen (10-12),Unknown/Miscellaneous,154,0.055291,dragonmaster
2,3,4,60,10,2,30,Samurai,60,abstract strategy,area control / area influence,7.44119,12019,2,4,0,Medium (31-60 minutes),Strategic Mechanics,Pre-Teen (10-12),Strategy,183,-0.056250,samurai
3,4,4,60,12,2,60,Tal der Könige,60,ancient,action point allowance system,6.60675,314,1,4,0,Medium (31-60 minutes),Strategic Mechanics,Pre-Teen (10-12),History/War,104,-0.007908,tal der könige
4,5,6,90,12,3,90,Acquire,90,economic,hand management,7.35830,15195,1,3,0,Long (61-120 minutes),Pattern & Design Mechanics,Pre-Teen (10-12),Strategy,191,0.168056,acquire
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9354,214996,2,480,12,1,30,"Silver Bayonet: The First Team in Vietnam, 196...",480,vietnam war,hex-and-counter,8.35333,75,2,1,0,Extended (241+ minutes),Unique Mechanics,Pre-Teen (10-12),History/War,186,0.127083,"silver bayonet: the first team in vietnam, 196..."
9355,215437,5,45,13,2,45,Codex: Card-Time Strategy – Core Set,45,card game,deck / pool building,8.08780,82,4,3,1,Medium (31-60 minutes),Game Structure Mechanics,Teen (13+),Unknown/Miscellaneous,99,-0.193750,codex: card-time strategy – core set
9356,215471,4,20,12,2,20,Wind the Film!,20,card game,hand management,7.28016,63,1,2,0,Short (0-30 minutes),Card & Pool Mechanics,Pre-Teen (10-12),Unknown/Miscellaneous,152,0.318452,wind the film!
9357,216201,6,120,12,2,20,Robo Rally (2016),120,miniatures,action / movement programming,7.45871,341,3,4,0,Long (61-120 minutes),Action & Movement Mechanics,Pre-Teen (10-12),Real-life/Simulation,463,0.008621,robo rally (2016)


In [10]:
df.columns

Index(['game_id', 'max_players', 'max_playtime', 'min_age', 'min_players',
       'min_playtime', 'name', 'playing_time', 'category', 'mechanic',
       'average_rating', 'users_rated', 'category_count', 'mechanic_count',
       'has_expansion', 'binned_playtime', 'binned_mechanics',
       'binned_min_age', 'binned_category', 'len_description',
       'description_sentiment', 'name_lower'],
      dtype='object')

In [11]:
X = df[FEATURE_COLS]

## Fit & Save Model

In [12]:
nn_pipeline.fit(X)

In [13]:
payload = {
    "feature_df": df[FEATURE_COLS + ['name']],
    "pipeline": nn_pipeline,
}

In [14]:
pickle.dump(payload, open(PICKLE_PATH, 'wb'))