In [1]:
# Take the V1 example and use for inference in the app. Testing notebook

In [2]:
# Import the required modules
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

# Pre-Processing
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Models
from sklearn.neighbors import NearestNeighbors

# suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
def make_recommendation(gamelist_length, game):
    # Step 1: Load the dataset
    df = pd.read_csv("resources/ml_clean.csv")
    
    # Remove any rows with missing values and reset the index
    df = df.dropna(how="any").reset_index(drop=True)

    # # Extract the release year from the album release date and convert to integer
    # df["track_release_year"] = df.track_album_release_date.apply(lambda x: int(x[0:4]))

    # Drop unnecessary columns that are not needed for recommendation
    df = df.drop(columns=[])

    # # Remove duplicate games based on game_id and reset index again
    # df = df.drop_duplicates(subset=[]).reset_index(drop=True)

    # Step 2: Define the columns for metadata and features
    meta_cols = ['game_id', 'name']
    feature_cols = [ 'max_players', 'max_playtime', 'min_age', 'min_players',
        'min_playtime', 'playing_time', 'category', 'mechanic',
        'average_rating', 'users_rated', 'category_count', 'mechanic_count',
        'has_expansion', 'len_description', 'description_sentiment']
    # Step 3: Define preprocessing steps for the data

    # Preprocessing for numeric features: fill missing values with the mean and standardize the values
    numeric_features = ['max_players', 'max_playtime', 'min_age', 'min_players',
       'min_playtime', 'playing_time', 
       'average_rating', 'users_rated', 'category_count', 'mechanic_count',
       'len_description', 'description_sentiment']
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),  # Fill missing values with the mean
        ('scaler', StandardScaler())])  # Standardize features

    # Preprocessing for binary features (if any): fill missing values with most frequent and label encode
    binary_features = ['has_expansion']
    binary_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent', missing_values=pd.NA)),  # Fill missing values
        ('label', OrdinalEncoder())])  # Convert binary features to ordinal values

    # Preprocessing for categorical features: fill missing values and apply one-hot encoding
    categorical_features = ['category', 'mechanic']
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent', missing_values=pd.NA)),  # Fill missing values
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])  # Apply one-hot encoding

    # Combine all preprocessing steps into a single preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),  # Numeric features preprocessing
            ('binary', binary_transformer, binary_features),  # Binary features preprocessing (if any)
            ('cat', categorical_transformer, categorical_features)])  # Categorical features preprocessing

    # Step 4: Get the target games ID based on game name
    game_id = df.loc[(df.name == game)].game_id.values[0] 

    # Step 5: Prepare the feature matrix for the Nearest Neighbors model
    X = df.loc[:, feature_cols]  # Select the feature columns

    # Apply preprocessing to the feature matrix
    preprocessor.fit(X)  # Fit the preprocessor to the feature matrix
    X_preprocessed = preprocessor.transform(X)  # Transform the feature matrix

    # Step 6: Initialize the Nearest Neighbors model
    # Define the number of neighbors (game list length) and use cosine similarity as the metric
    k = gamelist_length
    model1 = NearestNeighbors(n_neighbors=k, metric="cosine")

    # Fit the Nearest Neighbors model to the preprocessed data
    model1.fit(X_preprocessed)

    # Step 7: Extract features of the target game
    game_features = df.loc[df.game_id == game_id, feature_cols]  # Get the feature vector for the target track
    game_features_preprocessed = preprocessor.transform(game_features)  # Preprocess the target track features

    # Step 8: Find the nearest neighbors (songs most similar to the target game)
    distances, indices = model1.kneighbors(game_features_preprocessed)  # Get distances and indices of neighbors

    # Step 9: Retrieve the metadata of the recommended games
    games = df.iloc[indices[0]]  # Select tracks corresponding to the nearest neighbors
    games["distance"] = distances[0]  # Add the distance of each neighbor as a new column

    # Step 10: Filter the columns for the final output
    cols = games.columns # you can explicitly choose to return specific columns here
    games = games.loc[:, cols]  # Keep the relevant columns
    games = games.sort_values(by="distance")  # Sort the tracks by their distance (most similar first)

    # Step 11: Return the recommended tracks as a list of dictionaries
    return games.to_dict(orient="records")

In [4]:
# User input
gamelist_length = 10
game = "Catan"

response = make_recommendation(gamelist_length, game) # used for the flask app, returns JSON

# to test
pd.DataFrame(response)

Unnamed: 0,game_id,max_players,max_playtime,min_age,min_players,min_playtime,name,playing_time,category,mechanic,average_rating,users_rated,category_count,mechanic_count,has_expansion,len_description,description_sentiment,distance
0,13,4,120,10,3,60,Catan,120,negotiation,dice rolling,7.26569,67655,1,5,1,457,0.098867,2.220446e-16
1,30549,4,45,8,2,45,Pandemic,45,medical,action point allowance system,7.67234,62377,1,7,1,237,0.114088,0.01232204
2,36218,4,30,13,2,30,Dominion,30,card game,card drafting,7.69995,55930,2,3,1,289,0.130288,0.01251518
3,9209,5,60,8,2,30,Ticket to Ride,60,trains,hand management,7.48301,48227,2,3,1,229,0.042562,0.01413778
4,40692,5,80,8,2,40,Small World,80,fantasy,area control / area influence,7.35735,43144,3,4,1,250,0.084914,0.01536846
5,822,5,45,8,2,30,Carcassonne,45,city building,area control / area influence,7.43536,67056,3,2,1,204,-0.013889,0.01717995
6,68448,7,30,10,2,30,7 Wonders,30,ancient,card drafting,7.83595,51688,4,5,1,252,-0.012574,0.01906051
7,14996,5,60,8,2,30,Ticket to Ride: Europe,60,trains,card drafting,7.59192,35535,2,4,1,214,0.065083,0.019374
8,34635,4,90,10,2,60,Stone Age,90,dice,dice rolling,7.6334,30432,2,3,1,198,0.128114,0.0218127
9,2651,6,120,12,2,120,Power Grid,120,economic,auction/bidding,7.94499,42036,2,2,1,215,0.051769,0.02270821


In [5]:
def make_recommendation_1(gamelist_length, max_players, max_playtime, min_age, min_players, min_playtime, average_rating):
    # Step 1: Load the dataset
    df = pd.read_csv("resources/ml_clean.csv")
    
    # Remove any rows with missing values and reset the index
    df = df.dropna(how="any").reset_index(drop=True)

    # Step 2: Define the columns for metadata and features
    meta_cols = ['game_id', 'name']
    feature_cols = ['max_players', 'max_playtime', 'min_age', 'min_players', 'min_playtime', 'playing_time', 
                    'category', 'mechanic', 'average_rating', 'users_rated', 'category_count', 'mechanic_count',
                    'has_expansion', 'len_description', 'description_sentiment']

    # Step 3: Filter the dataset based on input parameters
    df_filtered = df[
        (df['max_players'] >= max_players) &
        (df['max_playtime'] <= max_playtime) &
        (df['min_age'] <= min_age) &
        (df['min_players'] <= min_players) &
        (df['min_playtime'] >= min_playtime) &
        (df['average_rating'] >= average_rating)
    ]

    # Step 4: Preprocess the data
    numeric_features = ['max_players', 'max_playtime', 'min_age', 'min_players', 'min_playtime', 'playing_time', 
                        'average_rating', 'users_rated', 'category_count', 'mechanic_count', 'len_description', 'description_sentiment']
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())])

    binary_features = ['has_expansion']
    binary_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('label', OrdinalEncoder())])

    categorical_features = ['category', 'mechanic']
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('binary', binary_transformer, binary_features),
            ('cat', categorical_transformer, categorical_features)])

    # Step 5: Prepare the feature matrix for the Nearest Neighbors model
    X = df_filtered.loc[:, feature_cols]  # Select the feature columns

    # Apply preprocessing to the feature matrix
    preprocessor.fit(X)  # Fit the preprocessor to the feature matrix
    X_preprocessed = preprocessor.transform(X)  # Transform the feature matrix

    # Step 6: Initialize the Nearest Neighbors model
    k = gamelist_length
    model1 = NearestNeighbors(n_neighbors=k, metric="cosine")
    model1.fit(X_preprocessed)

    # Step 7: Choose a random game from the filtered list to use as the target
    game_id = df_filtered.sample(1)['game_id'].values[0]
    game_features = df_filtered.loc[df_filtered.game_id == game_id, feature_cols]
    game_features_preprocessed = preprocessor.transform(game_features)

    # Step 8: Find the nearest neighbors (most similar games)
    distances, indices = model1.kneighbors(game_features_preprocessed)

    # Step 9: Retrieve the metadata of the recommended games
    games = df_filtered.iloc[indices[0]]
    games["distance"] = distances[0]

    # Step 10: Return the recommended games
    return games.to_dict(orient="records")


In [6]:
# User input
gamelist_length = 10
max_players = 4
max_playtime = 120
min_age = 5
min_players = 3
min_playtime = 60
average_rating = 5

response = make_recommendation_1(gamelist_length, max_players, max_playtime, min_age, min_players, min_playtime, average_rating) # used for the flask app, returns JSON

# to test
pd.DataFrame(response)

Unnamed: 0,game_id,max_players,max_playtime,min_age,min_players,min_playtime,name,playing_time,category,mechanic,average_rating,users_rated,category_count,mechanic_count,has_expansion,len_description,description_sentiment,distance
0,39914,4,60,0,2,60,Zombiegeddon,60,horror,area enclosure,5.73534,212,2,4,0,187,0.035057,1.110223e-16
1,7097,6,60,0,2,60,Go: The International Travel Game,60,travel,pick-up and deliver,5.66574,108,1,4,0,122,0.08,0.3431732
2,36634,5,60,0,2,60,Rice Wars,60,medieval,area control / area influence,5.38685,187,1,3,0,175,-0.063889,0.3769063
3,38694,4,60,0,2,60,Godzilla: Kaiju World Wars,60,fighting,action point allowance system,5.28954,153,3,4,0,153,0.065909,0.3849078
4,40483,4,60,0,2,60,Anima: Beyond Good and Evil,60,card game,card drafting,6.21421,347,2,3,0,194,-0.031597,0.4571384
5,1654,6,60,0,2,60,Double Crossing,60,economic,pick-up and deliver,5.53111,90,2,2,0,50,-0.033333,0.5009582
6,129851,6,60,0,2,60,Czech Pub,60,humor,action point allowance system,5.464,50,2,3,0,96,0.282143,0.5166355
7,39536,4,60,0,2,60,Cat & Fish,60,animals,point to point movement,6.66623,77,2,3,0,190,-0.031264,0.5467722
8,39382,6,60,0,2,60,18EZ,60,economic,route/network building,6.21622,74,3,3,0,133,-0.068136,0.5593158
9,31909,8,60,0,2,60,Die Jagd nach dem Gral,60,deduction,pick-up and deliver,5.44906,53,2,1,0,218,-0.15119,0.6077272
