In [8]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MinMaxScaler # Choose one for scaling
from scipy.sparse import hstack, csr_matrix

### Loading the Data

In [9]:
df = pd.read_csv("Dataset/games_eda.csv")

print("Sample names:", df['Name'].sample().tolist())

Sample names: ['DYNASTY WARRIORS 8: Xtreme Legends Complete Edition']


### Create a mapping from game name to index for easy lookup

In [10]:
df_unique_names = df.drop_duplicates(subset=['Name'], keep='first') # Handle potential duplicate names by keeping the first occurrence
indices = pd.Series(df_unique_names.index, index=df_unique_names['Name'])

print(f"Number of unique game names: {len(indices)}")

Number of unique game names: 22066


### Model 1: TF-IDF + Cosine Similarity

Select text and handle missing values

In [11]:
corpus = df['About the game'].fillna('') # Fill NaN values with empty strings

TF-DF Vectorization

In [12]:
# Use stop_words='english' to remove common English words
# max_df=0.8 -> ignore terms that appear in more than 80% of the documents
# min_df=5   -> ignore terms that appear in less than 5 documents
# ngram_range=(1, 2) -> consider both single words and two-word phrases
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.8, min_df=5, ngram_range=(1, 2))
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)


print(f"TF-IDF matrix shape: {tfidf_matrix.shape}") # (number of games, number of features/terms)

TF-IDF matrix shape: (22105, 89540)


Computes the pairwise similarity between all games based on their TF-IDF vectors

In [13]:
cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix) # Compute cosine similarity matrix
print(f"Cosine similarity matrix shape: {cosine_sim_matrix.shape}")

Cosine similarity matrix shape: (22105, 22105)


In [14]:
def get_tfidf_recommendations(title, cosine_sim=cosine_sim_matrix, data_indices=indices, data_df=df):
    """
    Gets game recommendations based on TF-IDF cosine similarity.

    Args:
        title (str): The title of the game to find recommendations for.
        cosine_sim (np.ndarray): The precomputed cosine similarity matrix.
        data_indices (pd.Series): Series mapping game titles to their index.
        data_df (pd.DataFrame): The original dataframe to get game names from index.

    Returns:
        list: A list of recommended game titles, or None if title not found.
    """
    if title not in data_indices:
        print(f"Error: Game '{title}' not found in the dataset.")
        # Try finding partial matches
        possible_matches = [name for name in data_indices.index if title.lower() in name.lower()]
        print(f"Finding similar games for: {possible_matches[0]} instead as '{title}' was not found.")
        title = possible_matches[0] if possible_matches else None
        if not title:
            return None
        
    idx = data_indices[title] # Get the index of the game in the DataFrame

    # Get the pairwise similarity scores of all games with that game
    # enumerate adds a counter to the iterable, list converts it to a list of (index, score) tuples
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the games based on the similarity scores (descending order)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar games (excluding the game itself, which is index 0 after sorting)
    sim_scores = sim_scores[1:11]

    # Get the game indices from the (index, score) tuples
    game_indices = [i[0] for i in sim_scores]

    # Return the titles of the top 10 most similar games
    return data_df['Name'].iloc[game_indices].tolist()

### Model 2: K-Nearest Neighbors (KNN) on Combined Features

In [15]:
tfidf_features = tfidf_matrix # Reuse the TF-IDF matrix as features for KNN

Numerical Features (Relevant Features like popularity, rating, age, playtime)

In [16]:
numerical_cols = [
    'Log Peak CCU', 'Review Ratio', 'Game Age (Days)',
    'Log Median playtime forever', 'Log Achievements', 'Log DLC count',
    'Num Languages'
]

Handle NaN values in case still have

In [17]:
df[numerical_cols] = df[numerical_cols].fillna(0).replace([np.inf, -np.inf], 0)
numerical_features_raw = df[numerical_cols].values

Categorical Features (One-Hot Encoded Tags, Genres, Categories)

In [18]:
tag_cols = [col for col in df.columns if col.startswith('Tags_')]
genre_cols = [col for col in df.columns if col.startswith('Genres_')]
category_cols = [col for col in df.columns if col.startswith('Categories_')]

# Combine all one-hot encoded features
categorical_features = df[tag_cols + genre_cols + category_cols].values

Convert dense categorical features to sparse matrix format for efficient combination

In [19]:
categorical_features_sparse = csr_matrix(categorical_features)

StandardScaler: Scales to zero mean and unit variance (good if data is normally distributed)

MinMaxScaler: Scales to a range [0, 1] (good if data isn't normally distributed or for distance metrics)

Since we found that our data was very right-skewed, we use MinMaxScaler here

In [20]:
scaler = MinMaxScaler()
numerical_features_scaled = scaler.fit_transform(numerical_features_raw)


Convert scaled numerical features to sparse matrix format again

In [21]:
numerical_features_scaled_sparse = csr_matrix(numerical_features_scaled)

Combine TF-IDF, numerical, and categorical features into a single sparse matrix

In [22]:
combined_features = hstack([
    tfidf_features,
    numerical_features_scaled_sparse,
    categorical_features_sparse
]) 

print(f"Combined features matrix shape: {combined_features.shape}")
print(f"Feature breakdown: TF-IDF ({tfidf_features.shape[1]}), Scaled Numerical ({numerical_features_scaled_sparse.shape[1]}), One-Hot Categorical ({categorical_features_sparse.shape[1]})")

Combined features matrix shape: (22105, 90063)
Feature breakdown: TF-IDF (89540), Scaled Numerical (7), One-Hot Categorical (516)


We want 11 neighbors (1 self + 10 recommendations)

'cosine' metric is often good for high-dimensional, sparse data like this

'brute' algorithm checks all points, suitable for sparse data

In [23]:
knn_model = NearestNeighbors(n_neighbors=11, metric='cosine', algorithm='brute', n_jobs=-1)
knn_model.fit(combined_features)

In [24]:
def get_knn_recommendations(title, model=knn_model, features=combined_features, data_indices=indices, data_df=df):
    """
    Gets game recommendations based on KNN on combined features.

    Args:
        title (str): The name of the game to get recommendations for.
        model (NearestNeighbors): The fitted KNN model.
        features (csr_matrix): The combined feature matrix used for fitting KNN.
        data_indices (pd.Series): Series mapping game titles to their index.
        data_df (pd.DataFrame): The original dataframe to get game names from index.

    Returns:
        list: A list of recommended game titles, or None if title not found.
    """
    if title not in data_indices:
        print(f"Error: Game '{title}' not found in the dataset.")
        possible_matches = [name for name in data_indices.index if title.lower() in name.lower()]
        if possible_matches:
            print(f"Did you mean one of these? {possible_matches[:5]}")
        return None

    # Get the index of the game
    idx = data_indices[title]

    # Get the feature vector for the target game
    query_vector = features[idx]

    # Find the nearest neighbors
    # _: distance values of neighbors
    # indices_knn: indices of neighbors in the original dataset
    _, indices_knn = model.kneighbors(query_vector) # Commented out distance since won't be used

    # The first index (indices_knn[0][0]) is the game itself. Exclude it.
    neighbor_indices = indices_knn[0][1:]

    # Return the names of the recommended games
    return data_df['Name'].iloc[neighbor_indices].tolist()

In [25]:
target_game = 'The Binding of Isaac: Rebirth'

print(f"\n--- Recommendations for '{target_game}' ---")

# Model 1: TF-IDF + Cosine Similarity
print("\nModel 1 (TF-IDF + Cosine Similarity) Recommendations:")
tfidf_recs = get_tfidf_recommendations(target_game)
if tfidf_recs:
    for i, rec in enumerate(tfidf_recs):
        print(f"{i+1}. {rec}")
else:
    print(f"Could not generate TF-IDF recommendations for '{target_game}'.")

# Model 2: KNN on Combined Features
print("\nModel 2 (KNN on Combined Features) Recommendations:")
knn_recs = get_knn_recommendations(target_game)
if knn_recs:
    for i, rec in enumerate(knn_recs):
        print(f"{i+1}. {rec}")
else:
    print(f"Could not generate KNN recommendations for '{target_game}'.")



--- Recommendations for 'The Binding of Isaac: Rebirth' ---

Model 1 (TF-IDF + Cosine Similarity) Recommendations:
1. The Binding of Isaac
2. The Binding of YOU
3. Dead Space
4. There's Poop In My Soup
5. The Legend of Bum-Bo
6. Overture
7. Crystal Crisis
8. Dead Space™ 2
9. Chrono Survival
10. Paranautical Activity: Deluxe Atonement Edition

Model 2 (KNN on Combined Features) Recommendations:
1. Crawl
2. Skelly Selest
3. Wizard of Legend
4. Lovecraft's Untold Stories
5. 20XX
6. Enter the Gungeon
7. Colt Canyon
8. Risk of Rain
9. Toroom
10. Tallowmere
