# A Simple Game Recommendation System Built Using LightFM

## 1. Import all packages to set up the environment

In [8]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import os
import sys
import itertools
import pandas as pd
import numpy as np
from numpy.linalg import inv
import matplotlib.pyplot as plt
import seaborn as sns

import lightfm
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score
import numpy as np
from lightfm.cross_validation import random_train_test_split
import os
from scipy.sparse import csr_matrix

## 2. Data Extraction & Cleaning

First, read the game data from csv file. There are some DLCs included in the file, which will not be evaluated and recommended to users. Therefore, those rows are excluded.

In [9]:
games_df = pd.read_csv('steam_games.csv')
games_df = games_df[games_df['types'] == 'app'].copy()
games_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 38021 entries, 0 to 40832
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   url                       38021 non-null  object 
 1   types                     38021 non-null  object 
 2   name                      38007 non-null  object 
 3   desc_snippet              24802 non-null  object 
 4   recent_reviews            2706 non-null   object 
 5   all_reviews               28470 non-null  object 
 6   release_date              37654 non-null  object 
 7   developer                 37721 non-null  object 
 8   publisher                 33021 non-null  object 
 9   popular_tags              37888 non-null  object 
 10  game_details              37546 non-null  object 
 11  languages                 38007 non-null  object 
 12  achievements              12194 non-null  float64
 13  genre                     37625 non-null  object 
 14  game_descri

Developers, publishers, tags, languages, genre, and game_details will be viewed as core features of a game. If any of those columns are null, the game does not have full information. Those games will also be excluded.

In [10]:
games_df.dropna(subset=['name','developer','publisher','popular_tags','languages','genre','game_details'], how='any', inplace=True)
games_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32546 entries, 0 to 40832
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   url                       32546 non-null  object 
 1   types                     32546 non-null  object 
 2   name                      32546 non-null  object 
 3   desc_snippet              24007 non-null  object 
 4   recent_reviews            2652 non-null   object 
 5   all_reviews               25835 non-null  object 
 6   release_date              32251 non-null  object 
 7   developer                 32546 non-null  object 
 8   publisher                 32546 non-null  object 
 9   popular_tags              32546 non-null  object 
 10  game_details              32546 non-null  object 
 11  languages                 32546 non-null  object 
 12  achievements              12060 non-null  float64
 13  genre                     32546 non-null  object 
 14  game_descri

Next, we read the user rating data. Data includes five columns: user ID, App ID, hours user spends on the game, whether they purchase the game, and whether they opened the game. Since this table shows the record of user's owned games, the "purchase" column is always 1. Furthermore, if user never open the game, the "hour" column will be 0. Therefore, "hour" column describes the overall rating of user on a game.

In [11]:
users_rating_df = pd.read_csv("steam_users_purchase_play.csv")
users_rating_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128804 entries, 0 to 128803
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   SteamID   128804 non-null  int64  
 1   AppID     128804 non-null  object 
 2   hours     128804 non-null  float64
 3   purchase  128804 non-null  int64  
 4   play      128804 non-null  int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 4.9+ MB


In the next part, we found some game names in rating data, but not in game data. Those are counted as invaluable ratings and will be excluded from rating data. To approach this, we first convert the name of games in both data frame to lower case and remove any spaces.

In [12]:
users_rating_df['name_norm'] = (
    users_rating_df['AppID']
    .str.lower()                        
    .str.replace(r'\s+', '', regex=True)  
)

games_df['name_norm'] = (
    games_df['name']
    .str.lower()                        
    .str.replace(r'\s+', '', regex=True)  
)



Then, we found out the missing games and remove them from the rating data.

In [13]:
rated_items = set(users_rating_df['name_norm'].unique())
known_items = set(games_df['name_norm'].unique())

missing = rated_items - known_items

print(f"Total rated items:   {len(rated_items)}")
print(f"Total known games:    {len(known_items)}")
print(f"Unregistered items:   {len(missing)}")
print("Examples:", list(missing)[:20])

mask_missing = ~users_rating_df['name_norm'].isin(known_items)
missing_rows = users_rating_df[mask_missing]

print("Total rows with missing games:", len(missing_rows))

users_rating_df = users_rating_df[~mask_missing].copy()



Total rated items:   5153
Total known games:    32476
Unregistered items:   3215
Examples: ['nba2k10', 'deathraymanta', 'lillyandsashacurseoftheimmortals', 'gunz2thesecondduel', 'disciplesiiirenaissance', 'thedarkeyechainsofsatinav', 'lumberisland-thatspecialplace', 'icewinddaleenhancededition', 'thief-ghost', 'mlb2k11', 'thenovelist', 'ronin', 'aceofspades', 'shadowgatemacventureseries', 'cidthedummy', 'earth2150lostsouls', 'torvald-hunter(assaultclass)', 'woodletreeadventures', 'uebergame', 'badrats']
Total rows with missing games: 73007


## 3.Model Training

After the cleaning of data, we now make a "feature" column for each game to fit the model in future training.

In [14]:
def make_features(row):
    feats = []
    for pub in str(row.get('publisher', '')).split(','):
        pub = pub.strip()
        if pub:
            feats.append(f"publisher:{pub}")

    for dev in str(row.get('developer', '')).split(','):
        dev = dev.strip()
        if dev:
            feats.append(f"developer:{dev}")

    for lang in str(row.get('languages', '')).split(','):
        lang = lang.strip()
        if lang:
            feats.append(f"language:{lang}")

    for genre in str(row.get('genre', '')).split(','):
        genre = genre.strip()
        if genre:
            feats.append(f"genre:{genre}")
    for tag in str(row.get('popular_tags', '')).split(','):
        tag = tag.strip()
        if tag:
            feats.append(f"tag:{tag}")
    for detail in str(row.get('game_details', '')).split(','):
        detail = detail.strip()
        if detail:
            feats.append(f"detail:{detail}")
    return feats

text_cols = ['publisher', 'developer', 'types', 'popular_tags', 'languages', 'genre']
games_df[text_cols] = games_df[text_cols].fillna('').astype(str)

games_df["feature_list"] = games_df.apply(make_features, axis=1)



The "feature_list" includes all features a game has, including their publishers, developers, languages, genre, tags, and game details.

In [16]:
games_df["feature_list"]

0        [publisher:Bethesda Softworks, publisher:Bethe...
1        [publisher:PUBG Corporation, publisher:PUBG Co...
2        [publisher:Paradox Interactive, publisher:Para...
3        [publisher:Bohemia Interactive, publisher:Bohe...
4        [publisher:CCP, publisher:CCP, developer:CCP, ...
                               ...                        
40824    [publisher:Self-published, publisher:Self-publ...
40825    [publisher:Gustavo Contreras, publisher:Gustav...
40826    [publisher:OldGHZ, publisher:OldGHZ, developer...
40831    [publisher:CAPCOM CO., publisher:LTD, publishe...
40832    [publisher:Self-Publish, publisher:Self-Publis...
Name: feature_list, Length: 32546, dtype: object

Create data set and train the model:

In [17]:
all_users = users_rating_df['SteamID'].unique().tolist()  + [114514]
all_feats = set()
for feats in games_df['feature_list']:
    all_feats.update(feats)
all_feats = list(all_feats)

ds = Dataset()
ds.fit(
    users=all_users,     
    user_features= {},
    items=games_df['name_norm'].unique(),            
    item_features=all_feats  
)

In [18]:
(interactions, weights) = ds.build_interactions(
    [(row['SteamID'], row['name_norm'], row['hours'])
     for _, row in users_rating_df.iterrows()]
)

item_features = ds.build_item_features(
    [(row['name_norm'], row['feature_list'])
     for _, row in games_df.iterrows()]
)

model = LightFM(loss="warp")
model.fit(interactions,
          item_features=item_features,
          sample_weight=weights,
          epochs=30,
          num_threads=4)

<lightfm.lightfm.LightFM at 0x74f40db4eb50>

## 4. Call trained model

The lightFM model can work in two ways. The first way is provide recommendations for users who are already in the user data. This works similar to classic model of ALS, which we input a user's id and the recommendation result will easily show up. Here is an illustration to approach this:

In [19]:
# inv map
user_id_map, _, item_id_map, _ = ds.mapping()
inv_user_map = {v: k for k, v in user_id_map.items()}
inv_item_map = {v: k for k, v in item_id_map.items()}

orig_uid = 151603712 #take the existing uid as an example

# change id to index in order to read data
if orig_uid not in user_id_map:
    raise ValueError(f"unknown id: {orig_uid}")
uid_internal = user_id_map[orig_uid]
n_items = len(item_id_map)
item_idx = np.arange(n_items)

# Score for this user on all games
scores = model.predict(
    user_ids=uid_internal,
    item_ids=item_idx,   
    item_features=item_features
)

N = 10
top_internal = np.argsort(-scores)[:N]
recommended = [inv_item_map[i] for i in top_internal]

print(f"For user: {orig_uid} Top 10 games:\n", recommended)

For user: 151603712 Top 10 games:
 ['half-life2:deathmatch', 'half-lifedeathmatch:source', 'left4dead2', 'unturned', 'dungeondefenders', 'teamfortress2', "garry'smod", 'alienswarm', 'warframe', 'terraria']


The second way is called "Cold start". This is a method to create a recommendation list for users who are not in current user data(fresh new users). Cold start require user to provide scores on some existing games. Here is a simple example:

In [20]:
V_items = item_features.dot(model.item_embeddings)
print("V.shape:", V_items.shape)

# Get data set
uid_map, _,iid_map, feat_map = ds.mapping()

# new rating 
raw_new = {
    'half-life2:deathmatch': 5.0,
    'half-lifedeathmatch:source': 4.0
}
new_ratings = {
    item_id_map[name]: rating
    for name, rating in raw_new.items()
}

# Extract V to search similar games
idxs = np.array(list(new_ratings.keys()), dtype=int)  
Vi   = V_items[idxs, :]                                    # (m, k)
r    = np.array(list(new_ratings.values()))          # (m,)
I   = np.eye(len(V_items[0]))
λ   = 1e-6

# The analytical solution of least squares
u_new = inv(Vi.T @ Vi + λ * I) @ (Vi.T @ r)          # (k,)

# score item and give recommendations
scores = V_items @ u_new                                    # (n_items,)
top_idxs = np.argsort(-scores)[:10]
recommendations = [inv_item_map[i] for i in top_idxs]

print("Top-10 for new user:", recommendations)

V.shape: (32476, 10)
Top-10 for new user: ['echoofsoul', 'unturned', 'fallenearthfree2play', 'trove', 'robocraft', 'rift', 'realmofthemadgod', 'defiance', 'archeage', 'tera']


## 5. Model Evaluation

In [21]:
train, test = random_train_test_split(interactions, test_percentage=0.2, random_state=42)

model.fit(train, item_features=item_features, epochs=30, num_threads=4)

k = 10
prec = precision_at_k(model, test, train_interactions=train,
                      item_features=item_features, k=k).mean()
rec  = recall_at_k(model, test, train_interactions=train,
                   item_features=item_features, k=k).mean()
auc  = auc_score(model, test, train_interactions=train,
                 item_features=item_features).mean()
print(f"Precision@{k}: {prec:.4f}  Recall@{k}: {rec:.4f}  AUC: {auc:.4f}")

Precision@10: 0.0660  Recall@10: 0.4087  AUC: 0.9927
