In [1]:
# Setup
from collections import defaultdict
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import json, pickle

# Steam Game Recommendation System (AI Prototype)
Goal : Build a minimal AI-based recommender using Matrix Factorization/Neural Collaborative Filtering

# Introduction
## Data Source
The dataset being used in this case is the Steam Video Game and Budle Data - User and Item data and item data from Professor Julian McAluley of the University of California San Diego

### Metadata and Key Features
#### Dataset 1 - user-item data
Size : 527 MB  
Items : 88310  
Data Range :  
Data Format : SteamID - Items{item_id, playtime_forever}  
#### Dataset 2 - item data
Size :  
Items :  
Data Range :   
Data Format : ItemId - playtime_forever, (tags/genres/categories)  

In [2]:
from sklearn.preprocessing import MinMaxScaler

# Load user-item data
with open('data/item_user_data.json', encoding='utf-8') as f:
    user_data = json.load(f)

# Load item data
with open('data/item_data.json', encoding='utf-8') as f:
    item_data = json.load(f)

# Flatten the data to df
rows = []
for user in user_data:
    for item in user['items']:
        rows.append({
            'steam_id': user['steam_id'],
            'item_id': item['item_id'],
            'playtime': item['playtime_forever']
        })
userItem_df = pd.DataFrame(rows)

# Convert only relevant metadata to df
rows = []
for game in item_data:
    rows.append({
        'id' : game.get('id', None),
        'name' : game.get('app_name', None),
        'tags' : game.get('tags', None),
        'price' : game.get('price', None)
    })
item_df = pd.DataFrame(rows)

# Keep only games with metadata
valid_games = set(item_df['id'])
userItem_df = userItem_df[userItem_df['item_id'].isin(valid_games)]

# Normalize playtime (0-1 scale)
userItem_df['playtime_norm'] = (userItem_df['playtime'] - userItem_df['playtime'].min()) / \
(userItem_df['playtime'].max() - userItem_df['playtime'].min())

print(userItem_df)
print(item_df)

                  steam_id item_id  playtime  playtime_norm
0        76561197970982479      10         6       0.000009
1        76561197970982479      20         0       0.000000
2        76561197970982479      30         7       0.000011
3        76561197970982479      40         0       0.000000
4        76561197970982479      50         0       0.000000
...                    ...     ...       ...            ...
5153203  76561198329548331  227940        43       0.000067
5153204  76561198329548331  346330         0       0.000000
5153205  76561198329548331  373330         0       0.000000
5153206  76561198329548331  388490         3       0.000005
5153207  76561198329548331  521570         4       0.000006

[4294257 rows x 4 columns]
           id                      name  \
0      761140       Lost Summoner Kitty   
1      643980                 Ironbound   
2      670290   Real Pool 3D - Poolians   
3      767400                   弹炸人2222   
4      773570             Log Challen

# Exploratory Data Analysis

# Model Implementation

## Basline

### Collaborative Filtering

In [52]:
from surprise import Dataset, Reader, KNNBasic

# Prepare data
reader = Reader(rating_scale=(0,1))
data = Dataset.load_from_df(userItem_df[['steam_id', 'item_id', 'playtime_norm']], reader)

# Train KNN model (Item-based CF)
trainset = data.build_full_trainset()
sim_opts = {'name': 'pearson', 'user_based': False}
model_cf = KNNBasic(sim_options=sim_opts)
model_cf.fit(trainset)

# Save to disk
with open('Models/cf_model(KNNBasic_pear).pkl', 'wb') as f:
    pickle.dump(model_cf, f)

# Get recommendations for a user
pred = model_cf.predict(uid='76561197970982479', iid='730')
print(f"Predicted playtime: {pred.est:.2f}")

user_id = '76561197970982479'
played_game = set(userItem_df[userItem_df['steam_id'] == user_id]['item_id'])
all_games = set(userItem_df)
unplayed_game = all_games-played_game

# Player Predictions
predictions = []
for game_id in unplayed_game:
    pred = model_cf.predict(uid=user_id, iid=game_id)
    predictions.append((game_id,pred.est))

# get top 5
top_5 = sorted(predictions, key=lambda x: -x[1][:5])
print("Top 5 Recommended Games (CF) :")
for game_id, score in top_5:
    game_name = item_df[item_df['id'] == game_id]['title'].values[0]
    print(f"{game_name} (Predicted Playtime: {score:.2f})")

Computing the pearson similarity matrix...
Done computing similarity matrix.
Predicted playtime: 0.01


IndexError: invalid index to scalar variable.

In [47]:
# Load KNN model (Item-based CF)
with open('Models/cf_model(KNNBasic_pear).pkl', 'rb') as f:
    model_cf = pickle.load(f)

# Get recommendations for a user
pred = model_cf.predict(uid='76561197970982479', iid='730')
print(f"Predicted playtime for 'Counter-Strike:Global Offensive' : {pred.est:.2f}")

user_id = '76561197970982479'
played_game = set(userItem_df[userItem_df['steam_id'] == user_id]['item_id'])
all_games = set(item_df['id'])
unplayed_game = all_games-played_game
print(f"Played: {len(played_game)} | All: {len(all_games)} | Unplayed: {len(unplayed_game)}\n")


# Player Predictions
predictions = []
for game_id in unplayed_game:
    pred = model_cf.predict(uid=user_id, iid=game_id)
    predictions.append((game_id,pred.est))

# get top 5
top_5 = sorted(predictions, key=lambda x: -x[1])[:5]
print("Top 5 Recommended Games (CF) :")
for game_id, score in top_5:
    game_name = item_df[item_df['id'] == game_id]['name'].values[0]
    print(f"{game_name} (Predicted Playtime: {score:4f})")

Predicted playtime for 'Counter-Strike:Global Offensive' : 0.01
Played: 232 | All: 32133 | Unplayed: 31901

Top 5 Recommended Games (CF) :
Pristine world (Predicted Playtime: 0.036610)
SpiritSphere (Predicted Playtime: 0.036610)
The Quivering (Predicted Playtime: 0.036610)
Shrooms (Predicted Playtime: 0.018529)
Cat President ~A More Purrfect Union~ (Predicted Playtime: 0.018398)


### Content Based Filtering

In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

item_df['tags'] = item_df['tags'].fillna('').apply(
    lambda x: ' '.join(x) if isinstance(x, list) else x
)
# Convert lists of tags into strings
item_df['tags_str'] = item_df['tags'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

# Create TF-IDF matrix (weights)
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(item_df['tags_str'])

# Save TF-IDF and matrix
np.save('Models/tfidf_matrix.npy', tfidf_matrix.toarray())
pickle.dump(tfidf, open('Models/tfidf_model.pkl', 'wb'))

# Run and Save cosine matrix
cos_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
np.save('Models/cosine_sim.npy', cos_sim)

# Get recommendations for a game
game_idx = item_df[item_df['app_name'] == 'Counter-Strike'].index[0]
sim_score = list(enumerate(cos_sim[game_id]))
sim_score = sorted(sim_score, key=lambda x: x[1], reverse=True)[1:6] #top 5
rec_games = item_df.iloc[[i[0] for i in sim_score]]['app_name']
print(rec_games)

KeyError: 'app_name'

In [4]:
# Load cosine simularity
cosine_sim = np.load('Models/cosine_sim.npy')

# Get recommendations for a game
game_idx = item_df[item_df['name'] == 'Counter-Strike'].index[0]
sim_score = list(enumerate(cos_sim[game_id]))
sim_score = sorted(sim_score, key=lambda x: x[1], reverse=True)[1:6] #top 5
rec_games = item_df.iloc[[i[0] for i in sim_score]]['name']
print(rec_games)

KeyboardInterrupt: 

## AI-Based Model

In [3]:
import torch
import torch.nn as nn

class NCF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=32):
        super().__init__()
        self.user_embed = nn.Embedding(num_users, emb_size)
        self.item_embed = nn.Embedding(num_items, emb_size)
        self.fc = nn.Sequential(
            nn.Linear(emb_size * 2, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )
    
    def forward(self, user, item):
        u = self.user_embed(user)
        i = self.item_embed(item)
        x = torch.cat([u, i], dim=1)
        return self.fc(x)

# preare data
user_ids = userItem_df['steam_id'].astype('category').cat.codes.values
item_ids = userItem_df['item_id'].astype('category').cat.codes.values
labels = userItem_df['playtime_norm'].values

train_data = torch.utils.data.TensorDataset(
    torch.tensor(user_ids, dtype=torch.long),
    torch.tensor(item_ids, dtype=torch.long),
    torch.tensor(labels, dtype=torch.float)
)

# training
model = NCF(num_users=max(user_ids)+1, num_items=max(item_ids)+1)
optimzer = torch.optim.Adam(model.parameters())
loss_fn = nn.MSELoss()

for epoch in range(10):
    for user, item, label in torch.utils.data.DataLoader(train_data, batch_size=64):
        pred = model(user, item)
        loss = loss_fn(pred.squeeze(), label)
        optimzer.zero_grad()
        loss.backward()
        optimzer.step()
        
torch.save(model.state_dict(), 'ncf_model.pth')

KeyboardInterrupt: 

# Evaluation

## Baseline

## AI-Based Model

# Demo

# Conclusion
## Summary
## Limitations
## Future