In [None]:
# Setup
from collections import defaultdict
import matplotlib.pyplot as plt
import pandas as pd
import json

# Steam Game Recommendation System (AI Prototype)
Goal : Build a minimal AI-based recommender using Matrix Factorization/Neural Collaborative Filtering

# Introduction
## Data Source
The dataset being used in this case is the Steam Video Game and Budle Data - User and Item data and item data from Professor Julian McAluley of the University of California San Diego

### Metadata and Key Features
#### Dataset 1 - user-item data
Size : 527 MB  
Items : 88310  
Data Range :  
Data Format : SteamID - Items{item_id, playtime_forever}  
#### Dataset 2 - item data
Size :  
Items :  
Data Range :   
Data Format : ItemId - playtime_forever, (tags/genres/categories)  

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Load user-item data
with open('item_user_data.json') as f:
    user_data = json.load(f)

# Load item data
with open('item_data.json') as f:
    item_data = json.load(f)

# Flatten the data to df
rows = []
for user in user_data:
    for item in user['items']:
        rows.append({
            'steam_id': user['steam_id'],
            'item_id': item['item_id'],
            'playtime': item['playtime_forever']
        })
userItem_df = pd.DataFrame(rows)

# Convert only relevant metadata to df
rows = []
for game in item_data:
    rows.append({
        'id' : game['id'],
        'name' : game['app_name'],
        'tags' : game['tags'],
        'price' : game['price'],
    })
item_df = pd.DataFrame(rows)

# Keep only games with metadata
valid_games = set(item_df['id'])
userItem_df = userItem_df[userItem_df['item_id'].isin(valid_games)]

# Normalize playtime (0-1 scale)
userItem_df['playtime_norm'] = (userItem_df['playtime'] - userItem_df['playtime'].min()) / \
(userItem_df['playtime'].max() - userItem_df['playtime'].min())

# Exploratory Data Analysis

# Model Implementation

## Basline

In [None]:
from surprise import Dataset, Reader, KNNBasic

# Prepare data
reader = Reader(rating_scale=(0,1))
data = Dataset.load_from_df(userItem_df[['steam_id', 'item_id', 'playtime_norm']], reader)

# Train KNN model (Item-based CF)
trainset = data.build_full_trainset()
sim_opts = {'name': 'cosine', 'user_based': False}
model_cf = KNNBasic(sim_options=sim_opts)
model_cf.fit(trainset)

# Get recommendations for a user
pred = model_cf.predict(uid='76561197970982479', iid='730')
print(f"Predicted playtime: {pred.est:.2f}")

user_id = '76561197970982479'
played_game = set(userItem_df[userItem_df['user_id'] == user_id]['item_id'])
all_games = set(userItem_df)
unplayed_game = all_games-played_game

# Player Predictions
predictions = []
for game_id in unplayed_game:
    pred = model_cf.predict(uid=user_id, iid=game_id)
    predictions.append((game_id,pred.est))

# get top 5
top_5 = sorted(predictions, key=lambda x: -x[1][:5])
print("Top 5 Recommended Games (CF) :")
for game_id, score in top_5:
    game_name = item_df[item_df['id'] == game_id]['title'].values[0]
    print(f"{game_name} (Predicted Playtime: {score:.2f})")

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Create TF-IDF matrix (weights)
tfidf = TfidfVectorizer(stop_words='english')
tfidx_matrix = tfidf.fit_transform(item_df['tags'])

cos_sim = cosine_similarity(tfidx_matrix, tfidx_matrix)

# Get recommendations for a game
game_idx = item_df[item_df['app_name'] == 'Counter-Strike'].index[0]
sim_score = list(enumerate(cos_sim[game_id]))
sim_score = sorted(sim_score, key=lambda x: x[1], reverse=True)[1:6] #top 5
rec_games = item_df.iloc[[i[0] for i in sim_score]]['app_names']
print(rec_games)

## AI-Based Model

In [None]:
import torch
import torch.nn as nn

# Evaluation

## Baseline

## AI-Based Model

# Demo

# Conclusion
## Summary
## Limitations
## Future