In [7]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import json, os, sys

### 1. Data preprocessing

In [9]:
# 加载数据集
users = pd.read_csv('../data/users.csv')
recommendations = pd.read_csv('../data/recommendations.csv')
games = pd.read_csv('../data/games.csv')

games_metadata_list = []
with open('../data/games_metadata.json', 'r') as f:
    for line in f:
        games_metadata_list.append(json.loads(line))
games_metadata = pd.json_normalize(games_metadata_list)

In [10]:
# 合并游戏数据
games_full = pd.merge(games, games_metadata, on='app_id', how='left')

# 合并用户评论和游戏数据
data = pd.merge(recommendations, games_full, on='app_id', how='left')

# 填充缺失值
data.fillna({'hours': 0, 'helpful': 0, 'funny': 0, 'tags': ''}, inplace=True)

In [11]:
# 编码 user_id 和 app_id
user_encoder = LabelEncoder()
game_encoder = LabelEncoder()

data['user_id_enc'] = user_encoder.fit_transform(data['user_id'])
data['app_id_enc'] = game_encoder.fit_transform(data['app_id'])

num_users = data['user_id_enc'].nunique()
num_games = data['app_id_enc'].nunique()

In [12]:
# 准备标签编码器
all_tags = list(set([tag for sublist in games_full['tags'].dropna() for tag in sublist]))
tag_encoder = LabelEncoder()
tag_encoder.fit(all_tags)

def encode_tags(tags_list):
    if not tags_list:
        return []
    return tag_encoder.transform(tags_list)

games_full['tags_enc'] = games_full['tags'].apply(encode_tags)

### 2. Data processing

In [13]:
# 按用户分组
user_group = data.groupby('user_id_enc')

# 创建训练数据列表
train_data = []

for user_id, group in user_group:
    # 根据需要按日期排序
    group = group.sort_values('date')
    # 获取序列
    game_ids = group['app_id_enc'].values
    hours = group['hours'].values
    is_recommended = group['is_recommended'].astype(int).values

    for i in range(1, len(game_ids)):
        input_games = game_ids[:i]
        input_hours = hours[:i]
        target_game = game_ids[i]
        label = is_recommended[i]
        train_data.append({
            'user_id': user_id,
            'input_games': input_games,
            'input_hours': input_hours,
            'target_game': target_game,
            'label': label
        })

In [None]:
class GameRecommendationDataset(Dataset):
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        return {
            'user_id': torch.tensor(item['user_id'], dtype=torch.long),
            'input_games': torch.tensor(item['input_games'], dtype=torch.long),
            'input_hours': torch.tensor(item['input_hours'], dtype=torch.float),
            'target_game': torch.tensor(item['target_game'], dtype=torch.long),
            'label': torch.tensor(item['label'], dtype=torch.float)
        }

# 划分训练集和测试集
train_set, test_set = train_test_split(train_data, test_size=0.2, random_state=42)

train_dataset = GameRecommendationDataset(train_set)
test_dataset = GameRecommendationDataset(test_set)

# 创建数据加载器
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, drop_last=False)

### 3. Model construction

In [None]:
class RecommendationModel(nn.Module):
    def __init__(self, num_users, num_games, embedding_dim, hidden_dim):
        super(RecommendationModel, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.game_embedding = nn.Embedding(num_games, embedding_dim)
        
        self.rnn = nn.GRU(embedding_dim * 2, hidden_dim, batch_first=True)
        
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, user_id, input_games, input_hours):
        user_emb = self.user_embedding(user_id)  # [batch_size, embedding_dim]
        games_emb = self.game_embedding(input_games)  # [batch_size, seq_len, embedding_dim]
        
        # 扩展用户嵌入维度
        user_emb_expanded = user_emb.unsqueeze(1).expand_as(games_emb)
        
        # 拼接嵌入向量
        rnn_input = torch.cat([user_emb_expanded, games_emb], dim=-1)  # [batch_size, seq_len, embedding_dim * 2]
        
        output, _ = self.rnn(rnn_input)  # [batch_size, seq_len, hidden_dim]
        last_output = output[:, -1, :]  # 获取最后一个时间步的输出
        
        logits = self.fc(last_output)
        prob = self.sigmoid(logits)
        
        return prob.squeeze()

In [None]:
# 定义模型参数
embedding_dim = 64
hidden_dim = 128

model = RecommendationModel(num_users=num_users, num_games=num_games, embedding_dim=embedding_dim, hidden_dim=hidden_dim)

### 4. Model training

In [None]:
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        user_id = batch['user_id']
        input_games = batch['input_games']
        input_hours = batch['input_hours']
        label = batch['label']
        
        optimizer.zero_grad()
        output = model(user_id, input_games, input_hours)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}")

### 5. Model Evaluation

In [None]:
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for batch in test_loader:
        user_id = batch['user_id']
        input_games = batch['input_games']
        input_hours = batch['input_hours']
        label = batch['label']
        
        output = model(user_id, input_games, input_hours)
        predicted = (output > 0.5).float()
        total += label.size(0)
        correct += (predicted == label).sum().item()
    print(f"Test Accuracy: {100 * correct / total:.2f}%")

### 6. Recommendation function

In [None]:
def recommend_games_for_user(model, user_id_raw, top_k=10):
    user_id = user_encoder.transform([user_id_raw])[0]
    played_games = data[data['user_id_enc'] == user_id]['app_id_enc'].unique()
    
    # 获取用户历史记录
    user_history = data[data['user_id_enc'] == user_id]
    user_history = user_history.sort_values('date')
    input_games = torch.tensor(user_history['app_id_enc'].values, dtype=torch.long).unsqueeze(0)
    input_hours = torch.tensor(user_history['hours'].values, dtype=torch.float).unsqueeze(0)
    
    # 预测未玩过的游戏
    all_games = set(range(num_games))
    unplayed_games = all_games - set(played_games)
    
    recommendations = []
    
    model.eval()
    with torch.no_grad():
        for game_id in unplayed_games:
            # 构造输入
            target_game = torch.tensor([game_id], dtype=torch.long)
            # 模型输出
            output = model(torch.tensor([user_id]), input_games, input_hours)
            prob = output.item()
            recommendations.append((game_id, prob))
    
    # 按预测概率排序
    recommendations.sort(key=lambda x: x[1], reverse=True)
    
    # 获取前 K 个游戏
    top_games = recommendations[:top_k]
    top_game_ids = [game_encoder.inverse_transform([game_id])[0] for game_id, _ in top_games]
    
    # 显示推荐的游戏信息
    recommended_games_info = games_full[games_full['app_id'].isin(top_game_ids)]
    print(f"Recommended Games for User {user_id_raw}:")
    display(recommended_games_info[['app_id', 'title', 'positive_ratio', 'price_final']])

In [None]:
# 替换为数据集中实际存在的 user_id
# target_user_id = users['user_id'].iloc[0]
# recommend_games_for_user(model, target_user_id)