In [2]:
import os

# detect gpus
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
print(os.environ["CUDA_VISIBLE_DEVICES"])

0


In [3]:
import torch
print(torch.cuda.is_available())

False


In [5]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import json

# 检查 GPU 是否可用
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cpu


### 1. Data preprocessing

In [7]:
# 假设数据文件在当前目录下
users = pd.read_csv('../data/users.csv')
recommendations = pd.read_csv('../data/recommendations.csv')
games = pd.read_csv('../data/games.csv')

games_metadata_list = []
with open('../data/games_metadata.json', 'r') as f:
    for line in f:
        games_metadata_list.append(json.loads(line))
games_metadata = pd.json_normalize(games_metadata_list)

In [8]:
# 合并游戏数据
games_full = pd.merge(games, games_metadata, on='app_id', how='left')

# 合并用户评论和游戏数据
data = pd.merge(recommendations, games_full, on='app_id', how='left')

# 合并用户数据
data = pd.merge(data, users, on='user_id', how='left')

In [10]:
data.fillna({
    'hours': 0, 'helpful': 0, 'funny': 0,
    'tags': '', 'description': '', 'rating': 'unknown'
}, inplace=True)

### 2. Feature Engineering

In [11]:
# 编码 user_id 和 app_id
user_encoder = LabelEncoder()
game_encoder = LabelEncoder()

data['user_id_enc'] = user_encoder.fit_transform(data['user_id'])
data['app_id_enc'] = game_encoder.fit_transform(data['app_id'])

num_users = data['user_id_enc'].nunique()
num_games = data['app_id_enc'].nunique()

In [12]:
# 提取所有独特的标签
all_tags = list(set(tag for tags in data['tags'] for tag in tags))
tag_encoder = LabelEncoder()
tag_encoder.fit(all_tags)

def encode_tags(tags_list):
    return tag_encoder.transform(tags_list)

data['tags_enc'] = data['tags'].apply(encode_tags)
num_tags = len(tag_encoder.classes_)

In [13]:
system_cols = ['win', 'mac', 'linux', 'steam_deck']
for col in system_cols:
    data[col] = data[col].astype(int)

In [14]:
data['date'] = pd.to_datetime(data['date'])
data['date_release'] = pd.to_datetime(data['date_release'])

In [15]:
# 使用评论日期与当前日期的差值
current_date = data['date'].max()
data['days_since_review'] = (current_date - data['date']).dt.days

# 提取时间周期特征
data['review_month'] = data['date'].dt.month
data['review_day_of_week'] = data['date'].dt.weekday

#### Temp stop

In [17]:
# save the data to disk
data.to_csv('../data/data_temp.csv', index=False)

In [None]:
# read data
data = pd.read_csv('../data/data_temp.csv')

#### Continue

In [16]:
# 使用 TF-IDF 提取特征
tfidf_vectorizer = TfidfVectorizer(max_features=100)
tfidf_matrix = tfidf_vectorizer.fit_transform(data['description'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# 合并 TF-IDF 特征
data = pd.concat([data.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)

MemoryError: Unable to allocate 3.68 GiB for an array with shape (12, 41154794) and data type int64

In [None]:
# 数值特征列表
numeric_features = [
    'hours', 'helpful', 'funny', 'products', 'reviews',
    'positive_ratio', 'user_reviews', 'price_final', 'price_original', 'discount',
    'days_since_review', 'review_month', 'review_day_of_week'
]

# 标准化
scaler = StandardScaler()
data[numeric_features] = scaler.fit_transform(data[numeric_features])

In [None]:
# 编码 rating
rating_encoder = LabelEncoder()
data['rating_enc'] = rating_encoder.fit_transform(data['rating'])

# 特征列表
feature_columns = (
    ['user_id_enc', 'app_id_enc', 'rating_enc'] +
    system_cols +
    numeric_features +
    list(tfidf_vectorizer.get_feature_names_out())
)

# 准备特征和标签
X = data[feature_columns]
y = data['is_recommended'].astype(int)

### 3. Model training

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 转换为 Tensor 并移动到设备
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).to(device)

X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).to(device)

# 创建数据集和数据加载器
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False)

In [None]:
class GameRecommendationModel(nn.Module):
    def __init__(self, num_users, num_games, num_ratings, input_dim):
        super(GameRecommendationModel, self).__init__()
        embedding_dim = 64
        
        # 嵌入层
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.game_embedding = nn.Embedding(num_games, embedding_dim)
        self.rating_embedding = nn.Embedding(num_ratings, embedding_dim)
        
        # 全连接层
        self.fc1 = nn.Linear(input_dim - 3 + embedding_dim * 3, 256)
        self.fc2 = nn.Linear(256, 64)
        self.output = nn.Linear(64, 1)
        self.dropout = nn.Dropout(0.2)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        user_id = x[:, 0].long()
        game_id = x[:, 1].long()
        rating_id = x[:, 2].long()
        
        # 嵌入层输出
        user_emb = self.user_embedding(user_id)
        game_emb = self.game_embedding(game_id)
        rating_emb = self.rating_embedding(rating_id)
        
        # 其余特征
        other_features = x[:, 3:]
        
        # 拼接所有特征
        x = torch.cat([user_emb, game_emb, rating_emb, other_features], dim=1)
        
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.output(x)
        x = self.sigmoid(x)
        return x.squeeze()

In [None]:
input_dim = X_train.shape[1]
num_ratings = data['rating_enc'].nunique()

model = GameRecommendationModel(num_users, num_games, num_ratings, input_dim).to(device)

In [None]:
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * X_batch.size(0)
    avg_loss = total_loss / len(train_dataset)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')

### 4. Model Evaluation

In [None]:
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for X_batch, y_batch in test_loader:
        outputs = model(X_batch)
        predicted = (outputs > 0.5).float()
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()
    accuracy = correct / total
    print(f'Test Accuracy: {accuracy * 100:.2f}%')

### 5. Recommendation function

In [None]:
def recommend_games(model, user_id_raw, top_k=10):
    model.eval()
    user_id = user_encoder.transform([user_id_raw])[0]
    
    # 获取用户特征
    user_data = data[data['user_id_enc'] == user_id].iloc[0]
    user_tensor = torch.tensor(user_data[feature_columns].values, dtype=torch.float32).to(device)
    
    # 获取用户未玩过的游戏
    played_games = data[data['user_id_enc'] == user_id]['app_id_enc'].unique()
    all_games = set(range(num_games))
    candidate_games = list(all_games - set(played_games))
    
    # 构建候选集
    candidate_features = []
    for game_id in candidate_games:
        game_data = data[data['app_id_enc'] == game_id].iloc[0]
        input_features = user_data.copy()
        input_features['app_id_enc'] = game_id
        input_features['rating_enc'] = game_data['rating_enc']
        for col in system_cols + numeric_features + list(tfidf_vectorizer.get_feature_names_out()):
            input_features[col] = game_data[col]
        candidate_features.append(input_features[feature_columns].values)
    
    # 转换为 Tensor
    candidate_tensor = torch.tensor(candidate_features, dtype=torch.float32).to(device)
    
    # 模型预测
    with torch.no_grad():
        outputs = model(candidate_tensor)
        top_k_indices = torch.topk(outputs, top_k).indices.cpu().numpy()
    
    # 获取推荐的游戏 ID
    recommended_game_ids = [candidate_games[i] for i in top_k_indices]
    recommended_app_ids = game_encoder.inverse_transform(recommended_game_ids)
    recommended_games = games_full[games_full['app_id'].isin(recommended_app_ids)]
    
    # 显示推荐结果
    print(f"Recommended Games for User {user_id_raw}:")
    display(recommended_games[['app_id', 'title', 'positive_ratio', 'price_final']])

In [None]:
# 示例：为指定用户推荐游戏
# target_user_id = users['user_id'].iloc[0]
# recommend_games(model, target_user_id)