In [1]:
import pandas as pd
import numpy as np
import json
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Flatten, Concatenate, Dropout, LSTM, GRU
from tensorflow.keras.optimizers import Adam


In [2]:
# 加载 games.csv
games_df = pd.read_csv('../data/games.csv')

# 加载 games_metadata.json
games_metadata_list = []
with open('../data/games_metadata.json', 'r') as f:
    for line in f:
        games_metadata_list.append(json.loads(line))
metadata_df = pd.json_normalize(games_metadata_list)

# 加载 recommendations.csv
recommendations_df = pd.read_csv('../data/recommendations.csv')

# 加载 users.csv
users_df = pd.read_csv('../data/users.csv')


ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

### 删除无关数据

In [3]:
recommendations_df = recommendations_df[recommendations_df['hours'] >= 2]

# 首先，统计每个用户的游戏数量和评论数量
user_game_counts = recommendations_df.groupby('user_id')['app_id'].nunique()
user_review_counts = recommendations_df.groupby('user_id')['review_id'].nunique()

# 获取有游戏且有评论的用户列表
valid_users = user_game_counts[(user_game_counts > 0) & (user_review_counts > 0)].index
recommendations_df = recommendations_df[recommendations_df['user_id'].isin(valid_users)]
users_df = users_df[users_df['user_id'].isin(valid_users)]


### game特征处理

In [4]:
games_features = ['app_id', 'title', 'date_release', 'win', 'mac', 'linux',
                  'rating', 'positive_ratio', 'user_reviews', 'price_final', 'price_original', 'discount', 'steam_deck']

games_df = games_df[games_features]


In [5]:
games_df['date_release'] = pd.to_datetime(games_df['date_release'], errors='coerce')
games_df['release_year'] = games_df['date_release'].dt.year
games_df['release_month'] = games_df['date_release'].dt.month
games_df['release_day'] = games_df['date_release'].dt.day
games_df.drop('date_release', axis=1, inplace=True)


In [6]:
# 编码 'rating'
rating_encoder = LabelEncoder()
games_df['rating_encoded'] = rating_encoder.fit_transform(games_df['rating'])
games_df.drop('rating', axis=1, inplace=True)


In [7]:
bool_features = ['win', 'mac', 'linux', 'steam_deck']
games_df[bool_features] = games_df[bool_features].astype(int)


In [8]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
num_features = ['positive_ratio', 'user_reviews', 'price_final', 'price_original', 'discount']
games_df[num_features] = scaler.fit_transform(games_df[num_features])


### games metadata 处理

In [9]:
metadata_df = metadata_df[['app_id', 'tags']]


In [10]:
# 将 'tags' 转换为列表
metadata_df['tags'] = metadata_df['tags'].apply(lambda x: x if isinstance(x, list) else [])


In [11]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
tags_encoded = mlb.fit_transform(metadata_df['tags'])
tags_df = pd.DataFrame(tags_encoded, columns=mlb.classes_)
metadata_df = pd.concat([metadata_df['app_id'], tags_df], axis=1)


### recommendations 处理

In [12]:
recommendations_df = recommendations_df[recommendations_df['hours'] >= 2]


In [13]:
recommendations_df['date'] = pd.to_datetime(recommendations_df['date'], errors='coerce')
recommendations_df['year'] = recommendations_df['date'].dt.year
recommendations_df['month'] = recommendations_df['date'].dt.month
recommendations_df['day'] = recommendations_df['date'].dt.day
recommendations_df['weekday'] = recommendations_df['date'].dt.weekday
recommendations_df.drop('date', axis=1, inplace=True)


In [14]:
recommendations_features = ['user_id', 'app_id', 'is_recommended', 'hours', 'year', 'month', 'day', 'weekday']
recommendations_df = recommendations_df[recommendations_features]


user 处理

In [15]:
# 统计每个用户的游戏数量和评论数量
user_game_counts = recommendations_df.groupby('user_id')['app_id'].nunique()
user_review_counts = recommendations_df.groupby('user_id')['is_recommended'].count()

# 获取有游戏且有评论的用户列表
valid_users = user_game_counts[(user_game_counts > 0) & (user_review_counts > 0)].index
users_df = users_df[users_df['user_id'].isin(valid_users)]
recommendations_df = recommendations_df[recommendations_df['user_id'].isin(valid_users)]


In [16]:
users_features = ['user_id', 'products', 'reviews']
users_df = users_df[users_features]


In [17]:
users_df[['products', 'reviews']] = scaler.fit_transform(users_df[['products', 'reviews']])

### 数据合并

In [18]:
games_full_df = pd.merge(games_df, metadata_df, on='app_id', how='left')


In [None]:
data_df = pd.merge(recommendations_df, users_df, on='user_id', how='left')
data_df = pd.merge(data_df, games_full_df, on='app_id', how='left')


In [None]:
print("合并后的数据形状：", data_df.shape)
print(data_df.head())


### Model

In [None]:
numerical_features = ['helpful', 'funny', 'hours', 'positive_ratio', 'user_reviews', 'price_final', 'price_original', 'discount', 'release_days', 'products', 'reviews']
categorical_features = ['user_id', 'app_id', 'year', 'month', 'day', 'weekday', 'win', 'mac', 'linux', 'steam_deck', 'rating_encoded']
tag_features = mlb.classes_.tolist()

In [None]:
features = numerical_features + categorical_features + tag_features
X = data_df[features]
y = data_df['is_recommended'].astype(int)


In [None]:
scaler = MinMaxScaler()
X[numerical_features] = scaler.fit_transform(X[numerical_features])


In [None]:
# 对类别特征进行标签编码
for col in categorical_features:
    if X[col].dtype == 'object':
        X[col] = LabelEncoder().fit_transform(X[col].astype(str))
X = X.values
y = y.values



In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# 假设我们将每个用户的交互视为一个时间序列
# 首先，我们需要根据用户 ID 对数据进行排序

data_df = data_df.sort_values(by=['user_id', 'date'])

# 然后，我们为每个用户创建一个序列
grouped = data_df.groupby('user_id')

# 定义序列长度
sequence_length = 5  # 可以根据需要调整

X_sequences = []
y_sequences = []

for user_id, group in grouped:
    if len(group) >= sequence_length:
        for i in range(len(group) - sequence_length + 1):
            X_seq = group.iloc[i:i+sequence_length][features].values
            y_seq = group.iloc[i+sequence_length-1]['is_recommended']
            X_sequences.append(X_seq)
            y_sequences.append(y_seq)

X_sequences = np.array(X_sequences)
y_sequences = np.array(y_sequences)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Masking

# 定义模型
model = Sequential()
model.add(Masking(mask_value=0., input_shape=(sequence_length, X_sequences.shape[2])))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # 二分类问题

# 编译模型
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# 模型摘要
model.summary()


In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

early_stopping = EarlyStopping(monitor='val_loss', patience=5)
model_checkpoint = ModelCheckpoint('best_lstm_model.h5', save_best_only=True, monitor='val_loss')

history = model.fit(X_sequences, y_sequences,
                    validation_split=0.2,
                    epochs=50,
                    batch_size=256,
                    callbacks=[early_stopping, model_checkpoint])


In [None]:
import matplotlib.pyplot as plt

# 绘制损失曲线
plt.plot(history.history['loss'], label='训练损失')
plt.plot(history.history['val_loss'], label='验证损失')
plt.legend()
plt.show()

# 绘制准确率曲线
plt.plot(history.history['accuracy'], label='训练准确率')
plt.plot(history.history['val_accuracy'], label='验证准确率')
plt.legend()
plt.show()


In [None]:
from tensorflow.keras.models import load_model

best_model = load_model('best_lstm_model.h5')
