In [8]:
# 数据处理
import pandas as pd
import numpy as np
import json


# 自然语言处理
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 推荐系统构建
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix

# 数据可视化（可选）
import matplotlib.pyplot as plt
import seaborn as sns


In [12]:

# 读取 CSV 文件
games_df = pd.read_csv('../data/games.csv')
users_df = pd.read_csv('../data/users.csv')
recommendations_df = pd.read_csv('../data/recommendations.csv')

# 读取 JSON 文件
games_metadata = []
with open('../data/games_metadata.json', 'r') as f:
    for line in f:
        games_metadata.append(json.loads(line))

# 转换 JSON 文件为 DataFrame
games_metadata_df = pd.json_normalize(games_metadata)


In [13]:
# 将 games_df 和 games_metadata_df 按照 app_id 合并
merged_games_df = pd.merge(games_df, games_metadata_df, on='app_id', how='inner')

# 查看合并后的数据结构
print(merged_games_df.head())


   app_id                              title date_release   win    mac  linux  \
0   13500  Prince of Persia: Warrior Within™   2008-11-21  True  False  False   
1   22364            BRINK: Agents of Change   2011-08-03  True  False  False   
2  113020       Monaco: What's Yours Is Mine   2013-04-24  True   True   True   
3  226560                 Escape Dead Island   2014-11-18  True  False  False   
4  249050            Dungeon of the ENDLESS™   2014-10-27  True   True  False   

          rating  positive_ratio  user_reviews  price_final  price_original  \
0  Very Positive              84          2199         9.99            9.99   
1       Positive              85            21         2.99            2.99   
2  Very Positive              92          3722        14.99           14.99   
3          Mixed              61           873        14.99           14.99   
4  Very Positive              88          8784        11.99           11.99   

   discount  steam_deck               

In [6]:
# 处理游戏描述列
# 假设 description 是游戏描述的列名
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# 如果有缺失值，我们将其填充为空字符串
games_df['description'] = games_df['description'].fillna('')

# 将游戏描述转化为TF-IDF矩阵
tfidf_matrix = tfidf_vectorizer.fit_transform(games_df['description'])

# 计算余弦相似性
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# 打印相似性矩阵的形状
print(cosine_sim.shape)


KeyError: 'description'