In [25]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ast

# Read the data
df = pd.read_csv('steam5.csv')

# Data Cleaning

# Parse 'genres', 'developer', 'publisher' fields
df['genres'] = df['genres'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])
df['developer'] = df['developer'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])
df['publisher'] = df['publisher'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])

# Clean 'number_of_reviews_from_purchased_people' and 'number_of_english_reviews' to numeric
def clean_number(x):
    if isinstance(x, str):
        x = x.replace(',', '').replace('(', '').replace(')', '').strip()
        return int(x) if x.isdigit() else np.nan
    return np.nan

df['number_of_reviews_from_purchased_people'] = df['number_of_reviews_from_purchased_people'].apply(clean_number)
df['number_of_english_reviews'] = df['number_of_english_reviews'].apply(clean_number)

# Clean 'release_date' column
def clean_date(date_str):
    try:
        # Attempt parsing with different formats
        return pd.to_datetime(date_str, format='%d %b, %Y', errors='coerce') or \
               pd.to_datetime(date_str, format='%b %Y', errors='coerce') or \
               pd.to_datetime(date_str, errors='coerce')
    except Exception:
        return pd.NaT

df['release_date'] = df['release_date'].apply(clean_date)

# Check for invalid dates
invalid_dates = df[df['release_date'].isna()]
print(f"Rows with invalid dates: {len(invalid_dates)}")

# Map 'overall_player_rating' to numeric scores
rating_map = {
    'Overwhelmingly Positive': 7,
    'Very Positive': 6,
    'Positive': 5,
    'Mostly Positive': 5,
    'Mixed': 4,
    'Negative': 3,
    'Mostly Negative': 3,
    'Very Negative': 2,
    'Overwhelmingly Negative': 1
}
df['player_rating_score'] = df['overall_player_rating'].map(rating_map)
df['player_rating_score'] = df['player_rating_score'].fillna(df['player_rating_score'].median())

# Create 'combined_features' for each game
def combine_features(row):
    genres = ' '.join(row['genres'])
    developers = ' '.join(row['developer'])
    publishers = ' '.join(row['publisher'])
    description = row['short_description']
    return f"{genres} {developers} {publishers} {description}"

df['combined_features'] = df.apply(combine_features, axis=1)

# Create the TF-IDF matrix
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined_features'])

# Compute cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Create reverse mapping of game titles to indices
indices = pd.Series(df.index, index=df['name']).drop_duplicates()

# Function to get recommendations
def get_recommendations(name, cosine_sim=cosine_sim):
    if name not in indices:
        return f"Game '{name}' not found in dataset."
    idx = indices[name]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Exclude the first one (itself), get top 10
    game_indices = [i[0] for i in sim_scores]
    return df['name'].iloc[game_indices]

# Example usage
recommended_games = get_recommendations('Black Myth: Wukong')
print("Recommended games:")
print(recommended_games)

Rows with invalid dates: 2
Recommended games:
114                                   God of War
110                              Hogwarts Legacy
33                                    Diablo® IV
50                                         Hades
4                                     ELDEN RING
76                     STAR WARS Jedi: Survivor™
109                              DARK SOULS™ III
71                                     Lies of P
81                               Sonic Frontiers
48     Sekiro™: Shadows Die Twice - GOTY Edition
Name: name, dtype: object


In [33]:
# Import necessary libraries
import pandas as pd
import numpy as np
from datetime import datetime
import ast
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors, KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load the dataset
df = pd.read_csv('steam5.csv')

# Step 1: Data Cleaning

# Parse genres and other list-like columns
def parse_list_column(column):
    return column.apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else [])

df['genres'] = parse_list_column(df['genres'])

# Clean review count columns
def clean_review_count(value):
    if pd.isnull(value):
        return 0
    # Extract only numeric parts from the string
    numeric_part = re.findall(r'\d+', value.replace(',', ''))
    if numeric_part:
        return sum(map(int, numeric_part))
    return 0

df['number_of_reviews_from_purchased_people'] = df['number_of_reviews_from_purchased_people'].apply(clean_review_count)
df['number_of_english_reviews'] = df['number_of_english_reviews'].apply(clean_review_count)

# Parse release date
def parse_release_date(date_str):
    try:
        return datetime.strptime(date_str, '%d %b, %Y')
    except ValueError:
        try:
            return datetime.strptime(date_str, '%b %Y')
        except ValueError:
            return pd.NaT

df['release_date'] = df['release_date'].apply(parse_release_date)

# Drop rows with missing values for essential columns
df = df.dropna(subset=['release_date', 'overall_player_rating'])

# Step 2: Feature Engineering

# One-Hot Encode genres
genres_dummies = df['genres'].str.join('|').str.get_dummies()

# Label Encode overall_player_rating
rating_le = LabelEncoder()
df['player_rating_encoded'] = rating_le.fit_transform(df['overall_player_rating'])

# Combine descriptions for TF-IDF vectorization
df['combined_description'] = df['short_description'] + ' ' + df['long_description']

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=500)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_description'])

# Create DataFrame from TF-IDF matrix
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Combine all features
features_df = pd.concat([
    df[['number_of_reviews_from_purchased_people', 'number_of_english_reviews', 'player_rating_encoded']],
    genres_dummies,
    tfidf_df
], axis=1)

# Replace NaN values with 0
features_df = features_df.fillna(0)

# Ensure all features are numeric
print(features_df.dtypes)

# Step 3: KNN Modeling

# Initialize and fit NearestNeighbors model
nn_model = NearestNeighbors(metric='cosine', algorithm='brute')
nn_model.fit(features_df)

# KNN Recommendation Function
def recommend_games(game_name, n_recommendations=5):
    try:
        game_index = df[df['name'] == game_name].index[0]
        game_features = features_df.iloc[game_index].values.reshape(1, -1)
        # Ensure the input for NearestNeighbors has valid feature names
        game_features = pd.DataFrame(game_features, columns=features_df.columns)
        distances, indices = nn_model.kneighbors(game_features, n_neighbors=n_recommendations+1)
        recommended_games = df.iloc[indices[0][1:]]['name']
        return recommended_games.tolist()
    except IndexError:
        return "Game not found in the dataset."

# Split data for KNN Regressor
X = features_df
y = df['player_rating_encoded']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and fit KNeighborsRegressor
knn_regressor = KNeighborsRegressor(n_neighbors=5)
knn_regressor.fit(X_train, y_train)

# Predict ratings for the test set
y_pred = knn_regressor.predict(X_test)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Rating Prediction Function
def predict_rating(game_name):
    try:
        game_index = df[df['name'] == game_name].index[0]
        game_features = features_df.iloc[game_index].values.reshape(1, -1)
        # Ensure the input for KNeighborsRegressor has valid feature names
        game_features = pd.DataFrame(game_features, columns=features_df.columns)
        predicted_rating = knn_regressor.predict(game_features)
        rating_label = rating_le.inverse_transform([int(round(predicted_rating[0]))])[0]
        return rating_label
    except IndexError:
        return "Game not found in the dataset."

# Example Usage
# Recommend games
game_name = 'Black Myth: Wukong'
print(f"Games recommended for '{game_name}':")
print(recommend_games(game_name))

# Predict rating
print(f"Predicted rating for '{game_name}':")
print(predict_rating(game_name))


number_of_reviews_from_purchased_people      int64
number_of_english_reviews                    int64
player_rating_encoded                        int32
1980s                                        int64
1990's                                       int64
                                            ...   
you                                        float64
your                                       float64
yours                                      float64
yourself                                   float64
zombies                                    float64
Length: 853, dtype: object
Mean Squared Error: 3.54
Games recommended for 'Black Myth: Wukong':
['NARAKA: BLADEPOINT', 'Russian Fishing 4', 'Shawarma Legend', 'Downhill Pro Racer', 'PUBG: BATTLEGROUNDS']
Predicted rating for 'Black Myth: Wukong':
Overwhelmingly Positive


In [36]:
# 导入必要的库
import pandas as pd
import numpy as np
import ast
from datetime import datetime
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity



# 数据清洗
def parse_list_column(column):
    """安全解析嵌套字符串字段"""
    def safe_parse(x):
        if pd.isnull(x):  # 处理缺失值
            return []
        try:
            return ast.literal_eval(x) if isinstance(x, str) else x
        except (ValueError, SyntaxError):
            return []
    return column.apply(safe_parse)

# 应用清洗函数到嵌套字段
df['genres'] = parse_list_column(df['genres'])
df['minimum_system_requirement'] = parse_list_column(df['minimum_system_requirement'])
df['recommend_system_requirement'] = parse_list_column(df['recommend_system_requirement'])
df['developer'] = parse_list_column(df['developer'])
df['publisher'] = parse_list_column(df['publisher'])

# 转换评价数量为数值
def parse_review_number(s):
    if pd.isnull(s):
        return 0
    s = ''.join(filter(str.isdigit, str(s)))  # 提取数字
    return int(s) if s else 0

df['number_of_reviews_from_purchased_people'] = df['number_of_reviews_from_purchased_people'].apply(parse_review_number)
df['number_of_english_reviews'] = df['number_of_english_reviews'].apply(parse_review_number)

# 格式化发布日期
def parse_date(date_str):
    try:
        return datetime.strptime(date_str, '%d %b, %Y')
    except:
        return pd.NaT

df['release_date'] = df['release_date'].apply(parse_date)
df['release_date'] = df['release_date'].fillna(df['release_date'].median())  # 填充缺失值

# 特征工程
# 编码类别特征
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(df['genres'])
genres_df = pd.DataFrame(genres_encoded, columns=mlb.classes_)

# 编码玩家评价
rating_mapping = {
    'Overwhelmingly Positive': 5,
    'Very Positive': 4,
    'Positive': 3,
    'Mixed': 2,
    'Negative': 1,
    'Very Negative': 0
}
df['overall_player_rating'] = df['overall_player_rating'].map(rating_mapping).fillna(0)

# 文本特征提取
tfidf = TfidfVectorizer(max_features=1000)
description_tfidf = tfidf.fit_transform(df['short_description'].fillna(''))

# 构建特征矩阵
features = np.hstack([
    description_tfidf.toarray(),
    df[['number_of_reviews_from_purchased_people', 'number_of_english_reviews']].values,
    genres_encoded
])

# 标签
labels = df['overall_player_rating'].values

# 数据集划分
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# 模型训练
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)

# 模型预测
y_pred = svm_classifier.predict(X_test)

# 结果评估
print("分类模型评估结果：")
print(classification_report(y_test, y_pred))

# 游戏推荐（基于内容的推荐）
def recommend_games(game_name, top_n=5):
    if game_name not in df['name'].values:
        print(f"游戏 {game_name} 不在数据集中。")
        return []
    idx = df.index[df['name'] == game_name][0]
    game_vector = features[idx].reshape(1, -1)
    similarities = cosine_similarity(game_vector, features)
    similar_indices = similarities[0].argsort()[-top_n-1:-1][::-1]
    recommended_games = df.iloc[similar_indices]['name'].values
    return recommended_games

# 示例：推荐与 "Black Myth: Wukong" 相似的游戏
recommended = recommend_games("Black Myth: Wukong")
print("推荐的游戏：", recommended)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()