### Install and import libraries


In [2]:
!pip install surprise



In [3]:
import pandas as pd
import numpy as np
import json
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from surprise import Dataset, Reader
from surprise import SVD
from surprise.model_selection import train_test_split, cross_validate
from surprise import accuracy
from surprise import KNNBasic

In [4]:
import os
print(os.getcwd())

C:\Users\w1305\Downloads


### Set working directory

In [6]:
os.chdir(r"C:\Users\w1305\Downloads")
print(os.getcwd())

C:\Users\w1305\Downloads


### Step1: Load and preprocess game metadata


In [8]:
# Read the JSON Lines formatted file
json_file_path = "games_metadata.json"
metadata_list = []

with open(json_file_path, "r", encoding="utf-8") as file:
    for line in file:
        try:
            metadata_list.append(json.loads(line.strip()))
        except json.JSONDecodeError:
            continue

metadata_df = pd.DataFrame(metadata_list)

# Keep only the necessary fields
metadata_df = metadata_df[["app_id", "tags"]]

# Convert tags to string format (to avoid NaN)
metadata_df["tags"] = metadata_df["tags"].apply(lambda x: " ".join(x) if isinstance(x, list) else "")

### Step 2: Load game data and merge with metadata

In [10]:
steam_data = pd.read_csv("steamgame.csv")
steam_data["app_id"] = steam_data["app_id"].astype(int)
metadata_df["app_id"] = metadata_df["app_id"].astype(int)

# Merge game data and tag data (based on app_id)
steam_data = steam_data.merge(metadata_df, on="app_id", how="left")
steam_data["tags"] = steam_data["tags"].fillna("").astype(str)

### Step 3: Load user and recommendation data

In [12]:
# Load datasets
users_df = pd.read_csv("users.csv")
recommendations_df = pd.read_csv("recommendations.csv")

In [13]:
steam_data.head()

Unnamed: 0,app_id,title,date_release,win,mac,linux,rating,positive_ratio,user_reviews,price_final,price_original,discount,steam_deck,tags
0,13500,Prince of Persia: Warrior Within™,11/21/2008,True,False,False,Very Positive,84,2199,9.99,9.99,0,True,Action Adventure Parkour Third Person Great So...
1,22364,BRINK: Agents of Change,8/3/2011,True,False,False,Positive,85,21,2.99,2.99,0,True,Action
2,113020,Monaco: What's Yours Is Mine,4/24/2013,True,True,True,Very Positive,92,3722,14.99,14.99,0,True,Co-op Stealth Indie Heist Local Co-Op Strategy...
3,226560,Escape Dead Island,11/18/2014,True,False,False,Mixed,61,873,14.99,14.99,0,True,Zombies Adventure Survival Action Third Person...
4,249050,Dungeon of the ENDLESS™,10/27/2014,True,True,False,Very Positive,88,8784,11.99,11.99,0,True,Roguelike Strategy Tower Defense Pixel Graphic...


In [14]:
recommendations_df.head()

Unnamed: 0,app_id,helpful,funny,date,is_recommended,hours,user_id,review_id
0,975370,0,0,2022-12-12,True,36.3,51580,0
1,304390,4,0,2017-02-17,False,11.5,2586,1
2,1085660,2,0,2019-11-17,True,336.5,253880,2
3,703080,0,0,2022-09-23,True,27.4,259432,3
4,526870,0,0,2021-01-10,True,7.9,23869,4


In [15]:
print(recommendations_df["hours"].describe())

count    4.115479e+07
mean     1.006022e+02
std      1.761675e+02
min      0.000000e+00
25%      7.800000e+00
50%      2.730000e+01
75%      9.920000e+01
max      1.000000e+03
Name: hours, dtype: float64


In [16]:
users_df.head()

Unnamed: 0,user_id,products,reviews
0,7360263,359,0
1,14020781,156,1
2,8762579,329,4
3,4820647,176,4
4,5167327,98,2


### Step 4: Merge recommendations with game and user data

In [18]:
# Merge recommendations with game details
recommendations_games = recommendations_df.merge(steam_data, on="app_id", how="left")

# Merge recommendations with user details
recommendations_full = recommendations_games.merge(users_df, on="user_id", how="left")
print(recommendations_full)

           app_id  helpful  funny        date  is_recommended  hours  \
0          975370        0      0  2022-12-12            True   36.3   
1          304390        4      0  2017-02-17           False   11.5   
2         1085660        2      0  2019-11-17            True  336.5   
3          703080        0      0  2022-09-23            True   27.4   
4          526870        0      0  2021-01-10            True    7.9   
...           ...      ...    ...         ...             ...    ...   
41154789   633230        0      0  2021-02-15            True   41.0   
41154790   758870        8      0  2019-07-18           False    8.0   
41154791   696170        3     10  2018-03-26           False    2.0   
41154792   696170        0      0  2018-06-11            True    4.0   
41154793  1089980        2      0  2020-09-16            True   14.0   

           user_id  review_id                              title date_release  \
0            51580          0                     Dwar

In [19]:
print(recommendations_full.shape)
print(recommendations_full[['user_id', 'app_id']].isnull().sum())
print(recommendations_full.dtypes)

(41154794, 23)
user_id    0
app_id     0
dtype: int64
app_id              int64
helpful             int64
funny               int64
date               object
is_recommended       bool
hours             float64
user_id             int64
review_id           int64
title              object
date_release       object
win                  bool
mac                  bool
linux                bool
rating             object
positive_ratio      int64
user_reviews        int64
price_final       float64
price_original    float64
discount            int64
steam_deck           bool
tags               object
products            int64
reviews             int64
dtype: object


In [20]:
unique_ratings = recommendations_full["rating"].unique()
print(unique_ratings)

['Overwhelmingly Positive' 'Mixed' 'Very Positive' 'Mostly Positive'
 'Positive' 'Mostly Negative' 'Overwhelmingly Negative' 'Negative'
 'Very Negative']


### Step 5: Filter the data for active users and popular games

Select the top 5,000 most active users and top 5,000 most played games.
Also, filter out entries with playtime (hours) less than 2.5 to remove low engagement records.

In [22]:
# Select the top 5,000 most active users
top_users = recommendations_full["user_id"].value_counts().head(5000).index

# Select the top 5,000 most played games
top_games = recommendations_full["app_id"].value_counts().head(5000).index


filtered_df = recommendations_full[
    (recommendations_full["user_id"].isin(top_users)) &
    (recommendations_full["app_id"].isin(top_games)) &
    (recommendations_full["hours"] > 2.5 )  # Remove entries with playtime less than 2.5 hour
]


print(filtered_df)

           app_id  helpful  funny        date  is_recommended  hours  \
422        435150       83      0  2017-12-15            True  199.9   
425        275850        0      0  2020-10-10            True   13.6   
691        105600        5      3  2014-09-28            True  911.8   
712       1794680        0      0  2022-02-19            True   77.3   
1180       686810        6      0  2020-07-20            True   67.4   
...           ...      ...    ...         ...             ...    ...   
41154313   203160        2      0  2014-01-29            True   10.0   
41154516   979640        3      0  2021-08-16           False   10.0   
41154723   897450        0      0  2020-11-12            True    6.0   
41154771   362960        0      0  2019-07-06            True   35.0   
41154784   633230        0      0  2021-06-21           False   20.0   

           user_id  review_id                                          title  \
422        6334645        422  Divinity: Original Sin 2

### Step 6: Map ratings to numeric scores and compute a weighted score

Define a mapping from textual ratings to numeric scores.
Compute a weighted score combining playtime ('hours') and rating score.
Fill any missing weighted_score values using a fallback based on hours.

In [None]:
rating_map = {
    "Overwhelmingly Positive": 4,
    "Very Positive": 3,
    "Positive": 2,
    "Mostly Positive": 1,
    "Mixed": 0,
    "Negative": -1,
    "Mostly Negative": -2,
    "Very Negative": -3,
    "Overwhelmingly Negative": -4
}

filtered_df.loc[:, "rating_score"] = filtered_df["rating"].map(rating_map)
filtered_df.loc[:, "weighted_score"] = 0.3 * filtered_df["hours"] + 0.7 * filtered_df["rating_score"]
filtered_df.loc[:, "weighted_score"] = filtered_df["weighted_score"].fillna(filtered_df["hours"] * 0.7)

### Step 7: Prepare data for collaborative filtering using the surprise library

Create a Surprise Reader specifying the rating scale.
Load the data from filtered_df (using 'user_id', 'app_id', and 'weighted_score') into a Surprise Dataset.
Split the data into training and test sets.

In [None]:
reader = Reader(rating_scale=(0, filtered_df["weighted_score"].max()))
data = Dataset.load_from_df(filtered_df[["user_id", "app_id", "weighted_score"]], reader)

### Step 8: Train collaborative filtering models with surprise

Train an SVD model and compute its RMSE on the test set.
Then, configure and train a KNN model (using cosine similarity, item-based) and evaluate its RMSE.



In [None]:
trainset, testset = train_test_split(data, test_size=0.2)
svd_model = SVD()
svd_model.fit(trainset)

predictions = svd_model.test(testset)
accuracy.rmse(predictions)

In [None]:
sim_options = {
    "name": "cosine",
    "user_based": False
}

knn_model = KNNBasic(sim_options=sim_options)
knn_model.fit(trainset)

# KNNBasic
knn_predictions = knn_model.test(testset)
accuracy.rmse(knn_predictions)

### Step 9: Define recommendation functions using collaborative filtering

Define three functions:
1. `recommend_games_svd`: Recommend games for a user using the SVD model.
2. `recommend_new_games_knn`: Recommend new games (that the user hasn't played) using KNN.
3. `recommend_similar_games_knn`: Recommend games similar to a given game (excluding the game itself) using KNN.

In [None]:
def recommend_games_svd(user_id, top_n=10):
    # Who has ever played this game
    played_games = filtered_df[filtered_df["user_id"] == user_id]["app_id"].unique()
    all_games = filtered_df["app_id"].unique()
    unplayed_games = [game for game in all_games if game not in played_games]

    predictions = [svd_model.predict(user_id, game) for game in unplayed_games]

    # top N
    predictions.sort(key=lambda x: x.est, reverse=True)
    top_games = [pred.iid for pred in predictions[:top_n]]

    recommended_df = filtered_df[filtered_df["app_id"].isin(top_games)][["app_id", "title","tags","positive_ratio"]]

    recommended_df = recommended_df.drop_duplicates(subset=["app_id"])

    return recommended_df

# recommend to user based on SVD
svd_recommendations = recommend_games_svd(6528047)

display(svd_recommendations)

In [None]:
def recommend_new_games_knn(user_id, top_n=10):
    try:
        # **Get the internal ID of the users**
        inner_id = knn_model.trainset.to_inner_uid(user_id)

        # **Get the most similar users**
        neighbors = knn_model.get_neighbors(inner_id, k=top_n)

        # **Convert back to the original user IDs**
        similar_users = [knn_model.trainset.to_raw_uid(n) for n in neighbors]

        # **Get the games that the user has already played**
        user_played_games = set(filtered_df[filtered_df["user_id"] == user_id]["app_id"].unique())

        # **Get the games played by similar users**
        similar_users_games = filtered_df[filtered_df["user_id"].isin(similar_users)][["app_id", "title", "tags", "positive_ratio"]]

        # **Filter out the games the user has already played**
        new_recommended_games = similar_users_games[~similar_users_games["app_id"].isin(user_played_games)]

        # **Remove duplicates and sort by rating (positive ratio)**
        new_recommended_games = new_recommended_games.drop_duplicates(subset=["app_id"]).sort_values(by="positive_ratio", ascending=False).head(top_n)

        return new_recommended_games

    except Exception as e:
        return f"Error: {e}"

# **Example**: Recommend games that `user_id = 8107668` has not played
knn_new_recommendations = recommend_new_games_knn(6528047)
display(knn_new_recommendations)


In [None]:
def recommend_new_games_knn(user_id, top_n=10):
    try:
        # **Get the internal ID of the users**
        inner_id = knn_model.trainset.to_inner_uid(user_id)

        # **Get the most similar users**
        neighbors = knn_model.get_neighbors(inner_id, k=top_n)

        # **Convert back to the original user IDs**
        similar_users = [knn_model.trainset.to_raw_uid(n) for n in neighbors]

        # **Get the games that the user has already played**
        user_played_games = set(filtered_df[filtered_df["user_id"] == user_id]["app_id"].unique())

        # **Get the games played by similar users**
        similar_users_games = filtered_df[filtered_df["user_id"].isin(similar_users)][["app_id", "title", "tags", "positive_ratio"]]

        # **Filtered out the games the user has already played**
        new_recommended_games = similar_users_games[~similar_users_games["app_id"].isin(user_played_games)]

        # **Remove duplicates and sort by rating (positive rate) in descending order**
        new_recommended_games = new_recommended_games.drop_duplicates(subset=["app_id"]).sort_values(by="positive_ratio", ascending=False).head(top_n)

        return new_recommended_games

    except Exception as e:
        return f"Error: {e}"

# **Example**：Recommend games that the user with `user_id = 8107668` has not played
knn_new_recommendations = recommend_new_games_knn(6528047)
display(knn_new_recommendations)

In [None]:
def recommend_similar_games_knn(game_id, top_n=10):
    """Recommend similar games using KNN (excluding the game itself)"""
    try:
        # **Get the internal ID of the game**
        inner_id = knn_model.trainset.to_inner_iid(game_id)

        # **Get the most similar games**
        neighbors = knn_model.get_neighbors(inner_id, k=top_n + 1)  # Get one extra to avoid including itself

        # **Convert back to the original game ID**
        similar_games = [knn_model.trainset.to_raw_iid(n) for n in neighbors]

        # **Remove the queried game itself**
        similar_games = [gid for gid in similar_games if gid != game_id][:top_n]

        # **Get the recommended game information**
        recommended_dfknn = filtered_df[filtered_df["app_id"].isin(similar_games)][["app_id", "title", "tags", "positive_ratio"]]
        recommended_dfknn = recommended_dfknn.drop_duplicates(subset=["app_id"]).sort_values(by="positive_ratio", ascending=False)

        return recommended_dfknn.head(top_n)

    except Exception as e:
        return f"Error: {e}"

# **Example**: Get the game most similar to `app_id = 203160` (excluding itself)
knn_recommendations = recommend_similar_games_knn(1245620)
display(knn_recommendations)


### Step 10: Prepare data for neural network recommender

Use scikit-learn to encode user and game IDs, split data into training and test sets, and extract rating data.
Also, apply log transformation and normalization on the rating (weighted_score) for numerical stability.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Filter data to ensure there are no NaN values
filtered_df = filtered_df.dropna(subset=["user_id", "app_id", "weighted_score"])

# Initialize encoders
user_encoder = LabelEncoder()
game_encoder = LabelEncoder()

# First, fit on the entire dataset
filtered_df["user_encoded"] = user_encoder.fit_transform(filtered_df["user_id"])
filtered_df["game_encoded"] = game_encoder.fit_transform(filtered_df["app_id"])

# Split into training & test sets
train, test = train_test_split(filtered_df, test_size=0.2, random_state=42)

# Apply transform() on both train and test sets for consistency
train["user_encoded"] = user_encoder.transform(train["user_id"])
test["user_encoded"] = user_encoder.transform(test["user_id"])

train["game_encoded"] = game_encoder.transform(train["app_id"])
test["game_encoded"] = game_encoder.transform(test["app_id"])

# ✅ **Extract user, game, and rating data**
train_users = train["user_encoded"].values
train_games = train["game_encoded"].values
train_scores = train["weighted_score"].values  # Rating data

test_users = test["user_encoded"].values
test_games = test["game_encoded"].values
test_scores = test["weighted_score"].values  # Rating data

# ✅ **Apply `log1p` transformation on the rating data**
train_scores_log = np.log1p(train_scores)
test_scores_log = np.log1p(test_scores)

train_scores[train_scores < 0] = 0
test_scores[test_scores < 0] = 0
train_scores_log = np.log1p(train_scores)
test_scores_log = np.log1p(test_scores)
train_scores_log = np.nan_to_num(train_scores_log, nan=0, posinf=0, neginf=0)
test_scores_log = np.nan_to_num(test_scores_log, nan=0, posinf=0, neginf=0)

# ✅ **Normalize the rating data**
scaler = StandardScaler()
train_scores_scaled = scaler.fit_transform(train_scores_log.reshape(-1, 1)).flatten()
test_scores_scaled = scaler.transform(test_scores_log.reshape(-1, 1)).flatten()

game_encoder.fit(filtered_df["app_id"])  # First, fit on the entire dataset
train["game_encoded"] = game_encoder.transform(train["app_id"])
test["game_encoded"] = game_encoder.transform(test["app_id"])


### Step 11: Define and train the neural network model

Build a neural network model using Keras.
The model consists of two embedding layers (for user and game IDs), followed by dense layers and dropout for regularization.
The final layer outputs a continuous rating prediction.

In [None]:
from keras.layers import Dropout

# Define the dimension for the embedding layers
embedding_dim = 64  # The dimension can be adjusted, typically 32~128

# User input layer
user_input = keras.layers.Input(shape=(1,), name="user_input")
user_embedding = keras.layers.Embedding(
    input_dim=len(user_encoder.classes_), output_dim=embedding_dim, name="user_embedding"
)(user_input)
user_vec = keras.layers.Flatten()(user_embedding)

# Game input layer
game_input = keras.layers.Input(shape=(1,), name="game_input")
game_embedding = keras.layers.Embedding(
    input_dim=len(game_encoder.classes_), output_dim=embedding_dim, name="game_embedding"
)(game_input)
game_vec = keras.layers.Flatten()(game_embedding)

# Merge user & game features
merged = keras.layers.Concatenate()([user_vec, game_vec])
hidden = keras.layers.Dense(128, activation="relu")(merged)
hidden = Dropout(0.2)(hidden)
hidden = keras.layers.Dense(64, activation="relu")(hidden)
output = keras.layers.Dense(1, activation="linear")(hidden)  # Predict rating

# Build the model
model = keras.models.Model([user_input, game_input], output)
model.compile(optimizer="adam", loss="mse", metrics=["mae"])

# Print the model summary
model.summary()


### Step 12: Train the neural network

Train the neural network using the training data.
We use a batch size of 512 over 11 epochs.

In [None]:
train_users = train["user_encoded"].values
train_games = train["game_encoded"].values
train_scores = train["weighted_score"].values


test_users = test["user_encoded"].values
test_games = test["game_encoded"].values
test_scores = test["weighted_score"].values


history = model.fit(
    [train_users, train_games], train_scores,
    validation_data=([test_users, test_games], test_scores),
    epochs=11, batch_size=512, verbose=1
)

In [None]:
def recommend_games_nn(user_id, top_n=10):
    user_id = int(user_id)

    user_idx = user_encoder.transform([user_id])[0]

    all_games = np.array(range(len(game_encoder.classes_)))
    predicted_scores = model.predict([np.array([user_idx] * len(all_games)), all_games])

    top_games_idx = np.argsort(predicted_scores.flatten())[::-1][:top_n]
    top_games_ids = game_encoder.inverse_transform(top_games_idx)

    recommended_df = filtered_df[filtered_df["app_id"].isin(top_games_ids)][["app_id", "title", "tags","positive_ratio"]]
    recommended_df = recommended_df.drop_duplicates(subset=["app_id"]).head(top_n)

    return recommended_df

# recommendation to user_id
nn_recommendations = recommend_games_nn(9850860)

display(nn_recommendations)

In [None]:
def recommend_similar_games_nn(app_id, top_n=10):
    app_id = int(app_id)

    # **Get the index of the game**
    game_idx = game_encoder.transform([app_id])[0]

    # **Extract the `game_embedding` layer from the neural network**
    game_embedding_layer = model.get_layer("game_embedding")
    game_embedding_weights = game_embedding_layer.get_weights()[0]  # (num_games, embedding_dim)

    # **Get the embedding vector of the target game**
    target_game_vector = game_embedding_weights[game_idx].reshape(1, -1)  # (1, embedding_dim)

    # **Retrieve all game embeddings**
    all_game_vectors = game_embedding_weights  # (num_games, embedding_dim)

    # **Calculate the L2 distance to other games**
    distances = np.linalg.norm(all_game_vectors - target_game_vector, axis=1)

    # **Get the top-N closest games (excluding the game itself)**
    top_games_idx = np.argsort(distances)[1:top_n+1]  # Exclude itself (index 0)

    # **Convert back to game IDs**
    top_games_ids = game_encoder.inverse_transform(top_games_idx)

    # **Get the recommended game information**
    recommended_df = filtered_df[filtered_df["app_id"].isin(top_games_ids)][["app_id", "title", "tags", "positive_ratio"]]
    recommended_df = recommended_df.drop_duplicates(subset=["app_id"]).head(top_n)

    return recommended_df

# Finding the similar games
similar_games_nn = recommend_similar_games_nn(1245620)
display(similar_games_nn)


### Step 14: Evaluate the neural network model

Evaluate the neural network on the test set using Keras's evaluate() function.
We compute RMSE (Root Mean Squared Error) from the returned MSE.
Also, we compare with RMSE values from the SVD and KNN models (from the Surprise library).

In [None]:
# Evaluate the Model
# Directly compute RMSE using Keras
score = model.evaluate([test_users, test_games], test_scores, batch_size=512, verbose=1)

# Keras returns MSE (Mean Squared Error) by default, so take the square root to get RMSE
rmse = np.sqrt(score[0])

print("Test RMSE:", rmse)


In [None]:
from sklearn.metrics import mean_squared_error
print("SVD RMSE:", accuracy.rmse(predictions))
print("KNN RMSE:", accuracy.rmse(knn_predictions))

### Step 15: Explore specific data queries

Perform sample queries on the data.
For example, filter games that have "Elden Ring" in their title, and list the games played by a specific user (sorted by hours played).
"""

In [None]:
filtered_df[filtered_df["title"].str.contains("Elden Ring", case=False, na=False)]

In [None]:
user_games = filtered_df[filtered_df["user_id"] == 9850860][["app_id", "title", "hours"]]
user_games = user_games.sort_values(by="hours", ascending=False)  # Sort in descending order
display(user_games)
