In [None]:
import pandas as pd
import numpy as np
import ast
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error




# MODEL - Price - Based Filtering

In [None]:
purchased_games = pd.read_csv("./clean_datasets/purchased_games.csv")  # (playerid, library)
prices = pd.read_csv("./clean_datasets/prices.csv")  # (gameid, date_acquired, price_usd)

In [None]:
# T·∫°o DataFrame m·ªü r·ªông t·ª´ danh s√°ch game c·ªßa m·ªói ng∆∞·ªùi ch∆°i
# Ch·ªâ c·∫ßn ch·∫°y 1 l·∫ßn - l∆∞u l·∫°i r·ªìi ƒë·ªçc th√¥ith√¥i
player_data = []
for _, row in purchased_games.iterrows():
    playerid = row["playerid"]
    
    games = ast.literal_eval(row["library"])  
    
    prices_list = [prices.loc[prices["gameid"] == game, "price_usd"].values[0] 
                   for game in games if game in prices["gameid"].values]

    if len(prices_list) > 0:
        player_data.append({
            "playerid": playerid,
            "mean_price": np.mean(prices_list),
            "median_price": np.median(prices_list),
            "min_price": np.min(prices_list),
            "max_price": np.max(prices_list),
            "last_5_avg": np.mean(prices_list[-5:]) if len(prices_list) >= 5 else np.mean(prices_list),
            "target_price": np.mean(prices_list[-5:])  # Nh√£n d·ª± ƒëo√°n
        })

player_data



In [None]:
df1 = pd.DataFrame(player_data)
# Write 1 l·∫ßn th√¥i - nh·ªõ comment l·∫°i l·∫ßn sau ch·ªâ c·∫ßn ƒë·ªçc file
df1.to_csv("./data_model/player_data_prices.csv", index=False)
# Ho·∫∑c ƒë·ªçc file 

In [None]:
# ƒê·ªçc file thay v√¨ ch·∫°y l·∫°i
df= pd.read_csv("./data_model/player_data_prices.csv")

In [None]:
# Chu·∫©n b·ªã d·ªØ li·ªáu hu·∫•n luy·ªán
X = df.drop(columns=["playerid", "target_price"])
y = df["target_price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hu·∫•n luy·ªán m√¥ h√¨nh XGBoost
model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
model.fit(X_train, y_train)

# ƒê√°nh gi√° m√¥ h√¨nh
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

# D·ª± ƒëo√°n gi√° t·ªëi ∆∞u cho ng∆∞·ªùi ch∆°i m·ªõi
def suggest_game_for_player(playerid):
    # L·∫•y danh s√°ch game c·ªßa ng∆∞·ªùi ch∆°i t·ª´ player_data
    player_info = df[df["playerid"] == playerid]
    
    if player_info.empty:
        return "Kh√¥ng t√¨m th·∫•y d·ªØ li·ªáu ng∆∞·ªùi ch∆°i n√†y."
    
    # Tr√≠ch xu·∫•t th√¥ng tin v·ªÅ gi√°
    mean_price = player_info["mean_price"].values[0]
    median_price = player_info["median_price"].values[0]
    min_price = player_info["min_price"].values[0]
    max_price = player_info["max_price"].values[0]
    last_5_avg = player_info["last_5_avg"].values[0]

    # D·ª± ƒëo√°n m·ª©c gi√° ph√π h·ª£p
    feature_vector = np.array([[mean_price, median_price, min_price, max_price, last_5_avg]])
    predicted_price = model.predict(feature_vector)[0]

    # T√¨m c√°c game c√≥ gi√° g·∫ßn nh·∫•t v·ªõi m·ª©c gi√° n√†y
    suggested_games = prices.iloc[(prices["price_usd"] - predicted_price).abs().argsort()[:10]]  # L·∫•y 10 game g·∫ßn nh·∫•t
    
    return suggested_games



In [None]:
# DD·ª± ƒëo√°n cho m·ªôt ng∆∞·ªùi ch∆°i m·ªõi v·ªõi danh s√°ch gameid ƒë√£ mua
test_player = 76561198060698936
print(suggest_game_for_player(test_player))

# MODEL 2 - Content-Based Filtering

In [None]:
import pandas as pd
import re
import ast 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
from sklearn.decomposition import TruncatedSVD


In [None]:
# Ch·∫°y file popular game tr∆∞·ªõc ƒë·ªÉ l·∫•y 10 % game ph·ªï bi·∫øn

purchased_games = pd.read_csv("./clean_datasets/purchased_games.csv")  # playerid, library
top_10_percent_game = pd.read_csv("./data_model/top_10_percent_games.csv")  # gameid, title, genres, description
achievements = pd.read_csv("./clean_datasets/achievements.csv")  # achievementid, gameid, title, description
history = pd.read_csv("./clean_datasets/history.csv")  # playerid, achievementid, date_acquired

In [None]:
def preprocess_text(text):
    """ H√†m ti·ªÅn x·ª≠ l√Ω vƒÉn b·∫£n: chuy·ªÉn th√†nh ch·ªØ th∆∞·ªùng v√† lo·∫°i b·ªè d·∫•u c√¢u """
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)  # Lo·∫°i b·ªè kho·∫£ng tr·∫Øng th·ª´a
    text = re.sub(r'[^\w\s]', '', text)  # Lo·∫°i b·ªè d·∫•u c√¢u
    return text

top_10_percent_game['processed_title'] = top_10_percent_game['title'].apply(preprocess_text)
top_10_percent_game['processed_genres'] = top_10_percent_game['genres'].apply(preprocess_text)
top_10_percent_game['processed_developers'] = top_10_percent_game['developers'].apply(preprocess_text)
top_10_percent_game['processed_publishers'] = top_10_percent_game['publishers'].apply(preprocess_text)
top_10_percent_game['processed_languages'] = top_10_percent_game['supported_languages'].apply(preprocess_text)

# T·∫°o m·ªôt c·ªôt t·ªïng h·ª£p c√°c th√¥ng tin ƒë·ªÉ vector h√≥a
top_10_percent_game['combined_features'] = top_10_percent_game['processed_title'] + ' ' + \
                              top_10_percent_game['processed_genres'] + ' ' + \
                              top_10_percent_game['processed_developers'] + ' ' + \
                              top_10_percent_game['processed_publishers'] + ' ' + \
                              top_10_percent_game['processed_languages']
top_10_percent_game

In [None]:
# Ch·∫°y 1 l·∫ßn r·ªìi sau ƒë√≥ ƒë·ªçc file th√¥i
# T·∫°o vector TF-IDF cho c√°c ƒë·∫∑c tr∆∞ng t·ªïng h·ª£p c·ªßa game
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
games_tfidf = tfidf_vectorizer.fit_transform(top_10_percent_game['combined_features'])

# T√≠nh to√°n ƒë·ªô t∆∞∆°ng ƒë·ªìng cosine gi·ªØa c√°c game
cosine_sim = cosine_similarity(games_tfidf)

# L∆∞u k·∫øt qu·∫£ cosine similarity v√†o CSV ƒë·ªÉ t√°i s·ª≠ d·ª•ng
cosine_sim_df = pd.DataFrame(cosine_sim, index=top_10_percent_game['gameid'], columns=top_10_percent_game['gameid'])
cosine_sim_df.to_csv('./data_model/cosine_similarity.csv', index=True)


In [None]:
# cosine_sim_df = pd.read_csv('./data_model/cosine_similarity.csv')
cosine_sim_df

In [None]:
def preprocess_purchased_games(purchased_games):
    def safe_eval(val):
        if isinstance(val, str):  
            try:
                return ast.literal_eval(val)
            except (ValueError, SyntaxError):
                return []  # Tr·∫£ v·ªÅ danh s√°ch r·ªóng n·∫øu l·ªói
        elif isinstance(val, list):
            return val  
        return []  # Tr·∫£ v·ªÅ danh s√°ch r·ªóng n·∫øu kh√¥ng ph·∫£i list

    purchased_games['library'] = purchased_games['library'].apply(safe_eval)
    return purchased_games


In [None]:
def recommend_games_for_player(playerid, purchased_games, cosine_sim_df, games_df, top_n=10):
    """G·ª£i √Ω game d·ª±a tr√™n l·ªãch s·ª≠ mua game c·ªßa ng∆∞·ªùi ch∆°i."""
    
    # Chuy·ªÉn playerid v√† gameid v·ªÅ ki·ªÉu string ƒë·ªÉ tr√°nh l·ªói
    purchased_games['playerid'] = purchased_games['playerid'].astype(str)
    playerid = str(playerid)

    # L·ªçc danh s√°ch game m√† ng∆∞·ªùi ch∆°i ƒë√£ mua
    player_games = purchased_games[purchased_games['playerid'] == playerid]

    if player_games.empty:
        print(f"‚ö†Ô∏è Kh√¥ng t√¨m th·∫•y ng∆∞·ªùi ch∆°i {playerid} trong danh s√°ch purchased_games.")
        return games_df.head(top_n)[['gameid', 'title', 'genres']]  # Tr·∫£ v·ªÅ game ph·ªï bi·∫øn
    
    # L·∫•y danh s√°ch game ƒë√£ mua
    purchased_game_ids = player_games['library'].values[0]
    
    # N·∫øu danh s√°ch r·ªóng, tr·∫£ v·ªÅ game ph·ªï bi·∫øn
    if not purchased_game_ids:
        print(f"‚ö†Ô∏è Ng∆∞·ªùi ch∆°i {playerid} ch∆∞a mua game n√†o.")
        return games_df.head(top_n)[['gameid', 'title', 'genres']]
    
    # T√≠nh ƒëi·ªÉm t∆∞∆°ng ƒë·ªìng
    similar_scores = pd.Series(dtype=float)
    for game_id in purchased_game_ids:
        if game_id in cosine_sim_df.index:
            similar_scores = pd.concat([similar_scores, cosine_sim_df[game_id]])
        

    # T·ªïng h·ª£p ƒëi·ªÉm s·ªë, s·∫Øp x·∫øp gi·∫£m d·∫ßn
    similar_scores = similar_scores.groupby(similar_scores.index).sum().sort_values(ascending=False)
    
    # Lo·∫°i b·ªè game ƒë√£ mua, ch·ªçn 10 game ƒë·∫ßu ti√™n
    recommended_game_ids = similar_scores[~similar_scores.index.isin(purchased_game_ids)].head(top_n).index
    
    # N·∫øu kh√¥ng ƒë·ªß game ƒë·ªÉ g·ª£i √Ω (v√¨ kh√¥ng t√¨m th·∫•y trong cosine_sim_df), b·ªï sung th√™m game ph·ªï bi·∫øn
    if len(recommended_game_ids) < top_n:
        print(f"‚ö†Ô∏è Kh√¥ng ƒë·ªß game g·ª£i √Ω. B·ªï sung th√™m game ph·ªï bi·∫øn.")
        recommended_game_ids = recommended_game_ids.append(games_df[~games_df['gameid'].isin(purchased_game_ids)].head(top_n - len(recommended_game_ids))['gameid'])
    
    return games_df[games_df['gameid'].isin(recommended_game_ids)][['gameid', 'title', 'genres']]



In [None]:
purchased_games = preprocess_purchased_games(purchased_games)  # Chuy·ªÉn ƒë·ªïi c·ªôt 'library'

In [None]:
playerid = 76561198060698936  
recommended_games = recommend_games_for_player(playerid, purchased_games, cosine_sim_df, top_10_percent_game)
print(recommended_games)

# MODEL 3 - Collaborative Filtering

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col

In [None]:
spark = SparkSession.builder.appName("GameRecommendation").getOrCreate()


In [None]:

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import TruncatedSVD
import pickle


In [None]:
purchased_games = pd.read_csv("./clean_datasets/purchased_games.csv")  # (playerid, library)


In [None]:
purchased_games

In [None]:
# purchased_games['library'] = purchased_games['library'].apply(lambda x: x.split(','))


In [None]:
purchased_games

In [None]:
all_games = set()
for games in purchased_games["library"]:
    if isinstance(games, str):  # N·∫øu l√† chu·ªói, t√°ch game
        cleaned_games = [game.strip("[]") for game in games.split(',')]
    elif isinstance(games, list):  # N·∫øu l√† danh s√°ch, x·ª≠ l√Ω tr·ª±c ti·∫øp
        cleaned_games = [game.strip("[]") for game in games]
    all_games.update(cleaned_games)

game_to_index = {game: idx for idx, game in enumerate(all_games)}
index_to_game = {idx: game for game, idx in game_to_index.items()}

# T·∫°o √°nh x·∫° player_id ‚Üí index
player_to_index = {player_id: idx for idx, player_id in enumerate(purchased_games["playerid"])}




In [None]:
all_games

In [None]:
index_to_game

In [None]:
player_to_index

In [None]:
game_to_index

In [None]:
# from scipy.sparse import lil_matrix, save_npz, load_npz

# num_players = len(purchased_games)
# num_games = len(all_games)
# interaction_matrix = lil_matrix((num_players, num_games), dtype=np.uint8)

# for i, games in enumerate(purchased_games["library"]):
#     if isinstance(games, str):  # N·∫øu games l√† chu·ªói, chuy·ªÉn n√≥ th√†nh danh s√°ch
#         games = games.strip("[]").split(",")  # T√°ch chu·ªói th√†nh danh s√°ch

#     for game in games:
#         game = str(game).strip()  # ƒê·∫£m b·∫£o game ID l√† chu·ªói kh√¥ng c√≥ kho·∫£ng tr·∫Øng

#         if game in game_to_index:  # Ki·ªÉm tra game c√≥ t·ªìn t·∫°i trong game_to_index kh√¥ng
#             # if i < 1:     
#                 # print("Abc")
#                 # print(game_to_index[game])

#             interaction_matrix[i, game_to_index[game]] = 1

# # Chuy·ªÉn ma tr·∫≠n sang CSR (t·ªëi ∆∞u t√≠nh to√°n)
# interaction_matrix = interaction_matrix.tocsr()

# # L∆∞u ma tr·∫≠n v√†o file ƒë·ªÉ t√°i s·ª≠ d·ª•ng m√† kh√¥ng c·∫ßn t√≠nh to√°n l·∫°i
# save_npz("./data_model/interaction_matrix.npz", interaction_matrix)




In [None]:
from scipy.sparse import lil_matrix, save_npz, load_npz

interaction_matrix = load_npz("./data_model/interaction_matrix.npz")


In [None]:
data = []
for player_id, games in enumerate(interaction_matrix):
    for game_index in games.nonzero()[1]:
        data.append((int(player_id), int(game_index), float(1)))  # Chuy·ªÉn ƒë·ªïi ki·ªÉu d·ªØ li·ªáu

In [None]:
interaction_matrix

In [None]:
data

In [None]:
# T·∫°o DataFrame PySpark
columns = ["player_id", "game_id", "rating"]
df = spark.createDataFrame(data, columns)

In [None]:
df = df.withColumn("player_id", col("player_id").cast("int"))
df = df.withColumn("game_id", col("game_id").cast("int"))
df = df.withColumn("rating", col("rating").cast("float"))
df.printSchema()  # X√°c nh·∫≠n l·∫°i ki·ªÉu d·ªØ li·ªáu


In [None]:
df

In [None]:
print(spark)


In [None]:
df.printSchema()


In [None]:
from pyspark.sql.functions import col, count

In [None]:
# df.count() # Ki·ªÉm tra s·ªë d√≤ng trong DataFrame
# print(df.schema)  # Xem ki·ªÉu d·ªØ li·ªáu c·ªßa t·ª´ng c·ªôt


In [None]:
# df.describe().show()  # Ki·ªÉm tra th·ªëng k√™ d·ªØ li·ªáu
# df.select([col(c).isNull().sum() for c in df.columns]).show()  # Ki·ªÉm tra gi√° tr·ªã null
# df.show(5)  # Xem th·ª≠ 5 d√≤ng ƒë·∫ßu

In [None]:
# Kh·ªüi t·∫°o m√¥ h√¨nh ALS
from pyspark.ml.recommendation import ALS

als = ALS(
    maxIter=10,         # S·ªë v√≤ng l·∫∑p
    regParam=0.1,       # H·ªá s·ªë ƒëi·ªÅu chu·∫©n
    userCol="player_id",
    itemCol="game_id",
    ratingCol="rating",
    coldStartStrategy="drop"  # Lo·∫°i b·ªè d·ªØ li·ªáu kh√¥ng ƒë·ªß th√¥ng tin
)

# Hu·∫•n luy·ªán m√¥ h√¨nh
model = als.fit(df)


In [None]:
interaction_matrix

In [None]:
interaction_matrix_dense = interaction_matrix.toarray()


In [None]:
interaction_matrix_dense

In [None]:

from scipy.sparse.linalg import svds

num_components = 50
U_svd, sigma_svd, Vt_svd = svds(interaction_matrix, k=num_components)

# Chuy·ªÉn sigma th√†nh ma tr·∫≠n ch√©o
sigma_svd = np.diag(sigma_svd)

# L∆∞u k·∫øt qu·∫£ ƒë·ªÉ s·ª≠ d·ª•ng l·∫°i
with open("./data_model/svd_model.pkl", "wb") as f:
    pickle.dump((U_svd, sigma_svd, Vt_svd), f)


In [None]:
# density = interaction_matrix.nnz / (interaction_matrix.shape[0] * interaction_matrix.shape[1])
# print(f"M·∫≠t ƒë·ªô d·ªØ li·ªáu kh√¥ng r·ªóng: {density:.6f}")

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def get_similar_users(player_index, user_embeddings_sparse, top_k=20):
    """
    L·∫•y danh s√°ch top_k ng∆∞·ªùi ch∆°i c√≥ ƒë·ªô t∆∞∆°ng ƒë·ªìng cao nh·∫•t v·ªõi player_index
    """
    similarities = cosine_similarity(user_embeddings_sparse[player_index], user_embeddings_sparse).flatten()
    # print(similarities)
    similar_users = np.argsort(similarities)[::-1][1:top_k + 1]  # L·∫•y top_k ng∆∞·ªùi ch∆°i g·∫ßn nh·∫•t
    return similar_users

In [None]:
def recommend_games(player_id, U_svd, sigma_svd, Vt_svd, interaction_matrix, 
                    game_to_index, index_to_game, player_to_index, num_recommendations=10):
    
    # Ki·ªÉm tra xem player_id c√≥ trong player_to_index kh√¥ng
    if player_id not in player_to_index:
        print("‚ùå Player ID kh√¥ng h·ª£p l·ªá.")
        return []

    # Chuy·ªÉn player_id th√†nh index
    player_index = player_to_index[player_id]

    # T·∫°o sparse embedding ƒë·ªÉ ti·∫øt ki·ªám b·ªô nh·ªõ
    from scipy.sparse import csr_matrix
    user_embeddings_sparse = csr_matrix(U_svd @ sigma_svd)
    print(user_embeddings_sparse)
    # L·∫•y danh s√°ch ng∆∞·ªùi ch∆°i t∆∞∆°ng t·ª±
    similar_users = get_similar_users(player_index, user_embeddings_sparse)
    print(similar_users)
    print(similarities[similar_users])
    # T·∫≠p h·ª£p c√°c game m√† nh·ªØng ng∆∞·ªùi ch∆°i t∆∞∆°ng t·ª± ƒë√£ mua
    recommended_games = set()
    for similar_user in similar_users:
        recommended_games.update(interaction_matrix[similar_user].nonzero()[1])
    print(recommended_games)
    # Lo·∫°i b·ªè game m√† ng∆∞·ªùi ch∆°i hi·ªán t·∫°i ƒë√£ mua
    purchased_games = set(interaction_matrix[player_index].nonzero()[1])
    final_recommendations = [index_to_game[idx] for idx in recommended_games if idx not in purchased_games]

    return final_recommendations[:num_recommendations]  # L·∫•y ƒë√∫ng s·ªë l∆∞·ª£ng c·∫ßn thi·∫øt


In [None]:
player_id = 76561198060698936  # ID c·ªßa ng∆∞·ªùi ch∆°i c·∫ßn g·ª£i √Ω
recommended_games = recommend_games(player_id, U_svd, sigma_svd, Vt_svd, 
                                    interaction_matrix, game_to_index, index_to_game, player_to_index)

print("üéÆ Game ƒë∆∞·ª£c ƒë·ªÅ xu·∫•t:", recommended_games)
