In [1]:
import pandas as pd
import numpy as np
import ast
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error




# MODEL - Price - Based Filtering

In [2]:
purchased_games = pd.read_csv("./clean_datasets/purchased_games.csv")  # (playerid, library)
prices = pd.read_csv("./clean_datasets/prices.csv")  # (gameid, date_acquired, price_usd)

In [None]:
# Tạo DataFrame mở rộng từ danh sách game của mỗi người chơi
# Chỉ cần chạy 1 lần - lưu lại rồi đọc thôithôi
player_data = []
for _, row in purchased_games.iterrows():
    playerid = row["playerid"]
    
    games = ast.literal_eval(row["library"])  
    
    prices_list = [prices.loc[prices["gameid"] == game, "price_usd"].values[0] 
                   for game in games if game in prices["gameid"].values]

    if len(prices_list) > 0:
        player_data.append({
            "playerid": playerid,
            "mean_price": np.mean(prices_list),
            "median_price": np.median(prices_list),
            "min_price": np.min(prices_list),
            "max_price": np.max(prices_list),
            "last_5_avg": np.mean(prices_list[-5:]) if len(prices_list) >= 5 else np.mean(prices_list),
            "target_price": np.mean(prices_list[-5:])  # Nhãn dự đoán
        })

player_data



[{'playerid': 76561198060698936,
  'mean_price': np.float64(12.17198039020626),
  'median_price': np.float64(9.156466326846973),
  'min_price': np.float64(0.0),
  'max_price': np.float64(64.89299077680998),
  'last_5_avg': np.float64(22.801252667582872),
  'target_price': np.float64(22.801252667582872)},
 {'playerid': 76561198287452552,
  'mean_price': np.float64(16.017915074726385),
  'median_price': np.float64(14.341199391605024),
  'min_price': np.float64(0.0),
  'max_price': np.float64(71.84235684967149),
  'last_5_avg': np.float64(4.836150258269301),
  'target_price': np.float64(4.836150258269301)},
 {'playerid': 76561198040436563,
  'mean_price': np.float64(18.778864045909398),
  'median_price': np.float64(15.331456320106083),
  'min_price': np.float64(0.0),
  'max_price': np.float64(71.84235684967149),
  'last_5_avg': np.float64(46.85818424150624),
  'target_price': np.float64(46.85818424150624)},
 {'playerid': 76561198042412488,
  'mean_price': np.float64(14.815043336376467),
 

In [None]:
df1 = pd.DataFrame(player_data)
# Write 1 lần thôi - nhớ comment lại lần sau chỉ cần đọc file
df1.to_csv("./data_model/player_data_prices.csv", index=False)
# Hoặc đọc file 

In [None]:
# Đọc file thay vì chạy lại
df= pd.read_csv("./data_model/player_data_prices.csv")

In [7]:
# Chuẩn bị dữ liệu huấn luyện
X = df.drop(columns=["playerid", "target_price"])
y = df["target_price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Huấn luyện mô hình XGBoost
model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
model.fit(X_train, y_train)

# Đánh giá mô hình
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

# Dự đoán giá tối ưu cho người chơi mới
def suggest_game_for_player(playerid):
    # Lấy danh sách game của người chơi từ player_data
    player_info = df[df["playerid"] == playerid]
    
    if player_info.empty:
        return "Không tìm thấy dữ liệu người chơi này."
    
    # Trích xuất thông tin về giá
    mean_price = player_info["mean_price"].values[0]
    median_price = player_info["median_price"].values[0]
    min_price = player_info["min_price"].values[0]
    max_price = player_info["max_price"].values[0]
    last_5_avg = player_info["last_5_avg"].values[0]

    # Dự đoán mức giá phù hợp
    feature_vector = np.array([[mean_price, median_price, min_price, max_price, last_5_avg]])
    predicted_price = model.predict(feature_vector)[0]

    # Tìm các game có giá gần nhất với mức giá này
    suggested_games = prices.iloc[(prices["price_usd"] - predicted_price).abs().argsort()[:10]]  # Lấy 10 game gần nhất
    
    return suggested_games



RMSE: 0.24174656623292673


In [8]:
# DDự đoán cho một người chơi mới với danh sách gameid đã mua
test_player = 76561198060698936
print(suggest_game_for_player(test_player))

        gameid date_acquired  price_usd
49635  1582650    2025-02-24  22.695552
16231   667970    2025-02-24  22.693167
67504  2116800    2025-02-24  22.710504
28225   991780    2025-02-24  22.712488
1361    200510    2025-02-24  22.679996
43721  1417880    2025-02-24  22.674023
66925  2099880    2025-02-24  22.723771
28452   998990    2025-02-24  22.723771
18471   724250    2025-02-24  22.723771
61261  1911610    2025-02-24  22.667224


# MODEL 2 - Content-Based Filtering

In [9]:
import pandas as pd
import re
import ast 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
from sklearn.decomposition import TruncatedSVD


In [None]:
# Chạy file popular game trước để lấy 10 % game phổ biến

purchased_games = pd.read_csv("./clean_datasets/purchased_games.csv")  # playerid, library
top_10_percent_game = pd.read_csv("./data_model/top_10_percent_games.csv")  # gameid, title, genres, description
achievements = pd.read_csv("./clean_datasets/achievements.csv")  # achievementid, gameid, title, description
history = pd.read_csv("./clean_datasets/history.csv")  # playerid, achievementid, date_acquired

In [12]:
def preprocess_text(text):
    """ Hàm tiền xử lý văn bản: chuyển thành chữ thường và loại bỏ dấu câu """
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)  # Loại bỏ khoảng trắng thừa
    text = re.sub(r'[^\w\s]', '', text)  # Loại bỏ dấu câu
    return text

top_10_percent_game['processed_title'] = top_10_percent_game['title'].apply(preprocess_text)
top_10_percent_game['processed_genres'] = top_10_percent_game['genres'].apply(preprocess_text)
top_10_percent_game['processed_developers'] = top_10_percent_game['developers'].apply(preprocess_text)
top_10_percent_game['processed_publishers'] = top_10_percent_game['publishers'].apply(preprocess_text)
top_10_percent_game['processed_languages'] = top_10_percent_game['supported_languages'].apply(preprocess_text)

# Tạo một cột tổng hợp các thông tin để vector hóa
top_10_percent_game['combined_features'] = top_10_percent_game['processed_title'] + ' ' + \
                              top_10_percent_game['processed_genres'] + ' ' + \
                              top_10_percent_game['processed_developers'] + ' ' + \
                              top_10_percent_game['processed_publishers'] + ' ' + \
                              top_10_percent_game['processed_languages']
top_10_percent_game

Unnamed: 0,gameid,title,developers,publishers,genres,supported_languages,release_date,purchase_count,processed_title,processed_genres,processed_developers,processed_publishers,processed_languages,combined_features
0,730,Counter-Strike 2,['Valve'],['Valve'],"['Action', 'Free To Play']","['Czech', 'Danish', 'Dutch', 'English', 'Finni...",2012-08-21,43967.0,counterstrike 2,action free to play,valve,valve,czech danish dutch english finnish french germ...,counterstrike 2 action free to play valve valv...
1,578080,PUBG: BATTLEGROUNDS,['PUBG Corporation'],"['KRAFTON, Inc.']","['Action', 'Adventure', 'Massively Multiplayer...","['English', 'Korean', 'Simplified Chinese', 'F...",2017-12-21,28356.0,pubg battlegrounds,action adventure massively multiplayer free to...,pubg corporation,krafton inc,english korean simplified chinese french germa...,pubg battlegrounds action adventure massively ...
2,550,Left 4 Dead 2,['Valve'],['Valve'],['Action'],"['Danish', 'Dutch', 'English', 'Finnish', 'Fre...",2009-11-16,24727.0,left 4 dead 2,action,valve,valve,danish dutch english finnish french german ita...,left 4 dead 2 action valve valve danish dutch ...
3,218620,PAYDAY 2,['OVERKILL - a Starbreeze Studio.'],['Starbreeze Publishing AB'],"['Action', 'RPG']","['English', 'German', 'French', 'Italian', 'Sp...",2013-08-13,23785.0,payday 2,action rpg,overkill a starbreeze studio,starbreeze publishing ab,english german french italian spanish spain d...,payday 2 action rpg overkill a starbreeze stu...
4,304930,Unturned,['Smartly Dressed Games'],['Smartly Dressed Games'],"['Action', 'Adventure', 'Casual', 'Indie', 'Fr...",['English'],2017-07-07,21620.0,unturned,action adventure casual indie free to play,smartly dressed games,smartly dressed games,english,unturned action adventure casual indie free to...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9819,736110,Deep Sorrow,['QuickSave'],['SA Industry'],"['Adventure', 'Indie']","['English', 'Turkish']",2017-11-12,211.0,deep sorrow,adventure indie,quicksave,sa industry,english turkish,deep sorrow adventure indie quicksave sa indus...
9820,462530,8i - Make VR Human,['8i'],['8i'],['Adventure'],['English'],2016-04-25,211.0,8i make vr human,adventure,8i,8i,english,8i make vr human adventure 8i 8i english
9821,257870,Eschalon: Book III,['Basilisk Games'],['Basilisk Games'],"['Adventure', 'Indie', 'RPG']",['English'],2014-02-14,211.0,eschalon book iii,adventure indie rpg,basilisk games,basilisk games,english,eschalon book iii adventure indie rpg basilisk...
9822,440550,Atomik: RunGunJumpGun,['ThirtyThree'],['Good Shepherd Entertainment'],"['Action', 'Indie']","['English', 'French', 'Italian', 'German', 'Sp...",2016-08-31,211.0,atomik rungunjumpgun,action indie,thirtythree,good shepherd entertainment,english french italian german spanish spain j...,atomik rungunjumpgun action indie thirtythree ...


In [None]:
# Chạy 1 lần rồi sau đó đọc file thôi
# Tạo vector TF-IDF cho các đặc trưng tổng hợp của game
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
games_tfidf = tfidf_vectorizer.fit_transform(top_10_percent_game['combined_features'])

# Tính toán độ tương đồng cosine giữa các game
cosine_sim = cosine_similarity(games_tfidf)

# Lưu kết quả cosine similarity vào CSV để tái sử dụng
cosine_sim_df = pd.DataFrame(cosine_sim, index=top_10_percent_game['gameid'], columns=top_10_percent_game['gameid'])
cosine_sim_df.to_csv('./data_model/cosine_similarity.csv', index=True)


In [14]:
# cosine_sim_df = pd.read_csv('./data_model/cosine_similarity.csv')
cosine_sim_df

gameid,730,578080,550,218620,304930,1172470,230410,271590,407530,4000,...,680930,263120,378110,1290070,516480,736110,462530,257870,440550,654770
gameid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
730,1.000000,0.231547,0.834468,0.153975,0.030039,0.256958,0.212027,0.187571,0.226222,0.601394,...,0.006700,0.063802,0.001633,0.109609,0.007027,0.024264,0.001282,0.001752,0.158963,0.419233
578080,0.231547,1.000000,0.206520,0.081990,0.032047,0.218642,0.190354,0.133946,0.155125,0.205261,...,0.006164,0.064558,0.006824,0.066954,0.012225,0.030224,0.005357,0.007320,0.166996,0.199079
550,0.834468,0.206520,1.000000,0.155021,0.005418,0.222725,0.182292,0.188846,0.227759,0.605480,...,0.006745,0.064236,0.001644,0.110353,0.007075,0.024429,0.001291,0.001763,0.160043,0.422082
218620,0.153975,0.081990,0.155021,1.000000,0.005581,0.169452,0.107812,0.143676,0.124387,0.106638,...,0.006949,0.031280,0.001694,0.093977,0.007288,0.002515,0.001330,0.013895,0.092167,0.135492
304930,0.030039,0.032047,0.005418,0.005581,1.000000,0.043347,0.032390,0.026428,0.021065,0.010901,...,0.024649,0.010245,0.006368,0.030318,0.021589,0.013813,0.005000,0.043029,0.009489,0.009494
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
736110,0.024264,0.030224,0.024429,0.002515,0.013813,0.013704,0.026162,0.011620,0.028655,0.030656,...,0.007305,0.018352,0.011408,0.001968,0.018037,1.000000,0.008956,0.017876,0.033892,0.026697
462530,0.001282,0.005357,0.001291,0.001330,0.005000,0.007246,0.001382,0.006144,0.004400,0.001340,...,0.001249,0.006643,0.006032,0.001041,0.006529,0.008956,1.000000,0.006470,0.001481,0.001167
257870,0.001752,0.007320,0.001763,0.013895,0.043029,0.009900,0.014446,0.028701,0.032122,0.005661,...,0.024025,0.013259,0.008241,0.032672,0.013031,0.017876,0.006470,1.000000,0.006259,0.004930
440550,0.158963,0.166996,0.160043,0.092167,0.009489,0.201685,0.156294,0.141421,0.112399,0.158687,...,0.011452,0.068329,0.001886,0.084073,0.012391,0.033892,0.001481,0.006259,1.000000,0.163714


In [15]:
def preprocess_purchased_games(purchased_games):
    def safe_eval(val):
        if isinstance(val, str):  
            try:
                return ast.literal_eval(val)
            except (ValueError, SyntaxError):
                return []  # Trả về danh sách rỗng nếu lỗi
        elif isinstance(val, list):
            return val  
        return []  # Trả về danh sách rỗng nếu không phải list

    purchased_games['library'] = purchased_games['library'].apply(safe_eval)
    return purchased_games


In [16]:
def recommend_games_for_player(playerid, purchased_games, cosine_sim_df, games_df, top_n=10):
    """Gợi ý game dựa trên lịch sử mua game của người chơi."""
    
    # Chuyển playerid và gameid về kiểu string để tránh lỗi
    purchased_games['playerid'] = purchased_games['playerid'].astype(str)
    playerid = str(playerid)

    # Lọc danh sách game mà người chơi đã mua
    player_games = purchased_games[purchased_games['playerid'] == playerid]

    if player_games.empty:
        print(f"⚠️ Không tìm thấy người chơi {playerid} trong danh sách purchased_games.")
        return games_df.head(top_n)[['gameid', 'title', 'genres']]  # Trả về game phổ biến
    
    # Lấy danh sách game đã mua
    purchased_game_ids = player_games['library'].values[0]
    
    # Nếu danh sách rỗng, trả về game phổ biến
    if not purchased_game_ids:
        print(f"⚠️ Người chơi {playerid} chưa mua game nào.")
        return games_df.head(top_n)[['gameid', 'title', 'genres']]
    
    # Tính điểm tương đồng
    similar_scores = pd.Series(dtype=float)
    for game_id in purchased_game_ids:
        if game_id in cosine_sim_df.index:
            similar_scores = pd.concat([similar_scores, cosine_sim_df[game_id]])
        

    # Tổng hợp điểm số, sắp xếp giảm dần
    similar_scores = similar_scores.groupby(similar_scores.index).sum().sort_values(ascending=False)
    
    # Loại bỏ game đã mua, chọn 10 game đầu tiên
    recommended_game_ids = similar_scores[~similar_scores.index.isin(purchased_game_ids)].head(top_n).index
    
    # Nếu không đủ game để gợi ý (vì không tìm thấy trong cosine_sim_df), bổ sung thêm game phổ biến
    if len(recommended_game_ids) < top_n:
        print(f"⚠️ Không đủ game gợi ý. Bổ sung thêm game phổ biến.")
        recommended_game_ids = recommended_game_ids.append(games_df[~games_df['gameid'].isin(purchased_game_ids)].head(top_n - len(recommended_game_ids))['gameid'])
    
    return games_df[games_df['gameid'].isin(recommended_game_ids)][['gameid', 'title', 'genres']]



In [17]:
purchased_games = preprocess_purchased_games(purchased_games)  # Chuyển đổi cột 'library'

In [18]:
playerid = 76561198060698936  
recommended_games = recommend_games_for_player(playerid, purchased_games, cosine_sim_df, top_10_percent_game)
print(recommended_games)

       gameid                title  \
659    527230         For The King   
2095  1272080             PAYDAY 3   
2220  1062830                 Embr   
2335   467950     Survive in Space   
2694  1220150            Blue Fire   
3501  1502190           From Space   
5090   934700        Dead Island 2   
5237  1677280  Company of Heroes 3   
5310  1676840      For The King II   
6224   705120    Death Coming/死神来了   

                                                 genres  
659           ['Adventure', 'Indie', 'RPG', 'Strategy']  
2095                     ['Action', 'Adventure', 'RPG']  
2220  ['Action', 'Casual', 'Indie', 'Simulation', 'S...  
2335  ['Action', 'Adventure', 'Casual', 'Indie', 'RPG']  
2694                   ['Action', 'Adventure', 'Indie']  
3501                                ['Action', 'Indie']  
5090                            ['Action', 'Adventure']  
5237                             ['Action', 'Strategy']  
5310          ['Adventure', 'Indie', 'RPG', 'Strategy']  
6

# MODEL 3 - Collaborative Filtering

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col

In [3]:
spark = SparkSession.builder.appName("GameRecommendation").getOrCreate()


In [4]:

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import TruncatedSVD
import pickle


In [5]:
purchased_games = pd.read_csv("./clean_datasets/purchased_games.csv")  # (playerid, library)


In [46]:
purchased_games

Unnamed: 0,playerid,library
0,76561198060698936,"[60, 1670, 3830, 1600, 2900, 2910, 2920, 4800,..."
1,76561198287452552,"[10, 80, 100, 240, 2990, 6880, 6910, 6920, 698..."
2,76561198040436563,"[10, 80, 100, 300, 20, 30, 40, 50, 60, 70, 130..."
3,76561198042412488,"[300, 240, 220, 320, 360, 4300, 4800, 4000, 61..."
4,76561198119605821,"[47870, 108600, 550, 271590, 331470, 381210, 2..."
...,...,...
102543,76561199063275634,[]
102544,76561198003275888,"[3920, 2600, 6980, 4540, 4550, 7830, 22330, 22..."
102545,76561198944668572,[]
102546,76561198033563710,[]


In [31]:
# purchased_games['library'] = purchased_games['library'].apply(lambda x: x.split(','))


In [47]:
purchased_games

Unnamed: 0,playerid,library
0,76561198060698936,"[60, 1670, 3830, 1600, 2900, 2910, 2920, 4800,..."
1,76561198287452552,"[10, 80, 100, 240, 2990, 6880, 6910, 6920, 698..."
2,76561198040436563,"[10, 80, 100, 300, 20, 30, 40, 50, 60, 70, 130..."
3,76561198042412488,"[300, 240, 220, 320, 360, 4300, 4800, 4000, 61..."
4,76561198119605821,"[47870, 108600, 550, 271590, 331470, 381210, 2..."
...,...,...
102543,76561199063275634,[]
102544,76561198003275888,"[3920, 2600, 6980, 4540, 4550, 7830, 22330, 22..."
102545,76561198944668572,[]
102546,76561198033563710,[]


In [6]:
all_games = set()
for games in purchased_games["library"]:
    if isinstance(games, str):  # Nếu là chuỗi, tách game
        cleaned_games = [game.strip("[]") for game in games.split(',')]
    elif isinstance(games, list):  # Nếu là danh sách, xử lý trực tiếp
        cleaned_games = [game.strip("[]") for game in games]
    all_games.update(cleaned_games)

game_to_index = {game: idx for idx, game in enumerate(all_games)}
index_to_game = {idx: game for game, idx in game_to_index.items()}

# Tạo ánh xạ player_id → index
player_to_index = {player_id: idx for idx, player_id in enumerate(purchased_games["playerid"])}




In [35]:
all_games

{' 566190',
 '',
 ' 1597730',
 ' 2393890',
 '20540',
 ' 1503060',
 '4540',
 ' 1360600',
 '201810',
 ' 510420',
 ' 620670',
 ' 1004860',
 ' 957070',
 ' 298140',
 ' 423120',
 ' 811870',
 ' 415270',
 ' 1589120',
 ' 695360',
 ' 1347580',
 ' 509360',
 ' 303860',
 ' 279160',
 ' 2313130',
 ' 262900',
 ' 325880',
 '244770',
 ' 742470',
 ' 1068150',
 ' 1397470',
 ' 789240',
 ' 491330',
 ' 2339040',
 ' 1706090',
 ' 2286770',
 ' 2888360',
 ' 398450',
 ' 583200',
 ' 404620',
 ' 2206240',
 ' 893010',
 ' 1233260',
 ' 1722040',
 ' 753390',
 ' 404040',
 ' 2216830',
 ' 847510',
 ' 589060',
 '23490',
 ' 1585130',
 ' 1379630',
 ' 2435090',
 ' 2560640',
 ' 653940',
 ' 284870',
 ' 42950',
 ' 1869370',
 ' 15560',
 ' 400200',
 ' 2635360',
 ' 1379930',
 ' 884400',
 '666220',
 ' 862800',
 ' 270030',
 ' 365560',
 ' 569130',
 ' 244050',
 ' 1765300',
 ' 1451480',
 ' 572220',
 ' 1125510',
 ' 2197890',
 '3720',
 ' 493290',
 ' 338640',
 ' 646280',
 ' 2777190',
 ' 241680',
 ' 819020',
 ' 557570',
 ' 374510',
 ' 14551

In [49]:
index_to_game

{0: '',
 1: ' 692190',
 2: ' 912550',
 3: ' 893070',
 4: ' 1401610',
 5: ' 2304490',
 6: ' 58300',
 7: ' 471730',
 8: ' 860530',
 9: ' 1736360',
 10: ' 2680840',
 11: ' 486130',
 12: ' 1100970',
 13: '552500',
 14: ' 314410',
 15: ' 1383150',
 16: ' 1636350',
 17: ' 563270',
 18: ' 765170',
 19: ' 1182670',
 20: ' 2186700',
 21: ' 1997490',
 22: ' 340880',
 23: ' 348850',
 24: ' 1647550',
 25: ' 1709170',
 26: '360940',
 27: ' 1953540',
 28: ' 510520',
 29: ' 2928280',
 30: '1049590',
 31: ' 459940',
 32: ' 593340',
 33: ' 1710750',
 34: ' 398810',
 35: ' 1447350',
 36: ' 837270',
 37: ' 459770',
 38: ' 815240',
 39: ' 560050',
 40: ' 646200',
 41: ' 2289650',
 42: ' 721390',
 43: ' 600280',
 44: ' 935560',
 45: ' 1459500',
 46: ' 879510',
 47: '314160',
 48: ' 8600',
 49: ' 287120',
 50: '13500',
 51: '491280',
 52: ' 15740',
 53: ' 1026160',
 54: ' 861190',
 55: '33520',
 56: ' 307340',
 57: ' 692610',
 58: ' 43600',
 59: ' 305740',
 60: ' 366280',
 61: ' 765320',
 62: ' 1414230',
 6

In [48]:
player_to_index

{76561198060698936: 0,
 76561198287452552: 1,
 76561198040436563: 2,
 76561198042412488: 3,
 76561198119605821: 4,
 76561198049686270: 5,
 76561198155814250: 6,
 76561198083492916: 7,
 76561198150634683: 8,
 76561198836367256: 9,
 76561198131958442: 10,
 76561198106910534: 11,
 76561198122070915: 12,
 76561198740883087: 13,
 76561198016756834: 14,
 76561198072740562: 15,
 76561198144540723: 16,
 76561198096557299: 17,
 76561198884939938: 18,
 76561198220441373: 19,
 76561198046610425: 20,
 76561198801797946: 21,
 76561199000909663: 22,
 76561199004958958: 23,
 76561198120109043: 24,
 76561198856855325: 25,
 76561198146275005: 26,
 76561197989446733: 27,
 76561197997318138: 28,
 76561198015133782: 29,
 76561199158139117: 30,
 76561197962051588: 31,
 76561198191613675: 32,
 76561198080851200: 33,
 76561198230516004: 34,
 76561198282337972: 35,
 76561198391791168: 36,
 76561198213729077: 37,
 76561199248076925: 38,
 76561198234203304: 39,
 76561199225844826: 40,
 76561198087768441: 41,
 7

In [47]:
game_to_index

{'': 0,
 ' 692190': 1,
 ' 912550': 2,
 ' 893070': 3,
 ' 1401610': 4,
 ' 2304490': 5,
 ' 58300': 6,
 ' 471730': 7,
 ' 860530': 8,
 ' 1736360': 9,
 ' 2680840': 10,
 ' 486130': 11,
 ' 1100970': 12,
 '552500': 13,
 ' 314410': 14,
 ' 1383150': 15,
 ' 1636350': 16,
 ' 563270': 17,
 ' 765170': 18,
 ' 1182670': 19,
 ' 2186700': 20,
 ' 1997490': 21,
 ' 340880': 22,
 ' 348850': 23,
 ' 1647550': 24,
 ' 1709170': 25,
 '360940': 26,
 ' 1953540': 27,
 ' 510520': 28,
 ' 2928280': 29,
 '1049590': 30,
 ' 459940': 31,
 ' 593340': 32,
 ' 1710750': 33,
 ' 398810': 34,
 ' 1447350': 35,
 ' 837270': 36,
 ' 459770': 37,
 ' 815240': 38,
 ' 560050': 39,
 ' 646200': 40,
 ' 2289650': 41,
 ' 721390': 42,
 ' 600280': 43,
 ' 935560': 44,
 ' 1459500': 45,
 ' 879510': 46,
 '314160': 47,
 ' 8600': 48,
 ' 287120': 49,
 '13500': 50,
 '491280': 51,
 ' 15740': 52,
 ' 1026160': 53,
 ' 861190': 54,
 '33520': 55,
 ' 307340': 56,
 ' 692610': 57,
 ' 43600': 58,
 ' 305740': 59,
 ' 366280': 60,
 ' 765320': 61,
 ' 1414230': 62,
 '

In [None]:
# from scipy.sparse import lil_matrix, save_npz, load_npz

# num_players = len(purchased_games)
# num_games = len(all_games)
# interaction_matrix = lil_matrix((num_players, num_games), dtype=np.uint8)

# for i, games in enumerate(purchased_games["library"]):
#     if isinstance(games, str):  # Nếu games là chuỗi, chuyển nó thành danh sách
#         games = games.strip("[]").split(",")  # Tách chuỗi thành danh sách

#     for game in games:
#         game = str(game).strip()  # Đảm bảo game ID là chuỗi không có khoảng trắng

#         if game in game_to_index:  # Kiểm tra game có tồn tại trong game_to_index không
#             # if i < 1:     
#                 # print("Abc")
#                 # print(game_to_index[game])

#             interaction_matrix[i, game_to_index[game]] = 1

# # Chuyển ma trận sang CSR (tối ưu tính toán)
# interaction_matrix = interaction_matrix.tocsr()

# # Lưu ma trận vào file để tái sử dụng mà không cần tính toán lại
# save_npz("./data_model/interaction_matrix.npz", interaction_matrix)




In [8]:
from scipy.sparse import lil_matrix, save_npz, load_npz

interaction_matrix = load_npz("./data_model/interaction_matrix.npz")


In [9]:
data = []
for player_id, games in enumerate(interaction_matrix):
    for game_index in games.nonzero()[1]:
        data.append((int(player_id), int(game_index), float(1)))  # Chuyển đổi kiểu dữ liệu

In [60]:
interaction_matrix

<Compressed Sparse Row sparse matrix of dtype 'uint8'
	with 6139343 stored elements and shape (102548, 45093)>

In [10]:
data

[(0, 13, 1.0),
 (0, 64, 1.0),
 (0, 93, 1.0),
 (0, 108, 1.0),
 (0, 128, 1.0),
 (0, 136, 1.0),
 (0, 237, 1.0),
 (0, 256, 1.0),
 (0, 264, 1.0),
 (0, 277, 1.0),
 (0, 319, 1.0),
 (0, 333, 1.0),
 (0, 370, 1.0),
 (0, 418, 1.0),
 (0, 506, 1.0),
 (0, 529, 1.0),
 (0, 559, 1.0),
 (0, 643, 1.0),
 (0, 653, 1.0),
 (0, 662, 1.0),
 (0, 666, 1.0),
 (0, 676, 1.0),
 (0, 684, 1.0),
 (0, 694, 1.0),
 (0, 721, 1.0),
 (0, 744, 1.0),
 (0, 820, 1.0),
 (0, 835, 1.0),
 (0, 842, 1.0),
 (0, 855, 1.0),
 (0, 874, 1.0),
 (0, 875, 1.0),
 (0, 948, 1.0),
 (0, 1114, 1.0),
 (0, 1139, 1.0),
 (0, 1160, 1.0),
 (0, 1199, 1.0),
 (0, 1219, 1.0),
 (0, 1220, 1.0),
 (0, 1269, 1.0),
 (0, 1316, 1.0),
 (0, 1355, 1.0),
 (0, 1370, 1.0),
 (0, 1411, 1.0),
 (0, 1442, 1.0),
 (0, 1459, 1.0),
 (0, 1465, 1.0),
 (0, 1466, 1.0),
 (0, 1476, 1.0),
 (0, 1488, 1.0),
 (0, 1514, 1.0),
 (0, 1541, 1.0),
 (0, 1594, 1.0),
 (0, 1670, 1.0),
 (0, 1683, 1.0),
 (0, 1701, 1.0),
 (0, 1708, 1.0),
 (0, 1710, 1.0),
 (0, 1768, 1.0),
 (0, 1924, 1.0),
 (0, 1960, 1.0),

In [11]:
# Tạo DataFrame PySpark
columns = ["player_id", "game_id", "rating"]
df = spark.createDataFrame(data, columns)

In [12]:
df = df.withColumn("player_id", col("player_id").cast("int"))
df = df.withColumn("game_id", col("game_id").cast("int"))
df = df.withColumn("rating", col("rating").cast("float"))
df.printSchema()  # Xác nhận lại kiểu dữ liệu


root
 |-- player_id: integer (nullable = true)
 |-- game_id: integer (nullable = true)
 |-- rating: float (nullable = true)



In [13]:
df

DataFrame[player_id: int, game_id: int, rating: float]

In [80]:
print(spark)


<pyspark.sql.session.SparkSession object at 0x00000264179AB2C0>


In [14]:
df.printSchema()


root
 |-- player_id: integer (nullable = true)
 |-- game_id: integer (nullable = true)
 |-- rating: float (nullable = true)



In [15]:
from pyspark.sql.functions import col, count

In [None]:
# df.count() # Kiểm tra số dòng trong DataFrame
# print(df.schema)  # Xem kiểu dữ liệu của từng cột


In [95]:
# df.describe().show()  # Kiểm tra thống kê dữ liệu
# df.select([col(c).isNull().sum() for c in df.columns]).show()  # Kiểm tra giá trị null
# df.show(5)  # Xem thử 5 dòng đầu

In [18]:
# Khởi tạo mô hình ALS
from pyspark.ml.recommendation import ALS

als = ALS(
    maxIter=10,         # Số vòng lặp
    regParam=0.1,       # Hệ số điều chuẩn
    userCol="player_id",
    itemCol="game_id",
    ratingCol="rating",
    coldStartStrategy="drop"  # Loại bỏ dữ liệu không đủ thông tin
)

# Huấn luyện mô hình
model = als.fit(df)


Py4JJavaError: An error occurred while calling o123.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 2.0 failed 1 times, most recent failure: Lost task 0.0 in stage 2.0 (TID 17) (10.230.135.254 executor driver): java.io.IOException: Cannot run program "python3": CreateProcess error=2, The system cannot find the file specified
	at java.base/java.lang.ProcessBuilder.start(ProcessBuilder.java:1140)
	at java.base/java.lang.ProcessBuilder.start(ProcessBuilder.java:1074)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:181)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:174)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:67)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.sql.execution.SQLExecutionRDD.$anonfun$compute$1(SQLExecutionRDD.scala:52)
	at org.apache.spark.sql.internal.SQLConf$.withExistingConf(SQLConf.scala:158)
	at org.apache.spark.sql.execution.SQLExecutionRDD.compute(SQLExecutionRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	at java.base/java.lang.Thread.run(Thread.java:1623)
Caused by: java.io.IOException: CreateProcess error=2, The system cannot find the file specified
	at java.base/java.lang.ProcessImpl.create(Native Method)
	at java.base/java.lang.ProcessImpl.<init>(ProcessImpl.java:500)
	at java.base/java.lang.ProcessImpl.start(ProcessImpl.java:159)
	at java.base/java.lang.ProcessBuilder.start(ProcessBuilder.java:1111)
	... 45 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2414)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2433)
	at org.apache.spark.rdd.RDD.$anonfun$take$1(RDD.scala:1492)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.take(RDD.scala:1465)
	at org.apache.spark.rdd.RDD.$anonfun$isEmpty$1(RDD.scala:1602)
	at scala.runtime.java8.JFunction0$mcZ$sp.apply(JFunction0$mcZ$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.isEmpty(RDD.scala:1602)
	at org.apache.spark.ml.recommendation.ALS$.train(ALS.scala:975)
	at org.apache.spark.ml.recommendation.ALS.$anonfun$fit$1(ALS.scala:737)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.recommendation.ALS.fit(ALS.scala:714)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:75)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:578)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1623)
Caused by: java.io.IOException: Cannot run program "python3": CreateProcess error=2, The system cannot find the file specified
	at java.base/java.lang.ProcessBuilder.start(ProcessBuilder.java:1140)
	at java.base/java.lang.ProcessBuilder.start(ProcessBuilder.java:1074)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:181)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:174)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:67)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.sql.execution.SQLExecutionRDD.$anonfun$compute$1(SQLExecutionRDD.scala:52)
	at org.apache.spark.sql.internal.SQLConf$.withExistingConf(SQLConf.scala:158)
	at org.apache.spark.sql.execution.SQLExecutionRDD.compute(SQLExecutionRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	... 1 more
Caused by: java.io.IOException: CreateProcess error=2, The system cannot find the file specified
	at java.base/java.lang.ProcessImpl.create(Native Method)
	at java.base/java.lang.ProcessImpl.<init>(ProcessImpl.java:500)
	at java.base/java.lang.ProcessImpl.start(ProcessImpl.java:159)
	at java.base/java.lang.ProcessBuilder.start(ProcessBuilder.java:1111)
	... 45 more


In [40]:
interaction_matrix

<Compressed Sparse Row sparse matrix of dtype 'uint8'
	with 11314322 stored elements and shape (102548, 45093)>

In [10]:
interaction_matrix_dense = interaction_matrix.toarray()


In [51]:
interaction_matrix_dense

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]], shape=(102548, 45093), dtype=uint8)

In [12]:

from scipy.sparse.linalg import svds

num_components = 50
U_svd, sigma_svd, Vt_svd = svds(interaction_matrix, k=num_components)

# Chuyển sigma thành ma trận chéo
sigma_svd = np.diag(sigma_svd)

# Lưu kết quả để sử dụng lại
with open("./data_model/svd_model.pkl", "wb") as f:
    pickle.dump((U_svd, sigma_svd, Vt_svd), f)


In [None]:
# density = interaction_matrix.nnz / (interaction_matrix.shape[0] * interaction_matrix.shape[1])
# print(f"Mật độ dữ liệu không rỗng: {density:.6f}")

Mật độ dữ liệu không rỗng: 0.000022


In [30]:
from sklearn.metrics.pairwise import cosine_similarity

def get_similar_users(player_index, user_embeddings_sparse, top_k=20):
    """
    Lấy danh sách top_k người chơi có độ tương đồng cao nhất với player_index
    """
    similarities = cosine_similarity(user_embeddings_sparse[player_index], user_embeddings_sparse).flatten()
    # print(similarities)
    similar_users = np.argsort(similarities)[::-1][1:top_k + 1]  # Lấy top_k người chơi gần nhất
    return similar_users

In [None]:
def recommend_games(player_id, U_svd, sigma_svd, Vt_svd, interaction_matrix, 
                    game_to_index, index_to_game, player_to_index, num_recommendations=10):
    
    # Kiểm tra xem player_id có trong player_to_index không
    if player_id not in player_to_index:
        print("❌ Player ID không hợp lệ.")
        return []

    # Chuyển player_id thành index
    player_index = player_to_index[player_id]

    # Tạo sparse embedding để tiết kiệm bộ nhớ
    from scipy.sparse import csr_matrix
    user_embeddings_sparse = csr_matrix(U_svd @ sigma_svd)
    print(user_embeddings_sparse)
    # Lấy danh sách người chơi tương tự
    similar_users = get_similar_users(player_index, user_embeddings_sparse)
    print(similar_users)
    print(similarities[similar_users])
    # Tập hợp các game mà những người chơi tương tự đã mua
    recommended_games = set()
    for similar_user in similar_users:
        recommended_games.update(interaction_matrix[similar_user].nonzero()[1])
    print(recommended_games)
    # Loại bỏ game mà người chơi hiện tại đã mua
    purchased_games = set(interaction_matrix[player_index].nonzero()[1])
    final_recommendations = [index_to_game[idx] for idx in recommended_games if idx not in purchased_games]

    return final_recommendations[:num_recommendations]  # Lấy đúng số lượng cần thiết


In [35]:
player_id = 76561198060698936  # ID của người chơi cần gợi ý
recommended_games = recommend_games(player_id, U_svd, sigma_svd, Vt_svd, 
                                    interaction_matrix, game_to_index, index_to_game, player_to_index)

print("🎮 Game được đề xuất:", recommended_games)


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 5127352 stored elements and shape (102548, 50)>
  Coords	Values
  (0, 0)	-3.199705324680498e-14
  (0, 49)	0.9999999999999994
  (1, 0)	3.120232990197707e-19
  (1, 1)	3.365463234185258e-31
  (1, 2)	-2.7076299194655933e-30
  (1, 3)	-5.65808625996907e-30
  (1, 4)	1.950863313451742e-30
  (1, 5)	4.785472906613285e-30
  (1, 6)	3.465348249497095e-30
  (1, 7)	1.3787599251779802e-31
  (1, 8)	-2.8356016880604663e-30
  (1, 9)	9.153565903018076e-32
  (1, 10)	-1.2031352007715917e-30
  (1, 11)	-7.578228213176958e-31
  (1, 12)	1.8627039173432797e-30
  (1, 13)	2.310158739209067e-30
  (1, 14)	7.16857349471279e-30
  (1, 15)	-3.332073008460134e-31
  (1, 16)	2.8337138536115683e-30
  (1, 17)	-1.1638269668514513e-30
  (1, 18)	3.487774619972938e-31
  (1, 19)	-2.026489941451901e-30
  (1, 20)	5.5312268621112864e-30
  (1, 21)	-6.091752939830233e-31
  (1, 22)	5.6468343683781634e-30
  :	:
  (102547, 25)	1.618140256785191e-19
  (102547, 26)	3.12094818991