In [1]:
%pip install xgboost

Note: you may need to restart the kernel to use updated packages.


# MODEL - Price - Based Filtering

In [2]:
import pandas as pd
import numpy as np
import ast
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [3]:
purchased_games = pd.read_csv("./clean_datasets/purchased_games.csv")  # (playerid, library)
prices = pd.read_csv("./clean_datasets/prices.csv")  # (gameid, date_acquired, price_usd)

In [4]:
# Tạo DataFrame mở rộng từ danh sách game của mỗi người chơi
# Chỉ cần chạy 1 lần - lưu lại rồi đọc thôithôi
import os

if (not os.path.exists("./data_model/player_data_prices.csv")):
    
    player_data = []
    for _, row in purchased_games.iterrows():
        playerid = row["playerid"]
        
        games = ast.literal_eval(row["library"])  
        
        prices_list = [prices.loc[prices["gameid"] == game, "price_usd"].values[0] 
                    for game in games if game in prices["gameid"].values]

        if len(prices_list) > 0:
            player_data.append({
                "playerid": playerid,
                "mean_price": np.mean(prices_list),
                "median_price": np.median(prices_list),
                "min_price": np.min(prices_list),
                "max_price": np.max(prices_list),
                "last_5_avg": np.mean(prices_list[-5:]) if len(prices_list) >= 5 else np.mean(prices_list),
                "target_price": np.mean(prices_list[-5:])  # Nhãn dự đoán
            })

    player_data


    df1 = pd.DataFrame(player_data)
    # Write 1 lần thôi - nhớ comment lại lần sau chỉ cần đọc file
    df1.to_csv("./data_model/player_data_prices.csv", index=False)
# Hoặc đọc file 

In [5]:
# Đọc file thay vì chạy lại
df= pd.read_csv("./data_model/player_data_prices.csv")

In [6]:
# Chuẩn bị dữ liệu huấn luyện
X = df.drop(columns=["playerid", "target_price"])
y = df["target_price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Huấn luyện mô hình XGBoost
model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
model.fit(X_train, y_train)

# Đánh giá mô hình
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

# Dự đoán giá tối ưu cho người chơi mới
def suggest_game_for_player(playerid):
    # Lấy danh sách game của người chơi từ player_data
    player_info = df[df["playerid"] == playerid]
    
    if player_info.empty:
        return "Không tìm thấy dữ liệu người chơi này."
    
    # Trích xuất thông tin về giá
    mean_price = player_info["mean_price"].values[0]
    median_price = player_info["median_price"].values[0]
    min_price = player_info["min_price"].values[0]
    max_price = player_info["max_price"].values[0]
    last_5_avg = player_info["last_5_avg"].values[0]

    # Dự đoán mức giá phù hợp
    feature_vector = np.array([[mean_price, median_price, min_price, max_price, last_5_avg]])
    predicted_price = model.predict(feature_vector)[0]

    # Tìm các game có giá gần nhất với mức giá này
    suggested_games = prices.iloc[(prices["price_usd"] - predicted_price).abs().argsort()[:10]]  # Lấy 10 game gần nhất
    
    return suggested_games



RMSE: 0.24174656623292673


In [7]:
# DDự đoán cho một người chơi mới với danh sách gameid đã mua
test_player = 76561198060698936
print(suggest_game_for_player(test_player))

        gameid date_acquired  price_usd
49635  1582650    2025-02-24  22.695552
16231   667970    2025-02-24  22.693167
67504  2116800    2025-02-24  22.710504
28225   991780    2025-02-24  22.712488
1361    200510    2025-02-24  22.679996
43721  1417880    2025-02-24  22.674023
66925  2099880    2025-02-24  22.723771
18471   724250    2025-02-24  22.723771
28452   998990    2025-02-24  22.723771
61261  1911610    2025-02-24  22.667224


# MODEL 2 - Content-Based Filtering

In [8]:
import pandas as pd
import re
import ast 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
from sklearn.decomposition import TruncatedSVD


In [10]:
# Chạy file popular game trước để lấy 10 % game phổ biến

purchased_games = pd.read_csv("./clean_datasets/purchased_games.csv")  # playerid, library
top_10_percent_game = pd.read_csv("./data_model/top_10_percent_games.csv")  # gameid, title, genres, description
achievements = pd.read_csv("./clean_datasets/achievements.csv")  # achievementid, gameid, title, description
history = pd.read_csv("./clean_datasets/history.csv")  # playerid, achievementid, date_acquired

In [11]:
def preprocess_text(text):
    """ Hàm tiền xử lý văn bản: chuyển thành chữ thường và loại bỏ dấu câu """
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)  # Loại bỏ khoảng trắng thừa
    text = re.sub(r'[^\w\s]', '', text)  # Loại bỏ dấu câu
    return text

top_10_percent_game['processed_title'] = top_10_percent_game['title'].apply(preprocess_text)
top_10_percent_game['processed_genres'] = top_10_percent_game['genres'].apply(preprocess_text)
top_10_percent_game['processed_developers'] = top_10_percent_game['developers'].apply(preprocess_text)
top_10_percent_game['processed_publishers'] = top_10_percent_game['publishers'].apply(preprocess_text)
top_10_percent_game['processed_languages'] = top_10_percent_game['supported_languages'].apply(preprocess_text)

# Tạo một cột tổng hợp các thông tin để vector hóa
top_10_percent_game['combined_features'] = top_10_percent_game['processed_title'] + ' ' + \
                              top_10_percent_game['processed_genres'] + ' ' + \
                              top_10_percent_game['processed_developers'] + ' ' + \
                              top_10_percent_game['processed_publishers'] + ' ' + \
                              top_10_percent_game['processed_languages']
top_10_percent_game

Unnamed: 0,gameid,title,developers,publishers,genres,supported_languages,release_date,purchase_count,processed_title,processed_genres,processed_developers,processed_publishers,processed_languages,combined_features
0,730,Counter-Strike 2,['Valve'],['Valve'],"['Action', 'Free To Play']","['Czech', 'Danish', 'Dutch', 'English', 'Finni...",2012-08-21,43967.0,counterstrike 2,action free to play,valve,valve,czech danish dutch english finnish french germ...,counterstrike 2 action free to play valve valv...
1,578080,PUBG: BATTLEGROUNDS,['PUBG Corporation'],"['KRAFTON, Inc.']","['Action', 'Adventure', 'Massively Multiplayer...","['English', 'Korean', 'Simplified Chinese', 'F...",2017-12-21,28356.0,pubg battlegrounds,action adventure massively multiplayer free to...,pubg corporation,krafton inc,english korean simplified chinese french germa...,pubg battlegrounds action adventure massively ...
2,550,Left 4 Dead 2,['Valve'],['Valve'],['Action'],"['Danish', 'Dutch', 'English', 'Finnish', 'Fre...",2009-11-16,24727.0,left 4 dead 2,action,valve,valve,danish dutch english finnish french german ita...,left 4 dead 2 action valve valve danish dutch ...
3,218620,PAYDAY 2,['OVERKILL - a Starbreeze Studio.'],['Starbreeze Publishing AB'],"['Action', 'RPG']","['English', 'German', 'French', 'Italian', 'Sp...",2013-08-13,23785.0,payday 2,action rpg,overkill a starbreeze studio,starbreeze publishing ab,english german french italian spanish spain d...,payday 2 action rpg overkill a starbreeze stu...
4,304930,Unturned,['Smartly Dressed Games'],['Smartly Dressed Games'],"['Action', 'Adventure', 'Casual', 'Indie', 'Fr...",['English'],2017-07-07,21620.0,unturned,action adventure casual indie free to play,smartly dressed games,smartly dressed games,english,unturned action adventure casual indie free to...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9819,1227280,Despot's Game: Dystopian Battle Simulator,['Konfa Games'],['tinyBuild'],"['Indie', 'RPG', 'Strategy']","['English', 'Russian', 'French', 'Italian', 'G...",2022-09-29,211.0,despots game dystopian battle simulator,indie rpg strategy,konfa games,tinybuild,english russian french italian german spanish ...,despots game dystopian battle simulator indie ...
9820,315460,Dig or Die,['Gaddy Games'],['Gaddy Games'],"['Action', 'Indie', 'RPG', 'Strategy']","['English', 'French', 'Russian', 'Simplified C...",2018-07-10,211.0,dig or die,action indie rpg strategy,gaddy games,gaddy games,english french russian simplified chinese germ...,dig or die action indie rpg strategy gaddy gam...
9821,798280,CrocoMars,['Anatoliy Loginovskikh'],['Anatoliy Loginovskikh'],"['Action', 'Adventure', 'Free To Play', 'Indie...","['English', 'Russian']",2018-03-05,211.0,crocomars,action adventure free to play indie simulation...,anatoliy loginovskikh,anatoliy loginovskikh,english russian,crocomars action adventure free to play indie ...
9822,736110,Deep Sorrow,['QuickSave'],['SA Industry'],"['Adventure', 'Indie']","['English', 'Turkish']",2017-11-12,211.0,deep sorrow,adventure indie,quicksave,sa industry,english turkish,deep sorrow adventure indie quicksave sa indus...


In [12]:
def preprocess_purchased_games(purchased_games):
    def safe_eval(val):
        if isinstance(val, str):  
            try:
                return ast.literal_eval(val)
            except (ValueError, SyntaxError):
                return []  # Trả về danh sách rỗng nếu lỗi
        elif isinstance(val, list):
            return val  
        return []  # Trả về danh sách rỗng nếu không phải list

    purchased_games['library'] = purchased_games['library'].apply(safe_eval)
    return purchased_games

In [13]:
purchased_games = preprocess_purchased_games(purchased_games)  # Chuyển đổi cột 'library'

In [14]:
import os
if (not os.path.exists('./data_model/cosine_similarity.csv')):

    # Chạy 1 lần rồi sau đó đọc file thôi
    # Tạo vector TF-IDF cho các đặc trưng tổng hợp của game
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    games_tfidf = tfidf_vectorizer.fit_transform(top_10_percent_game['combined_features'])

    # Tính toán độ tương đồng cosine giữa các game
    cosine_sim = cosine_similarity(games_tfidf)

    # Lưu kết quả cosine similarity vào CSV để tái sử dụng
    cosine_sim_df = pd.DataFrame(cosine_sim, index=top_10_percent_game['gameid'], columns=top_10_percent_game['gameid'])
    cosine_sim_df.to_csv('./data_model/cosine_similarity.csv', index=True)


In [15]:
cosine_sim_df = pd.read_csv('./data_model/cosine_similarity.csv', index_col='gameid')
cosine_sim_df

Unnamed: 0_level_0,730,578080,550,218620,304930,1172470,230410,271590,407530,4000,...,462530,339110,444590,2129490,2777190,1227280,315460,798280,736110,489580
gameid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
730,1.000000,0.231469,0.834490,0.153933,0.030013,0.256865,0.211969,0.187524,0.226219,0.601471,...,0.001282,0.105134,0.006881,0.096230,0.406934,0.138206,0.155061,0.040831,0.024253,0.006410
578080,0.231469,1.000000,0.206466,0.081970,0.032027,0.218630,0.190309,0.133918,0.155094,0.205203,...,0.005361,0.090268,0.011976,0.084308,0.236089,0.122069,0.121766,0.042780,0.030220,0.005897
550,0.834490,0.206466,1.000000,0.154977,0.005417,0.222664,0.182262,0.188796,0.227752,0.605548,...,0.001291,0.105847,0.006928,0.096883,0.379674,0.139143,0.156112,0.011811,0.024417,0.006453
218620,0.153933,0.081970,0.154977,1.000000,0.005581,0.169418,0.107787,0.143649,0.124385,0.106598,...,0.001330,0.040727,0.019069,0.062066,0.136441,0.099878,0.122118,0.012167,0.002515,0.006648
304930,0.030013,0.032027,0.005417,0.005581,1.000000,0.043320,0.032363,0.026432,0.021066,0.010907,...,0.005003,0.015444,0.014288,0.007952,0.045409,0.022194,0.039364,0.038184,0.013819,0.008401
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1227280,0.138206,0.122069,0.139143,0.099878,0.022194,0.161300,0.146963,0.147407,0.122043,0.142645,...,0.001493,0.055945,0.019633,0.089149,0.158117,1.000000,0.177982,0.023495,0.008733,0.005808
315460,0.155061,0.121766,0.156112,0.122118,0.039364,0.161307,0.146181,0.155724,0.139634,0.155366,...,0.001329,0.049821,0.022825,0.084098,0.159840,0.177982,1.000000,0.025856,0.007777,0.010147
798280,0.040831,0.042780,0.011811,0.012167,0.038184,0.057864,0.044027,0.018359,0.023119,0.022900,...,0.005910,0.007317,0.016879,0.014597,0.053023,0.023495,0.025856,1.000000,0.016325,0.009925
736110,0.024253,0.030220,0.024417,0.002515,0.013819,0.013713,0.026151,0.011627,0.028651,0.030642,...,0.008962,0.043389,0.017673,0.028392,0.030481,0.008733,0.007777,0.016325,1.000000,0.007670


In [16]:
def recommend_games_for_player(playerid, purchased_games, cosine_sim_df, games_df, top_n=10):
    """Gợi ý game dựa trên lịch sử mua game của người chơi."""
    
    # Chuyển playerid và gameid về kiểu string để tránh lỗi
    purchased_games['playerid'] = purchased_games['playerid'].astype(str)
    playerid = str(playerid)

    # Lọc danh sách game mà người chơi đã mua
    player_games = purchased_games[purchased_games['playerid'] == playerid]

    if player_games.empty:
        print(f"⚠️ Không tìm thấy người chơi {playerid} trong danh sách purchased_games.")
        return games_df.head(top_n)[['gameid', 'title', 'genres']]  # Trả về game phổ biến
    
    # Lấy danh sách game đã mua
    purchased_game_ids = player_games['library'].values[0]
    
    # Nếu danh sách rỗng, trả về game phổ biến
    if not purchased_game_ids:
        print(f"⚠️ Người chơi {playerid} chưa mua game nào.")
        return games_df.head(top_n)[['gameid', 'title', 'genres']]
    
    # Precompute the set for faster membership checking
    cosine_index_set = set(cosine_sim_df.index)

    # print(set(cosine_sim_df.index))

    similar_scores_list = []  # List to accumulate series

    for game_id in purchased_game_ids:
        if game_id in cosine_index_set:
            str_game_id = str(game_id)  # Ensure game_id is a string
            try:
                similar_scores_list.append(cosine_sim_df[str_game_id])
            except Exception as e:
                print("Exception")
                

    # Concatenate once outside the loop
    if similar_scores_list:
        similar_scores = pd.concat(similar_scores_list)
    else:
        similar_scores = pd.Series(dtype=float)
        

    # Tổng hợp điểm số, sắp xếp giảm dần
    similar_scores = similar_scores.groupby(similar_scores.index).sum().sort_values(ascending=False)
    
    # Loại bỏ game đã mua, chọn 10 game đầu tiên
    recommended_game_ids = similar_scores[~similar_scores.index.isin(purchased_game_ids)].head(top_n).index
    
    # Nếu không đủ game để gợi ý (vì không tìm thấy trong cosine_sim_df), bổ sung thêm game phổ biến
    if len(recommended_game_ids) < top_n:
        print(f"⚠️ Không đủ game gợi ý. Bổ sung thêm game phổ biến.")
        recommended_game_ids = recommended_game_ids.append(games_df[~games_df['gameid'].isin(purchased_game_ids)].head(top_n - len(recommended_game_ids))['gameid'])
    
    return games_df[games_df['gameid'].isin(recommended_game_ids)][['gameid', 'title', 'genres']]



In [17]:
playerid = 76561198060698936  
recommended_games = recommend_games_for_player(playerid, purchased_games, cosine_sim_df, top_10_percent_game)
print(recommended_games)

       gameid                title  \
658    527230         For The King   
2095  1272080             PAYDAY 3   
2221  1062830                 Embr   
2334   467950     Survive in Space   
2694  1220150            Blue Fire   
3504  1502190           From Space   
5099   934700        Dead Island 2   
5244  1677280  Company of Heroes 3   
5313  1676840      For The King II   
6221   705120    Death Coming/死神来了   

                                                 genres  
658           ['Adventure', 'Indie', 'RPG', 'Strategy']  
2095                     ['Action', 'Adventure', 'RPG']  
2221  ['Action', 'Casual', 'Indie', 'Simulation', 'S...  
2334  ['Action', 'Adventure', 'Casual', 'Indie', 'RPG']  
2694                   ['Action', 'Adventure', 'Indie']  
3504                                ['Action', 'Indie']  
5099                            ['Action', 'Adventure']  
5244                             ['Action', 'Strategy']  
5313          ['Adventure', 'Indie', 'RPG', 'Strategy']  
6

# MODEL 3 - Collaborative Filtering

In [18]:
%pip install findspark

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl.metadata (352 bytes)
Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1
Note: you may need to restart the kernel to use updated packages.


In [19]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col
import findspark

In [20]:
findspark.init()
spark = SparkSession.builder.appName("GameRecommendation").getOrCreate()

In [21]:

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import TruncatedSVD
import pickle


In [22]:
purchased_games = pd.read_csv("./clean_datasets/purchased_games.csv")  # (playerid, library)


In [23]:
purchased_games

Unnamed: 0,playerid,library
0,76561198060698936,"[60, 1670, 3830, 1600, 2900, 2910, 2920, 4800,..."
1,76561198287452552,"[10, 80, 100, 240, 2990, 6880, 6910, 6920, 698..."
2,76561198040436563,"[10, 80, 100, 300, 20, 30, 40, 50, 60, 70, 130..."
3,76561198042412488,"[300, 240, 220, 320, 360, 4300, 4800, 4000, 61..."
4,76561198119605821,"[47870, 108600, 550, 271590, 331470, 381210, 2..."
...,...,...
102543,76561199063275634,[]
102544,76561198003275888,"[3920, 2600, 6980, 4540, 4550, 7830, 22330, 22..."
102545,76561198944668572,[]
102546,76561198033563710,[]


In [24]:
# purchased_games['library'] = purchased_games['library'].apply(lambda x: x.split(','))


In [25]:
purchased_games

Unnamed: 0,playerid,library
0,76561198060698936,"[60, 1670, 3830, 1600, 2900, 2910, 2920, 4800,..."
1,76561198287452552,"[10, 80, 100, 240, 2990, 6880, 6910, 6920, 698..."
2,76561198040436563,"[10, 80, 100, 300, 20, 30, 40, 50, 60, 70, 130..."
3,76561198042412488,"[300, 240, 220, 320, 360, 4300, 4800, 4000, 61..."
4,76561198119605821,"[47870, 108600, 550, 271590, 331470, 381210, 2..."
...,...,...
102543,76561199063275634,[]
102544,76561198003275888,"[3920, 2600, 6980, 4540, 4550, 7830, 22330, 22..."
102545,76561198944668572,[]
102546,76561198033563710,[]


In [26]:
all_games = set()
for games in purchased_games["library"]:
    if isinstance(games, str):  # Nếu là chuỗi, tách game
        cleaned_games = [game.strip("[]") for game in games.split(',')]
    elif isinstance(games, list):  # Nếu là danh sách, xử lý trực tiếp
        cleaned_games = [game.strip("[]") for game in games]
    all_games.update(cleaned_games)

game_to_index = {game: idx for idx, game in enumerate(all_games)}
index_to_game = {idx: game for game, idx in game_to_index.items()}

# Tạo ánh xạ player_id → index
player_to_index = {player_id: idx for idx, player_id in enumerate(purchased_games["playerid"])}




In [27]:
all_games

{'',
 ' 487350',
 '782670',
 ' 625410',
 '225360',
 ' 650820',
 ' 366570',
 ' 2754690',
 ' 108700',
 ' 1315200',
 ' 1115780',
 ' 15200',
 ' 592120',
 '317250',
 ' 1837330',
 ' 339840',
 ' 1567060',
 ' 1495100',
 ' 340470',
 ' 281410',
 ' 611970',
 ' 547000',
 ' 352530',
 ' 1086850',
 ' 2475800',
 ' 695690',
 ' 2287560',
 ' 541760',
 ' 523090',
 ' 389160',
 ' 546440',
 ' 991780',
 ' 2347380',
 '444480',
 '3020',
 ' 765140',
 ' 513790',
 ' 663590',
 ' 348490',
 ' 391420',
 ' 596950',
 ' 2074890',
 ' 15060',
 ' 669280',
 ' 712840',
 ' 578740',
 ' 1766540',
 ' 1329790',
 ' 1065580',
 ' 335560',
 ' 502940',
 ' 2763670',
 ' 787850',
 ' 1292500',
 ' 545690',
 ' 547810',
 ' 389230',
 ' 1016810',
 ' 566180',
 ' 1849760',
 ' 1556100',
 '1888160',
 ' 2332550',
 ' 730430',
 ' 627690',
 ' 929010',
 ' 658610',
 ' 1457080',
 ' 355050',
 ' 1031140',
 ' 298630',
 ' 280720',
 ' 645750',
 ' 2756380',
 ' 981600',
 ' 820020',
 ' 61530',
 ' 660040',
 ' 1253870',
 ' 611760',
 ' 557330',
 ' 666820',
 ' 181058

In [28]:
index_to_game

{0: '',
 1: ' 487350',
 2: '782670',
 3: ' 625410',
 4: '225360',
 5: ' 650820',
 6: ' 366570',
 7: ' 2754690',
 8: ' 108700',
 9: ' 1315200',
 10: ' 1115780',
 11: ' 15200',
 12: ' 592120',
 13: '317250',
 14: ' 1837330',
 15: ' 339840',
 16: ' 1567060',
 17: ' 1495100',
 18: ' 340470',
 19: ' 281410',
 20: ' 611970',
 21: ' 547000',
 22: ' 352530',
 23: ' 1086850',
 24: ' 2475800',
 25: ' 695690',
 26: ' 2287560',
 27: ' 541760',
 28: ' 523090',
 29: ' 389160',
 30: ' 546440',
 31: ' 991780',
 32: ' 2347380',
 33: '444480',
 34: '3020',
 35: ' 765140',
 36: ' 513790',
 37: ' 663590',
 38: ' 348490',
 39: ' 391420',
 40: ' 596950',
 41: ' 2074890',
 42: ' 15060',
 43: ' 669280',
 44: ' 712840',
 45: ' 578740',
 46: ' 1766540',
 47: ' 1329790',
 48: ' 1065580',
 49: ' 335560',
 50: ' 502940',
 51: ' 2763670',
 52: ' 787850',
 53: ' 1292500',
 54: ' 545690',
 55: ' 547810',
 56: ' 389230',
 57: ' 1016810',
 58: ' 566180',
 59: ' 1849760',
 60: ' 1556100',
 61: '1888160',
 62: ' 2332550'

In [29]:
player_to_index

{76561198060698936: 0,
 76561198287452552: 1,
 76561198040436563: 2,
 76561198042412488: 3,
 76561198119605821: 4,
 76561198049686270: 5,
 76561198155814250: 6,
 76561198083492916: 7,
 76561198150634683: 8,
 76561198836367256: 9,
 76561198131958442: 10,
 76561198106910534: 11,
 76561198122070915: 12,
 76561198740883087: 13,
 76561198016756834: 14,
 76561198072740562: 15,
 76561198144540723: 16,
 76561198096557299: 17,
 76561198884939938: 18,
 76561198220441373: 19,
 76561198046610425: 20,
 76561198801797946: 21,
 76561199000909663: 22,
 76561199004958958: 23,
 76561198120109043: 24,
 76561198856855325: 25,
 76561198146275005: 26,
 76561197989446733: 27,
 76561197997318138: 28,
 76561198015133782: 29,
 76561199158139117: 30,
 76561197962051588: 31,
 76561198191613675: 32,
 76561198080851200: 33,
 76561198230516004: 34,
 76561198282337972: 35,
 76561198391791168: 36,
 76561198213729077: 37,
 76561199248076925: 38,
 76561198234203304: 39,
 76561199225844826: 40,
 76561198087768441: 41,
 7

In [30]:
game_to_index

{'': 0,
 ' 487350': 1,
 '782670': 2,
 ' 625410': 3,
 '225360': 4,
 ' 650820': 5,
 ' 366570': 6,
 ' 2754690': 7,
 ' 108700': 8,
 ' 1315200': 9,
 ' 1115780': 10,
 ' 15200': 11,
 ' 592120': 12,
 '317250': 13,
 ' 1837330': 14,
 ' 339840': 15,
 ' 1567060': 16,
 ' 1495100': 17,
 ' 340470': 18,
 ' 281410': 19,
 ' 611970': 20,
 ' 547000': 21,
 ' 352530': 22,
 ' 1086850': 23,
 ' 2475800': 24,
 ' 695690': 25,
 ' 2287560': 26,
 ' 541760': 27,
 ' 523090': 28,
 ' 389160': 29,
 ' 546440': 30,
 ' 991780': 31,
 ' 2347380': 32,
 '444480': 33,
 '3020': 34,
 ' 765140': 35,
 ' 513790': 36,
 ' 663590': 37,
 ' 348490': 38,
 ' 391420': 39,
 ' 596950': 40,
 ' 2074890': 41,
 ' 15060': 42,
 ' 669280': 43,
 ' 712840': 44,
 ' 578740': 45,
 ' 1766540': 46,
 ' 1329790': 47,
 ' 1065580': 48,
 ' 335560': 49,
 ' 502940': 50,
 ' 2763670': 51,
 ' 787850': 52,
 ' 1292500': 53,
 ' 545690': 54,
 ' 547810': 55,
 ' 389230': 56,
 ' 1016810': 57,
 ' 566180': 58,
 ' 1849760': 59,
 ' 1556100': 60,
 '1888160': 61,
 ' 2332550': 62

In [31]:
import os
if (not os.path.exists("./data_model/interaction_matrix.npz")):

    from scipy.sparse import lil_matrix, save_npz, load_npz

    num_players = len(purchased_games)
    num_games = len(all_games)
    interaction_matrix = lil_matrix((num_players, num_games), dtype=np.uint8)

    for i, games in enumerate(purchased_games["library"]):
        if isinstance(games, str):  # Nếu games là chuỗi, chuyển nó thành danh sách
            games = games.strip("[]").split(",")  # Tách chuỗi thành danh sách

        for game in games:
            game = str(game).strip()  # Đảm bảo game ID là chuỗi không có khoảng trắng

            if game in game_to_index:  # Kiểm tra game có tồn tại trong game_to_index không
                # if i < 1:     
                    # print("Abc")
                    # print(game_to_index[game])

                interaction_matrix[i, game_to_index[game]] = 1

    # Chuyển ma trận sang CSR (tối ưu tính toán)
    interaction_matrix = interaction_matrix.tocsr()

    # Lưu ma trận vào file để tái sử dụng mà không cần tính toán lại
    save_npz("./data_model/interaction_matrix.npz", interaction_matrix)




In [32]:
from scipy.sparse import lil_matrix, save_npz, load_npz

interaction_matrix = load_npz("./data_model/interaction_matrix.npz")


In [33]:
data = []
for player_id, games in enumerate(interaction_matrix):
    for game_index in games.nonzero()[1]:
        data.append((int(player_id), int(game_index), float(1)))  # Chuyển đổi kiểu dữ liệu

In [34]:
interaction_matrix

<102548x45093 sparse matrix of type '<class 'numpy.uint8'>'
	with 6139343 stored elements in Compressed Sparse Row format>

In [35]:
data

[(0, 13, 1.0),
 (0, 61, 1.0),
 (0, 124, 1.0),
 (0, 140, 1.0),
 (0, 263, 1.0),
 (0, 264, 1.0),
 (0, 321, 1.0),
 (0, 336, 1.0),
 (0, 344, 1.0),
 (0, 372, 1.0),
 (0, 385, 1.0),
 (0, 420, 1.0),
 (0, 428, 1.0),
 (0, 443, 1.0),
 (0, 464, 1.0),
 (0, 468, 1.0),
 (0, 527, 1.0),
 (0, 550, 1.0),
 (0, 628, 1.0),
 (0, 765, 1.0),
 (0, 775, 1.0),
 (0, 793, 1.0),
 (0, 826, 1.0),
 (0, 831, 1.0),
 (0, 864, 1.0),
 (0, 878, 1.0),
 (0, 880, 1.0),
 (0, 931, 1.0),
 (0, 952, 1.0),
 (0, 1002, 1.0),
 (0, 1012, 1.0),
 (0, 1019, 1.0),
 (0, 1045, 1.0),
 (0, 1083, 1.0),
 (0, 1117, 1.0),
 (0, 1256, 1.0),
 (0, 1280, 1.0),
 (0, 1312, 1.0),
 (0, 1414, 1.0),
 (0, 1440, 1.0),
 (0, 1504, 1.0),
 (0, 1505, 1.0),
 (0, 1558, 1.0),
 (0, 1585, 1.0),
 (0, 1666, 1.0),
 (0, 1715, 1.0),
 (0, 1756, 1.0),
 (0, 1802, 1.0),
 (0, 1811, 1.0),
 (0, 1824, 1.0),
 (0, 1837, 1.0),
 (0, 1846, 1.0),
 (0, 1880, 1.0),
 (0, 1886, 1.0),
 (0, 1920, 1.0),
 (0, 1974, 1.0),
 (0, 2001, 1.0),
 (0, 2042, 1.0),
 (0, 2083, 1.0),
 (0, 2133, 1.0),
 (0, 2148, 

In [36]:
# Tạo DataFrame PySpark
columns = ["player_id", "game_id", "rating"]
df = spark.createDataFrame(data, columns)

In [37]:
df = df.withColumn("player_id", col("player_id").cast("int"))
df = df.withColumn("game_id", col("game_id").cast("int"))
df = df.withColumn("rating", col("rating").cast("float"))
df.printSchema()  # Xác nhận lại kiểu dữ liệu


root
 |-- player_id: integer (nullable = true)
 |-- game_id: integer (nullable = true)
 |-- rating: float (nullable = true)



In [38]:
df

DataFrame[player_id: int, game_id: int, rating: float]

In [39]:
print(spark)


<pyspark.sql.session.SparkSession object at 0x7fa53e870ed0>


In [40]:
df.printSchema()


root
 |-- player_id: integer (nullable = true)
 |-- game_id: integer (nullable = true)
 |-- rating: float (nullable = true)



In [41]:
from pyspark.sql.functions import col, count

In [42]:
# df.count() # Kiểm tra số dòng trong DataFrame
# print(df.schema)  # Xem kiểu dữ liệu của từng cột


In [43]:
# df.describe().show()  # Kiểm tra thống kê dữ liệu
# df.select([col(c).isNull().sum() for c in df.columns]).show()  # Kiểm tra giá trị null
# df.show(5)  # Xem thử 5 dòng đầu

In [44]:
# Khởi tạo mô hình ALS
from pyspark.ml.recommendation import ALS

als = ALS(
    maxIter=10,         # Số vòng lặp
    regParam=0.1,       # Hệ số điều chuẩn
    userCol="player_id",
    itemCol="game_id",
    ratingCol="rating",
    coldStartStrategy="drop"  # Loại bỏ dữ liệu không đủ thông tin
)

# Huấn luyện mô hình
model = als.fit(df)


In [45]:
interaction_matrix

<102548x45093 sparse matrix of type '<class 'numpy.uint8'>'
	with 6139343 stored elements in Compressed Sparse Row format>

In [46]:
interaction_matrix_dense = interaction_matrix.toarray()


In [47]:
interaction_matrix_dense

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [48]:

from scipy.sparse.linalg import svds
import numpy as np
import pickle

# Convert the existing interaction_matrix to float type
interaction_matrix = interaction_matrix.astype(float)

# Now perform SVD
num_components = 50
U_svd, sigma_svd, Vt_svd = svds(interaction_matrix, k=num_components)

# Convert sigma to diagonal matrix
sigma_svd = np.diag(sigma_svd)

# Save results for reuse
with open("./data_model/svd_model.pkl", "wb") as f:
    pickle.dump((U_svd, sigma_svd, Vt_svd), f)

In [49]:
# density = interaction_matrix.nnz / (interaction_matrix.shape[0] * interaction_matrix.shape[1])
# print(f"Mật độ dữ liệu không rỗng: {density:.6f}")

In [50]:
from sklearn.metrics.pairwise import cosine_similarity

def get_similar_users(player_index, user_embeddings_sparse, top_k=20):
    """
    Lấy danh sách top_k người chơi có độ tương đồng cao nhất với player_index
    """
    similarities = cosine_similarity(user_embeddings_sparse[player_index], user_embeddings_sparse).flatten()
    # print(similarities)
    similar_users = np.argsort(similarities)[::-1][1:top_k + 1]  # Lấy top_k người chơi gần nhất
    return similar_users, similarities

In [51]:
def recommend_games(player_id, U_svd, sigma_svd, Vt_svd, interaction_matrix, 
                    game_to_index, index_to_game, player_to_index, num_recommendations=10):
    
    # Kiểm tra xem player_id có trong player_to_index không
    if player_id not in player_to_index:
        print("❌ Player ID không hợp lệ.")
        return []

    # Chuyển player_id thành index
    player_index = player_to_index[player_id]

    # Tạo sparse embedding để tiết kiệm bộ nhớ
    from scipy.sparse import csr_matrix
    user_embeddings_sparse = csr_matrix(U_svd @ sigma_svd)
    print(user_embeddings_sparse)
    # Lấy danh sách người chơi tương tự
    similar_users, similarities = get_similar_users(player_index, user_embeddings_sparse)
    # print(similar_users)
    # print(similarities[similar_users])
    # Tập hợp các game mà những người chơi tương tự đã mua
    recommended_games = set()
    for similar_user in similar_users:
        recommended_games.update(interaction_matrix[similar_user].nonzero()[1])
    # print(recommended_games)
    # Loại bỏ game mà người chơi hiện tại đã mua
    purchased_games = set(interaction_matrix[player_index].nonzero()[1])
    final_recommendations = [index_to_game[idx] for idx in recommended_games if idx not in purchased_games]

    return final_recommendations[:num_recommendations]  # Lấy đúng số lượng cần thiết


In [52]:
player_id = 76561198060698936  # ID của người chơi cần gợi ý
recommended_games = recommend_games(player_id, U_svd, sigma_svd, Vt_svd, 
                                    interaction_matrix, game_to_index, index_to_game, player_to_index)

print("🎮 Game được đề xuất:", recommended_games)


  (0, 0)	-0.17961992283603564
  (0, 1)	-0.540623003329793
  (0, 2)	1.5729651394308188
  (0, 3)	-0.9583456079630254
  (0, 4)	1.4299063662637124
  (0, 5)	-1.0183641458025374
  (0, 6)	3.448572289909638
  (0, 7)	1.3744843397176305
  (0, 8)	0.945269478564186
  (0, 9)	-0.3401382960159317
  (0, 10)	-0.849616015465323
  (0, 11)	-0.46430195232673277
  (0, 12)	-0.3584271840267247
  (0, 13)	-0.39098361771246976
  (0, 14)	-1.8755136717119223
  (0, 15)	-0.2578508332866567
  (0, 16)	1.3549762272209853
  (0, 17)	3.6045352908913357
  (0, 18)	-0.028755695442010027
  (0, 19)	-3.6062459357518937
  (0, 20)	-0.6486748132668381
  (0, 21)	0.8555720446087028
  (0, 22)	1.1127472117431834
  (0, 23)	-0.742455901177111
  (0, 24)	-1.3088371256031466
  :	:
  (102547, 25)	-1.0750307164235916e-17
  (102547, 26)	-2.2771542326745516e-18
  (102547, 27)	-1.0059000376784809e-16
  (102547, 28)	-3.188062864981181e-17
  (102547, 29)	6.674939993855127e-18
  (102547, 30)	-1.034028542030521e-16
  (102547, 31)	2.5682279166976364