# Model 1: Ideal price prediction using XGBRegressor
- predict suitable game price for userid based on purchased_game, rcm games close to this price
- input: purchased_game, prices

In [1]:
import pandas as pd

In [2]:
purchased_games = pd.read_csv('./clean_datasets/purchased_games_demo.csv')
purchased_games['library'] = purchased_games['library'].apply(eval)

prices = pd.read_csv('./clean_datasets/prices.csv')

In [3]:
rows = []
for index, row in purchased_games.iterrows():
    for game in row['library']:
        rows.append({'playerid': row['playerid'], 'gameid': game})
        
purchased_df = pd.DataFrame(rows)

In [4]:
purchase_with_price = purchased_df.merge(prices, on='gameid', how='left')[['playerid', 'gameid', 'price_usd']]

In [5]:
purchase_with_price.dropna(subset=['price_usd'], inplace=True)

In [6]:
# check null
purchase_with_price.isnull().sum()

playerid     0
gameid       0
price_usd    0
dtype: int64

In [28]:
purchase_with_price

Unnamed: 0,playerid,gameid,price_usd
0,76561198060698936,60,4.99
1,76561198060698936,1670,5.99
2,76561198060698936,3830,9.99
3,76561198060698936,1600,14.99
4,76561198060698936,2900,9.99
...,...,...,...
11258669,76561198003275888,987840,44.99
11258671,76561198003275888,249050,11.99
11258672,76561198003275888,2140020,14.99
11258673,76561198003275888,1010750,9.99


## Calculate target_stats for each player

In [10]:
def compute_stats(df):
    prices = df['price_usd']
    # convert to list
    prices = prices.tolist()
    if (len(prices) < 5):
        return pd.Series({
            'first_purchase': None,
            'second_purchase': None,
            'third_purchase': None,
            'mean_price': None,
            'last_purchase': None, # target
        })
    return pd.Series({
        'first_purchase': prices[-4],
        'second_purchase': prices[-3],
        'third_purchase': prices[-2],
        'mean_price': sum(prices) / len(prices),
        'last_purchase': prices[-1], # target
        # 'median_price': prices.median(),
        # 'min_price': prices.min(),
        # 'max_price': prices.max(),
        # 'last_5_avg': prices.tail(5).mean(),
    })

target_stats = purchase_with_price.groupby('playerid').apply(compute_stats).reset_index()
# remove player with None first purchase
target_stats.dropna(subset=['first_purchase'], inplace=True)


  target_stats = purchase_with_price.groupby('playerid').apply(compute_stats).reset_index()


In [11]:
target_stats

Unnamed: 0,playerid,first_purchase,second_purchase,third_purchase,mean_price,last_purchase
0,76561197960270682,24.99,9.99,29.99,7.522958,0.00
1,76561197960272112,2.99,0.00,59.99,12.825287,19.99
2,76561197960272169,0.00,9.99,14.99,7.859773,0.00
3,76561197960273069,19.99,44.99,9.99,21.500769,0.00
4,76561197960273410,49.99,19.79,1.99,9.978214,9.99
...,...,...,...,...,...,...
46925,76561199755008174,9.99,0.00,9.99,3.995882,9.99
46928,76561199769422588,0.00,0.00,0.00,0.000000,0.00
46929,76561199777619741,0.00,0.00,0.00,4.220000,7.99
46932,76561199783370251,0.00,0.00,0.00,4.999167,0.00


In [31]:
# save the target stats to a CSV file
target_stats.to_csv("./data_model/target_stats.csv", index=False)

## Build the model (start from here)

In [12]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

In [33]:
target_stats = pd.read_csv('./data_model/target_stats.csv')

In [23]:
x = target_stats.drop(columns=['playerid', 'last_purchase'])
# y = target_stats['mean_price']
y = target_stats['last_purchase']

In [28]:
x

Unnamed: 0,first_purchase,second_purchase,third_purchase,mean_price
0,24.99,9.99,29.99,7.522958
1,2.99,0.00,59.99,12.825287
2,0.00,9.99,14.99,7.859773
3,19.99,44.99,9.99,21.500769
4,49.99,19.79,1.99,9.978214
...,...,...,...,...
46925,9.99,0.00,9.99,3.995882
46928,0.00,0.00,0.00,0.000000
46929,0.00,0.00,0.00,4.220000
46932,0.00,0.00,0.00,4.999167


In [24]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [25]:
model = XGBRegressor(n_estimators=100, max_depth=4, learning_rate=0.1, random_state=42)
model.fit(x_train, y_train)


## Predict and recommend

In [54]:
# get price for top 10% of games
top_10_percent_games = pd.read_csv('./clean_datasets/top_10_percent_games.csv')

In [59]:
def get_recommended_games(playerid, top_n=10):
    
    user_features = target_stats[target_stats['playerid'] == playerid].drop(columns=["playerid"])
    predicted_price = model.predict(user_features)[0]
    print(f"Ideal price for user: ${predicted_price:.2f}")

    owned_games = purchased_games[purchased_games['playerid'] == playerid]['library'].values[0]
    
    # Filter out owned games
    recommender_pool = prices[~prices['gameid'].isin(owned_games)]
    recommender_pool = recommender_pool[recommender_pool['gameid'].isin(top_10_percent_games['gameid'].tolist())]
        
    recommender_pool['price_diff'] = (recommender_pool['price_usd'] - predicted_price).abs()

    recommended_games = recommender_pool.sort_values('price_diff').head(top_n)
    recommended_games = recommended_games[['gameid', 'price_usd']]
    
    # include game info
    recommended_games = recommended_games.merge(top_10_percent_games, on='gameid', how='left')
    
    return recommended_games[['gameid', 'price_usd', 'title']]

In [16]:
import numpy as np

In [48]:
purchase_with_price

# print max price
max_price = purchase_with_price['price_usd'].min()
max_price

np.float64(0.0)

In [18]:
target_stats

Unnamed: 0,playerid,first_purchase,second_purchase,third_purchase,mean_price,last_purchase
0,76561197960270682,24.99,9.99,29.99,7.522958,0.00
1,76561197960272112,2.99,0.00,59.99,12.825287,19.99
2,76561197960272169,0.00,9.99,14.99,7.859773,0.00
3,76561197960273069,19.99,44.99,9.99,21.500769,0.00
4,76561197960273410,49.99,19.79,1.99,9.978214,9.99
...,...,...,...,...,...,...
46925,76561199755008174,9.99,0.00,9.99,3.995882,9.99
46928,76561199769422588,0.00,0.00,0.00,0.000000,0.00
46929,76561199777619741,0.00,0.00,0.00,4.220000,7.99
46932,76561199783370251,0.00,0.00,0.00,4.999167,0.00


In [41]:
user_feature = target_stats[target_stats['playerid'] == 76561199755008174][['first_purchase', 'second_purchase', 'third_purchase', 'mean_price']]

In [42]:
user_feature = user_feature.values[0]
user_feature = np.array(user_feature).reshape(1, -1)
predicted_price = model.predict(user_feature)[0]

In [43]:
predicted_price

np.float32(3.328533)

In [44]:
from sklearn.metrics import mean_squared_error

# Đánh giá mô hình
y_pred = model.predict(x_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

RMSE: 16.169415990310537


In [36]:
predicted_price = model.predict(user_feature)[0]

XGBoostError: [18:39:54] C:\actions-runner\_work\xgboost\xgboost\src\predictor\cpu_predictor.cc:779: Check failed: m->NumColumns() == model.learner_model_param->num_feature (1 vs. 4) : Number of columns in data must equal to trained model.

In [60]:
get_recommended_games(76561197963765138, 10)


Ideal price for user: $5.83


Unnamed: 0,gameid,price_usd,title
0,1640,5.99,Disciples II: Gallean's Return
1,391660,5.99,Incitement 3
2,255940,5.99,The 11th Hour
3,337450,5.99,Dream Tale
4,459130,5.99,Love in the Glen
5,318220,5.99,Enforcer: Police Crime Action
6,318430,5.99,Squishy the Suicidal Pig
7,351820,5.99,Z.A.R.
8,352890,5.99,Caravan
9,798610,5.99,Mermaid Adventures: The Frozen Time


In [None]:
# check if a game is in the library
# def is_game_in_library(user_id, game_id):
#     user_games = purchased[purchased['playerid'] == user_id]['library'].values[0]
#     return game_id in user_games

# print(is_game_in_library(76561198220441373, 2100))  # Example game ID
