In [41]:
import pandas as pd
import numpy as np
import xgboost
from xgboost import XGBRegressor
from xgboost.callback import EarlyStopping
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

In [33]:
df = pd.read_excel('processed_gamelist.xlsx')
df.head()

Unnamed: 0.1,Unnamed: 0,gameId,Name,Price,Rating,hasDLC,inAppPurchases,Release Date,Multiplayer,Action,Adventure,Casual,Early Access,Indie,Massively Multiplayer,RPG,Racing,Simulation,Sports,Strategy
0,0,1903340,Clair Obscur: Expedition 33,67.48,Very Positive,1,0,2025-04-24,0,1,0,0,0,0,0,1,0,0,0,0
1,1,2993780,FANTASY LIFE i: The Girl Who Steals Time,59.99,Very Positive,0,0,2025-05-21,1,0,0,0,0,0,0,1,0,0,0,0
2,2,730,Counter-Strike 2,0.0,Very Positive,1,1,2012-08-21,1,1,0,0,0,0,0,0,0,0,0,0
3,3,1091500,Cyberpunk 2077,59.99,Very Positive,1,0,2020-12-09,0,0,0,0,0,0,0,1,0,0,0,0
4,4,2488370,Cash Cleaner Simulator,14.79,Very Positive,1,0,2025-05-08,0,0,0,1,0,1,0,0,0,1,0,0


In [34]:
df["month"] = df["Release Date"].dt.month
df["month_sin"] = np.sin(2 * np.pi * df["month"] / 12)
df["month_cos"] = np.cos(2 * np.pi * df["month"] / 12)
df = df.drop(columns=['month'])
df["days_since_2000"] = (df["Release Date"] - pd.Timestamp("2000-01-01")).dt.days
df.columns

Index(['Unnamed: 0', 'gameId', 'Name', 'Price', 'Rating', 'hasDLC',
       'inAppPurchases', 'Release Date', 'Multiplayer', 'Action', 'Adventure',
       'Casual', 'Early Access', 'Indie', 'Massively Multiplayer', 'RPG',
       'Racing', 'Simulation', 'Sports', 'Strategy', 'month_sin', 'month_cos',
       'days_since_2000'],
      dtype='object')

In [35]:
# Split

y = df['Price']
X = df.drop(columns=['Unnamed: 0', 'gameId', 'Name', 'Price', 'Rating', 'Release Date'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [44]:
# 1) Modèle
xgb = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=2000,        # grand + early stopping -> il s'arrêtera avant si besoin
    learning_rate=0.03,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    tree_method="hist",       # "gpu_hist" si GPU CUDA dispo
    random_state=42,
    eval_metric="rmse",
)

# 2) Entraînement avec early stopping sur le set de test (ou fais un vrai valid split si tu préfères)
xgb.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],    # ou un vrai jeu de validation
    verbose=False
)

# 3) Prédictions + métriques
y_pred = xgb.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae  = mean_absolute_error(y_test, y_pred)
r2   = r2_score(y_test, y_pred)

print(f"Best iteration: {getattr(xgb, 'best_iteration', None)}")
print(f"RMSE: {rmse:.3f} | MAE: {mae:.3f} | R²: {r2:.3f}")

# 4) Importances de features (gain)
imp = pd.Series(xgb.feature_importances_, index=X.columns).sort_values(ascending=False)
print(imp.head(10))

Best iteration: None
RMSE: 34.720 | MAE: 13.414 | R²: -0.233
Adventure                0.103669
hasDLC                   0.095365
inAppPurchases           0.087974
month_cos                0.077146
Strategy                 0.075785
Indie                    0.071004
Massively Multiplayer    0.068074
Action                   0.064334
days_since_2000          0.060707
month_sin                0.057646
dtype: float32
