# Lancer le pipeline sur Google Colab
Ce notebook installe les dépendances, charge vos données, encode les labels en numérique (compatible XGBoost), entraîne et évalue le modèle. Adaptez `DATA_PATH` et `target_col` selon vos fichiers.

In [None]:
# 1) Installation des dépendances (Colab)
!pip install -q pandas scikit-learn xgboost

In [None]:
# 2) Imports et configuration
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from xgboost import XGBClassifier

# Emplacements à adapter :
DATA_PATH = "/content/csv_games_fusionnes.csv"  # Téléversez le fichier dans Colab ou montez Drive
TARGET_COL = "rb1"  # colonne cible à prédire

pd.set_option("display.max_rows", 20)
pd.set_option("display.max_columns", 50)


In [None]:
# 3) Chargement des données et inspection des labels

df = pd.read_csv(DATA_PATH)
print(f"Dataset: {df.shape[0]} lignes, {df.shape[1]} colonnes")

if TARGET_COL not in df.columns:
    raise ValueError(f"Colonne cible '{TARGET_COL}' absente du dataset")

unique_labels = df[TARGET_COL].dropna().unique()
print(f"Labels uniques: {len(unique_labels)}")
print("Aperçu:", unique_labels[:20])

In [None]:
# 4) Encodage numérique des classes (LabelEncoder)

mask = df[TARGET_COL].notna()
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df.loc[mask, TARGET_COL])
print(f"Classes encodées: {len(label_encoder.classes_)}")
print("Mapping (label -> id) aperçu:")
for lbl, idx in zip(label_encoder.classes_[:15], range(min(15, len(label_encoder.classes_)))):
    print(f"  {lbl} -> {idx}")

In [None]:
# 5) Préparation des features et split train/test

feature_cols = [c for c in df.columns if c != TARGET_COL]
X_raw = df.loc[mask, feature_cols]
X = pd.get_dummies(X_raw, drop_first=False)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

print(f"X train: {X_train.shape} | X test: {X_test.shape}")

In [None]:
# 6) Entraînement XGBoost (multi-classes)

num_classes = len(label_encoder.classes_)
model = XGBClassifier(
    objective="multi:softprob",
    num_class=num_classes,
    eval_metric="mlogloss",
    learning_rate=0.1,
    max_depth=6,
    n_estimators=200,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
    tree_method="hist"
)

model.fit(X_train, y_train)
print("Modèle entraîné ✅")

In [None]:
# 7) Évaluation et décodage des prédictions

y_pred_ids = model.predict(X_test)
y_pred_labels = label_encoder.inverse_transform(y_pred_ids)
y_test_labels = label_encoder.inverse_transform(y_test)

acc = accuracy_score(y_test, y_pred_ids)
f1 = f1_score(y_test, y_pred_ids, average="macro")
print(f"Accuracy: {acc:.3f} | F1-macro: {f1:.3f}")

print("\nClassification report (labels décodés):")
print(classification_report(y_test_labels, y_pred_labels))

In [None]:
# 8) Sauvegarde optionnelle du modèle et de l'encodeur dans Colab
import joblib

joblib.dump(model, "/content/xgb_model.pkl")
joblib.dump(label_encoder, "/content/label_encoder.pkl")
print("Fichiers sauvegardés: /content/xgb_model.pkl et /content/label_encoder.pkl")