# Tennis Match Prediction with Optuna

## Match parameters

✅ Données d’entrée nécessaires
Voici les caractéristiques essentielles (features) à fournir pour chaque match :

|Variable|	Type|	Description|
| --- | --- | --- |
|Rank_1|	int|	Classement ATP de Player_1|
|Rank_2|	int|	Classement ATP de Player_2|
|Pts_1|	int|	Points ATP de Player_1|
|Pts_2|	int|	Points ATP de Player_2|
|Odd_1|	float|	Cote pré-match de Player_1 (optionnelle mais utile)|
|Odd_2|	float|	Cote pré-match de Player_2|
|Surface|	str|	Surface du match (Hard, Clay, Grass)|
|Round|	str|	Tour du match (1st Round, Quarterfinal, etc.)|
|Best of|	int|	Nombre de sets gagnants (3 ou 5)|
|Court|	str|	Indoor / Outdoor|

## Install librairies

In [None]:
pip install -r requirements.txt

## Data Acquisition

In [None]:
import csv
from datetime import datetime, timedelta

def parse_date_flexible(date_str):
    """Essaie plusieurs formats de date."""
    for fmt in ('%Y-%m-%d %H:%M:%S', '%Y-%m-%d', '%d/%m/%Y %H:%M', '%d/%m/%Y'):
        try:
            return datetime.strptime(date_str.strip(), fmt)
        except ValueError:
            continue
    return None

def filtrer_et_normaliser_matchs(fichier_entree, fichier_sortie):
    maintenant = datetime.now()
    trois_mois = timedelta(days=1825)
    matchs_conserves = 0

    with open(fichier_entree, mode='r', encoding='utf-8') as infile, \
         open(fichier_sortie, mode='w', newline='', encoding='utf-8') as outfile:
        
        lecteur = csv.DictReader(infile, delimiter=',')
        champs = lecteur.fieldnames
        ecrivain = csv.DictWriter(outfile, fieldnames=champs, delimiter=',')
        ecrivain.writeheader()

        for ligne in lecteur:
            date_str = ligne.get('Date', '')
            date_match = parse_date_flexible(date_str)
            if date_match and (maintenant - date_match <= trois_mois):
                # Normalisation de la date
                ligne['Date'] = date_match.strftime('%Y-%m-%d %H:%M:%S')
                ecrivain.writerow(ligne)
                matchs_conserves += 1

    print(f"{matchs_conserves} matchs conservés (joués dans les 5 dernieres années).")


# Extraction des matchs ATP des 3 derniers mois
filtrer_et_normaliser_matchs('../data/atp_tennis.csv', '../data/matches_atp_5_dernieres_années.csv')

# Extraction des matchs WTA des 3 derniers mois
#filtrer_et_normaliser_matchs('../data/wta.csv', '../data/matches_wta_3_derniers_mois.csv')


#### Constitution du DF

In [None]:
import pandas as pd
from collections import defaultdict, deque
import re
import joblib

### 1. Chargement des données
df = pd.read_csv("../data/matches_atp_5_dernieres_années.csv")

def count_total_games(score_str):
    if pd.isna(score_str):
        return None
    sets = re.findall(r'(\d+)-(\d+)', score_str)
    return sum(int(a) + int(b) for a, b in sets)

df['Total_Games'] = df['Score'].apply(count_total_games)
df = df.dropna(subset=['Total_Games'])

# 🎯 Classes de total de jeux
bins = list(range(15, 40, 4))
labels = [f"{b}-{b+3}" for b in bins[:-1]]
df["Games_Class"] = pd.cut(df["Total_Games"], bins=bins, labels=labels, include_lowest=True)
df = df.dropna(subset=["Games_Class"])

# 🎯 Features de base
df["Rank_Diff"] = df["Rank_1"] - df["Rank_2"]
df["Pts_Diff"] = df["Pts_1"] - df["Pts_2"]
df["Odds_Ratio"] = df["Odd_1"] / df["Odd_2"]
df["Book_Fav"] = (df["Odd_1"] < df["Odd_2"]).astype(int)
df["Avg_Rank"] = (df["Rank_1"] + df["Rank_2"]) / 2
df["Odd_Diff"] = abs(df["Odd_1"] - df["Odd_2"])

round_order = {
    "1st Round": 1, "2nd Round": 2, "3rd Round": 3, "4th Round": 4,
    "Quarterfinal": 5, "Semifinal": 6, "Final": 7
}
df["Round_Ordinal"] = df["Round"].map(round_order)

### 2. Nettoyage
df = df.dropna(subset=['Rank_1', 'Rank_2', 'Pts_1', 'Pts_2', 'Odd_1', 'Odd_2', 'Surface', 'Round', 'Best of', 'Court'])
df['Winner_encoded'] = (df['Winner'] == df['Player_1']).astype(int)
df["Date"] = pd.to_datetime(df["Date"], errors="coerce")

### 3. Chargement historique complet pour H2H + Forme
full_df = pd.read_csv("../data/atp_tennis.csv")
full_df = full_df.dropna(subset=["Player_1", "Player_2", "Winner", "Date"])
full_df["Date"] = pd.to_datetime(full_df["Date"], errors='coerce')
full_df = full_df.dropna(subset=["Date"])
full_df = full_df.sort_values("Date")

### 4. Historique H2H
h2h_dict = defaultdict(lambda: [0, 0])
for _, row in full_df.iterrows():
    p1, p2, winner = row["Player_1"], row["Player_2"], row["Winner"]
    key = tuple(sorted([p1, p2]))
    if winner == p1:
        h2h_dict[key][0] += 1
    elif winner == p2:
        h2h_dict[key][1] += 1

def get_h2h(p1, p2):
    key = tuple(sorted([p1, p2]))
    h2h = h2h_dict.get(key, [0, 0])
    return h2h if p1 <= p2 else h2h[::-1]

df[["H2H_P1", "H2H_P2"]] = df.apply(lambda row: pd.Series(get_h2h(row["Player_1"], row["Player_2"])), axis=1)
df["H2H_Diff"] = df["H2H_P1"] - df["H2H_P2"]

### 5. Forme récente (5 derniers matchs)
recent_form = defaultdict(lambda: deque(maxlen=5))
win_history = {}

for i, row in full_df.iterrows():
    p1, p2, winner = row["Player_1"], row["Player_2"], row["Winner"]
    win_p1 = sum(recent_form[p1])
    win_p2 = sum(recent_form[p2])
    win_history[i] = (win_p1, win_p2)
    recent_form[p1].append(1 if winner == p1 else 0)
    recent_form[p2].append(1 if winner == p2 else 0)

recent_form_dict = {player: sum(matches) for player, matches in recent_form.items()}

# Créer DataFrame temporaire
win_df = pd.DataFrame.from_dict(win_history, orient='index', columns=["Wins_Last5_P1", "Wins_Last5_P2"])
full_df = full_df.reset_index(drop=True)
full_df = pd.concat([full_df, win_df], axis=1)

# Fusion avec df
df = pd.merge(df, full_df[["Player_1", "Player_2", "Date", "Wins_Last5_P1", "Wins_Last5_P2"]],
              on=["Player_1", "Player_2", "Date"], how="left")

df["Form_Diff"] = df["Wins_Last5_P1"] - df["Wins_Last5_P2"]
df[["Wins_Last5_P1", "Wins_Last5_P2", "Form_Diff"]] = df[["Wins_Last5_P1", "Wins_Last5_P2", "Form_Diff"]].fillna(0)

### 6. Définition des features finales
numeric_features = [
    'Rank_1', 'Rank_2', 'Pts_1', 'Pts_2', 'Odd_1', 'Odd_2', 'Best of',
    'Rank_Diff', 'Pts_Diff', 'Odds_Ratio', 'Book_Fav', 'Avg_Rank',
    'Odd_Diff', 'Round_Ordinal',
    'H2H_P1', 'H2H_P2', 'H2H_Diff',
    'Wins_Last5_P1', 'Wins_Last5_P2', 'Form_Diff'
]

categorical_features = ['Surface', 'Court']
all_features = numeric_features + categorical_features

# 4. Sauvegarde des dictionnaires
joblib.dump(dict(h2h_dict), "../models/h2h_dict.pkl")
joblib.dump(recent_form_dict, "../models/recent_form_dict.pkl")

# Final dataset
#X = df[all_features]
#y = df["Winner_encoded"]


# Vérification
print("✅ Dataset prêt. Nombre de lignes :", len(df))
print("✅ Features numériques :", numeric_features)
print("✅ X shape :", X.shape, "| y shape :", y.shape)



# 🔧 Création des features pour le modèle Bradley-Terry-like
df_bt = pd.DataFrame()

# Différences entre Player_1 et Player_2
for col in numeric_features:
    if "_1" in col and col.replace("_1", "_2") in df.columns:
        col_2 = col.replace("_1", "_2")
        diff_col = col.replace("_1", "_Diff")
        df_bt[diff_col] = df[col] - df[col_2]

# Ajout des features différentielles manuelles
df_bt["Form_Diff"] = df["Form_Diff"]
df_bt["H2H_Diff"] = df["H2H_Diff"]

# Ajouter les features catégorielles
df_bt[categorical_features] = df[categorical_features]

# Cible
y_bt = df["Winner_encoded"]





### Boucle rolling window

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss, brier_score_loss
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np

# 🧹 Mise à jour du preprocessing pour intégrer StandardScaler
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())  # ✅ Ajout du scaler ici
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# 🔧 Paramètres de la rolling window
train_window_months = 6
start_date = pd.to_datetime("2019-01-01")
end_date = pd.to_datetime("2024-12-31")

df = df.sort_values("Date").dropna(subset=["Date"])

rolling_results = []
current_date = start_date

while current_date + pd.DateOffset(months=train_window_months + 1) <= end_date:
    train_end = current_date + pd.DateOffset(months=train_window_months)
    test_start = train_end
    test_end = test_start + pd.DateOffset(months=1)

    df_train = df[(df["Date"] >= current_date) & (df["Date"] < train_end)]
    df_test = df[(df["Date"] >= test_start) & (df["Date"] < test_end)]

    if len(df_train) < 200 or len(df_test) < 50:
        current_date += pd.DateOffset(months=1)
        continue

    X_train = df_train[all_features]
    y_train = df_train["Winner_encoded"]
    X_test = df_test[all_features]
    y_test = df_test["Winner_encoded"]

    # ✅ Logistic Regression avec solver robuste
    base_model = Pipeline([
        ("preprocessor", preprocessor),
        ("clf", LogisticRegression(max_iter=2000, solver='saga', penalty='l2', random_state=42))
    ])
    base_model.fit(X_train, y_train)

    try:
        cal_iso = CalibratedClassifierCV(estimator=base_model, method='isotonic', cv=3)
        cal_iso.fit(X_train, y_train)
        proba_iso = cal_iso.predict_proba(X_test)[:, 1]
        brier_iso = brier_score_loss(y_test, proba_iso)

        cal_sig = CalibratedClassifierCV(estimator=base_model, method='sigmoid', cv=3)
        cal_sig.fit(X_train, y_train)
        proba_sig = cal_sig.predict_proba(X_test)[:, 1]
        brier_sig = brier_score_loss(y_test, proba_sig)

        if brier_iso < brier_sig:
            y_proba = proba_iso
            selected_cal = "isotonic"
        else:
            y_proba = proba_sig
            selected_cal = "sigmoid"

    except Exception as e:
        print(f"⚠️ Calibration échouée pour {test_start.strftime('%Y-%m')} : {e}")
        y_proba = base_model.predict_proba(X_test)[:, 1]
        selected_cal = "non_calibré"

    y_pred = (y_proba >= 0.5).astype(int)

    rolling_results.append({
        "Train_start": current_date.strftime("%Y-%m"),
        "Train_end": train_end.strftime("%Y-%m"),
        "Test_month": test_start.strftime("%Y-%m"),
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_proba),
        "LogLoss": log_loss(y_test, y_proba),
        "Brier": brier_score_loss(y_test, y_proba),
        "Calibration": selected_cal
    })

    print(f"📅 {test_start.strftime('%Y-%m')} | Cal: {selected_cal} | Acc: {rolling_results[-1]['Accuracy']:.3f} | AUC: {rolling_results[-1]['AUC']:.3f}")

    current_date += pd.DateOffset(months=1)

# 📊 Résumé
rolling_df = pd.DataFrame(rolling_results)
display(rolling_df)


#### Custom Data Extraction Functions

#### Applying Functions

## Machine Learning

### Import des librairies et chargement des données

In [None]:
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score, recall_score,
    f1_score, log_loss, balanced_accuracy_score
)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor,
    HistGradientBoostingRegressor,
    RandomForestClassifier, 
    GradientBoostingClassifier
)
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from xgboost import XGBClassifier
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.metrics import brier_score_loss
from sklearn.calibration import calibration_curve

import joblib




### Entraine un modèle de régression pour le vainqueur du match

In [None]:
### 4. Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Split spécifique pour le modèle BT
X_train_bt, X_test_bt, y_train_bt, y_test_bt = train_test_split(df_bt, y_bt, stratify=y_bt, test_size=0.2, random_state=42)

### 5. Preprocessing
# Transformer pour les variables numériques
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

# Transformer pour les variables catégorielles
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Remplace les NaN par la modalité la plus fréquente
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combiner les transformations
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])


### 6. Modèles
models = {
    "BradleyTerry_LogReg": (
        Pipeline([
            ("preprocessor", ColumnTransformer([
                ("num", StandardScaler(), [
                    col for col in df_bt.columns if col not in categorical_features
                ]),
                ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
            ])),
            ("clf", LogisticRegression(max_iter=1000, random_state=42))
        ]),
        {
            "clf__C": [0.1, 1.0, 10.0]  # Valeurs de régularisation
        }
    ),"DecisionTree": (
        Pipeline([("preprocessor", preprocessor),
                  ("clf", DecisionTreeClassifier(random_state=42))]),
        {"clf__max_depth": [3, 5, 10], "clf__min_samples_split": [2, 5, 10]}
    ),
    "RandomForest": (
        Pipeline([("preprocessor", preprocessor),
                  ("clf", RandomForestClassifier(random_state=42))]),
        {"clf__n_estimators": [50, 150], "clf__max_depth": [5, 10, None]}
    ),
    "GradientBoosting": (
        Pipeline([("preprocessor", preprocessor),
                  ("clf", GradientBoostingClassifier(random_state=42))]),
        {"clf__n_estimators": [50, 150], "clf__max_depth": [3, 7], "clf__learning_rate": [0.05, 0.1]}
    ),
    "XGBoost": (
        Pipeline([("preprocessor", preprocessor),
                ("clf", XGBClassifier(eval_metric='logloss'))]),  # <- ligne modifiée
        {"clf__n_estimators": [100, 200], "clf__max_depth": [3, 5], "clf__learning_rate": [0.05, 0.1]}
    )
}


### 7. Entraînement + évaluation
results = []
best_global_model = None
best_global_score = 0

for name, (model, params) in models.items():
    print(f"\n🔍 Training {name}...")

    # Sélectionner X/y selon le modèle
    if name == "BradleyTerry_LogReg":
        X_train_model, X_test_model = X_train_bt, X_test_bt
        y_train_model, y_test_model = y_train_bt, y_test_bt
    else:
        X_train_model, X_test_model = X_train, X_test
        y_train_model, y_test_model = y_train, y_test

    # Entraînement + recherche d'hyperparamètres
    grid = GridSearchCV(model, params, cv=5, scoring='roc_auc', n_jobs=-1)
    grid.fit(X_train_model, y_train_model)

    best_model_uncalibrated = grid.best_estimator_

    # Calibration automatique
    try:
        cal_iso = CalibratedClassifierCV(best_model_uncalibrated, method='isotonic', cv=5)
        cal_iso.fit(X_train_model, y_train_model)
        proba_iso = cal_iso.predict_proba(X_test_model)[:, 1]
        brier_iso = brier_score_loss(y_test_model, proba_iso)

        cal_sig = CalibratedClassifierCV(best_model_uncalibrated, method='sigmoid', cv=5)
        cal_sig.fit(X_train_model, y_train_model)
        proba_sig = cal_sig.predict_proba(X_test_model)[:, 1]
        brier_sig = brier_score_loss(y_test_model, proba_sig)

        if brier_iso < brier_sig:
            best_model = cal_iso
            y_proba = proba_iso
            selected_cal = "isotonic"
        else:
            best_model = cal_sig
            y_proba = proba_sig
            selected_cal = "sigmoid"

        y_pred = best_model.predict(X_test_model)
        brier_uncal = brier_score_loss(y_test_model, best_model_uncalibrated.predict_proba(X_test_model)[:, 1])

        print(f"📐 Calibration choisie pour {name} : {selected_cal} (Brier = {min(brier_iso, brier_sig):.4f})")

    except Exception as e:
        print(f"⚠️ Calibration impossible pour {name} : {e}")
        best_model = best_model_uncalibrated
        y_pred = best_model.predict(X_test_model)
        y_proba = best_model.predict_proba(X_test_model)[:, 1] if hasattr(best_model, "predict_proba") else None
        selected_cal = "non calibré"
        brier_iso = brier_sig = brier_uncal = None

    # Évaluation
    acc = accuracy_score(y_test_model, y_pred)
    prec = precision_score(y_test_model, y_pred)
    rec = recall_score(y_test_model, y_pred)
    f1 = f1_score(y_test_model, y_pred)
    bal_acc = balanced_accuracy_score(y_test_model, y_pred)
    auc = roc_auc_score(y_test_model, y_proba) if y_proba is not None else 0
    logloss = log_loss(y_test_model, y_proba) if y_proba is not None else 0

    results.append({
        "Modèle": name,
        "Accuracy": acc,
        "Balanced Accuracy": bal_acc,
        "Precision": prec,
        "Recall": rec,
        "F1": f1,
        "ROC AUC": auc,
        "Log Loss": logloss,
        "Best Params": grid.best_params_,
        "Calibration": selected_cal,
        "Brier Avant": brier_uncal,
        "Brier Après": min(brier_iso, brier_sig) if selected_cal in ["isotonic", "sigmoid"] else brier_uncal
    })

    # Score global pondéré
    score_global = (
        0.2 * acc +
        0.2 * bal_acc +
        0.3 * f1 +
        0.2 * auc +
        0.1 * prec
    )

    print(f"✅ Best Params: {grid.best_params_}")
    print(f"📊 Accuracy: {acc:.3f} | 🎯 F1: {f1:.3f} | 📉 ROC AUC: {auc:.3f} | 🏆 Score global: {score_global:.4f}")

    if score_global > best_global_score:
        best_global_model = best_model
        best_global_model_name = name
        best_global_score = score_global

    # 1. Calibrage isotonic
    cal_iso = CalibratedClassifierCV(estimator=grid.best_estimator_, method='isotonic', cv=5)
    cal_iso.fit(X_test, y_test)
    proba_iso = cal_iso.predict_proba(X_test)[:, 1]
    brier_iso = brier_score_loss(y_test, proba_iso)
    prob_true_iso, prob_pred_iso = calibration_curve(y_test, proba_iso, n_bins=10)

    # 2. Calibrage sigmoid (Platt scaling)
    cal_sig = CalibratedClassifierCV(estimator=grid.best_estimator_, method='sigmoid', cv=5)
    cal_sig.fit(X_test, y_test)
    proba_sig = cal_sig.predict_proba(X_test)[:, 1]
    brier_sig = brier_score_loss(y_test, proba_sig)
    prob_true_sig, prob_pred_sig = calibration_curve(y_test, proba_sig, n_bins=10)

    # 3. Calibration non calibrée (optionnel, pour comparaison)
    if hasattr(grid.best_estimator_, "predict_proba"):
        proba_uncal = grid.best_estimator_.predict_proba(X_test)[:, 1]
        brier_uncal = brier_score_loss(y_test, proba_uncal)
        prob_true_uncal, prob_pred_uncal = calibration_curve(y_test, proba_uncal, n_bins=10)
    else:
        proba_uncal, brier_uncal = None, None

    # 4. Affichage comparatif
    plt.figure(figsize=(7, 6))
    if proba_uncal is not None:
        plt.plot(prob_pred_uncal, prob_true_uncal, marker='o', label='Non calibré', color='gray')
    plt.plot(prob_pred_iso, prob_true_iso, marker='o', label='Isotonic', color='green')
    plt.plot(prob_pred_sig, prob_true_sig, marker='o', label='Sigmoid (Platt)', color='orange')
    plt.plot([0, 1], [0, 1], linestyle='--', color='black')
    plt.title("📐 Courbes de calibration – Comparaison")
    plt.xlabel("Probabilité prédite")
    plt.ylabel("Proportion réelle de victoires")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    # 5. Affichage Brier Scores
    print(f"Brier Score non calibré : {brier_uncal:.4f}" if proba_uncal is not None else "")
    print(f"Brier Score isotonic    : {brier_iso:.4f}")
    print(f"Brier Score sigmoid     : {brier_sig:.4f}")


# 📦 Sauvegarde du meilleur modèle calibré
joblib.dump(best_global_model, "../models/tennis_win_predictor.pkl")

# 💾 Sauvegarde des performances
results_df = pd.DataFrame(results).sort_values(by="ROC AUC", ascending=False)
results_df.to_csv("../models/evaluation_models.csv", index=False)

print(f"\n🏆 Meilleur modèle global : {best_global_model_name}")
print(f"📈 Score global obtenu : {best_global_score:.4f}")



# 🎯 Utiliser le bon X_test / y_test selon le modèle choisi
X_test_eval = X_test_bt if best_global_model_name == "BradleyTerry_LogReg" else X_test
y_test_eval = y_test_bt if best_global_model_name == "BradleyTerry_LogReg" else y_test

y_proba_final = best_global_model.predict_proba(X_test_eval)[:, 1]

# ⚡ Définir les "matchs à haute confiance"
high_conf_threshold = 0.65
mask_high_conf = (y_proba_final > high_conf_threshold) | (y_proba_final < (1 - high_conf_threshold))

# 🧪 Extraire les lignes concernées
df_base_test = df.loc[X_test_eval.index].copy()
df_high_conf = df_base_test.loc[mask_high_conf].copy()
df_high_conf["Proba_Player1"] = y_proba_final[mask_high_conf]
df_high_conf["Predicted_Winner"] = (df_high_conf["Proba_Player1"] >= 0.5).astype(int)
df_high_conf["Is_Correct"] = (df_high_conf["Predicted_Winner"] == y_test_eval.iloc[mask_high_conf].values).astype(int)

# 📊 Afficher les résultats
print(f"{len(df_high_conf)} matchs à haute confiance (proba > {high_conf_threshold})")
print(f"Taux de réussite sur ces matchs : {df_high_conf['Is_Correct'].mean():.2%}")

# Facultatif : afficher un échantillon
df_high_conf[["Player_1", "Player_2", "Proba_Player1", "Predicted_Winner", "Is_Correct"]].head()


#### Comparaison visuelle de la fiabilité des prédictions

In [None]:
from sklearn.calibration import calibration_curve
import matplotlib.pyplot as plt
import seaborn as sns

# Déterminer les bons X_test / y_test selon le modèle choisi
X_test_eval = X_test_bt if best_global_model_name == "BradleyTerry_LogReg" else X_test
y_test_eval = y_test_bt if best_global_model_name == "BradleyTerry_LogReg" else y_test

# Probabilités prédictes finales
y_proba_final = best_global_model.predict_proba(X_test_eval)[:, 1]

# Brier Score final
brier_final = brier_score_loss(y_test_eval, y_proba_final)

# Courbe de calibration
prob_true, prob_pred = calibration_curve(y_test_eval, y_proba_final, n_bins=10)

# 🔍 FIGURE COMBINÉE
plt.figure(figsize=(14, 5))

# 1. Courbe de calibration
plt.subplot(1, 2, 1)
plt.plot(prob_pred, prob_true, marker='o', label='Calibré')
plt.plot([0, 1], [0, 1], '--', color='gray', label='Perfect calibration')
plt.xlabel("Probabilité prédite")
plt.ylabel("Proportion réelle de victoire")
plt.title(f"📐 Calibration - {best_global_model_name}")
plt.legend()
plt.grid(True)

# 2. Histogramme des proba
plt.subplot(1, 2, 2)
sns.histplot(y_proba_final, bins=20, kde=False, color='steelblue')
plt.title("📊 Distribution des probabilités prédites")
plt.xlabel("Probabilité victoire Player 1")
plt.ylabel("Nombre de matchs")

plt.suptitle(f"🔍 Fiabilité du modèle : {best_global_model_name}\nBrier Score = {brier_final:.4f}", fontsize=14)
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()


#### Vérifier les prédictions peu tranchées

In [None]:
import matplotlib.pyplot as plt

# Refaire une prédiction sur X_test pour visualiser les probabilités
y_proba_test = best_global_model.predict_proba(X_test)[:, 1]

plt.hist(y_proba_test, bins=20, edgecolor='k')
plt.title("Distribution des probabilités de victoire (Player_1)")
plt.xlabel("Probabilité prédite")
plt.ylabel("Nombre de matchs")
plt.grid(True)
plt.show()

#### Visulaiser les prédictions incertaines

In [None]:
plt.hist(y_proba_test, bins=20, edgecolor='k')
plt.axvline(0.5, color='red', linestyle='--')  # seuil
plt.title("Distribution des probas (Player 1)")
plt.xlabel("Proba de victoire Player 1")
plt.ylabel("Nombre de matchs")
plt.grid(True)
plt.show()

#### Histogramme des probabilités

In [None]:
# Histogramme des probabilités
plt.figure(figsize=(8, 5))
plt.hist(y_proba_final, bins=20, edgecolor='black', alpha=0.7)

# Zones de haute confiance
plt.axvline(high_conf_threshold, color='green', linestyle='--', label=f'Haute confiance > {high_conf_threshold}')
plt.axvline(1 - high_conf_threshold, color='red', linestyle='--', label=f'Haute confiance < {1 - high_conf_threshold}')

plt.title("Distribution des probabilités prédites (Player 1 gagnant)")
plt.xlabel("Probabilité prédite")
plt.ylabel("Nombre de matchs")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

#### Diagramme de fiabilité

### Entraine un modèle de régression pour le nombre de jeux du match

In [None]:
# 🎯 Données
X_gc = df[all_features]
y_gc = df["Games_Class"]

Xgc_train, Xgc_test, ygc_train, ygc_test = train_test_split(X_gc, y_gc, test_size=0.2, random_state=42, stratify=y_gc)

# 📦 Modèles à tester
models_gc = {
    "RandomForest": (
        Pipeline([
            ("preprocessor", preprocessor),
            ("clf", RandomForestClassifier(random_state=42))
        ]),
        {"clf__n_estimators": [50, 150], "clf__max_depth": [5, 10, None]}
    ),
    "GradientBoosting": (
        Pipeline([
            ("preprocessor", preprocessor),
            ("clf", GradientBoostingClassifier(random_state=42))
        ]),
        {"clf__n_estimators": [50, 150], "clf__learning_rate": [0.05, 0.1]}
    )
}

# 🔁 Fonction de score global
def compute_classification_score(acc, bal_acc, f1, auc, prec):
    return (
        0.3 * acc +
        0.2 * bal_acc +
        0.2 * f1 +
        0.2 * auc +
        0.1 * prec
    )

# 🔍 Évaluation
results_gc = []
best_model_gc = None
best_score_gc = float('-inf')

for name, (model, params) in models_gc.items():
    print(f"\n🔍 Entraînement de {name}...")

    grid = GridSearchCV(model, params, cv=5, scoring="accuracy", n_jobs=-1)
    grid.fit(Xgc_train, ygc_train)

    best_model = grid.best_estimator_
    y_pred = best_model.predict(Xgc_test)

    # Probabilités pour ROC AUC
    try:
        y_proba = best_model.predict_proba(Xgc_test)
    except:
        y_proba = None

    # Métriques
    acc = accuracy_score(ygc_test, y_pred)
    bal_acc = balanced_accuracy_score(ygc_test, y_pred)
    f1 = f1_score(ygc_test, y_pred, average="macro")
    prec = precision_score(ygc_test, y_pred, average="macro")
    rec = recall_score(ygc_test, y_pred, average="macro")

    if y_proba is not None:
        classes = best_model.named_steps["clf"].classes_
        ygc_test_bin = label_binarize(ygc_test, classes=classes)
        auc = roc_auc_score(ygc_test_bin, y_proba, average='macro', multi_class='ovr')
    else:
        auc = 0  # Pas dispo pour ce modèle

    score = compute_classification_score(acc, bal_acc, f1, auc, prec)

    print(f"✅ Best Params: {grid.best_params_}")
    print(f"📊 Accuracy: {acc:.3f}")
    print(f"📈 Balanced Accuracy: {bal_acc:.3f}")
    print(f"🎯 F1 (macro): {f1:.3f}")
    print(f"📉 ROC AUC: {auc:.3f}")
    print(f"🏆 Score global: {score:.4f}")

    results_gc.append({
        "Modèle": name,
        "Accuracy": acc,
        "Balanced Accuracy": bal_acc,
        "F1": f1,
        "ROC AUC": auc,
        "Score Global": score,
        "Best Params": grid.best_params_
    })

    if score > best_score_gc:
        best_score_gc = score
        best_model_gc = best_model
        best_global_model_name = name

# 💾 Sauvegarde du meilleur modèle
joblib.dump(best_model_gc, f"../models/tennis_total_games_predictor.pkl")

# 📋 Résumé
results_df = pd.DataFrame(results_gc).sort_values(by="Score Global", ascending=False)
print("\n📊 Résumé des performances (classe de jeux) :")
print(results_df[["Modèle", "Accuracy", "Balanced Accuracy", "F1", "ROC AUC", "Score Global"]])

# 👉 Affichage du meilleur modèle selon ton score global
print(f"\n🏆 Meilleur modèle global : {best_global_model_name}")
print(f"📈 Score global obtenu : {best_score_gc:.4f}")

### 🔮 Fonctions de prédiction

In [None]:
### 10. 🔮 Prédiction sur un match à partir d’un dictionnaire
def predict_match_proba(input_dict, model_path="../models/tennis_win_predictor.pkl"):
    model = joblib.load(model_path)
    enriched_input = enrich_features(input_dict)  # ⚠️ Doit générer TOUTES les features
    X_input = pd.DataFrame([enriched_input])
    prob = model.predict_proba(X_input)[0]
    return {
        "Probabilité victoire Player_1": round(prob[1], 4),
        "Probabilité victoire Player_2": round(prob[0], 4)
    }


### 10. 🔮 Prédiction du nombre de jeux
def predict_total_games(input_dict, model_path="../models/tennis_total_games_predictor.pkl"):
    model = joblib.load(model_path)
    X_input = pd.DataFrame([input_dict])
    prediction = model.predict(X_input)[0]
    return prediction

def enrich_features(input_dict, h2h_dict=None, recent_form_dict=None):
    enriched = input_dict.copy()
    
    # Calculs classiques
    enriched["Rank_Diff"] = enriched["Rank_1"] - enriched["Rank_2"]
    enriched["Pts_Diff"] = enriched["Pts_1"] - enriched["Pts_2"]
    enriched["Odds_Diff"] = abs(enriched["Odd_1"] - enriched["Odd_2"])
    enriched["Odds_Ratio"] = enriched["Odd_1"] / enriched["Odd_2"] if enriched["Odd_2"] > 0 else 0
    enriched["Avg_Rank"] = (enriched["Rank_1"] + enriched["Rank_2"]) / 2
    enriched["Book_Fav"] = int(enriched["Odd_1"] < enriched["Odd_2"])

    # Round encodé
    round_mapping = {
        "Final": 7, "Semifinal": 6, "Quarterfinal": 5,
        "4th Round": 4, "3rd Round": 3, "2nd Round": 2, "1st Round": 1
    }
    enriched["Round_Ordinal"] = round_mapping.get(enriched.get("Round", ""), 0)

    # H2H
    if h2h_dict:
        key = tuple(sorted([enriched["Player_1"], enriched["Player_2"]]))
        h2h = h2h_dict.get(key, [0, 0])
        if enriched["Player_1"] > enriched["Player_2"]:
            h2h = h2h[::-1]
        enriched["H2H_P1"], enriched["H2H_P2"] = h2h
        enriched["H2H_Diff"] = h2h[0] - h2h[1]
    else:
        enriched["H2H_P1"] = enriched["H2H_P2"] = enriched["H2H_Diff"] = 0

    # Forme récente
    if recent_form_dict:
        enriched["Wins_Last5_P1"] = recent_form_dict.get(enriched["Player_1"], 0)
        enriched["Wins_Last5_P2"] = recent_form_dict.get(enriched["Player_2"], 0)
        enriched["Form_Diff"] = enriched["Wins_Last5_P1"] - enriched["Wins_Last5_P2"]
    else:
        enriched["Wins_Last5_P1"] = enriched["Wins_Last5_P2"] = enriched["Form_Diff"] = 0

    return enriched


## Prédiction avec modèle optimisé

In [None]:
import joblib
import pandas as pd

exemple_match = {
    "Player_1": "Safiullin R.",
    "Player_2": "Mensik J.",
    "Rank_1": 71,
    "Rank_2": 54,
    "Pts_1": 851,
    "Pts_2": 1042,
    "Odd_1": 2.57,
    "Odd_2": 1.49,
    "Surface": "Hard",
    "Round": "3rd Round",
    "Best of": 3,
    "Court": "Outdoor"
}


h2h_dict = joblib.load("../models/h2h_dict.pkl")
recent_form_dict = joblib.load("../models/recent_form_dict.pkl")


# 🛠️ Ajout des features manquantes
enriched_input = enrich_features(exemple_match, h2h_dict, recent_form_dict)

# 🔢 Prédiction du vainqueur
probas = predict_match_proba(enriched_input, model_path="../models/tennis_win_predictor.pkl")

# 🔢 Prédiction total jeux
predicted_class = predict_total_games(enriched_input, model_path="../models/tennis_total_games_predictor.pkl")



print("\n🎯 Prédiction complète :")
for joueur, proba in probas.items():
    print(f"{joueur} : {proba:.2%}")
print(f"📊 Intervalle prédit pour le nombre de jeux : {predicted_class}")