# 02 - Ajustement du modele Dixon-Coles

**Objectif** : Fitter le modele Dixon-Coles sur les donnees historiques de Ligue 1 et verifier que les parametres sont coherents.

## Ce qu'on verifie :
1. Le MLE converge correctement
2. Les ratings attack/defense sont coherents avec le classement reel
3. Le parametre rho est dans les bornes attendues (-0.3 a 0)
4. Le home advantage est realiste (0.1 a 0.4)
5. Les predictions de score sont sensees

In [None]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
sys.path.insert(0, str(PROJECT_ROOT))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

from src.models.dixon_coles import DixonColesModel, MatchResult

sns.set_theme(style='whitegrid')
print('OK')

## 1. Charger les donnees d'entrainement

In [None]:
# Charger saison 2023-24 (entrainement)
DATA_DIR = PROJECT_ROOT / 'data' / 'raw'
df = pd.read_csv(DATA_DIR / 'ligue1_2023.csv')
df['kickoff'] = pd.to_datetime(df['kickoff'])
print(f'Matchs charges: {len(df)}')
print(f'Equipes: {df["home_team"].nunique()}')
print(f'Periode: {df["kickoff"].min().date()} -> {df["kickoff"].max().date()}')
df.head()

## 2. Convertir en MatchResult et fitter

In [None]:
%%time

# Convertir en MatchResult
match_results = [
    MatchResult(
        home_team=row['home_team'],
        away_team=row['away_team'],
        home_goals=int(row['home_score']),
        away_goals=int(row['away_score']),
        date=row['kickoff'].to_pydatetime(),
    )
    for _, row in df.iterrows()
]

print(f'Matchs pour fitting: {len(match_results)}')

# Fitter le modele
model = DixonColesModel(half_life_days=180)
model.fit(match_results)

print(f'\nConvergence: {model._convergence_info}')
print(f'Home advantage: {model.home_advantage:.3f}')
print(f'Rho: {model.rho:.3f}')
print(f'Avg goals: {model.avg_goals:.3f}')
print(f'Equipes fittees: {len(model.teams)}')

## 3. Classement des equipes par force

In [None]:
# Classement par ratio attack/defense
rankings = model.get_team_rankings()
df_rank = pd.DataFrame(rankings)
print('\nClassement par force (attack/defense ratio):\n')
print(df_rank.to_string(index=False))

In [None]:
# Visualisation attack vs defense
fig, ax = plt.subplots(figsize=(10, 8))

for _, row in df_rank.iterrows():
    ax.scatter(row['attack'], row['defense'], s=100, zorder=5)
    ax.annotate(row['team'], (row['attack'], row['defense']),
                textcoords='offset points', xytext=(5, 5), fontsize=8)

ax.axhline(y=1.0, color='gray', linestyle='--', alpha=0.5)
ax.axvline(x=1.0, color='gray', linestyle='--', alpha=0.5)
ax.set_xlabel('Attack Rating (> 1 = above average)')
ax.set_ylabel('Defense Rating (< 1 = better defense)')
ax.set_title('Dixon-Coles: Attack vs Defense Ratings - Ligue 1')

# Quadrant labels
ax.text(0.98, 0.02, 'Fort attaque\nForte defense', transform=ax.transAxes,
        ha='right', va='bottom', fontsize=9, color='green', alpha=0.7)
ax.text(0.02, 0.98, 'Faible attaque\nFaible defense', transform=ax.transAxes,
        ha='left', va='top', fontsize=9, color='red', alpha=0.7)

plt.tight_layout()
plt.savefig(PROJECT_ROOT / 'data' / 'results' / 'team_ratings.png', dpi=150)
plt.show()

## 4. Test de prediction

In [None]:
# Predire quelques matchs pour verifier la coherence
test_matchups = [
    ('Paris Saint-Germain', 'Montpellier HSC'),   # Gros favori
    ('Paris Saint-Germain', 'AS Monaco'),          # Choc
    ('Montpellier HSC', 'Le Havre AC'),            # Bas de tableau
]

teams_available = list(model.teams.keys())
print('Equipes disponibles:', teams_available[:5], '...')
print()

for home, away in test_matchups:
    # Trouver les noms les plus proches si exact match echoue
    if home not in model.teams or away not in model.teams:
        print(f'\n[SKIP] {home} vs {away} - equipe non trouvee')
        continue
    
    pred = model.predict(home, away)
    d = pred.to_dict()
    print(f"\n{home} vs {away}")
    print(f"  Lambda: {d['lambda_home']:.2f} - {d['lambda_away']:.2f}")
    print(f"  1X2: {d['1x2']['home']:.1%} / {d['1x2']['draw']:.1%} / {d['1x2']['away']:.1%}")
    print(f"  O/U 2.5: {d['over_under']['over_25']:.1%} / {1-d['over_under']['over_25']:.1%}")
    print(f"  BTTS: {d['btts']['yes']:.1%} / {d['btts']['no']:.1%}")

## 5. Verification rapide sur les matchs termines

On predit les matchs de la saison d'entrainement (in-sample) pour verifier que le modele n'est pas completement a cote.

In [None]:
# Prediction in-sample (attention: c'est du in-sample, pas du out-of-sample!)
correct = 0
total = 0
probs_home = []

for _, row in df.iterrows():
    h, a = row['home_team'], row['away_team']
    if h not in model.teams or a not in model.teams:
        continue
    
    pred = model.predict(h, a)
    predicted = max(
        [('H', pred.home_win), ('D', pred.draw), ('A', pred.away_win)],
        key=lambda x: x[1]
    )[0]
    
    # Resultat reel
    hs, as_ = int(row['home_score']), int(row['away_score'])
    actual = 'H' if hs > as_ else ('D' if hs == as_ else 'A')
    
    if predicted == actual:
        correct += 1
    total += 1
    probs_home.append(pred.home_win)

print(f'Accuracy in-sample: {correct/total:.1%} ({correct}/{total})')
print(f'(Attention: c est du in-sample, le vrai test est dans notebook 05_backtest)')
print(f'\nProba home win moyenne: {np.mean(probs_home):.3f}')
print(f'Home win rate reel:     {(df["home_score"] > df["away_score"]).mean():.3f}')

## Prochaine etape

Le modele Dixon-Coles est fitte. Passons au notebook **03** pour le modele ELO, puis au **05_backtest** pour la validation walk-forward.