# World Cup Bracket Prediction Model

## Overview
This notebook implements a dual XGBoost regression model to predict expected goals for international football matches. We then use Poisson-based Monte Carlo simulation to generate full bracket predictions with probability estimates.

**Key Approach:**
- Predict goals, not outcomes (a 5-0 and 1-0 are both "wins" but carry different information)
- Use Poisson distribution to model goal scoring (mathematically proven for rare, independent events)
- Monte Carlo simulation for robust probability estimates

**Data Sources:**
- International Football Results (2010+)
- FIFA World Rankings
- EA Sports FC Player Stats (FIFA 15-24)

## 1. Setup and Imports

In [None]:
# Install core dependencies
!pip install -q 'pandas>=1.5.0' 'numpy>=1.21.0' 'scipy>=1.9.0' 'xgboost>=2.0.0' \
    'scikit-learn>=1.0.0' 'matplotlib>=3.5.0' 'seaborn>=0.12.0' 'joblib>=1.2.0'

# Install CuPy for GPU-accelerated simulation (try CUDA 12 first, then CUDA 11)
try:
    import cupy
    print("CuPy already installed")
except ImportError:
    print("Installing CuPy for GPU acceleration...")
    import subprocess
    # Try CUDA 12 first (default for newer Colab)
    result = subprocess.run(['pip', 'install', '-q', 'cupy-cuda12x'], capture_output=True)
    if result.returncode != 0:
        # Fallback to CUDA 11
        print("CUDA 12 failed, trying CUDA 11...")
        subprocess.run(['pip', 'install', '-q', 'cupy-cuda11x'], capture_output=True)
    print("CuPy installation complete")

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
from scipy.stats import poisson
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from collections import Counter
import joblib
import json
import os
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# Set random seed for reproducibility
np.random.seed(42)

# --- Google Colab / Google Drive Configuration ---
# Set to True when running in Google Colab to mount Drive
USE_GOOGLE_DRIVE = True

# Google Drive paths (customize these to your folder structure)
GDRIVE_BASE_PATH = '/content/drive/MyDrive/world-cup-prediction'
GDRIVE_DATA_PATH = f'{GDRIVE_BASE_PATH}/data'
GDRIVE_MODEL_PATH = f'{GDRIVE_BASE_PATH}/model_artifacts'
GDRIVE_SIMULATIONS_PATH = f'{GDRIVE_BASE_PATH}/simulations'

# Local paths (used when not on Google Drive)
LOCAL_DATA_PATH = 'data'
LOCAL_MODEL_PATH = 'model_artifacts'
LOCAL_SIMULATIONS_PATH = 'simulations'

# Set active paths based on configuration
if USE_GOOGLE_DRIVE:
    DATA_PATH = GDRIVE_DATA_PATH
    MODEL_PATH = GDRIVE_MODEL_PATH
    SIMULATIONS_PATH = GDRIVE_SIMULATIONS_PATH
else:
    DATA_PATH = LOCAL_DATA_PATH
    MODEL_PATH = LOCAL_MODEL_PATH
    SIMULATIONS_PATH = LOCAL_SIMULATIONS_PATH

print("Libraries loaded successfully")
print(f"Data path: {DATA_PATH}")
print(f"Model path: {MODEL_PATH}")
print(f"Simulations path: {SIMULATIONS_PATH}")

In [None]:
# --- GPU Configuration ---
# Detect GPU availability and set up array backend (CuPy for GPU, NumPy for CPU)
import subprocess

def check_gpu_available():
    """Check if CUDA GPU is available."""
    try:
        result = subprocess.run(['nvidia-smi'], capture_output=True)
        return result.returncode == 0
    except FileNotFoundError:
        return False

USE_GPU = check_gpu_available()

# Initialize cp as None for CPU fallback (prevents NameError if GPU unavailable)
cp = None

if USE_GPU:
    try:
        import cupy as cp
        # Test that CuPy can actually use the GPU
        _ = cp.array([1, 2, 3])
        print(f"GPU detected. Using CuPy for vectorized simulation.")
        try:
            gpu_name = cp.cuda.runtime.getDeviceProperties(0)['name'].decode()
            print(f"GPU: {gpu_name}")
        except:
            print("GPU: Available (name not retrievable)")
        xp = cp  # Use CuPy as array backend
    except Exception as e:
        print(f"CuPy import failed: {e}")
        print("Falling back to NumPy (CPU).")
        USE_GPU = False
        cp = None
        xp = np
else:
    print("No GPU detected. Falling back to NumPy (CPU).")
    xp = np

# XGBoost device setting
xgb_device = "cuda" if USE_GPU else "cpu"
print(f"XGBoost device: {xgb_device}")

# Helper function to convert GPU arrays to CPU (NumPy)
def to_numpy(arr):
    """Convert CuPy array to NumPy, or return NumPy array unchanged."""
    if USE_GPU and cp is not None and hasattr(arr, 'get'):
        return cp.asnumpy(arr)
    return arr if isinstance(arr, np.ndarray) else np.asarray(arr)


In [None]:
# Mount Google Drive (only runs if USE_GOOGLE_DRIVE is True)
if USE_GOOGLE_DRIVE:
    try:
        from google.colab import drive
        drive.mount('/content/drive')
        
        # Create directories if they don't exist
        os.makedirs(GDRIVE_DATA_PATH, exist_ok=True)
        os.makedirs(GDRIVE_MODEL_PATH, exist_ok=True)
        
        print(f"Google Drive mounted successfully")
        print(f"Data folder: {GDRIVE_DATA_PATH}")
        print(f"Model folder: {GDRIVE_MODEL_PATH}")
        
        # List files in data directory
        if os.path.exists(GDRIVE_DATA_PATH):
            files = os.listdir(GDRIVE_DATA_PATH)
            if files:
                print(f"Files in data folder: {files}")
            else:
                print("WARNING: Data folder is empty. Please upload the following files:")
                print("  - all_matches.csv")
                print("  - fifa_ranking_2024.csv")
                print("  - players.csv")
                print("  - countries_names.csv")
    except ImportError:
        print("Not running in Google Colab. Using local paths.")
        USE_GOOGLE_DRIVE = False
        DATA_PATH = LOCAL_DATA_PATH
        MODEL_PATH = LOCAL_MODEL_PATH
else:
    print("Using local paths (USE_GOOGLE_DRIVE is False)")

## 2. Data Loading

In [None]:
# Load datasets from configured path (local or Google Drive)
matches_df = pd.read_csv(f'{DATA_PATH}/all_matches.csv')
rankings_df = pd.read_csv(f'{DATA_PATH}/fifa_ranking_2024.csv')
players_df = pd.read_csv(f'{DATA_PATH}/players.csv', low_memory=False)
country_names_df = pd.read_csv(f'{DATA_PATH}/countries_names.csv')

print(f"Loaded data from: {DATA_PATH}")
print(f"Matches: {len(matches_df):,} rows")
print(f"Rankings: {len(rankings_df):,} rows")
print(f"Players: {len(players_df):,} rows")
print(f"Country names: {len(country_names_df):,} rows")

In [None]:
# Preview match data
print("Matches columns:", matches_df.columns.tolist())
matches_df.head()

In [None]:
# Convert date columns
matches_df['date'] = pd.to_datetime(matches_df['date'])
rankings_df['rank_date'] = pd.to_datetime(rankings_df['rank_date'])

# Filter matches to 2010+ (modern football era)
matches_df = matches_df[matches_df['date'] >= '2010-01-01'].copy()
matches_df = matches_df.sort_values('date').reset_index(drop=True)

# Add year column for easier filtering
matches_df['year'] = matches_df['date'].dt.year

print(f"Matches after 2010 filter: {len(matches_df):,}")
print(f"Date range: {matches_df['date'].min()} to {matches_df['date'].max()}")
print(f"\nTournament breakdown:")
print(matches_df['tournament'].value_counts().head(15))

## 3. Country Name Normalization

Country names differ across datasets. We need to normalize them to ensure proper merging.

In [None]:
# Build country name normalization from the provided mapping file
# The countries_names.csv contains original_name -> current_name mappings
name_mapping = dict(zip(country_names_df['original_name'], country_names_df['current_name']))

# Additional manual mappings for common variations
ADDITIONAL_MAPPINGS = {
    # FIFA Rankings variations
    'USA': 'United States',
    'Korea Republic': 'South Korea',
    'Korea DPR': 'North Korea',
    'IR Iran': 'Iran',
    'China PR': 'China',
    "Cote d'Ivoire": "Ivory Coast",
    "Côte d'Ivoire": "Ivory Coast",
    'Czechia': 'Czech Republic',
    'Congo DR': 'DR Congo',
    'Viet Nam': 'Vietnam',
    'Russian Federation': 'Russia',
    'Türkiye': 'Turkey',
    
    # EA Sports player nationality variations
    'United States of America': 'United States',
    'Korea': 'South Korea',
    'Republic of Korea': 'South Korea',
    'DPR Korea': 'North Korea',
    "People's Republic of China": 'China',
    'Democratic Republic of Congo': 'DR Congo',
    
    # UK nations
    'England': 'England',
    'Scotland': 'Scotland',
    'Wales': 'Wales',
    'Northern Ireland': 'Northern Ireland',
    
    # Other common variations
    'Republic of Ireland': 'Ireland',
    'Eswatini': 'Swaziland',
    'Timor-Leste': 'East Timor',
    'Trinidad & Tobago': 'Trinidad and Tobago',
}

# Combine mappings
name_mapping.update(ADDITIONAL_MAPPINGS)

def normalize_country_name(name):
    """Normalize country name to canonical form."""
    if pd.isna(name):
        return name
    name = str(name).strip()
    return name_mapping.get(name, name)

print(f"Total name mappings: {len(name_mapping)}")

In [None]:
# Apply normalization to all datasets
matches_df['home_team'] = matches_df['home_team'].apply(normalize_country_name)
matches_df['away_team'] = matches_df['away_team'].apply(normalize_country_name)

rankings_df['country_full'] = rankings_df['country_full'].apply(normalize_country_name)

players_df['nationality_name'] = players_df['nationality_name'].apply(normalize_country_name)

# Get unique teams from matches
all_teams = set(matches_df['home_team'].unique()) | set(matches_df['away_team'].unique())
print(f"Unique teams in matches: {len(all_teams)}")

# Check player coverage
player_countries = set(players_df['nationality_name'].unique())
matched_teams = all_teams & player_countries
print(f"Teams with player data: {len(matched_teams)}")

## 4. Elo Rating Calculation

We calculate Elo ratings for all teams based on historical match results. Elo is a powerful predictor of team strength.

In [None]:
def calculate_elo_ratings(matches_df, k=32, initial_elo=1500):
    """
    Calculate Elo ratings for all teams from match history.
    Returns a dict of team -> current elo, and adds elo columns to dataframe.
    """
    elo = defaultdict(lambda: initial_elo)
    
    # Store Elo at time of each match
    home_elos = []
    away_elos = []
    
    for _, match in matches_df.iterrows():
        home, away = match['home_team'], match['away_team']
        home_elo, away_elo = elo[home], elo[away]
        
        # Store pre-match Elo
        home_elos.append(home_elo)
        away_elos.append(away_elo)
        
        # Expected scores
        exp_home = 1 / (1 + 10**((away_elo - home_elo) / 400))
        exp_away = 1 - exp_home
        
        # Actual scores (1=win, 0.5=draw, 0=loss)
        home_score, away_score = match['home_score'], match['away_score']
        if home_score > away_score:
            actual_home, actual_away = 1, 0
        elif home_score < away_score:
            actual_home, actual_away = 0, 1
        else:
            actual_home, actual_away = 0.5, 0.5
        
        # Update Elo
        elo[home] += k * (actual_home - exp_home)
        elo[away] += k * (actual_away - exp_away)
    
    return dict(elo), home_elos, away_elos

# Calculate Elo ratings
elo_ratings, home_elos, away_elos = calculate_elo_ratings(matches_df)

# Add Elo columns to matches dataframe
matches_df['home_elo'] = home_elos
matches_df['away_elo'] = away_elos
matches_df['elo_diff'] = matches_df['home_elo'] - matches_df['away_elo']

print(f"Calculated Elo for {len(elo_ratings)} teams")
print("\nTop 20 teams by Elo:")
top_teams = sorted(elo_ratings.items(), key=lambda x: x[1], reverse=True)[:20]
for i, (team, rating) in enumerate(top_teams, 1):
    print(f"{i:2}. {team:25} {rating:.1f}")

## 5. Player Aggregation

Aggregate player stats by country and year. We use the top 14 players (typical squad selection) and align FIFA version to match year to avoid data leakage.

In [None]:
# Check FIFA versions available
print("FIFA versions in dataset:")
print(players_df['fifa_version'].value_counts().sort_index())

In [None]:
def get_fifa_version_for_year(match_year):
    """
    Map match year to FIFA version.
    FIFA 15 = 2014/2015 season, FIFA 24 = 2023/2024 season
    """
    # FIFA version is roughly match_year - 2000 + 1
    # But we need to cap it at available versions
    fifa_version = match_year - 2000 + 1
    
    # Cap to available versions (15-24)
    if fifa_version < 15:
        return 15  # Use FIFA 15 for older matches
    elif fifa_version > 24:
        return 24  # Use FIFA 24 for future matches
    return fifa_version

def get_player_stats_with_fallback(player_aggregates, country, target_version):
    """
    Get player stats for a country with fallback to nearest available version.
    
    Fallback strategy:
    1. Try exact FIFA version match
    2. Try closest earlier version (to avoid data leakage)
    3. Try closest later version (if no earlier data exists)
    4. Return None if no data exists for this country
    """
    country_data = player_aggregates[player_aggregates['country'] == country]
    
    if len(country_data) == 0:
        return None
    
    available_versions = sorted(country_data['fifa_version'].unique())
    
    # Try exact match first
    exact = country_data[country_data['fifa_version'] == target_version]
    if len(exact) > 0:
        return exact.iloc[0]
    
    # Try closest earlier version (preferred - avoids data leakage)
    earlier_versions = [v for v in available_versions if v < target_version]
    if earlier_versions:
        closest_earlier = max(earlier_versions)
        return country_data[country_data['fifa_version'] == closest_earlier].iloc[0]
    
    # Try closest later version (fallback - some leakage but better than no data)
    later_versions = [v for v in available_versions if v > target_version]
    if later_versions:
        closest_later = min(later_versions)
        return country_data[country_data['fifa_version'] == closest_later].iloc[0]
    
    return None

def aggregate_players_by_country_year(players_df, top_n=14):
    """
    Aggregate player stats by country and FIFA version.
    Returns top N players by overall rating for each country/version.
    """
    # Key columns for aggregation
    agg_cols = ['overall', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic']
    
    # Filter to valid data
    player_data = players_df[['nationality_name', 'fifa_version', 'overall', 
                               'pace', 'shooting', 'passing', 'dribbling', 
                               'defending', 'physic', 'player_positions']].copy()
    player_data = player_data.dropna(subset=['nationality_name', 'fifa_version', 'overall'])
    player_data['fifa_version'] = player_data['fifa_version'].astype(int)
    
    aggregations = []
    
    for (country, version), group in player_data.groupby(['nationality_name', 'fifa_version']):
        # Get top N players by overall rating
        top_players = group.nlargest(top_n, 'overall')
        
        if len(top_players) == 0:
            continue
        
        # Basic aggregations
        agg_dict = {
            'country': country,
            'fifa_version': version,
            'num_players': len(top_players),
            'avg_overall': top_players['overall'].mean(),
            'max_overall': top_players['overall'].max(),
            'avg_pace': top_players['pace'].mean(),
            'avg_shooting': top_players['shooting'].mean(),
            'avg_passing': top_players['passing'].mean(),
            'avg_dribbling': top_players['dribbling'].mean(),
            'avg_defending': top_players['defending'].mean(),
            'avg_physic': top_players['physic'].mean(),
        }
        
        # Calculate attack/defense averages based on positions
        # Attackers: ST, CF, LW, RW, LF, RF, CAM
        # Defenders: CB, LB, RB, LWB, RWB, CDM, GK
        attackers = top_players[top_players['player_positions'].fillna('').str.contains(
            'ST|CF|LW|RW|LF|RF|CAM', case=False)]
        defenders = top_players[top_players['player_positions'].fillna('').str.contains(
            'CB|LB|RB|LWB|RWB|CDM|GK', case=False)]
        
        agg_dict['avg_attack_overall'] = attackers['overall'].mean() if len(attackers) > 0 else agg_dict['avg_overall']
        agg_dict['avg_defense_overall'] = defenders['overall'].mean() if len(defenders) > 0 else agg_dict['avg_overall']
        
        aggregations.append(agg_dict)
    
    return pd.DataFrame(aggregations)

# Aggregate player data
player_aggregates = aggregate_players_by_country_year(players_df)
print(f"Player aggregates: {len(player_aggregates)} country-version combinations")

# Diagnostic: Show available FIFA versions
available_versions = sorted(player_aggregates['fifa_version'].unique())
print(f"Available FIFA versions: {available_versions}")
print(f"Countries with data: {player_aggregates['country'].nunique()}")

# Show coverage warning if needed
if len(available_versions) < 10:
    print(f"\nWARNING: Only {len(available_versions)} FIFA versions available.")
    print("Fallback mechanism will be used for matches outside these years.")
    print("This may introduce some data leakage for historical predictions.")

player_aggregates.head(10)

In [None]:
# Check coverage for key World Cup teams
wc_teams = ['Brazil', 'Argentina', 'France', 'Germany', 'England', 'Spain', 'Netherlands', 'Portugal']

print("Player data coverage for key teams:")
for team in wc_teams:
    team_data = player_aggregates[player_aggregates['country'] == team]
    if len(team_data) > 0:
        print(f"{team}: {len(team_data)} versions, avg overall = {team_data['avg_overall'].mean():.1f}")
    else:
        print(f"{team}: NO DATA")

## 6. Feature Engineering

Create the full feature set by merging Elo ratings, player stats, and calculating recent form features.

In [None]:
def calculate_form_features(matches_df, team, match_date, n_matches=5):
    """
    Calculate recent form features for a team before a given match date.
    Returns goals scored, goals conceded, and win rate from last N matches.
    (Legacy function - kept for compatibility, use precompute_all_form_features for batch)
    """
    # Get team's matches before this date
    team_home = matches_df[(matches_df['home_team'] == team) & (matches_df['date'] < match_date)]
    team_away = matches_df[(matches_df['away_team'] == team) & (matches_df['date'] < match_date)]
    
    # Combine and get last N matches
    home_results = team_home[['date', 'home_score', 'away_score']].copy()
    home_results.columns = ['date', 'goals_for', 'goals_against']
    
    away_results = team_away[['date', 'away_score', 'home_score']].copy()
    away_results.columns = ['date', 'goals_for', 'goals_against']
    
    all_results = pd.concat([home_results, away_results]).sort_values('date', ascending=False)
    recent = all_results.head(n_matches)
    
    if len(recent) == 0:
        return 1.5, 1.5, 0.33  # Default values
    
    avg_scored = recent['goals_for'].mean()
    avg_conceded = recent['goals_against'].mean()
    
    wins = (recent['goals_for'] > recent['goals_against']).sum()
    win_rate = wins / len(recent)
    
    return avg_scored, avg_conceded, win_rate


# =============================================================================
# OPTIMIZED FEATURE BUILDING (10-50x faster)
# =============================================================================

def precompute_all_form_features(matches_df, n_matches=5):
    """
    Pre-compute rolling form features for ALL teams at ALL match dates.
    This is O(n) instead of O(n²) - runs once instead of per-match.
    
    Returns dict: (team, date) -> (avg_scored, avg_conceded, win_rate)
    """
    print("Pre-computing form features for all teams...")
    
    # Create expanded match records (each match = 2 records: one per team)
    records = []
    for _, match in matches_df.iterrows():
        records.append({
            'team': match['home_team'],
            'date': match['date'],
            'goals_for': match['home_score'],
            'goals_against': match['away_score']
        })
        records.append({
            'team': match['away_team'],
            'date': match['date'],
            'goals_for': match['away_score'],
            'goals_against': match['home_score']
        })
    
    records_df = pd.DataFrame(records).sort_values(['team', 'date'])
    
    # Calculate rolling stats per team
    form_cache = {}
    default_form = (1.5, 1.5, 0.33)
    
    for team in records_df['team'].unique():
        team_matches = records_df[records_df['team'] == team].sort_values('date').copy()
        
        # Rolling calculations (shift by 1 to exclude current match)
        team_matches['scored_rolling'] = team_matches['goals_for'].shift(1).rolling(n_matches, min_periods=1).mean()
        team_matches['conceded_rolling'] = team_matches['goals_against'].shift(1).rolling(n_matches, min_periods=1).mean()
        team_matches['win'] = (team_matches['goals_for'] > team_matches['goals_against']).astype(int)
        team_matches['win_rate_rolling'] = team_matches['win'].shift(1).rolling(n_matches, min_periods=1).mean()
        
        for _, row in team_matches.iterrows():
            if pd.isna(row['scored_rolling']):
                form_cache[(team, row['date'])] = default_form
            else:
                form_cache[(team, row['date'])] = (
                    row['scored_rolling'],
                    row['conceded_rolling'],
                    row['win_rate_rolling'] if not pd.isna(row['win_rate_rolling']) else 0.33
                )
    
    print(f"  Cached form for {len(form_cache):,} (team, date) combinations")
    return form_cache


def create_player_lookup(player_aggregates):
    """Create O(1) lookup dict for player stats by (country, fifa_version)."""
    print("Creating player stats lookup index...")
    lookup = {}
    for _, row in player_aggregates.iterrows():
        lookup[(row['country'], int(row['fifa_version']))] = row
    print(f"  Indexed {len(lookup):,} country-version combinations")
    return lookup


def get_player_stats_fast(lookup, country, target_version):
    """Fast O(1) player stats lookup with fallback to nearest version."""
    if (country, target_version) in lookup:
        return lookup[(country, target_version)]
    
    # Fallback: find nearest version
    available = [v for (c, v) in lookup.keys() if c == country]
    if not available:
        return None
    
    earlier = [v for v in available if v < target_version]
    if earlier:
        return lookup[(country, max(earlier))]
    
    later = [v for v in available if v > target_version]
    if later:
        return lookup[(country, min(later))]
    
    return None


# Test form calculation
test_date = pd.Timestamp('2022-11-20')
test_team = 'Brazil'
scored, conceded, win_rate = calculate_form_features(matches_df, test_team, test_date)
print(f"{test_team} form before {test_date.date()}:")
print(f"  Avg goals scored: {scored:.2f}")
print(f"  Avg goals conceded: {conceded:.2f}")
print(f"  Win rate: {win_rate:.1%}")

In [None]:
def build_feature_dataset_fast(matches_df, player_aggregates, form_cache=None, player_lookup=None):
    """
    OPTIMIZED feature building - 10-50x faster than original.
    
    Key optimizations:
    1. Pre-computed form features (O(1) lookup instead of O(n) per match)
    2. Indexed player stats (O(1) lookup)
    3. Progress reporting
    """
    import time
    start_time = time.time()
    
    # Pre-compute caches if not provided
    if form_cache is None:
        form_cache = precompute_all_form_features(matches_df)
    if player_lookup is None:
        player_lookup = create_player_lookup(player_aggregates)
    
    print("Building feature dataset (optimized)...")
    
    features_list = []
    skipped = 0
    total = len(matches_df)
    default_form = (1.5, 1.5, 0.33)
    
    for i, (idx, match) in enumerate(matches_df.iterrows()):
        if (i + 1) % 2000 == 0:
            elapsed = time.time() - start_time
            rate = (i + 1) / elapsed
            remaining = (total - i - 1) / rate
            print(f"  Processed {i+1:,}/{total:,} ({100*(i+1)/total:.0f}%) - {remaining:.0f}s remaining")
        
        home_team = match['home_team']
        away_team = match['away_team']
        match_date = match['date']
        match_year = match['year']
        
        fifa_version = get_fifa_version_for_year(match_year)
        
        # Fast O(1) player stats lookup
        home_players = get_player_stats_fast(player_lookup, home_team, fifa_version)
        away_players = get_player_stats_fast(player_lookup, away_team, fifa_version)
        
        if home_players is None or away_players is None:
            skipped += 1
            continue
        
        # Fast O(1) form lookup
        home_form = form_cache.get((home_team, match_date), default_form)
        away_form = form_cache.get((away_team, match_date), default_form)
        
        features = {
            'home_elo': match['home_elo'],
            'away_elo': match['away_elo'],
            'elo_diff': match['elo_diff'],
            'home_avg_overall': home_players['avg_overall'],
            'home_max_overall': home_players['max_overall'],
            'home_avg_attack': home_players['avg_attack_overall'],
            'home_avg_defense': home_players['avg_defense_overall'],
            'home_avg_pace': home_players['avg_pace'],
            'home_avg_shooting': home_players['avg_shooting'],
            'home_avg_passing': home_players['avg_passing'],
            'away_avg_overall': away_players['avg_overall'],
            'away_max_overall': away_players['max_overall'],
            'away_avg_attack': away_players['avg_attack_overall'],
            'away_avg_defense': away_players['avg_defense_overall'],
            'away_avg_pace': away_players['avg_pace'],
            'away_avg_shooting': away_players['avg_shooting'],
            'away_avg_passing': away_players['avg_passing'],
            'overall_diff': home_players['avg_overall'] - away_players['avg_overall'],
            'attack_diff': home_players['avg_attack_overall'] - away_players['avg_attack_overall'],
            'defense_diff': home_players['avg_defense_overall'] - away_players['avg_defense_overall'],
            'home_form_scored': home_form[0],
            'home_form_conceded': home_form[1],
            'home_form_win_rate': home_form[2],
            'away_form_scored': away_form[0],
            'away_form_conceded': away_form[1],
            'away_form_win_rate': away_form[2],
            'is_neutral': 1 if match['neutral'] else 0,
            'is_world_cup': 1 if 'FIFA World Cup' in str(match['tournament']) else 0,
            'is_continental': 1 if any(x in str(match['tournament']) for x in 
                                       ['UEFA Euro', 'Copa America', 'Africa Cup', 'AFC Asian Cup']) else 0,
            'home_goals': match['home_score'],
            'away_goals': match['away_score'],
            '_home_team': home_team,
            '_away_team': away_team,
            '_date': match_date,
            '_tournament': match['tournament'],
        }
        
        features_list.append(features)
    
    elapsed = time.time() - start_time
    print(f"  Completed in {elapsed:.1f}s! {len(features_list):,} matches, {skipped:,} skipped")
    return pd.DataFrame(features_list)


# Legacy function for backwards compatibility
def build_feature_dataset(matches_df, player_aggregates, elo_ratings_history=None):
    """Original slow version - use build_feature_dataset_fast instead."""
    return build_feature_dataset_fast(matches_df, player_aggregates)


# Build feature dataset using OPTIMIZED version
print("Building feature dataset...")
feature_df = build_feature_dataset_fast(matches_df, player_aggregates)
print(f"\nFeature dataset: {len(feature_df)} matches with complete features")

In [None]:
# Summary of feature dataset
print("Feature dataset summary:")
print(f"Shape: {feature_df.shape}")
print(f"\nDate range: {feature_df['_date'].min()} to {feature_df['_date'].max()}")
print(f"\nWorld Cup matches: {feature_df['is_world_cup'].sum()}")

# Check for missing values
feature_cols = [c for c in feature_df.columns if not c.startswith('_') and c not in ['home_goals', 'away_goals']]
print(f"\nMissing values:")
missing = feature_df[feature_cols].isnull().sum()
print(missing[missing > 0] if missing.sum() > 0 else "None")

## 7. Model Training

Train dual XGBoost regressors: one for home team goals, one for away team goals.

In [None]:
# Prepare features and targets
feature_cols = [c for c in feature_df.columns if not c.startswith('_') and c not in ['home_goals', 'away_goals']]

X = feature_df[feature_cols].copy()
y_home = feature_df['home_goals'].copy()
y_away = feature_df['away_goals'].copy()

# Fill any remaining NaN values
X = X.fillna(X.mean())

print(f"Features: {len(feature_cols)}")
print(feature_cols)

# Split: use 2022+ as test set (including 2022 World Cup)
train_mask = feature_df['_date'] < '2022-01-01'
test_mask = feature_df['_date'] >= '2022-01-01'

X_train, X_test = X[train_mask], X[test_mask]
y_home_train, y_home_test = y_home[train_mask], y_home[test_mask]
y_away_train, y_away_test = y_away[train_mask], y_away[test_mask]

print(f"\nTrain set: {len(X_train)} matches")
print(f"Test set: {len(X_test)} matches")

In [None]:
# Shared hyperparameters for both models (GPU-accelerated with improved settings)
xgb_params = {
    'n_estimators': 2000,           # Increased with early stopping
    'learning_rate': 0.01,          # Lower for more precise learning
    'max_depth': 4,                 # Reduced to prevent overfitting
    'subsample': 0.7,               # Slightly lower for robustness
    'colsample_bytree': 0.7,
    'random_state': 42,
    'device': xgb_device,           # GPU acceleration (cuda or cpu)
    'early_stopping_rounds': 50,    # Stop if no improvement for 50 rounds
}

print(f"Training with device: {xgb_device}")
print(f"Hyperparameters: {xgb_params}")

# Train Home Goals Model
model_home = XGBRegressor(**xgb_params)
model_home.fit(
    X_train, y_home_train,
    eval_set=[(X_test, y_home_test)],
    verbose=100  # Print progress every 100 trees
)
print(f"Home Goals Model: stopped at {model_home.best_iteration} trees")

# Train Away Goals Model (same parameters)
model_away = XGBRegressor(**xgb_params)
model_away.fit(
    X_train, y_away_train,
    eval_set=[(X_test, y_away_test)],
    verbose=100
)
print(f"Away Goals Model: stopped at {model_away.best_iteration} trees")

In [None]:
# Evaluate models
y_home_pred = model_home.predict(X_test)
y_away_pred = model_away.predict(X_test)

print("Model Performance:")
print("\nHome Goals Model:")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_home_test, y_home_pred)):.3f}")
print(f"  MAE:  {mean_absolute_error(y_home_test, y_home_pred):.3f}")

print("\nAway Goals Model:")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_away_test, y_away_pred)):.3f}")
print(f"  MAE:  {mean_absolute_error(y_away_test, y_away_pred):.3f}")

# Match outcome accuracy
actual_outcomes = np.sign(y_home_test.values - y_away_test.values)
pred_outcomes = np.sign(y_home_pred - y_away_pred)
outcome_accuracy = (actual_outcomes == pred_outcomes).mean()

print(f"\nMatch Outcome Accuracy: {outcome_accuracy:.1%}")

In [None]:
# Feature importance
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Home model importance
importance_home = pd.DataFrame({
    'feature': feature_cols,
    'importance': model_home.feature_importances_
}).sort_values('importance', ascending=True).tail(15)

axes[0].barh(importance_home['feature'], importance_home['importance'])
axes[0].set_title('Home Goals Model - Feature Importance')
axes[0].set_xlabel('Importance')

# Away model importance
importance_away = pd.DataFrame({
    'feature': feature_cols,
    'importance': model_away.feature_importances_
}).sort_values('importance', ascending=True).tail(15)

axes[1].barh(importance_away['feature'], importance_away['importance'])
axes[1].set_title('Away Goals Model - Feature Importance')
axes[1].set_xlabel('Importance')

plt.tight_layout()
plt.show()

## 8. Save Model Artifacts

Save all trained components for later inference without retraining.

In [None]:
def save_model_artifacts(output_dir=None):
    """Save all components needed for inference."""
    # Use configured MODEL_PATH if no output_dir specified
    if output_dir is None:
        output_dir = MODEL_PATH
    
    os.makedirs(output_dir, exist_ok=True)
    
    # 1. Save trained XGBoost models
    joblib.dump(model_home, f'{output_dir}/model_home_goals.joblib')
    joblib.dump(model_away, f'{output_dir}/model_away_goals.joblib')
    
    # 2. Save current Elo ratings
    with open(f'{output_dir}/elo_ratings.json', 'w') as f:
        json.dump(elo_ratings, f, indent=2)
    
    # 3. Save player aggregates (latest version for each country)
    latest_players = player_aggregates[player_aggregates['fifa_version'] == 24].copy()
    latest_players.to_csv(f'{output_dir}/player_aggregates.csv', index=False)
    
    # 4. Save country name mapping
    with open(f'{output_dir}/country_name_map.json', 'w') as f:
        json.dump(name_mapping, f, indent=2)
    
    # 5. Save feature column order
    with open(f'{output_dir}/feature_columns.json', 'w') as f:
        json.dump(feature_cols, f)
    
    # 6. Save recent form stats
    recent_form = {}
    for team in elo_ratings.keys():
        scored, conceded, win_rate = calculate_form_features(
            matches_df, team, matches_df['date'].max() + pd.Timedelta(days=1))
        recent_form[team] = {
            'avg_scored': scored,
            'avg_conceded': conceded,
            'win_rate': win_rate
        }
    with open(f'{output_dir}/recent_form.json', 'w') as f:
        json.dump(recent_form, f, indent=2)
    
    print(f"All artifacts saved to {output_dir}")
    if USE_GOOGLE_DRIVE:
        print("Models saved to Google Drive - they will persist after runtime disconnects")
    print(f"Files: {os.listdir(output_dir)}")

save_model_artifacts()

In [None]:
# --- Export Teams Metadata for Web App ---
# This cell exports teams_metadata.json with ISO 2-letter codes for flag URLs

# ISO 2-letter country codes for flag CDN (flagcdn.com/w80/{code}.png)
ISO_CODES = {
    'Afghanistan': 'af', 'Albania': 'al', 'Algeria': 'dz', 'Andorra': 'ad',
    'Angola': 'ao', 'Argentina': 'ar', 'Armenia': 'am', 'Australia': 'au',
    'Austria': 'at', 'Azerbaijan': 'az', 'Bahrain': 'bh', 'Bangladesh': 'bd',
    'Belarus': 'by', 'Belgium': 'be', 'Benin': 'bj', 'Bolivia': 'bo',
    'Bosnia and Herzegovina': 'ba', 'Botswana': 'bw', 'Brazil': 'br',
    'Bulgaria': 'bg', 'Burkina Faso': 'bf', 'Burundi': 'bi', 'Cambodia': 'kh',
    'Cameroon': 'cm', 'Canada': 'ca', 'Cape Verde': 'cv', 'Central African Republic': 'cf',
    'Chad': 'td', 'Chile': 'cl', 'China': 'cn', 'Colombia': 'co',
    'Comoros': 'km', 'Congo': 'cg', 'Costa Rica': 'cr', 'Croatia': 'hr',
    'Cuba': 'cu', 'Curaçao': 'cw', 'Cyprus': 'cy', 'Czech Republic': 'cz', 'DR Congo': 'cd',
    'Denmark': 'dk', 'Djibouti': 'dj', 'Dominican Republic': 'do', 'Ecuador': 'ec',
    'Egypt': 'eg', 'El Salvador': 'sv', 'England': 'gb-eng', 'Equatorial Guinea': 'gq',
    'Eritrea': 'er', 'Estonia': 'ee', 'Eswatini': 'sz', 'Ethiopia': 'et',
    'Fiji': 'fj', 'Finland': 'fi', 'France': 'fr', 'Gabon': 'ga',
    'Gambia': 'gm', 'Georgia': 'ge', 'Germany': 'de', 'Ghana': 'gh',
    'Greece': 'gr', 'Guatemala': 'gt', 'Guinea': 'gn', 'Guinea-Bissau': 'gw',
    'Haiti': 'ht', 'Honduras': 'hn', 'Hungary': 'hu', 'Iceland': 'is',
    'India': 'in', 'Indonesia': 'id', 'Iran': 'ir', 'Iraq': 'iq',
    'Ireland': 'ie', 'Israel': 'il', 'Italy': 'it', 'Ivory Coast': 'ci',
    'Jamaica': 'jm', 'Japan': 'jp', 'Jordan': 'jo', 'Kazakhstan': 'kz',
    'Kenya': 'ke', 'Kosovo': 'xk', 'Kuwait': 'kw', 'Kyrgyzstan': 'kg',
    'Laos': 'la', 'Latvia': 'lv', 'Lebanon': 'lb', 'Lesotho': 'ls',
    'Liberia': 'lr', 'Libya': 'ly', 'Liechtenstein': 'li', 'Lithuania': 'lt',
    'Luxembourg': 'lu', 'Madagascar': 'mg', 'Malawi': 'mw', 'Malaysia': 'my',
    'Maldives': 'mv', 'Mali': 'ml', 'Malta': 'mt', 'Mauritania': 'mr',
    'Mauritius': 'mu', 'Mexico': 'mx', 'Moldova': 'md', 'Mongolia': 'mn',
    'Montenegro': 'me', 'Morocco': 'ma', 'Mozambique': 'mz', 'Myanmar': 'mm',
    'Namibia': 'na', 'Nepal': 'np', 'Netherlands': 'nl', 'New Zealand': 'nz',
    'Nicaragua': 'ni', 'Niger': 'ne', 'Nigeria': 'ng', 'North Korea': 'kp',
    'North Macedonia': 'mk', 'Northern Ireland': 'gb-nir', 'Norway': 'no',
    'Oman': 'om', 'Pakistan': 'pk', 'Palestine': 'ps', 'Panama': 'pa',
    'Papua New Guinea': 'pg', 'Paraguay': 'py', 'Peru': 'pe', 'Philippines': 'ph',
    'Poland': 'pl', 'Portugal': 'pt', 'Qatar': 'qa', 'Romania': 'ro',
    'Russia': 'ru', 'Rwanda': 'rw', 'Saudi Arabia': 'sa', 'Scotland': 'gb-sct',
    'Senegal': 'sn', 'Serbia': 'rs', 'Sierra Leone': 'sl', 'Singapore': 'sg',
    'Slovakia': 'sk', 'Slovenia': 'si', 'Solomon Islands': 'sb', 'Somalia': 'so',
    'South Africa': 'za', 'South Korea': 'kr', 'South Sudan': 'ss', 'Spain': 'es',
    'Sri Lanka': 'lk', 'Sudan': 'sd', 'Suriname': 'sr', 'Sweden': 'se',
    'Switzerland': 'ch', 'Syria': 'sy', 'Tajikistan': 'tj', 'Tanzania': 'tz',
    'Thailand': 'th', 'Togo': 'tg', 'Trinidad and Tobago': 'tt', 'Tunisia': 'tn',
    'Turkey': 'tr', 'Turkmenistan': 'tm', 'Uganda': 'ug', 'Ukraine': 'ua',
    'United Arab Emirates': 'ae', 'United States': 'us', 'Uruguay': 'uy',
    'Uzbekistan': 'uz', 'Venezuela': 've', 'Vietnam': 'vn', 'Wales': 'gb-wls',
    'Yemen': 'ye', 'Zambia': 'zm', 'Zimbabwe': 'zw',
    # Additional variations
    'USA': 'us', 'Korea Republic': 'kr', 'Republic of Ireland': 'ie',
    "Cote d'Ivoire": 'ci', 'Czechia': 'cz', 'Türkiye': 'tr',
}

def export_teams_metadata():
    """Export teams metadata with ISO codes for web app flag display."""
    os.makedirs(MODEL_PATH, exist_ok=True)
    
    # Get teams that have BOTH Elo AND player data (FIFA 24)
    player_countries = set(player_aggregates[player_aggregates['fifa_version'] == 24]['country'].unique())
    available_teams = [t for t in elo_ratings.keys() if t in player_countries]
    
    # Build metadata list
    teams_data = []
    for team in available_teams:
        iso_code = ISO_CODES.get(team, team.lower()[:2])
        
        # Get color from countries_names if available
        color_match = country_names_df[country_names_df['current_name'] == team]['color_code'].values
        color = color_match[0] if len(color_match) > 0 else '#CCCCCC'
        
        teams_data.append({
            'name': team,
            'iso_code': iso_code,
            'elo_rating': round(elo_ratings[team], 1),
            'color_code': color
        })
    
    # Sort by Elo rating (highest first)
    teams_data.sort(key=lambda x: x['elo_rating'], reverse=True)
    
    filepath = f'{MODEL_PATH}/teams_metadata.json'
    with open(filepath, 'w') as f:
        json.dump(teams_data, f, indent=2)
    
    print(f"Exported {len(teams_data)} teams to {filepath}")
    return teams_data

# Export teams metadata
teams_metadata = export_teams_metadata()
print(f"\nTop 10 teams by Elo:")
for i, team in enumerate(teams_metadata[:10], 1):
    print(f"{i:2}. {team['name']:20} (Elo: {team['elo_rating']}, ISO: {team['iso_code']})")

In [None]:
# --- Export World Cup Group Presets for Web App ---
# This cell exports group configurations for preset tournaments

def export_wc_presets():
    """Export World Cup group presets for quick loading in web app."""
    os.makedirs(SIMULATIONS_PATH, exist_ok=True)
    
    # 2022 World Cup Groups (32 teams, 8 groups) - Actual tournament
    wc2022_groups = {
        "A": ["Qatar", "Ecuador", "Senegal", "Netherlands"],
        "B": ["England", "Iran", "United States", "Wales"],
        "C": ["Argentina", "Saudi Arabia", "Mexico", "Poland"],
        "D": ["France", "Australia", "Denmark", "Tunisia"],
        "E": ["Spain", "Costa Rica", "Germany", "Japan"],
        "F": ["Belgium", "Canada", "Morocco", "Croatia"],
        "G": ["Brazil", "Serbia", "Switzerland", "Cameroon"],
        "H": ["Portugal", "Ghana", "Uruguay", "South Korea"]
    }
    
    # Normalize team names to match our data
    wc2022_groups_normalized = {}
    for group, teams in wc2022_groups.items():
        wc2022_groups_normalized[group] = [normalize_country_name(t) for t in teams]
    
    # Save 2022 groups
    filepath_2022 = f'{SIMULATIONS_PATH}/wc2022_groups.json'
    with open(filepath_2022, 'w') as f:
        json.dump({
            'name': '2022 FIFA World Cup',
            'format': '32_team',
            'groups': wc2022_groups_normalized
        }, f, indent=2)
    print(f"Exported 2022 World Cup groups to {filepath_2022}")
    
    # 2026 World Cup Groups (48 teams, 12 groups) - Official Draw December 2024
    # Playoff spots filled with likely qualifiers:
    # UEFA Playoff D: Wales, UEFA Playoff A: Ukraine, UEFA Playoff B: Turkey, UEFA Playoff C: Greece
    # FIFA Playoff 1: Jamaica, FIFA Playoff 2: Bolivia
    
    wc2026_groups = {
        "A": ["Mexico", "South Africa", "South Korea", "Wales"],
        "B": ["Canada", "Ukraine", "Qatar", "Switzerland"],
        "C": ["Brazil", "Morocco", "Haiti", "Scotland"],
        "D": ["United States", "Paraguay", "Australia", "Greece"],
        "E": ["Germany", "Curaçao", "Ivory Coast", "Ecuador"],
        "F": ["Netherlands", "Japan", "Turkey", "Tunisia"],
        "G": ["Belgium", "Egypt", "Iran", "New Zealand"],
        "H": ["Spain", "Cape Verde", "Saudi Arabia", "Uruguay"],
        "I": ["France", "Senegal", "Norway", "Bolivia"],
        "J": ["Argentina", "Algeria", "Austria", "Jordan"],
        "K": ["Portugal", "Jamaica", "Uzbekistan", "Colombia"],
        "L": ["England", "Croatia", "Ghana", "Panama"]
    }
    
    # Normalize team names to match our data
    wc2026_groups_normalized = {}
    for group, teams in wc2026_groups.items():
        wc2026_groups_normalized[group] = [normalize_country_name(t) for t in teams]
    
    # Save 2026 groups
    filepath_2026 = f'{SIMULATIONS_PATH}/wc2026_groups.json'
    with open(filepath_2026, 'w') as f:
        json.dump({
            'name': '2026 FIFA World Cup',
            'format': '48_team',
            'groups': wc2026_groups_normalized,
            'note': 'Official draw Dec 2024. Playoff spots: Wales, Ukraine, Turkey, Greece (UEFA), Jamaica, Bolivia (FIFA)'
        }, f, indent=2)
    print(f"Exported 2026 World Cup groups to {filepath_2026}")
    
    return wc2022_groups_normalized, wc2026_groups_normalized

# Export presets
wc22_groups, wc26_groups = export_wc_presets()

print("\n2022 World Cup Groups:")
for group, teams in wc22_groups.items():
    print(f"  Group {group}: {', '.join(teams)}")

print("\n2026 World Cup Official Groups:")
for group, teams in sorted(wc26_groups.items()):
    print(f"  Group {group}: {', '.join(teams)}")

In [None]:
# --- Simulation Results Persistence ---
# Functions to save and load tournament simulation results

def save_simulation_results(champions, finalists, semifinalists, filename, metadata=None):
    """
    Save tournament simulation results to file.
    
    Parameters:
    - champions: Counter of championship wins
    - finalists: Counter of finals appearances
    - semifinalists: Counter of semifinal appearances
    - filename: Name of the file (without path)
    - metadata: Optional dict with additional info (n_sims, format, date, etc.)
    """
    os.makedirs(SIMULATIONS_PATH, exist_ok=True)
    
    filepath = f'{SIMULATIONS_PATH}/{filename}'
    
    results = {
        'champions': dict(champions),
        'finalists': dict(finalists),
        'semifinalists': dict(semifinalists),
        'metadata': metadata or {}
    }
    
    # Add timestamp to metadata
    from datetime import datetime
    results['metadata']['saved_at'] = datetime.now().isoformat()
    
    with open(filepath, 'w') as f:
        json.dump(results, f, indent=2)
    
    print(f"Simulation results saved to {filepath}")
    return filepath

def load_simulation_results(filename):
    """
    Load tournament simulation results from file.
    
    Returns:
    - Tuple of (champions, finalists, semifinalists, metadata) or None if file doesn't exist
    """
    filepath = f'{SIMULATIONS_PATH}/{filename}'
    
    if not os.path.exists(filepath):
        print(f"No saved simulation found at {filepath}")
        return None
    
    with open(filepath, 'r') as f:
        results = json.load(f)
    
    champions = Counter(results['champions'])
    finalists = Counter(results['finalists'])
    semifinalists = Counter(results['semifinalists'])
    metadata = results.get('metadata', {})
    
    print(f"Loaded simulation results from {filepath}")
    if metadata.get('saved_at'):
        print(f"  Saved at: {metadata['saved_at']}")
    if metadata.get('n_sims'):
        print(f"  Simulations: {metadata['n_sims']}")
    if metadata.get('format'):
        print(f"  Format: {metadata['format']}")
    
    return champions, finalists, semifinalists, metadata

def simulation_exists(filename):
    """Check if a simulation result file exists."""
    filepath = f'{SIMULATIONS_PATH}/{filename}'
    return os.path.exists(filepath)

print(f"Simulation persistence functions ready")
print(f"Simulations will be saved to: {SIMULATIONS_PATH}")

## 9. Poisson Match Simulator

Use predicted goals as lambda parameter for Poisson distribution. Monte Carlo sampling gives robust probability estimates.

In [None]:
def simulate_match(home_goals_pred, away_goals_pred, n_sims=10000):
    """
    Use predicted goals as lambda parameter for Poisson distribution.
    Returns win/draw/loss probabilities.
    Uses GPU (CuPy) when available, falls back to CPU (NumPy).
    """
    # Ensure non-negative lambda values
    home_lambda = max(0.1, home_goals_pred)
    away_lambda = max(0.1, away_goals_pred)
    
    # Simulate goals using Poisson distribution (GPU-accelerated via xp)
    home_goals = xp.random.poisson(home_lambda, size=n_sims)
    away_goals = xp.random.poisson(away_lambda, size=n_sims)
    
    # Convert back to numpy for calculations if using GPU
    home_goals_np = to_numpy(home_goals)
    away_goals_np = to_numpy(away_goals)
    
    # Calculate probabilities
    home_wins = (home_goals_np > away_goals_np).mean()
    draws = (home_goals_np == away_goals_np).mean()
    away_wins = (home_goals_np < away_goals_np).mean()
    
    return {
        'home_win_prob': home_wins,
        'draw_prob': draws,
        'away_win_prob': away_wins,
        'expected_home_goals': home_goals_pred,
        'expected_away_goals': away_goals_pred,
        'simulated_home_goals': home_goals_np,
        'simulated_away_goals': away_goals_np
    }

def predict_match(home_team, away_team, model_home, model_away, 
                  player_aggregates, elo_ratings, recent_form,
                  is_neutral=True, is_world_cup=True, n_sims=10000):
    """
    Predict a match between two teams with full probability distribution.
    """
    # Get latest FIFA version
    fifa_version = 24
    
    # Get player data (with fallback to nearest available version)
    home_players = get_player_stats_with_fallback(player_aggregates, home_team, fifa_version)
    away_players = get_player_stats_with_fallback(player_aggregates, away_team, fifa_version)
    
    if home_players is None or away_players is None:
        print(f"Warning: Missing player data for {home_team} or {away_team}")
        return None
    
    # Get Elo ratings
    home_elo = elo_ratings.get(home_team, 1500)
    away_elo = elo_ratings.get(away_team, 1500)
    
    # Get form data
    home_form = recent_form.get(home_team, {'avg_scored': 1.5, 'avg_conceded': 1.5, 'win_rate': 0.33})
    away_form = recent_form.get(away_team, {'avg_scored': 1.5, 'avg_conceded': 1.5, 'win_rate': 0.33})
    
    # Build feature vector
    features = pd.DataFrame([{
        'home_elo': home_elo,
        'away_elo': away_elo,
        'elo_diff': home_elo - away_elo,
        'home_avg_overall': home_players['avg_overall'],
        'home_max_overall': home_players['max_overall'],
        'home_avg_attack': home_players['avg_attack_overall'],
        'home_avg_defense': home_players['avg_defense_overall'],
        'home_avg_pace': home_players['avg_pace'],
        'home_avg_shooting': home_players['avg_shooting'],
        'home_avg_passing': home_players['avg_passing'],
        'away_avg_overall': away_players['avg_overall'],
        'away_max_overall': away_players['max_overall'],
        'away_avg_attack': away_players['avg_attack_overall'],
        'away_avg_defense': away_players['avg_defense_overall'],
        'away_avg_pace': away_players['avg_pace'],
        'away_avg_shooting': away_players['avg_shooting'],
        'away_avg_passing': away_players['avg_passing'],
        'overall_diff': home_players['avg_overall'] - away_players['avg_overall'],
        'attack_diff': home_players['avg_attack_overall'] - away_players['avg_attack_overall'],
        'defense_diff': home_players['avg_defense_overall'] - away_players['avg_defense_overall'],
        'home_form_scored': home_form['avg_scored'],
        'home_form_conceded': home_form['avg_conceded'],
        'home_form_win_rate': home_form['win_rate'],
        'away_form_scored': away_form['avg_scored'],
        'away_form_conceded': away_form['avg_conceded'],
        'away_form_win_rate': away_form['win_rate'],
        'is_neutral': 1 if is_neutral else 0,
        'is_world_cup': 1 if is_world_cup else 0,
        'is_continental': 0,
    }])
    
    # Reorder columns to match training
    features = features[feature_cols]
    
    # Predict goals
    home_goals_pred = model_home.predict(features)[0]
    away_goals_pred = model_away.predict(features)[0]
    
    # Simulate match
    result = simulate_match(home_goals_pred, away_goals_pred, n_sims)
    result['home_team'] = home_team
    result['away_team'] = away_team
    
    return result

print("Poisson simulator ready")

In [None]:
# --- Vectorized Match Simulation ---
# Batch simulate multiple matches simultaneously on GPU

def simulate_matches_vectorized(home_lambdas, away_lambdas, n_sims=10000):
    """
    Vectorized match simulation using GPU (CuPy) or CPU (NumPy).
    Simulates ALL matches x ALL simulations in one GPU kernel call.
    
    Args:
        home_lambdas: Array of expected home goals (n_matches,)
        away_lambdas: Array of expected away goals (n_matches,)
        n_sims: Number of simulations per match
    
    Returns:
        home_win_probs, draw_probs, away_win_probs (n_matches,)
    """
    n_matches = len(home_lambdas)
    
    # Ensure non-negative lambdas
    home_lambdas = xp.maximum(0.1, xp.asarray(home_lambdas))
    away_lambdas = xp.maximum(0.1, xp.asarray(away_lambdas))
    
    # Generate all Poisson samples at once: (n_matches, n_sims)
    # This replaces scipy.stats.poisson.rvs with GPU-accelerated version
    home_goals = xp.random.poisson(home_lambdas[:, None], size=(n_matches, n_sims))
    away_goals = xp.random.poisson(away_lambdas[:, None], size=(n_matches, n_sims))
    
    # Calculate probabilities across simulation axis
    home_wins = (home_goals > away_goals).mean(axis=1)
    draws = (home_goals == away_goals).mean(axis=1)
    away_wins = (home_goals < away_goals).mean(axis=1)
    
    # Return as numpy for compatibility with rest of code
    return to_numpy(home_wins), to_numpy(draws), to_numpy(away_wins)

print("Vectorized match simulation function ready")


In [None]:
# Load recent form for predictions
with open(f'{MODEL_PATH}/recent_form.json', 'r') as f:
    recent_form = json.load(f)

# Test prediction
result = predict_match('Brazil', 'Argentina', model_home, model_away,
                       player_aggregates, elo_ratings, recent_form)

if result:
    print(f"\n{result['home_team']} vs {result['away_team']}")
    print(f"Expected Goals: {result['expected_home_goals']:.2f} - {result['expected_away_goals']:.2f}")
    print(f"\nProbabilities:")
    print(f"  {result['home_team']} wins: {result['home_win_prob']:.1%}")
    print(f"  Draw:              {result['draw_prob']:.1%}")
    print(f"  {result['away_team']} wins: {result['away_win_prob']:.1%}")

## 10. 2022 World Cup Validation

Test the model on the 2022 World Cup to assess accuracy.

In [None]:
# Get 2022 World Cup matches from our test set
wc22_matches = feature_df[
    (feature_df['_date'] >= '2022-11-01') & 
    (feature_df['_date'] <= '2022-12-31') &
    (feature_df['is_world_cup'] == 1)
].copy()

print(f"2022 World Cup matches in dataset: {len(wc22_matches)}")

if len(wc22_matches) > 0:
    # Make predictions
    X_wc = wc22_matches[feature_cols]
    wc22_matches['pred_home_goals'] = model_home.predict(X_wc)
    wc22_matches['pred_away_goals'] = model_away.predict(X_wc)
    
    # Calculate outcome accuracy
    actual = np.sign(wc22_matches['home_goals'] - wc22_matches['away_goals'])
    predicted = np.sign(wc22_matches['pred_home_goals'] - wc22_matches['pred_away_goals'])
    accuracy = (actual.values == predicted.values).mean()
    
    print(f"\nMatch outcome accuracy: {accuracy:.1%}")
    print(f"Home goals RMSE: {np.sqrt(mean_squared_error(wc22_matches['home_goals'], wc22_matches['pred_home_goals'])):.3f}")
    print(f"Away goals RMSE: {np.sqrt(mean_squared_error(wc22_matches['away_goals'], wc22_matches['pred_away_goals'])):.3f}")

In [None]:
# Show detailed predictions vs actuals
if len(wc22_matches) > 0:
    display_cols = ['_home_team', '_away_team', 'home_goals', 'away_goals', 
                    'pred_home_goals', 'pred_away_goals']
    display_df = wc22_matches[display_cols].copy()
    display_df.columns = ['Home', 'Away', 'Actual Home', 'Actual Away', 'Pred Home', 'Pred Away']
    display_df['Pred Home'] = display_df['Pred Home'].round(2)
    display_df['Pred Away'] = display_df['Pred Away'].round(2)
    
    # Add result columns
    display_df['Actual Result'] = display_df.apply(
        lambda r: 'Home' if r['Actual Home'] > r['Actual Away'] 
        else ('Away' if r['Actual Home'] < r['Actual Away'] else 'Draw'), axis=1)
    display_df['Pred Result'] = display_df.apply(
        lambda r: 'Home' if r['Pred Home'] > r['Pred Away'] 
        else ('Away' if r['Pred Home'] < r['Pred Away'] else 'Draw'), axis=1)
    display_df['Correct'] = display_df['Actual Result'] == display_df['Pred Result']
    
    print("2022 World Cup Predictions vs Actuals:")
    print(display_df.to_string(index=False))

In [None]:
# 2022 World Cup Tournament Simulation (32-team format)
# NOTE: If running for the first time, run Section 11 cells first to define simulate_tournament

# Check if simulate_tournament is available
SIMULATION_FUNCS_AVAILABLE = 'simulate_tournament' in dir()

# Load 2022 World Cup groups
wc22_json_path = f'{DATA_PATH}/wc22.json' if os.path.exists(f'{DATA_PATH}/wc22.json') else 'wc22.json'
with open(wc22_json_path, 'r') as f:
    wc22_groups = json.load(f)

# Get all 2022 WC teams and normalize their names to match our data
wc22_teams_raw = []
for group, teams in wc22_groups.items():
    wc22_teams_raw.extend(teams)

# IMPORTANT: Normalize team names from wc22.json to match our normalized data
wc22_teams = [normalize_country_name(t) for t in wc22_teams_raw]

print(f"WC22 teams (normalized): {wc22_teams}")

# Filter to teams we have data for
wc22_teams_available = [t for t in wc22_teams if t in elo_ratings and 
                        t in player_aggregates[player_aggregates['fifa_version'] == 24]['country'].values]

# Debug: show which teams are missing
missing_teams = [t for t in wc22_teams if t not in wc22_teams_available]
if missing_teams:
    print(f"Missing teams: {missing_teams}")
    for t in missing_teams:
        in_elo = t in elo_ratings
        in_players = t in player_aggregates[player_aggregates['fifa_version'] == 24]['country'].values
        print(f"  {t}: in_elo={in_elo}, in_players={in_players}")

print(f"2022 World Cup teams with data: {len(wc22_teams_available)}/{len(wc22_teams)}")

# Check for saved simulation results
WC22_SIM_FILE = 'wc2022_simulation.json'
N_SIMS_2022 = 10000  # Increased from 1000 for smoother probability estimates

if len(wc22_teams_available) >= 32:
    # Try to load existing simulation
    if simulation_exists(WC22_SIM_FILE):
        loaded = load_simulation_results(WC22_SIM_FILE)
        if loaded:
            wc22_champions, wc22_finalists, wc22_semifinalists, wc22_metadata = loaded
            print(f"\nLoaded existing 2022 World Cup simulation")
        else:
            loaded = None
    else:
        loaded = None
    
    # Run new simulation if not loaded
    if loaded is None:
        if not SIMULATION_FUNCS_AVAILABLE:
            print("\nWARNING: simulate_tournament not defined yet.")
            print("Please run Section 11 cells first, then re-run this cell.")
            print("Or, if you have saved results, they will be loaded automatically.")
            wc22_champions = None
        else:
            print(f"\nRunning 2022 World Cup simulation (32-team format, {N_SIMS_2022} tournaments)...")
            # Use vectorized simulation for GPU acceleration
            wc22_champions, wc22_finalists, wc22_semifinalists = simulate_tournament_vectorized(
                wc22_teams_available, n_tournament_sims=N_SIMS_2022, format='32_team'
            )
            
            # Save results
            save_simulation_results(
                wc22_champions, wc22_finalists, wc22_semifinalists,
                WC22_SIM_FILE,
                metadata={
                    'tournament': '2022 FIFA World Cup',
                    'format': '32_team',
                    'n_sims': N_SIMS_2022,
                    'teams': wc22_teams_available
                }
            )
    
    # Display results
    if wc22_champions is not None:
        print("\n2022 World Cup Simulation Results:")
        print("-" * 40)
        print("Championship Probability (Top 10):")
        for i, (team, count) in enumerate(wc22_champions.most_common(10), 1):
            prob = count / N_SIMS_2022 * 100
            print(f"{i:2}. {team:20} {prob:5.1f}%")
        
        print("\nActual Result: Argentina won the 2022 World Cup")
else:
    print(f"Not enough teams with data for 32-team simulation")

## 11. 2026 World Cup Tournament Simulation

Simulate the full 2026 World Cup tournament with bracket predictions.

**Tournament Formats Supported:**
- **32-team format** (2018, 2022): 8 groups of 4, top 2 advance to Round of 16
- **48-team format** (2026): 12 groups of 4, top 2 + 8 best third-place advance to Round of 32

In [None]:
# 2026 World Cup will have 48 teams in 12 groups of 4
# For now, let's use projected qualified teams based on current rankings
# This is a simplified example - actual groups will be drawn later

# Top teams likely to qualify (based on Elo ratings and typical qualifiers)
WC2026_PROJECTED_TEAMS = [
    # CONMEBOL (6-7 spots)
    'Brazil', 'Argentina', 'Uruguay', 'Colombia', 'Ecuador', 'Chile',
    # UEFA (16 spots)
    'France', 'England', 'Spain', 'Germany', 'Netherlands', 'Portugal',
    'Belgium', 'Italy', 'Croatia', 'Switzerland', 'Denmark', 'Austria',
    'Poland', 'Serbia', 'Ukraine', 'Sweden',
    # CONCACAF (6-7 spots including hosts)
    'United States', 'Mexico', 'Canada', 'Costa Rica', 'Jamaica', 'Panama',
    # AFC (8-9 spots)
    'Japan', 'South Korea', 'Iran', 'Australia', 'Saudi Arabia', 'Qatar',
    'United Arab Emirates', 'Iraq',
    # CAF (9-10 spots)
    'Morocco', 'Senegal', 'Nigeria', 'Egypt', 'Cameroon', 'Algeria',
    'Tunisia', 'Ivory Coast', 'Ghana', 'Mali',
    # OFC (1-2 spots)
    'New Zealand',
]

# Filter to teams we have data for
available_teams = set(player_aggregates[player_aggregates['fifa_version'] == 24]['country'].unique())
available_teams &= set(elo_ratings.keys())

qualified_teams = [t for t in WC2026_PROJECTED_TEAMS if t in available_teams]
print(f"Projected qualified teams with data: {len(qualified_teams)}")

# Need 48 teams, pad with highest Elo teams if needed
if len(qualified_teams) < 48:
    remaining = sorted(
        [(t, elo_ratings[t]) for t in available_teams if t not in qualified_teams],
        key=lambda x: x[1], reverse=True
    )
    qualified_teams.extend([t for t, _ in remaining[:48-len(qualified_teams)]])

qualified_teams = qualified_teams[:48]
print(f"Final team count: {len(qualified_teams)}")

In [None]:
def simulate_group_stage(groups, model_home, model_away, player_aggregates, 
                         elo_ratings, recent_form, n_sims=1000):
    """
    Simulate group stage matches and return standings.
    """
    group_results = {}
    
    for group_name, teams in groups.items():
        # Initialize points and goal difference
        standings = {team: {'points': 0, 'gd': 0, 'gf': 0, 'wins': 0} for team in teams}
        
        # Play all group matches (round robin)
        for i, team_a in enumerate(teams):
            for team_b in teams[i+1:]:
                result = predict_match(team_a, team_b, model_home, model_away,
                                       player_aggregates, elo_ratings, recent_form,
                                       is_neutral=True, is_world_cup=True, n_sims=n_sims)
                
                if result is None:
                    continue
                
                # Simulate single match outcome from probabilities
                rand = np.random.random()
                if rand < result['home_win_prob']:
                    # team_a wins
                    standings[team_a]['points'] += 3
                    standings[team_a]['wins'] += 1
                    standings[team_a]['gf'] += result['expected_home_goals']
                    standings[team_a]['gd'] += result['expected_home_goals'] - result['expected_away_goals']
                    standings[team_b]['gf'] += result['expected_away_goals']
                    standings[team_b]['gd'] += result['expected_away_goals'] - result['expected_home_goals']
                elif rand < result['home_win_prob'] + result['draw_prob']:
                    # draw
                    standings[team_a]['points'] += 1
                    standings[team_b]['points'] += 1
                    standings[team_a]['gf'] += result['expected_home_goals']
                    standings[team_b]['gf'] += result['expected_away_goals']
                else:
                    # team_b wins
                    standings[team_b]['points'] += 3
                    standings[team_b]['wins'] += 1
                    standings[team_b]['gf'] += result['expected_away_goals']
                    standings[team_b]['gd'] += result['expected_away_goals'] - result['expected_home_goals']
                    standings[team_a]['gf'] += result['expected_home_goals']
                    standings[team_a]['gd'] += result['expected_home_goals'] - result['expected_away_goals']
        
        # Sort by points, then goal difference
        sorted_teams = sorted(standings.items(), 
                              key=lambda x: (x[1]['points'], x[1]['gd'], x[1]['gf']), 
                              reverse=True)
        group_results[group_name] = sorted_teams
    
    return group_results

def simulate_knockout_match(team_a, team_b, model_home, model_away, player_aggregates,
                            elo_ratings, recent_form, n_sims=1000):
    """
    Simulate a knockout match (no draws allowed).
    """
    result = predict_match(team_a, team_b, model_home, model_away,
                           player_aggregates, elo_ratings, recent_form,
                           is_neutral=True, is_world_cup=True, n_sims=n_sims)
    
    if result is None:
        # Fallback: use Elo to decide
        return team_a if elo_ratings.get(team_a, 1500) > elo_ratings.get(team_b, 1500) else team_b
    
    # For knockouts, convert draw probability to coin flip
    total_win_prob = result['home_win_prob'] + result['away_win_prob']
    adj_home_win = result['home_win_prob'] / total_win_prob if total_win_prob > 0 else 0.5
    
    # Add half of draw probability to each team
    adj_home_win = result['home_win_prob'] + result['draw_prob'] / 2
    
    if np.random.random() < adj_home_win:
        return team_a
    else:
        return team_b

print("Knockout simulation functions ready")

In [None]:
# =============================================================================
# VECTORIZED TOURNAMENT SIMULATION (GPU-Accelerated)
# =============================================================================
# These functions replace the serial simulation with parallel GPU operations

def build_single_match_features(home_team, away_team, player_aggregates, elo_ratings, recent_form):
    """Build feature dict for a single match (used in batch feature building)."""
    fifa_version = 24
    
    # Get player data
    home_players = get_player_stats_with_fallback(player_aggregates, home_team, fifa_version)
    away_players = get_player_stats_with_fallback(player_aggregates, away_team, fifa_version)
    
    if home_players is None or away_players is None:
        return None
    
    # Get Elo ratings
    home_elo = elo_ratings.get(home_team, 1500)
    away_elo = elo_ratings.get(away_team, 1500)
    
    # Get form data
    home_form = recent_form.get(home_team, {'avg_scored': 1.5, 'avg_conceded': 1.5, 'win_rate': 0.33})
    away_form = recent_form.get(away_team, {'avg_scored': 1.5, 'avg_conceded': 1.5, 'win_rate': 0.33})
    
    return {
        'home_elo': home_elo,
        'away_elo': away_elo,
        'elo_diff': home_elo - away_elo,
        'home_avg_overall': home_players['avg_overall'],
        'home_max_overall': home_players['max_overall'],
        'home_avg_attack': home_players['avg_attack_overall'],
        'home_avg_defense': home_players['avg_defense_overall'],
        'home_avg_pace': home_players['avg_pace'],
        'home_avg_shooting': home_players['avg_shooting'],
        'home_avg_passing': home_players['avg_passing'],
        'away_avg_overall': away_players['avg_overall'],
        'away_max_overall': away_players['max_overall'],
        'away_avg_attack': away_players['avg_attack_overall'],
        'away_avg_defense': away_players['avg_defense_overall'],
        'away_avg_pace': away_players['avg_pace'],
        'away_avg_shooting': away_players['avg_shooting'],
        'away_avg_passing': away_players['avg_passing'],
        'overall_diff': home_players['avg_overall'] - away_players['avg_overall'],
        'attack_diff': home_players['avg_attack_overall'] - away_players['avg_attack_overall'],
        'defense_diff': home_players['avg_defense_overall'] - away_players['avg_defense_overall'],
        'home_form_scored': home_form['avg_scored'],
        'home_form_conceded': home_form['avg_conceded'],
        'home_form_win_rate': home_form['win_rate'],
        'away_form_scored': away_form['avg_scored'],
        'away_form_conceded': away_form['avg_conceded'],
        'away_form_win_rate': away_form['win_rate'],
        'is_neutral': 1,
        'is_world_cup': 1,
        'is_continental': 0,
    }


def build_all_match_features(matchups, player_aggregates, elo_ratings, recent_form, feature_cols):
    """
    Pre-compute features for ALL possible matchups at once.
    Called once before simulation, not inside the loop.
    """
    features_list = []
    valid_matchups = []
    
    for home_team, away_team in matchups:
        features = build_single_match_features(
            home_team, away_team, player_aggregates, elo_ratings, recent_form
        )
        if features is not None:
            features_list.append(features)
            valid_matchups.append((home_team, away_team))
    
    if not features_list:
        return pd.DataFrame(), []
    
    return pd.DataFrame(features_list)[feature_cols], valid_matchups


def simulate_all_groups_vectorized(groups, home_preds, away_preds, n_sims):
    """
    Simulate ALL group stage matches for ALL tournaments simultaneously.
    
    Args:
        groups: dict of group_name -> [team1, team2, team3, team4]
        home_preds: dict (team_a, team_b) -> expected_home_goals
        away_preds: dict (team_a, team_b) -> expected_away_goals
        n_sims: number of tournament simulations
    
    Returns:
        group_standings: dict of group_name -> list of (n_sims,) arrays with team indices ranked
        teams_by_group: dict of group_name -> list of team names
    """
    group_standings = {}
    teams_by_group = {}
    
    for group_name, teams in groups.items():
        n_teams = len(teams)
        teams_by_group[group_name] = teams
        
        # Points matrix: (n_sims, n_teams) - use int64 for compatibility
        points = xp.zeros((n_sims, n_teams), dtype=xp.int64)
        goal_diff = xp.zeros((n_sims, n_teams), dtype=xp.float64)
        goals_for = xp.zeros((n_sims, n_teams), dtype=xp.float64)
        
        # Simulate all matches in this group across all tournaments
        for i in range(n_teams):
            for j in range(i + 1, n_teams):
                team_a, team_b = teams[i], teams[j]
                
                # Get predictions (check both orderings)
                if (team_a, team_b) in home_preds:
                    h_lambda = home_preds[(team_a, team_b)]
                    a_lambda = away_preds[(team_a, team_b)]
                elif (team_b, team_a) in home_preds:
                    # Swap if opposite ordering
                    h_lambda = away_preds[(team_b, team_a)]
                    a_lambda = home_preds[(team_b, team_a)]
                else:
                    h_lambda = 1.5
                    a_lambda = 1.5
                
                # Vectorized Poisson: (n_sims,) goals for each team
                home_goals = xp.random.poisson(max(0.1, h_lambda), size=n_sims).astype(xp.float64)
                away_goals = xp.random.poisson(max(0.1, a_lambda), size=n_sims).astype(xp.float64)
                
                # Update points based on results (vectorized)
                # Cast booleans to int64 explicitly to avoid type issues
                home_wins = (home_goals > away_goals).astype(xp.int64)
                draws = (home_goals == away_goals).astype(xp.int64)
                away_wins = (home_goals < away_goals).astype(xp.int64)
                
                points[:, i] += 3 * home_wins + draws
                points[:, j] += 3 * away_wins + draws
                goal_diff[:, i] += home_goals - away_goals
                goal_diff[:, j] += away_goals - home_goals
                goals_for[:, i] += home_goals
                goals_for[:, j] += away_goals
        
        # Sort teams by points, then goal difference, then goals for (for each simulation)
        # Use a combined sort key (use float64 for precision)
        sort_key = points.astype(xp.float64) * 10000 + goal_diff * 10 + goals_for * 0.01
        rankings = xp.argsort(-sort_key, axis=1)  # Descending order
        
        group_standings[group_name] = rankings
    
    return group_standings, teams_by_group


def simulate_knockout_match_vectorized(team_a_indices, team_b_indices, teams_list, 
                                        home_preds, away_preds, n_sims):
    """
    Simulate knockout matches for all simulations at once.
    Returns winner indices for each simulation.
    """
    winners = xp.zeros(n_sims, dtype=xp.int32)
    
    # For each simulation, determine the winner
    for sim_idx in range(n_sims):
        team_a_idx = int(team_a_indices[sim_idx]) if hasattr(team_a_indices, '__iter__') else int(team_a_indices)
        team_b_idx = int(team_b_indices[sim_idx]) if hasattr(team_b_indices, '__iter__') else int(team_b_indices)
        
        team_a = teams_list[team_a_idx]
        team_b = teams_list[team_b_idx]
        
        # Get predictions
        if (team_a, team_b) in home_preds:
            h_lambda = home_preds[(team_a, team_b)]
            a_lambda = away_preds[(team_a, team_b)]
        elif (team_b, team_a) in home_preds:
            h_lambda = away_preds[(team_b, team_a)]
            a_lambda = home_preds[(team_b, team_a)]
        else:
            h_lambda = 1.5
            a_lambda = 1.5
        
        # Simulate single match
        home_goals = xp.random.poisson(max(0.1, h_lambda))
        away_goals = xp.random.poisson(max(0.1, a_lambda))
        
        # Determine winner (no draws in knockout)
        if home_goals > away_goals:
            winners[sim_idx] = team_a_idx
        elif away_goals > home_goals:
            winners[sim_idx] = team_b_idx
        else:
            # Penalty shootout - 50/50 with slight home advantage based on predictions
            if xp.random.random() < 0.5 + (h_lambda - a_lambda) * 0.05:
                winners[sim_idx] = team_a_idx
            else:
                winners[sim_idx] = team_b_idx
    
    return winners


def simulate_tournament_vectorized(teams, n_tournament_sims=10000, format='48_team'):
    """
    Run ALL tournament simulations in parallel using GPU-accelerated operations.
    
    This is significantly faster than the serial version because:
    1. All match predictions are computed once upfront
    2. Group stage simulations run in parallel across all tournaments
    3. Uses CuPy for GPU acceleration when available
    """
    print(f"Running vectorized tournament simulation ({format}, {n_tournament_sims} tournaments)...")
    print(f"Using {'GPU (CuPy)' if USE_GPU else 'CPU (NumPy)'} for simulation")
    
    sorted_teams = sorted(teams, key=lambda t: elo_ratings.get(t, 1500), reverse=True)
    
    # Set format parameters
    if format == '32_team':
        n_groups = 8
        n_teams = 32
        group_letters = [chr(65+i) for i in range(8)]  # A-H
    elif format == '48_team':
        n_groups = 12
        n_teams = 48
        group_letters = [chr(65+i) for i in range(12)]  # A-L
    else:
        raise ValueError(f"Unknown format: {format}")
    
    # Truncate/pad teams
    sorted_teams = sorted_teams[:n_teams]
    if len(sorted_teams) < n_teams:
        print(f"Warning: Only {len(sorted_teams)} teams available")
    
    # 1. Pre-compute all match predictions ONCE
    print("Pre-computing match predictions...")
    all_matchups = list(itertools.combinations(sorted_teams, 2))
    match_features, valid_matchups = build_all_match_features(
        all_matchups, player_aggregates, elo_ratings, recent_form, feature_cols
    )
    
    if len(match_features) == 0:
        print("Error: No valid matchups found")
        return Counter(), Counter(), Counter()
    
    home_goals_pred = model_home.predict(match_features)
    away_goals_pred = model_away.predict(match_features)
    
    # Create lookup dicts
    home_preds = {m: h for m, h in zip(valid_matchups, home_goals_pred)}
    away_preds = {m: a for m, a in zip(valid_matchups, away_goals_pred)}
    
    print(f"Computed predictions for {len(home_preds)} matchups")
    
    # 2. Create groups (same for all simulations - serpentine seeding)
    groups = {f'Group {letter}': [] for letter in group_letters}
    for i, team in enumerate(sorted_teams):
        pot = i // n_groups
        if pot % 2 == 0:
            group_idx = i % n_groups
        else:
            group_idx = (n_groups - 1) - (i % n_groups)
        groups[f'Group {group_letters[group_idx]}'].append(team)
    
    # 3. Vectorized group stage
    print("Simulating group stage...")
    group_standings, teams_by_group = simulate_all_groups_vectorized(
        groups, home_preds, away_preds, n_tournament_sims
    )
    
    # 4. Run knockout rounds (still somewhat serial but with vectorized match simulation)
    print("Simulating knockout rounds...")
    champions = Counter()
    finalists = Counter()
    semifinalists = Counter()
    
    # Convert to numpy for easier indexing
    group_standings_np = {k: to_numpy(v) for k, v in group_standings.items()}
    
    for sim_idx in range(n_tournament_sims):
        # Get group results for this simulation
        sim_group_results = {}
        for group_name in groups.keys():
            rankings = group_standings_np[group_name][sim_idx]
            team_list = teams_by_group[group_name]
            sim_group_results[group_name] = [(team_list[int(idx)], {}) for idx in rankings]
        
        # Create bracket
        if format == '32_team':
            bracket_pairs = create_32_team_bracket(sim_group_results)
        else:
            # For 48-team, we need third place teams
            third_place = []
            for group_name, standings in sim_group_results.items():
                if len(standings) > 2:
                    third_place.append((standings[2][0], 0, 0, group_name))
            bracket_pairs = create_48_team_bracket(sim_group_results, third_place)
        
        # Simulate knockout rounds
        current_round = []
        for pair in bracket_pairs:
            team_a, team_b = pair[0], pair[1]
            # Get predictions
            if (team_a, team_b) in home_preds:
                h_lambda = home_preds[(team_a, team_b)]
                a_lambda = away_preds[(team_a, team_b)]
            elif (team_b, team_a) in home_preds:
                h_lambda = away_preds[(team_b, team_a)]
                a_lambda = home_preds[(team_b, team_a)]
            else:
                h_lambda, a_lambda = 1.5, 1.5
            
            # Simulate match
            home_goals = np.random.poisson(max(0.1, h_lambda))
            away_goals = np.random.poisson(max(0.1, a_lambda))
            
            if home_goals > away_goals:
                current_round.append(team_a)
            elif away_goals > home_goals:
                current_round.append(team_b)
            else:
                # Penalty shootout
                current_round.append(team_a if np.random.random() < 0.5 else team_b)
        
        # Continue through knockout rounds
        round_names = ['R32', 'R16', 'QF', 'SF', 'F'] if format == '48_team' else ['R16', 'QF', 'SF', 'F']
        
        for round_name in round_names[1:-1]:  # Skip first round (already done) and final
            next_round = []
            for i in range(0, len(current_round), 2):
                if i + 1 < len(current_round):
                    team_a, team_b = current_round[i], current_round[i+1]
                    if (team_a, team_b) in home_preds:
                        h_lambda = home_preds[(team_a, team_b)]
                        a_lambda = away_preds[(team_a, team_b)]
                    elif (team_b, team_a) in home_preds:
                        h_lambda = away_preds[(team_b, team_a)]
                        a_lambda = home_preds[(team_b, team_a)]
                    else:
                        h_lambda, a_lambda = 1.5, 1.5
                    
                    home_goals = np.random.poisson(max(0.1, h_lambda))
                    away_goals = np.random.poisson(max(0.1, a_lambda))
                    
                    if home_goals > away_goals:
                        winner = team_a
                    elif away_goals > home_goals:
                        winner = team_b
                    else:
                        winner = team_a if np.random.random() < 0.5 else team_b
                    
                    next_round.append(winner)
                    
                    if round_name == 'QF':
                        semifinalists[winner] += 1
                    elif round_name == 'SF':
                        finalists[winner] += 1
            
            current_round = next_round
        
        # Final
        if len(current_round) >= 2:
            team_a, team_b = current_round[0], current_round[1]
            if (team_a, team_b) in home_preds:
                h_lambda = home_preds[(team_a, team_b)]
                a_lambda = away_preds[(team_a, team_b)]
            elif (team_b, team_a) in home_preds:
                h_lambda = away_preds[(team_b, team_a)]
                a_lambda = home_preds[(team_b, team_a)]
            else:
                h_lambda, a_lambda = 1.5, 1.5
            
            home_goals = np.random.poisson(max(0.1, h_lambda))
            away_goals = np.random.poisson(max(0.1, a_lambda))
            
            if home_goals > away_goals:
                champion = team_a
            elif away_goals > home_goals:
                champion = team_b
            else:
                champion = team_a if np.random.random() < 0.5 else team_b
            
            champions[champion] += 1
        
        # Progress update
        if (sim_idx + 1) % 1000 == 0:
            print(f"  Completed {sim_idx + 1}/{n_tournament_sims} simulations")
    
    print("Simulation complete!")
    return champions, finalists, semifinalists

print("Vectorized tournament simulation functions ready")


In [None]:
def create_32_team_bracket(group_results):
    """
    Create Round of 16 bracket for 32-team format following FIFA rules.
    
    Official FIFA Bracket Structure:
    Left Side:
    - 1A vs 2B
    - 1C vs 2D
    - 1E vs 2F
    - 1G vs 2H
    Right Side:
    - 1B vs 2A
    - 1D vs 2C
    - 1F vs 2E
    - 1H vs 2G
    """
    # Extract group winners and runners-up
    # group_results format: {'A': [(team, stats), (team, stats), ...], ...}
    pairs = []
    
    # Left bracket
    pairs.append((group_results['A'][0][0], group_results['B'][1][0]))  # 1A vs 2B
    pairs.append((group_results['C'][0][0], group_results['D'][1][0]))  # 1C vs 2D
    pairs.append((group_results['E'][0][0], group_results['F'][1][0]))  # 1E vs 2F
    pairs.append((group_results['G'][0][0], group_results['H'][1][0]))  # 1G vs 2H
    
    # Right bracket
    pairs.append((group_results['B'][0][0], group_results['A'][1][0]))  # 1B vs 2A
    pairs.append((group_results['D'][0][0], group_results['C'][1][0]))  # 1D vs 2C
    pairs.append((group_results['F'][0][0], group_results['E'][1][0]))  # 1F vs 2E
    pairs.append((group_results['H'][0][0], group_results['G'][1][0]))  # 1H vs 2G
    
    return pairs


def create_48_team_bracket(group_results, best_third_place):
    """
    Create Round of 32 bracket for 48-team format following FIFA rules.
    
    Structure: 12 Group Winners + 12 Runners-up + 8 Best 3rd Place = 32 teams
    
    FIFA 2026 Bracket Logic:
    - Group winners are placed in fixed positions
    - Runners-up and 3rd place teams fill remaining slots
    - Avoid same-group matchups in Round of 32 when possible
    """
    # Extract group winners and runners-up
    winners = {group: standings[0][0] for group, standings in group_results.items()}
    runners_up = {group: standings[1][0] for group, standings in group_results.items()}
    
    # Get best 8 third-place teams (already sorted)
    third_place_teams = [t[0] for t in best_third_place[:8]]
    third_place_by_group = {}  # Track which groups provided 3rd place teams
    
    # Find which groups the 3rd place teams came from
    for group, standings in group_results.items():
        if len(standings) > 2 and standings[2][0] in third_place_teams:
            third_place_by_group[group] = standings[2][0]
    
    # Track used teams
    used_teams = set()
    pairs = []
    third_idx = 0
    
    # Create bracket pairs following FIFA structure
    # Match 1-4: Winners A-D vs Runners-up (prefer cross-group matchups)
    for i, group in enumerate(['A', 'B', 'C', 'D']):
        opponent_group = ['B', 'A', 'D', 'C'][i]
        if runners_up[opponent_group] not in used_teams:
            pairs.append((winners[group], runners_up[opponent_group]))
            used_teams.add(winners[group])
            used_teams.add(runners_up[opponent_group])
        elif third_idx < len(third_place_teams):
            pairs.append((winners[group], third_place_teams[third_idx]))
            used_teams.add(winners[group])
            used_teams.add(third_place_teams[third_idx])
            third_idx += 1
    
    # Match 5-8: Winners E-H vs Runners-up or 3rd Place
    for i, group in enumerate(['E', 'F', 'G', 'H']):
        opponent_group = ['F', 'E', 'H', 'G'][i]
        if runners_up[opponent_group] not in used_teams:
            pairs.append((winners[group], runners_up[opponent_group]))
            used_teams.add(winners[group])
            used_teams.add(runners_up[opponent_group])
        elif third_idx < len(third_place_teams):
            pairs.append((winners[group], third_place_teams[third_idx]))
            used_teams.add(winners[group])
            used_teams.add(third_place_teams[third_idx])
            third_idx += 1
    
    # Match 9-12: Winners I-L vs Runners-up
    pairs.append((winners['I'], runners_up['J']))
    pairs.append((winners['J'], runners_up['I']))
    pairs.append((winners['K'], runners_up['L']))
    pairs.append((winners['L'], runners_up['K']))
    used_teams.update([winners['I'], winners['J'], winners['K'], winners['L'],
                       runners_up['I'], runners_up['J'], runners_up['K'], runners_up['L']])
    
    # Match 13-16: Remaining Runners-up and 3rd Place teams
    unused_runners = [runners_up[g] for g in sorted(group_results.keys()) 
                     if runners_up[g] not in used_teams]
    unused_third = [t for t in third_place_teams if t not in used_teams]
    
    # Pair remaining teams
    remaining = unused_runners + unused_third
    for i in range(0, len(remaining), 2):
        if i+1 < len(remaining):
            pairs.append((remaining[i], remaining[i+1]))
    
    # Ensure we have exactly 16 pairs for Round of 32
    return pairs[:16]


def simulate_tournament(teams, n_tournament_sims=100, format='48_team'):
    """
    Run full tournament simulation multiple times following official FIFA bracket rules.
    Returns championship frequency for each team.
    
    Parameters:
    - teams: list of team names
    - n_tournament_sims: number of tournament simulations to run
    - format: '32_team' (8 groups, used in 2018/2022) or '48_team' (12 groups, used in 2026)
    """
    sorted_teams = sorted(teams, key=lambda t: elo_ratings.get(t, 1500), reverse=True)
    
    # Set format parameters
    if format == '32_team':
        n_groups = 8
        teams_per_group = 4
        n_teams = 32
        use_third_place = False  # Top 2 from each group = 16 teams -> Round of 16
    elif format == '48_team':
        n_groups = 12
        teams_per_group = 4
        n_teams = 48
        use_third_place = True  # Top 2 (24) + 8 best third = 32 teams -> Round of 32
    else:
        raise ValueError(f"Unknown format: {format}. Use '32_team' or '48_team'")
    
    # Validate team count
    if len(sorted_teams) < n_teams:
        print(f"Warning: Only {len(sorted_teams)} teams provided, expected {n_teams}. Padding with available teams.")
        sorted_teams = sorted_teams[:n_teams] if len(sorted_teams) >= n_teams else sorted_teams
    else:
        sorted_teams = sorted_teams[:n_teams]
    
    champions = Counter()
    finalists = Counter()
    semifinalists = Counter()
    
    for sim_num in range(n_tournament_sims):
        # Create groups with serpentine seeding
        groups = {f'Group {chr(65+i)}': [] for i in range(n_groups)}
        for i, team in enumerate(sorted_teams):
            pot = i // n_groups
            if pot % 2 == 0:
                group_idx = i % n_groups
            else:
                group_idx = (n_groups - 1) - (i % n_groups)
            groups[f'Group {chr(65+group_idx)}'].append(team)
        
        # Simulate group stage
        group_results = simulate_group_stage(groups, model_home, model_away,
                                             player_aggregates, elo_ratings, recent_form)
        
        # Determine advancing teams based on format
        third_place = []
        
        for group_name, standings in group_results.items():
            if use_third_place and len(standings) > 2:
                third_place.append((standings[2][0], standings[2][1]['points'], standings[2][1]['gd'], group_name))
        
        # Sort third-place teams for 48-team format
        if use_third_place:
            third_place.sort(key=lambda x: (x[1], x[2]), reverse=True)
        
        # Create bracket pairs following FIFA rules
        if format == '32_team':
            bracket_pairs = create_32_team_bracket(group_results)
        else:  # 48_team
            bracket_pairs = create_48_team_bracket(group_results, third_place)
        
        # Simulate Round of 32 (or Round of 16 for 32-team) using proper bracket structure
        current_round = [
            simulate_knockout_match(pair[0], pair[1],
                                   model_home, model_away, player_aggregates,
                                   elo_ratings, recent_form)
            for pair in bracket_pairs
        ]
        
        # For 48-team: R32 -> R16 -> QF -> SF -> F
        # For 32-team: R16 -> QF -> SF -> F
        round_names = []
        if format == '48_team':
            round_names = ['Round of 32', 'Round of 16', 'Quarter Finals', 'Semi Finals', 'Final']
        else:
            round_names = ['Round of 16', 'Quarter Finals', 'Semi Finals', 'Final']
        
        # Start from Round of 16 (or Round of 32 for 48-team)
        round_start_idx = 1 if format == '48_team' else 0
        
        for round_idx, round_name in enumerate(round_names[round_start_idx:-1], round_start_idx):
            next_round = []
            for i in range(0, len(current_round), 2):
                if i+1 < len(current_round):
                    winner = simulate_knockout_match(current_round[i], current_round[i+1],
                                                     model_home, model_away, player_aggregates,
                                                     elo_ratings, recent_form)
                    next_round.append(winner)
                    
                    # Track semifinalists
                    if round_name == 'Quarter Finals':
                        semifinalists[winner] += 1
                    # Track finalists
                    if round_name == 'Semi Finals':
                        finalists[winner] += 1
            
            current_round = next_round
        
        # Final
        if len(current_round) >= 2:
            champion = simulate_knockout_match(current_round[0], current_round[1],
                                               model_home, model_away, player_aggregates,
                                               elo_ratings, recent_form)
            champions[champion] += 1
    
    return champions, finalists, semifinalists

# Configuration for 2026 simulation
WC26_SIM_FILE = 'wc2026_simulation.json'
N_SIMS_2026 = 10000  # Increased from 1000 for smoother probability estimates
FORCE_RERUN_2026 = False  # Set to True to recompute even if saved results exist

# Check for saved simulation results
if not FORCE_RERUN_2026 and simulation_exists(WC26_SIM_FILE):
    loaded = load_simulation_results(WC26_SIM_FILE)
    if loaded:
        champions, finalists, semifinalists, wc26_metadata = loaded
        print(f"\nLoaded existing 2026 World Cup simulation")
    else:
        loaded = None
else:
    loaded = None

# Run new simulation if not loaded
if loaded is None:
    print("Running 2026 World Cup simulation...")
    print(f"Format: 48 teams, 12 groups, {N_SIMS_2026} tournaments")
    print("This may take a few minutes (using GPU-accelerated vectorized simulation)...\n")
    
    # Use vectorized simulation for GPU acceleration
    champions, finalists, semifinalists = simulate_tournament_vectorized(
        qualified_teams, n_tournament_sims=N_SIMS_2026, format='48_team'
    )
    
    # Save results
    save_simulation_results(
        champions, finalists, semifinalists,
        WC26_SIM_FILE,
        metadata={
            'tournament': '2026 FIFA World Cup (Projected)',
            'format': '48_team',
            'n_sims': N_SIMS_2026,
            'teams': qualified_teams
        }
    )

In [None]:
# Simulation save/load functions are defined in Section 8

In [None]:
# Display results
n_sims = N_SIMS_2026  # Use the actual simulation count, not hardcoded value!

print("=" * 50)
print("2026 WORLD CUP PREDICTION RESULTS")
print("=" * 50)

print("\nChampionship Probability (Top 20):")
print("-" * 40)
for i, (team, count) in enumerate(champions.most_common(20), 1):
    prob = count / n_sims * 100
    bar = '*' * int(prob / 2)
    print(f"{i:2}. {team:20} {prob:5.1f}% {bar}")

print("\n\nFinalist Probability (Top 15):")
print("-" * 40)
for i, (team, count) in enumerate(finalists.most_common(15), 1):
    prob = count / n_sims * 100
    print(f"{i:2}. {team:20} {prob:5.1f}%")

print("\n\nSemifinalist Probability (Top 15):")
print("-" * 40)
for i, (team, count) in enumerate(semifinalists.most_common(15), 1):
    prob = count / n_sims * 100
    print(f"{i:2}. {team:20} {prob:5.1f}%")

## 12. Visualization

In [None]:
# Championship probability visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 8))

# Top 15 championship probabilities
top_15 = champions.most_common(15)
teams = [t[0] for t in top_15]
probs = [t[1] / n_sims * 100 for t in top_15]

colors = plt.cm.Blues(np.linspace(0.3, 0.9, len(teams)))[::-1]
axes[0].barh(teams[::-1], probs[::-1], color=colors)
axes[0].set_xlabel('Championship Probability (%)')
axes[0].set_title('2026 World Cup - Championship Probability')
for i, (team, prob) in enumerate(zip(teams[::-1], probs[::-1])):
    axes[0].text(prob + 0.5, i, f'{prob:.1f}%', va='center', fontsize=9)

# Elo ratings for comparison
elo_top = sorted([(t, elo_ratings.get(t, 1500)) for t in qualified_teams], 
                 key=lambda x: x[1], reverse=True)[:15]
elo_teams = [t[0] for t in elo_top]
elo_vals = [t[1] for t in elo_top]

colors = plt.cm.Reds(np.linspace(0.3, 0.9, len(elo_teams)))[::-1]
axes[1].barh(elo_teams[::-1], elo_vals[::-1], color=colors)
axes[1].set_xlabel('Elo Rating')
axes[1].set_title('Current Elo Ratings (Top 15)')
for i, (team, elo) in enumerate(zip(elo_teams[::-1], elo_vals[::-1])):
    axes[1].text(elo + 5, i, f'{elo:.0f}', va='center', fontsize=9)

plt.tight_layout()
viz_path = f'{MODEL_PATH}/wc2026_predictions.png'
plt.savefig(viz_path, dpi=150, bbox_inches='tight')
plt.show()
print(f"\nVisualization saved to {viz_path}")

In [None]:
# Sample head-to-head predictions for key matchups
key_matchups = [
    ('Brazil', 'Argentina'),
    ('France', 'England'),
    ('Germany', 'Spain'),
    ('Netherlands', 'Portugal'),
    ('United States', 'Mexico'),
    ('Brazil', 'France'),
]

print("\nKey Matchup Predictions:")
print("=" * 70)

for home, away in key_matchups:
    result = predict_match(home, away, model_home, model_away,
                           player_aggregates, elo_ratings, recent_form)
    if result:
        print(f"\n{home} vs {away}")
        print(f"Expected Score: {result['expected_home_goals']:.1f} - {result['expected_away_goals']:.1f}")
        print(f"  {home} wins: {result['home_win_prob']:.1%}")
        print(f"  Draw:        {result['draw_prob']:.1%}")
        print(f"  {away} wins: {result['away_win_prob']:.1%}")

In [None]:
# Final Summary
print("\n" + "=" * 60)
print("MODEL SUMMARY")
print("=" * 60)

print(f"\nData:")
print(f"  - Training matches: {len(X_train):,} (2010-2021)")
print(f"  - Test matches: {len(X_test):,} (2022+)")
print(f"  - Features: {len(feature_cols)}")
print(f"  - Teams with Elo: {len(elo_ratings)}")
print(f"  - Countries with player data: {len(player_aggregates['country'].unique())}")

print(f"\nModel Performance:")
print(f"  - Home Goals RMSE: {np.sqrt(mean_squared_error(y_home_test, y_home_pred)):.3f}")
print(f"  - Away Goals RMSE: {np.sqrt(mean_squared_error(y_away_test, y_away_pred)):.3f}")
print(f"  - Match Outcome Accuracy: {outcome_accuracy:.1%}")

print(f"\n2026 World Cup Prediction:")
print(f"  - Favorite: {champions.most_common(1)[0][0]} ({champions.most_common(1)[0][1]/n_sims*100:.1f}%)")
top_3 = champions.most_common(3)
print(f"  - Top 3: {', '.join([f'{t[0]} ({t[1]/n_sims*100:.1f}%)' for t in top_3])}")

print("\n" + "=" * 60)
print(f"Model artifacts saved to: {MODEL_PATH}")
if USE_GOOGLE_DRIVE:
    print("(Saved to Google Drive - persistent storage)")
print("=" * 60)

## 13. Export Web App Data

Export data needed for the World Cup Predictor web application:
- `teams_metadata.json`: List of teams with ISO codes for flag images
- `wc2022_groups.json`: 2022 World Cup group assignments  
- `wc2026_groups.json`: 2026 World Cup projected group assignments

In [None]:
# ISO 2-letter country codes for flag CDN (flagcdn.com/w80/{code}.png)
ISO_CODES = {
    'Afghanistan': 'af', 'Albania': 'al', 'Algeria': 'dz', 'Andorra': 'ad',
    'Angola': 'ao', 'Argentina': 'ar', 'Armenia': 'am', 'Australia': 'au',
    'Austria': 'at', 'Azerbaijan': 'az', 'Bahrain': 'bh', 'Bangladesh': 'bd',
    'Belarus': 'by', 'Belgium': 'be', 'Benin': 'bj', 'Bolivia': 'bo',
    'Bosnia and Herzegovina': 'ba', 'Botswana': 'bw', 'Brazil': 'br',
    'Bulgaria': 'bg', 'Burkina Faso': 'bf', 'Burundi': 'bi', 'Cambodia': 'kh',
    'Cameroon': 'cm', 'Canada': 'ca', 'Cape Verde': 'cv', 'Central African Republic': 'cf',
    'Chad': 'td', 'Chile': 'cl', 'China': 'cn', 'Colombia': 'co',
    'Comoros': 'km', 'Congo': 'cg', 'Costa Rica': 'cr', 'Croatia': 'hr',
    'Cuba': 'cu', 'Curaçao': 'cw', 'Cyprus': 'cy', 'Czech Republic': 'cz', 'DR Congo': 'cd',
    'Denmark': 'dk', 'Djibouti': 'dj', 'Dominican Republic': 'do', 'Ecuador': 'ec',
    'Egypt': 'eg', 'El Salvador': 'sv', 'England': 'gb-eng', 'Equatorial Guinea': 'gq',
    'Eritrea': 'er', 'Estonia': 'ee', 'Eswatini': 'sz', 'Ethiopia': 'et',
    'Fiji': 'fj', 'Finland': 'fi', 'France': 'fr', 'Gabon': 'ga',
    'Gambia': 'gm', 'Georgia': 'ge', 'Germany': 'de', 'Ghana': 'gh',
    'Greece': 'gr', 'Guatemala': 'gt', 'Guinea': 'gn', 'Guinea-Bissau': 'gw',
    'Haiti': 'ht', 'Honduras': 'hn', 'Hungary': 'hu', 'Iceland': 'is',
    'India': 'in', 'Indonesia': 'id', 'Iran': 'ir', 'Iraq': 'iq',
    'Ireland': 'ie', 'Israel': 'il', 'Italy': 'it', 'Ivory Coast': 'ci',
    'Jamaica': 'jm', 'Japan': 'jp', 'Jordan': 'jo', 'Kazakhstan': 'kz',
    'Kenya': 'ke', 'Kosovo': 'xk', 'Kuwait': 'kw', 'Kyrgyzstan': 'kg',
    'Laos': 'la', 'Latvia': 'lv', 'Lebanon': 'lb', 'Lesotho': 'ls',
    'Liberia': 'lr', 'Libya': 'ly', 'Liechtenstein': 'li', 'Lithuania': 'lt',
    'Luxembourg': 'lu', 'Madagascar': 'mg', 'Malawi': 'mw', 'Malaysia': 'my',
    'Maldives': 'mv', 'Mali': 'ml', 'Malta': 'mt', 'Mauritania': 'mr',
    'Mauritius': 'mu', 'Mexico': 'mx', 'Moldova': 'md', 'Mongolia': 'mn',
    'Montenegro': 'me', 'Morocco': 'ma', 'Mozambique': 'mz', 'Myanmar': 'mm',
    'Namibia': 'na', 'Nepal': 'np', 'Netherlands': 'nl', 'New Zealand': 'nz',
    'Nicaragua': 'ni', 'Niger': 'ne', 'Nigeria': 'ng', 'North Korea': 'kp',
    'North Macedonia': 'mk', 'Northern Ireland': 'gb-nir', 'Norway': 'no',
    'Oman': 'om', 'Pakistan': 'pk', 'Palestine': 'ps', 'Panama': 'pa',
    'Papua New Guinea': 'pg', 'Paraguay': 'py', 'Peru': 'pe', 'Philippines': 'ph',
    'Poland': 'pl', 'Portugal': 'pt', 'Qatar': 'qa', 'Romania': 'ro',
    'Russia': 'ru', 'Rwanda': 'rw', 'Saudi Arabia': 'sa', 'Scotland': 'gb-sct',
    'Senegal': 'sn', 'Serbia': 'rs', 'Sierra Leone': 'sl', 'Singapore': 'sg',
    'Slovakia': 'sk', 'Slovenia': 'si', 'Solomon Islands': 'sb', 'Somalia': 'so',
    'South Africa': 'za', 'South Korea': 'kr', 'South Sudan': 'ss', 'Spain': 'es',
    'Sri Lanka': 'lk', 'Sudan': 'sd', 'Suriname': 'sr', 'Sweden': 'se',
    'Switzerland': 'ch', 'Syria': 'sy', 'Tajikistan': 'tj', 'Tanzania': 'tz',
    'Thailand': 'th', 'Togo': 'tg', 'Trinidad and Tobago': 'tt', 'Tunisia': 'tn',
    'Turkey': 'tr', 'Turkmenistan': 'tm', 'Uganda': 'ug', 'Ukraine': 'ua',
    'United Arab Emirates': 'ae', 'United States': 'us', 'Uruguay': 'uy',
    'Uzbekistan': 'uz', 'Venezuela': 've', 'Vietnam': 'vn', 'Wales': 'gb-wls',
    'Yemen': 'ye', 'Zambia': 'zm', 'Zimbabwe': 'zw',
    # Additional variations
    'USA': 'us', 'Korea Republic': 'kr', 'Republic of Ireland': 'ie',
    "Cote d'Ivoire": 'ci', 'Czechia': 'cz', 'Türkiye': 'tr',
}

def get_iso_code(country_name):
    """Get ISO 2-letter code for a country name."""
    return ISO_CODES.get(country_name, country_name.lower()[:2])

def get_flag_url(country_name, width=80):
    """Get flag CDN URL for a country."""
    iso_code = get_iso_code(country_name)
    return f"https://flagcdn.com/w{width}/{iso_code}.png"

print(f"ISO codes defined for {len(ISO_CODES)} countries")

In [None]:
# Export teams_metadata.json for the web app
def export_teams_metadata():
    """Export teams with ISO codes, Elo ratings, and flag URLs for web app."""
    # Get teams that have BOTH Elo AND player data (FIFA 24)
    player_countries = set(
        player_aggregates[player_aggregates['fifa_version'] == 24]['country'].unique()
    )
    available_teams = [t for t in elo_ratings.keys() if t in player_countries]
    
    # Build metadata
    teams_data = []
    for team in available_teams:
        iso_code = get_iso_code(team)
        teams_data.append({
            'name': team,
            'iso_code': iso_code,
            'elo_rating': round(elo_ratings[team], 1),
            'flag_url': get_flag_url(team)
        })
    
    # Sort by Elo rating (highest first)
    teams_data.sort(key=lambda x: x['elo_rating'], reverse=True)
    
    # Save to model artifacts
    output_path = f'{MODEL_PATH}/teams_metadata.json'
    with open(output_path, 'w') as f:
        json.dump(teams_data, f, indent=2)
    
    print(f"Exported {len(teams_data)} teams to {output_path}")
    print(f"Top 10 teams by Elo:")
    for i, team in enumerate(teams_data[:10], 1):
        print(f"  {i}. {team['name']} ({team['elo_rating']}) - {team['iso_code']}")
    
    return teams_data

teams_metadata = export_teams_metadata()

In [None]:
# Export World Cup groups for web app presets

def export_wc_groups():
    """Export World Cup group configurations for web app presets."""
    os.makedirs(SIMULATIONS_PATH, exist_ok=True)
    
    # --- 2022 World Cup Groups ---
    # Load from existing wc22.json file
    wc22_path = f'{DATA_PATH}/wc22.json' if os.path.exists(f'{DATA_PATH}/wc22.json') else 'wc22.json'
    
    if os.path.exists(wc22_path):
        with open(wc22_path, 'r') as f:
            wc22_raw_groups = json.load(f)
        
        # Normalize team names to match our data
        wc2022_groups = {}
        for group_name, teams in wc22_raw_groups.items():
            # Convert "Group A" to just "A"
            short_name = group_name.replace("Group ", "")
            wc2022_groups[short_name] = [normalize_country_name(t) for t in teams]
        
        wc2022_data = {
            'name': '2022 FIFA World Cup',
            'format': '32_team',
            'groups': wc2022_groups
        }
        
        output_path = f'{SIMULATIONS_PATH}/wc2022_groups.json'
        with open(output_path, 'w') as f:
            json.dump(wc2022_data, f, indent=2)
        print(f"Exported 2022 World Cup groups to {output_path}")
    else:
        print(f"Warning: wc22.json not found at {wc22_path}")
    
    # --- 2026 World Cup Groups (Official Draw - December 2024) ---
    # Using actual draw results. Playoff spots filled with likely qualifiers.
    # UEFA Playoff winners: Wales (D), Ukraine (A), Turkey (B), Greece (C)
    # FIFA Playoff 1: Jamaica, FIFA Playoff 2: Bolivia
    
    wc2026_raw_groups = {
        'A': ['Mexico', 'South Africa', 'South Korea', 'Wales'],           # Wales = UEFA Playoff D
        'B': ['Canada', 'Ukraine', 'Qatar', 'Switzerland'],                # Ukraine = UEFA Playoff A
        'C': ['Brazil', 'Morocco', 'Haiti', 'Scotland'],
        'D': ['United States', 'Paraguay', 'Australia', 'Greece'],         # Greece = UEFA Playoff C
        'E': ['Germany', 'Curaçao', 'Ivory Coast', 'Ecuador'],             # Curaçao normalized
        'F': ['Netherlands', 'Japan', 'Turkey', 'Tunisia'],                # Turkey = UEFA Playoff B
        'G': ['Belgium', 'Egypt', 'Iran', 'New Zealand'],
        'H': ['Spain', 'Cape Verde', 'Saudi Arabia', 'Uruguay'],           # Cabo Verde = Cape Verde
        'I': ['France', 'Senegal', 'Norway', 'Bolivia'],                   # Bolivia = FIFA Playoff 2
        'J': ['Argentina', 'Algeria', 'Austria', 'Jordan'],
        'K': ['Portugal', 'Jamaica', 'Uzbekistan', 'Colombia'],            # Jamaica = FIFA Playoff 1
        'L': ['England', 'Croatia', 'Ghana', 'Panama'],
    }
    
    # Normalize team names to match our data
    wc2026_groups = {}
    for group_name, teams in wc2026_raw_groups.items():
        wc2026_groups[group_name] = [normalize_country_name(t) for t in teams]
    
    wc2026_data = {
        'name': '2026 FIFA World Cup',
        'format': '48_team',
        'groups': wc2026_groups,
        'note': 'Official draw Dec 2024. Playoff spots: Wales, Ukraine, Turkey, Greece (UEFA), Jamaica, Bolivia (FIFA)'
    }
    
    output_path = f'{SIMULATIONS_PATH}/wc2026_groups.json'
    with open(output_path, 'w') as f:
        json.dump(wc2026_data, f, indent=2)
    print(f"Exported 2026 World Cup groups to {output_path}")
    
    # Show group assignments
    print("\n2026 World Cup Official Draw Groups:")
    for group_name in sorted(wc2026_groups.keys()):
        teams = wc2026_groups[group_name]
        print(f"  Group {group_name}: {', '.join(teams)}")

export_wc_groups()

In [None]:
# Summary of exported files for web app
print("\n" + "=" * 60)
print("WEB APP EXPORT SUMMARY")
print("=" * 60)

print(f"\nModel Artifacts ({MODEL_PATH}):")
for f in sorted(os.listdir(MODEL_PATH)):
    size = os.path.getsize(f'{MODEL_PATH}/{f}')
    print(f"  - {f} ({size:,} bytes)")

print(f"\nSimulations ({SIMULATIONS_PATH}):")
for f in sorted(os.listdir(SIMULATIONS_PATH)):
    size = os.path.getsize(f'{SIMULATIONS_PATH}/{f}')
    print(f"  - {f} ({size:,} bytes)")

print("\n" + "=" * 60)
print("Copy these folders to the web app backend directory:")
print(f"  {MODEL_PATH} -> backend/model_artifacts/")
print(f"  {SIMULATIONS_PATH} -> backend/simulations/")
print("=" * 60)