# World Cup Bracket Prediction Model

## Overview
This notebook implements a dual XGBoost regression model to predict expected goals for international football matches. We then use Poisson-based Monte Carlo simulation to generate full bracket predictions with probability estimates.

**Key Approach:**
- Predict goals, not outcomes (a 5-0 and 1-0 are both "wins" but carry different information)
- Use Poisson distribution to model goal scoring (mathematically proven for rare, independent events)
- Monte Carlo simulation for robust probability estimates

**Data Sources:**
- International Football Results (2010+)
- FIFA World Rankings
- EA Sports FC Player Stats (FIFA 15-24)

## 1. Setup and Imports

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
from scipy.stats import poisson
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from collections import Counter
import joblib
import json
import os
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# Set random seed for reproducibility
np.random.seed(42)

# --- Google Colab / Google Drive Configuration ---
# Set to True when running in Google Colab to mount Drive
USE_GOOGLE_DRIVE = False

# Google Drive paths (customize these to your folder structure)
GDRIVE_BASE_PATH = '/content/drive/MyDrive/world-cup-prediction'
GDRIVE_DATA_PATH = f'{GDRIVE_BASE_PATH}/data'
GDRIVE_MODEL_PATH = f'{GDRIVE_BASE_PATH}/model_artifacts'

# Local paths (used when not on Google Drive)
LOCAL_DATA_PATH = 'data'
LOCAL_MODEL_PATH = 'model_artifacts'

# Set active paths based on configuration
if USE_GOOGLE_DRIVE:
    DATA_PATH = GDRIVE_DATA_PATH
    MODEL_PATH = GDRIVE_MODEL_PATH
else:
    DATA_PATH = LOCAL_DATA_PATH
    MODEL_PATH = LOCAL_MODEL_PATH

print("Libraries loaded successfully")
print(f"Data path: {DATA_PATH}")
print(f"Model path: {MODEL_PATH}")

In [None]:
# Mount Google Drive (only runs if USE_GOOGLE_DRIVE is True)
if USE_GOOGLE_DRIVE:
    try:
        from google.colab import drive
        drive.mount('/content/drive')
        
        # Create directories if they don't exist
        os.makedirs(GDRIVE_DATA_PATH, exist_ok=True)
        os.makedirs(GDRIVE_MODEL_PATH, exist_ok=True)
        
        print(f"Google Drive mounted successfully")
        print(f"Data folder: {GDRIVE_DATA_PATH}")
        print(f"Model folder: {GDRIVE_MODEL_PATH}")
        
        # List files in data directory
        if os.path.exists(GDRIVE_DATA_PATH):
            files = os.listdir(GDRIVE_DATA_PATH)
            if files:
                print(f"Files in data folder: {files}")
            else:
                print("WARNING: Data folder is empty. Please upload the following files:")
                print("  - all_matches.csv")
                print("  - fifa_ranking_2024.csv")
                print("  - players.csv")
                print("  - countries_names.csv")
    except ImportError:
        print("Not running in Google Colab. Using local paths.")
        USE_GOOGLE_DRIVE = False
        DATA_PATH = LOCAL_DATA_PATH
        MODEL_PATH = LOCAL_MODEL_PATH
else:
    print("Using local paths (USE_GOOGLE_DRIVE is False)")

## 2. Data Loading

In [None]:
# Load datasets from configured path (local or Google Drive)
matches_df = pd.read_csv(f'{DATA_PATH}/all_matches.csv')
rankings_df = pd.read_csv(f'{DATA_PATH}/fifa_ranking_2024.csv')
players_df = pd.read_csv(f'{DATA_PATH}/players.csv', low_memory=False)
country_names_df = pd.read_csv(f'{DATA_PATH}/countries_names.csv')

print(f"Loaded data from: {DATA_PATH}")
print(f"Matches: {len(matches_df):,} rows")
print(f"Rankings: {len(rankings_df):,} rows")
print(f"Players: {len(players_df):,} rows")
print(f"Country names: {len(country_names_df):,} rows")

In [None]:
# Preview match data
print("Matches columns:", matches_df.columns.tolist())
matches_df.head()

In [None]:
# Convert date columns
matches_df['date'] = pd.to_datetime(matches_df['date'])
rankings_df['rank_date'] = pd.to_datetime(rankings_df['rank_date'])

# Filter matches to 2010+ (modern football era)
matches_df = matches_df[matches_df['date'] >= '2010-01-01'].copy()
matches_df = matches_df.sort_values('date').reset_index(drop=True)

# Add year column for easier filtering
matches_df['year'] = matches_df['date'].dt.year

print(f"Matches after 2010 filter: {len(matches_df):,}")
print(f"Date range: {matches_df['date'].min()} to {matches_df['date'].max()}")
print(f"\nTournament breakdown:")
print(matches_df['tournament'].value_counts().head(15))

## 3. Country Name Normalization

Country names differ across datasets. We need to normalize them to ensure proper merging.

In [None]:
# Build country name normalization from the provided mapping file
# The countries_names.csv contains original_name -> current_name mappings
name_mapping = dict(zip(country_names_df['original_name'], country_names_df['current_name']))

# Additional manual mappings for common variations
ADDITIONAL_MAPPINGS = {
    # FIFA Rankings variations
    'USA': 'United States',
    'Korea Republic': 'South Korea',
    'Korea DPR': 'North Korea',
    'IR Iran': 'Iran',
    'China PR': 'China',
    "Cote d'Ivoire": "Ivory Coast",
    "Côte d'Ivoire": "Ivory Coast",
    'Czechia': 'Czech Republic',
    'Congo DR': 'DR Congo',
    'Viet Nam': 'Vietnam',
    'Russian Federation': 'Russia',
    'Türkiye': 'Turkey',
    
    # EA Sports player nationality variations
    'United States of America': 'United States',
    'Korea': 'South Korea',
    'Republic of Korea': 'South Korea',
    'DPR Korea': 'North Korea',
    "People's Republic of China": 'China',
    'Democratic Republic of Congo': 'DR Congo',
    
    # UK nations
    'England': 'England',
    'Scotland': 'Scotland',
    'Wales': 'Wales',
    'Northern Ireland': 'Northern Ireland',
    
    # Other common variations
    'Republic of Ireland': 'Ireland',
    'Eswatini': 'Swaziland',
    'Timor-Leste': 'East Timor',
    'Trinidad & Tobago': 'Trinidad and Tobago',
}

# Combine mappings
name_mapping.update(ADDITIONAL_MAPPINGS)

def normalize_country_name(name):
    """Normalize country name to canonical form."""
    if pd.isna(name):
        return name
    name = str(name).strip()
    return name_mapping.get(name, name)

print(f"Total name mappings: {len(name_mapping)}")

In [None]:
# Apply normalization to all datasets
matches_df['home_team'] = matches_df['home_team'].apply(normalize_country_name)
matches_df['away_team'] = matches_df['away_team'].apply(normalize_country_name)

rankings_df['country_full'] = rankings_df['country_full'].apply(normalize_country_name)

players_df['nationality_name'] = players_df['nationality_name'].apply(normalize_country_name)

# Get unique teams from matches
all_teams = set(matches_df['home_team'].unique()) | set(matches_df['away_team'].unique())
print(f"Unique teams in matches: {len(all_teams)}")

# Check player coverage
player_countries = set(players_df['nationality_name'].unique())
matched_teams = all_teams & player_countries
print(f"Teams with player data: {len(matched_teams)}")

## 4. Elo Rating Calculation

We calculate Elo ratings for all teams based on historical match results. Elo is a powerful predictor of team strength.

In [None]:
def calculate_elo_ratings(matches_df, k=32, initial_elo=1500):
    """
    Calculate Elo ratings for all teams from match history.
    Returns a dict of team -> current elo, and adds elo columns to dataframe.
    """
    elo = defaultdict(lambda: initial_elo)
    
    # Store Elo at time of each match
    home_elos = []
    away_elos = []
    
    for _, match in matches_df.iterrows():
        home, away = match['home_team'], match['away_team']
        home_elo, away_elo = elo[home], elo[away]
        
        # Store pre-match Elo
        home_elos.append(home_elo)
        away_elos.append(away_elo)
        
        # Expected scores
        exp_home = 1 / (1 + 10**((away_elo - home_elo) / 400))
        exp_away = 1 - exp_home
        
        # Actual scores (1=win, 0.5=draw, 0=loss)
        home_score, away_score = match['home_score'], match['away_score']
        if home_score > away_score:
            actual_home, actual_away = 1, 0
        elif home_score < away_score:
            actual_home, actual_away = 0, 1
        else:
            actual_home, actual_away = 0.5, 0.5
        
        # Update Elo
        elo[home] += k * (actual_home - exp_home)
        elo[away] += k * (actual_away - exp_away)
    
    return dict(elo), home_elos, away_elos

# Calculate Elo ratings
elo_ratings, home_elos, away_elos = calculate_elo_ratings(matches_df)

# Add Elo columns to matches dataframe
matches_df['home_elo'] = home_elos
matches_df['away_elo'] = away_elos
matches_df['elo_diff'] = matches_df['home_elo'] - matches_df['away_elo']

print(f"Calculated Elo for {len(elo_ratings)} teams")
print("\nTop 20 teams by Elo:")
top_teams = sorted(elo_ratings.items(), key=lambda x: x[1], reverse=True)[:20]
for i, (team, rating) in enumerate(top_teams, 1):
    print(f"{i:2}. {team:25} {rating:.1f}")

## 5. Player Aggregation

Aggregate player stats by country and year. We use the top 14 players (typical squad selection) and align FIFA version to match year to avoid data leakage.

In [None]:
# Check FIFA versions available
print("FIFA versions in dataset:")
print(players_df['fifa_version'].value_counts().sort_index())

In [None]:
def get_fifa_version_for_year(match_year):
    """
    Map match year to FIFA version.
    FIFA 15 = 2014/2015 season, FIFA 24 = 2023/2024 season
    """
    # FIFA version is roughly match_year - 2000 + 1
    # But we need to cap it at available versions
    fifa_version = match_year - 2000 + 1
    
    # Cap to available versions (15-24)
    if fifa_version < 15:
        return 15  # Use FIFA 15 for older matches
    elif fifa_version > 24:
        return 24  # Use FIFA 24 for future matches
    return fifa_version

def aggregate_players_by_country_year(players_df, top_n=14):
    """
    Aggregate player stats by country and FIFA version.
    Returns top N players by overall rating for each country/version.
    """
    # Key columns for aggregation
    agg_cols = ['overall', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic']
    
    # Filter to valid data
    player_data = players_df[['nationality_name', 'fifa_version', 'overall', 
                               'pace', 'shooting', 'passing', 'dribbling', 
                               'defending', 'physic', 'player_positions']].copy()
    player_data = player_data.dropna(subset=['nationality_name', 'fifa_version', 'overall'])
    player_data['fifa_version'] = player_data['fifa_version'].astype(int)
    
    aggregations = []
    
    for (country, version), group in player_data.groupby(['nationality_name', 'fifa_version']):
        # Get top N players by overall rating
        top_players = group.nlargest(top_n, 'overall')
        
        if len(top_players) == 0:
            continue
        
        # Basic aggregations
        agg_dict = {
            'country': country,
            'fifa_version': version,
            'num_players': len(top_players),
            'avg_overall': top_players['overall'].mean(),
            'max_overall': top_players['overall'].max(),
            'avg_pace': top_players['pace'].mean(),
            'avg_shooting': top_players['shooting'].mean(),
            'avg_passing': top_players['passing'].mean(),
            'avg_dribbling': top_players['dribbling'].mean(),
            'avg_defending': top_players['defending'].mean(),
            'avg_physic': top_players['physic'].mean(),
        }
        
        # Calculate attack/defense averages based on positions
        # Attackers: ST, CF, LW, RW, LF, RF, CAM
        # Defenders: CB, LB, RB, LWB, RWB, CDM, GK
        attackers = top_players[top_players['player_positions'].fillna('').str.contains(
            'ST|CF|LW|RW|LF|RF|CAM', case=False)]
        defenders = top_players[top_players['player_positions'].fillna('').str.contains(
            'CB|LB|RB|LWB|RWB|CDM|GK', case=False)]
        
        agg_dict['avg_attack_overall'] = attackers['overall'].mean() if len(attackers) > 0 else agg_dict['avg_overall']
        agg_dict['avg_defense_overall'] = defenders['overall'].mean() if len(defenders) > 0 else agg_dict['avg_overall']
        
        aggregations.append(agg_dict)
    
    return pd.DataFrame(aggregations)

# Aggregate player data
player_aggregates = aggregate_players_by_country_year(players_df)
print(f"Player aggregates: {len(player_aggregates)} country-version combinations")
player_aggregates.head(10)

In [None]:
# Check coverage for key World Cup teams
wc_teams = ['Brazil', 'Argentina', 'France', 'Germany', 'England', 'Spain', 'Netherlands', 'Portugal']

print("Player data coverage for key teams:")
for team in wc_teams:
    team_data = player_aggregates[player_aggregates['country'] == team]
    if len(team_data) > 0:
        print(f"{team}: {len(team_data)} versions, avg overall = {team_data['avg_overall'].mean():.1f}")
    else:
        print(f"{team}: NO DATA")

## 6. Feature Engineering

Create the full feature set by merging Elo ratings, player stats, and calculating recent form features.

In [None]:
def calculate_form_features(matches_df, team, match_date, n_matches=5):
    """
    Calculate recent form features for a team before a given match date.
    Returns goals scored, goals conceded, and win rate from last N matches.
    """
    # Get team's matches before this date
    team_home = matches_df[(matches_df['home_team'] == team) & (matches_df['date'] < match_date)]
    team_away = matches_df[(matches_df['away_team'] == team) & (matches_df['date'] < match_date)]
    
    # Combine and get last N matches
    home_results = team_home[['date', 'home_score', 'away_score']].copy()
    home_results.columns = ['date', 'goals_for', 'goals_against']
    
    away_results = team_away[['date', 'away_score', 'home_score']].copy()
    away_results.columns = ['date', 'goals_for', 'goals_against']
    
    all_results = pd.concat([home_results, away_results]).sort_values('date', ascending=False)
    recent = all_results.head(n_matches)
    
    if len(recent) == 0:
        return 1.5, 1.5, 0.33  # Default values
    
    avg_scored = recent['goals_for'].mean()
    avg_conceded = recent['goals_against'].mean()
    
    wins = (recent['goals_for'] > recent['goals_against']).sum()
    win_rate = wins / len(recent)
    
    return avg_scored, avg_conceded, win_rate

# Test form calculation
test_date = pd.Timestamp('2022-11-20')
test_team = 'Brazil'
scored, conceded, win_rate = calculate_form_features(matches_df, test_team, test_date)
print(f"{test_team} form before {test_date.date()}:")
print(f"  Avg goals scored: {scored:.2f}")
print(f"  Avg goals conceded: {conceded:.2f}")
print(f"  Win rate: {win_rate:.1%}")

In [None]:
def build_feature_dataset(matches_df, player_aggregates, elo_ratings_history=None):
    """
    Build the full feature dataset for training.
    """
    features_list = []
    
    for idx, match in matches_df.iterrows():
        home_team = match['home_team']
        away_team = match['away_team']
        match_date = match['date']
        match_year = match['year']
        
        # Get FIFA version for this year
        fifa_version = get_fifa_version_for_year(match_year)
        
        # Get player aggregates for each team
        home_players = player_aggregates[
            (player_aggregates['country'] == home_team) & 
            (player_aggregates['fifa_version'] == fifa_version)
        ]
        away_players = player_aggregates[
            (player_aggregates['country'] == away_team) & 
            (player_aggregates['fifa_version'] == fifa_version)
        ]
        
        # Skip if no player data for either team
        if len(home_players) == 0 or len(away_players) == 0:
            continue
        
        home_players = home_players.iloc[0]
        away_players = away_players.iloc[0]
        
        # Calculate form features (expensive, so we'll sample for training)
        home_scored, home_conceded, home_win_rate = calculate_form_features(
            matches_df, home_team, match_date)
        away_scored, away_conceded, away_win_rate = calculate_form_features(
            matches_df, away_team, match_date)
        
        # Build feature dict
        features = {
            # Elo features
            'home_elo': match['home_elo'],
            'away_elo': match['away_elo'],
            'elo_diff': match['elo_diff'],
            
            # Player aggregate features - Home
            'home_avg_overall': home_players['avg_overall'],
            'home_max_overall': home_players['max_overall'],
            'home_avg_attack': home_players['avg_attack_overall'],
            'home_avg_defense': home_players['avg_defense_overall'],
            'home_avg_pace': home_players['avg_pace'],
            'home_avg_shooting': home_players['avg_shooting'],
            'home_avg_passing': home_players['avg_passing'],
            
            # Player aggregate features - Away
            'away_avg_overall': away_players['avg_overall'],
            'away_max_overall': away_players['max_overall'],
            'away_avg_attack': away_players['avg_attack_overall'],
            'away_avg_defense': away_players['avg_defense_overall'],
            'away_avg_pace': away_players['avg_pace'],
            'away_avg_shooting': away_players['avg_shooting'],
            'away_avg_passing': away_players['avg_passing'],
            
            # Diff features
            'overall_diff': home_players['avg_overall'] - away_players['avg_overall'],
            'attack_diff': home_players['avg_attack_overall'] - away_players['avg_attack_overall'],
            'defense_diff': home_players['avg_defense_overall'] - away_players['avg_defense_overall'],
            
            # Form features
            'home_form_scored': home_scored,
            'home_form_conceded': home_conceded,
            'home_form_win_rate': home_win_rate,
            'away_form_scored': away_scored,
            'away_form_conceded': away_conceded,
            'away_form_win_rate': away_win_rate,
            
            # Match context
            'is_neutral': 1 if match['neutral'] else 0,
            'is_world_cup': 1 if 'FIFA World Cup' in str(match['tournament']) else 0,
            'is_continental': 1 if any(x in str(match['tournament']) for x in 
                                       ['UEFA Euro', 'Copa America', 'Africa Cup', 'AFC Asian Cup']) else 0,
            
            # Targets
            'home_goals': match['home_score'],
            'away_goals': match['away_score'],
            
            # Metadata (for analysis, not training)
            '_home_team': home_team,
            '_away_team': away_team,
            '_date': match_date,
            '_tournament': match['tournament'],
        }
        
        features_list.append(features)
    
    return pd.DataFrame(features_list)

# Build feature dataset (this takes a few minutes)
print("Building feature dataset... (this may take a few minutes)")
feature_df = build_feature_dataset(matches_df, player_aggregates)
print(f"\nFeature dataset: {len(feature_df)} matches with complete features")

In [None]:
# Summary of feature dataset
print("Feature dataset summary:")
print(f"Shape: {feature_df.shape}")
print(f"\nDate range: {feature_df['_date'].min()} to {feature_df['_date'].max()}")
print(f"\nWorld Cup matches: {feature_df['is_world_cup'].sum()}")

# Check for missing values
feature_cols = [c for c in feature_df.columns if not c.startswith('_') and c not in ['home_goals', 'away_goals']]
print(f"\nMissing values:")
missing = feature_df[feature_cols].isnull().sum()
print(missing[missing > 0] if missing.sum() > 0 else "None")

## 7. Model Training

Train dual XGBoost regressors: one for home team goals, one for away team goals.

In [None]:
# Prepare features and targets
feature_cols = [c for c in feature_df.columns if not c.startswith('_') and c not in ['home_goals', 'away_goals']]

X = feature_df[feature_cols].copy()
y_home = feature_df['home_goals'].copy()
y_away = feature_df['away_goals'].copy()

# Fill any remaining NaN values
X = X.fillna(X.mean())

print(f"Features: {len(feature_cols)}")
print(feature_cols)

# Split: use 2022+ as test set (including 2022 World Cup)
train_mask = feature_df['_date'] < '2022-01-01'
test_mask = feature_df['_date'] >= '2022-01-01'

X_train, X_test = X[train_mask], X[test_mask]
y_home_train, y_home_test = y_home[train_mask], y_home[test_mask]
y_away_train, y_away_test = y_away[train_mask], y_away[test_mask]

print(f"\nTrain set: {len(X_train)} matches")
print(f"Test set: {len(X_test)} matches")

In [None]:
# Train Home Goals Model
model_home = XGBRegressor(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

model_home.fit(X_train, y_home_train, 
               eval_set=[(X_test, y_home_test)],
               verbose=False)

print("Home Goals Model trained")

# Train Away Goals Model
model_away = XGBRegressor(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

model_away.fit(X_train, y_away_train,
               eval_set=[(X_test, y_away_test)],
               verbose=False)

print("Away Goals Model trained")

In [None]:
# Evaluate models
y_home_pred = model_home.predict(X_test)
y_away_pred = model_away.predict(X_test)

print("Model Performance:")
print("\nHome Goals Model:")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_home_test, y_home_pred)):.3f}")
print(f"  MAE:  {mean_absolute_error(y_home_test, y_home_pred):.3f}")

print("\nAway Goals Model:")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_away_test, y_away_pred)):.3f}")
print(f"  MAE:  {mean_absolute_error(y_away_test, y_away_pred):.3f}")

# Match outcome accuracy
actual_outcomes = np.sign(y_home_test.values - y_away_test.values)
pred_outcomes = np.sign(y_home_pred - y_away_pred)
outcome_accuracy = (actual_outcomes == pred_outcomes).mean()

print(f"\nMatch Outcome Accuracy: {outcome_accuracy:.1%}")

In [None]:
# Feature importance
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Home model importance
importance_home = pd.DataFrame({
    'feature': feature_cols,
    'importance': model_home.feature_importances_
}).sort_values('importance', ascending=True).tail(15)

axes[0].barh(importance_home['feature'], importance_home['importance'])
axes[0].set_title('Home Goals Model - Feature Importance')
axes[0].set_xlabel('Importance')

# Away model importance
importance_away = pd.DataFrame({
    'feature': feature_cols,
    'importance': model_away.feature_importances_
}).sort_values('importance', ascending=True).tail(15)

axes[1].barh(importance_away['feature'], importance_away['importance'])
axes[1].set_title('Away Goals Model - Feature Importance')
axes[1].set_xlabel('Importance')

plt.tight_layout()
plt.show()

## 8. Save Model Artifacts

Save all trained components for later inference without retraining.

In [None]:
def save_model_artifacts(output_dir=None):
    """Save all components needed for inference."""
    # Use configured MODEL_PATH if no output_dir specified
    if output_dir is None:
        output_dir = MODEL_PATH
    
    os.makedirs(output_dir, exist_ok=True)
    
    # 1. Save trained XGBoost models
    joblib.dump(model_home, f'{output_dir}/model_home_goals.joblib')
    joblib.dump(model_away, f'{output_dir}/model_away_goals.joblib')
    
    # 2. Save current Elo ratings
    with open(f'{output_dir}/elo_ratings.json', 'w') as f:
        json.dump(elo_ratings, f, indent=2)
    
    # 3. Save player aggregates (latest version for each country)
    latest_players = player_aggregates[player_aggregates['fifa_version'] == 24].copy()
    latest_players.to_csv(f'{output_dir}/player_aggregates.csv', index=False)
    
    # 4. Save country name mapping
    with open(f'{output_dir}/country_name_map.json', 'w') as f:
        json.dump(name_mapping, f, indent=2)
    
    # 5. Save feature column order
    with open(f'{output_dir}/feature_columns.json', 'w') as f:
        json.dump(feature_cols, f)
    
    # 6. Save recent form stats
    recent_form = {}
    for team in elo_ratings.keys():
        scored, conceded, win_rate = calculate_form_features(
            matches_df, team, matches_df['date'].max() + pd.Timedelta(days=1))
        recent_form[team] = {
            'avg_scored': scored,
            'avg_conceded': conceded,
            'win_rate': win_rate
        }
    with open(f'{output_dir}/recent_form.json', 'w') as f:
        json.dump(recent_form, f, indent=2)
    
    print(f"All artifacts saved to {output_dir}")
    if USE_GOOGLE_DRIVE:
        print("Models saved to Google Drive - they will persist after runtime disconnects")
    print(f"Files: {os.listdir(output_dir)}")

save_model_artifacts()

## 9. Poisson Match Simulator

Use predicted goals as lambda parameter for Poisson distribution. Monte Carlo sampling gives robust probability estimates.

In [None]:
def simulate_match(home_goals_pred, away_goals_pred, n_sims=10000):
    """
    Use predicted goals as lambda parameter for Poisson distribution.
    Returns win/draw/loss probabilities.
    """
    # Ensure non-negative lambda values
    home_lambda = max(0.1, home_goals_pred)
    away_lambda = max(0.1, away_goals_pred)
    
    # Simulate goals using Poisson distribution
    home_goals = poisson.rvs(mu=home_lambda, size=n_sims)
    away_goals = poisson.rvs(mu=away_lambda, size=n_sims)
    
    # Calculate probabilities
    home_wins = (home_goals > away_goals).mean()
    draws = (home_goals == away_goals).mean()
    away_wins = (home_goals < away_goals).mean()
    
    return {
        'home_win_prob': home_wins,
        'draw_prob': draws,
        'away_win_prob': away_wins,
        'expected_home_goals': home_goals_pred,
        'expected_away_goals': away_goals_pred,
        'simulated_home_goals': home_goals,
        'simulated_away_goals': away_goals
    }

def predict_match(home_team, away_team, model_home, model_away, 
                  player_aggregates, elo_ratings, recent_form,
                  is_neutral=True, is_world_cup=True, n_sims=10000):
    """
    Predict a match between two teams with full probability distribution.
    """
    # Get latest FIFA version
    fifa_version = 24
    
    # Get player data
    home_players = player_aggregates[
        (player_aggregates['country'] == home_team) & 
        (player_aggregates['fifa_version'] == fifa_version)
    ]
    away_players = player_aggregates[
        (player_aggregates['country'] == away_team) & 
        (player_aggregates['fifa_version'] == fifa_version)
    ]
    
    if len(home_players) == 0 or len(away_players) == 0:
        print(f"Warning: Missing player data for {home_team} or {away_team}")
        return None
    
    home_players = home_players.iloc[0]
    away_players = away_players.iloc[0]
    
    # Get Elo ratings
    home_elo = elo_ratings.get(home_team, 1500)
    away_elo = elo_ratings.get(away_team, 1500)
    
    # Get form data
    home_form = recent_form.get(home_team, {'avg_scored': 1.5, 'avg_conceded': 1.5, 'win_rate': 0.33})
    away_form = recent_form.get(away_team, {'avg_scored': 1.5, 'avg_conceded': 1.5, 'win_rate': 0.33})
    
    # Build feature vector
    features = pd.DataFrame([{
        'home_elo': home_elo,
        'away_elo': away_elo,
        'elo_diff': home_elo - away_elo,
        'home_avg_overall': home_players['avg_overall'],
        'home_max_overall': home_players['max_overall'],
        'home_avg_attack': home_players['avg_attack_overall'],
        'home_avg_defense': home_players['avg_defense_overall'],
        'home_avg_pace': home_players['avg_pace'],
        'home_avg_shooting': home_players['avg_shooting'],
        'home_avg_passing': home_players['avg_passing'],
        'away_avg_overall': away_players['avg_overall'],
        'away_max_overall': away_players['max_overall'],
        'away_avg_attack': away_players['avg_attack_overall'],
        'away_avg_defense': away_players['avg_defense_overall'],
        'away_avg_pace': away_players['avg_pace'],
        'away_avg_shooting': away_players['avg_shooting'],
        'away_avg_passing': away_players['avg_passing'],
        'overall_diff': home_players['avg_overall'] - away_players['avg_overall'],
        'attack_diff': home_players['avg_attack_overall'] - away_players['avg_attack_overall'],
        'defense_diff': home_players['avg_defense_overall'] - away_players['avg_defense_overall'],
        'home_form_scored': home_form['avg_scored'],
        'home_form_conceded': home_form['avg_conceded'],
        'home_form_win_rate': home_form['win_rate'],
        'away_form_scored': away_form['avg_scored'],
        'away_form_conceded': away_form['avg_conceded'],
        'away_form_win_rate': away_form['win_rate'],
        'is_neutral': 1 if is_neutral else 0,
        'is_world_cup': 1 if is_world_cup else 0,
        'is_continental': 0,
    }])
    
    # Reorder columns to match training
    features = features[feature_cols]
    
    # Predict goals
    home_goals_pred = model_home.predict(features)[0]
    away_goals_pred = model_away.predict(features)[0]
    
    # Simulate match
    result = simulate_match(home_goals_pred, away_goals_pred, n_sims)
    result['home_team'] = home_team
    result['away_team'] = away_team
    
    return result

print("Poisson simulator ready")

In [None]:
# Load recent form for predictions
with open(f'{MODEL_PATH}/recent_form.json', 'r') as f:
    recent_form = json.load(f)

# Test prediction
result = predict_match('Brazil', 'Argentina', model_home, model_away,
                       player_aggregates, elo_ratings, recent_form)

if result:
    print(f"\n{result['home_team']} vs {result['away_team']}")
    print(f"Expected Goals: {result['expected_home_goals']:.2f} - {result['expected_away_goals']:.2f}")
    print(f"\nProbabilities:")
    print(f"  {result['home_team']} wins: {result['home_win_prob']:.1%}")
    print(f"  Draw:              {result['draw_prob']:.1%}")
    print(f"  {result['away_team']} wins: {result['away_win_prob']:.1%}")

## 10. 2022 World Cup Validation

Test the model on the 2022 World Cup to assess accuracy.

In [None]:
# Get 2022 World Cup matches from our test set
wc22_matches = feature_df[
    (feature_df['_date'] >= '2022-11-01') & 
    (feature_df['_date'] <= '2022-12-31') &
    (feature_df['is_world_cup'] == 1)
].copy()

print(f"2022 World Cup matches in dataset: {len(wc22_matches)}")

if len(wc22_matches) > 0:
    # Make predictions
    X_wc = wc22_matches[feature_cols]
    wc22_matches['pred_home_goals'] = model_home.predict(X_wc)
    wc22_matches['pred_away_goals'] = model_away.predict(X_wc)
    
    # Calculate outcome accuracy
    actual = np.sign(wc22_matches['home_goals'] - wc22_matches['away_goals'])
    predicted = np.sign(wc22_matches['pred_home_goals'] - wc22_matches['pred_away_goals'])
    accuracy = (actual.values == predicted.values).mean()
    
    print(f"\nMatch outcome accuracy: {accuracy:.1%}")
    print(f"Home goals RMSE: {np.sqrt(mean_squared_error(wc22_matches['home_goals'], wc22_matches['pred_home_goals'])):.3f}")
    print(f"Away goals RMSE: {np.sqrt(mean_squared_error(wc22_matches['away_goals'], wc22_matches['pred_away_goals'])):.3f}")

In [None]:
# Show detailed predictions vs actuals
if len(wc22_matches) > 0:
    display_cols = ['_home_team', '_away_team', 'home_goals', 'away_goals', 
                    'pred_home_goals', 'pred_away_goals']
    display_df = wc22_matches[display_cols].copy()
    display_df.columns = ['Home', 'Away', 'Actual Home', 'Actual Away', 'Pred Home', 'Pred Away']
    display_df['Pred Home'] = display_df['Pred Home'].round(2)
    display_df['Pred Away'] = display_df['Pred Away'].round(2)
    
    # Add result columns
    display_df['Actual Result'] = display_df.apply(
        lambda r: 'Home' if r['Actual Home'] > r['Actual Away'] 
        else ('Away' if r['Actual Home'] < r['Actual Away'] else 'Draw'), axis=1)
    display_df['Pred Result'] = display_df.apply(
        lambda r: 'Home' if r['Pred Home'] > r['Pred Away'] 
        else ('Away' if r['Pred Home'] < r['Pred Away'] else 'Draw'), axis=1)
    display_df['Correct'] = display_df['Actual Result'] == display_df['Pred Result']
    
    print("2022 World Cup Predictions vs Actuals:")
    print(display_df.to_string(index=False))

In [None]:
# Simulate 2022 World Cup with actual teams (32-team format)
# Load 2022 World Cup groups
wc22_json_path = f'{DATA_PATH}/wc22.json' if os.path.exists(f'{DATA_PATH}/wc22.json') else 'wc22.json'
with open(wc22_json_path, 'r') as f:
    wc22_groups = json.load(f)

# Get all 2022 WC teams
wc22_teams = []
for group, teams in wc22_groups.items():
    wc22_teams.extend(teams)

# Filter to teams we have data for
wc22_teams_available = [t for t in wc22_teams if t in elo_ratings and 
                        t in player_aggregates[player_aggregates['fifa_version'] == 24]['country'].values]

print(f"2022 World Cup teams with data: {len(wc22_teams_available)}/{len(wc22_teams)}")
print(f"Teams: {wc22_teams_available}")

# Run 32-team tournament simulation
if len(wc22_teams_available) >= 32:
    print("\nRunning 2022 World Cup simulation (32-team format, 100 tournaments)...")
    wc22_champions, wc22_finalists, wc22_semifinalists = simulate_tournament(
        wc22_teams_available, n_tournament_sims=100, format='32_team'
    )
    
    print("\n2022 World Cup Simulation Results:")
    print("-" * 40)
    print("Championship Probability (Top 10):")
    for i, (team, count) in enumerate(wc22_champions.most_common(10), 1):
        prob = count / 100 * 100
        print(f"{i:2}. {team:20} {prob:5.1f}%")
    
    print("\nActual Result: Argentina won the 2022 World Cup")
else:
    print(f"Not enough teams with data for 32-team simulation")

## 11. 2026 World Cup Tournament Simulation

Simulate the full 2026 World Cup tournament with bracket predictions.

**Tournament Formats Supported:**
- **32-team format** (2018, 2022): 8 groups of 4, top 2 advance to Round of 16
- **48-team format** (2026): 12 groups of 4, top 2 + 8 best third-place advance to Round of 32

In [None]:
# 2026 World Cup will have 48 teams in 12 groups of 4
# For now, let's use projected qualified teams based on current rankings
# This is a simplified example - actual groups will be drawn later

# Top teams likely to qualify (based on Elo ratings and typical qualifiers)
WC2026_PROJECTED_TEAMS = [
    # CONMEBOL (6-7 spots)
    'Brazil', 'Argentina', 'Uruguay', 'Colombia', 'Ecuador', 'Chile',
    # UEFA (16 spots)
    'France', 'England', 'Spain', 'Germany', 'Netherlands', 'Portugal',
    'Belgium', 'Italy', 'Croatia', 'Switzerland', 'Denmark', 'Austria',
    'Poland', 'Serbia', 'Ukraine', 'Sweden',
    # CONCACAF (6-7 spots including hosts)
    'United States', 'Mexico', 'Canada', 'Costa Rica', 'Jamaica', 'Panama',
    # AFC (8-9 spots)
    'Japan', 'South Korea', 'Iran', 'Australia', 'Saudi Arabia', 'Qatar',
    'United Arab Emirates', 'Iraq',
    # CAF (9-10 spots)
    'Morocco', 'Senegal', 'Nigeria', 'Egypt', 'Cameroon', 'Algeria',
    'Tunisia', 'Ivory Coast', 'Ghana', 'Mali',
    # OFC (1-2 spots)
    'New Zealand',
]

# Filter to teams we have data for
available_teams = set(player_aggregates[player_aggregates['fifa_version'] == 24]['country'].unique())
available_teams &= set(elo_ratings.keys())

qualified_teams = [t for t in WC2026_PROJECTED_TEAMS if t in available_teams]
print(f"Projected qualified teams with data: {len(qualified_teams)}")

# Need 48 teams, pad with highest Elo teams if needed
if len(qualified_teams) < 48:
    remaining = sorted(
        [(t, elo_ratings[t]) for t in available_teams if t not in qualified_teams],
        key=lambda x: x[1], reverse=True
    )
    qualified_teams.extend([t for t, _ in remaining[:48-len(qualified_teams)]])

qualified_teams = qualified_teams[:48]
print(f"Final team count: {len(qualified_teams)}")

In [None]:
def simulate_group_stage(groups, model_home, model_away, player_aggregates, 
                         elo_ratings, recent_form, n_sims=1000):
    """
    Simulate group stage matches and return standings.
    """
    group_results = {}
    
    for group_name, teams in groups.items():
        # Initialize points and goal difference
        standings = {team: {'points': 0, 'gd': 0, 'gf': 0, 'wins': 0} for team in teams}
        
        # Play all group matches (round robin)
        for i, team_a in enumerate(teams):
            for team_b in teams[i+1:]:
                result = predict_match(team_a, team_b, model_home, model_away,
                                       player_aggregates, elo_ratings, recent_form,
                                       is_neutral=True, is_world_cup=True, n_sims=n_sims)
                
                if result is None:
                    continue
                
                # Simulate single match outcome from probabilities
                rand = np.random.random()
                if rand < result['home_win_prob']:
                    # team_a wins
                    standings[team_a]['points'] += 3
                    standings[team_a]['wins'] += 1
                    standings[team_a]['gf'] += result['expected_home_goals']
                    standings[team_a]['gd'] += result['expected_home_goals'] - result['expected_away_goals']
                    standings[team_b]['gf'] += result['expected_away_goals']
                    standings[team_b]['gd'] += result['expected_away_goals'] - result['expected_home_goals']
                elif rand < result['home_win_prob'] + result['draw_prob']:
                    # draw
                    standings[team_a]['points'] += 1
                    standings[team_b]['points'] += 1
                    standings[team_a]['gf'] += result['expected_home_goals']
                    standings[team_b]['gf'] += result['expected_away_goals']
                else:
                    # team_b wins
                    standings[team_b]['points'] += 3
                    standings[team_b]['wins'] += 1
                    standings[team_b]['gf'] += result['expected_away_goals']
                    standings[team_b]['gd'] += result['expected_away_goals'] - result['expected_home_goals']
                    standings[team_a]['gf'] += result['expected_home_goals']
                    standings[team_a]['gd'] += result['expected_home_goals'] - result['expected_away_goals']
        
        # Sort by points, then goal difference
        sorted_teams = sorted(standings.items(), 
                              key=lambda x: (x[1]['points'], x[1]['gd'], x[1]['gf']), 
                              reverse=True)
        group_results[group_name] = sorted_teams
    
    return group_results

def simulate_knockout_match(team_a, team_b, model_home, model_away, player_aggregates,
                            elo_ratings, recent_form, n_sims=1000):
    """
    Simulate a knockout match (no draws allowed).
    """
    result = predict_match(team_a, team_b, model_home, model_away,
                           player_aggregates, elo_ratings, recent_form,
                           is_neutral=True, is_world_cup=True, n_sims=n_sims)
    
    if result is None:
        # Fallback: use Elo to decide
        return team_a if elo_ratings.get(team_a, 1500) > elo_ratings.get(team_b, 1500) else team_b
    
    # For knockouts, convert draw probability to coin flip
    total_win_prob = result['home_win_prob'] + result['away_win_prob']
    adj_home_win = result['home_win_prob'] / total_win_prob if total_win_prob > 0 else 0.5
    
    # Add half of draw probability to each team
    adj_home_win = result['home_win_prob'] + result['draw_prob'] / 2
    
    if np.random.random() < adj_home_win:
        return team_a
    else:
        return team_b

print("Knockout simulation functions ready")

In [None]:
def simulate_tournament(teams, n_tournament_sims=100, format='48_team'):
    """
    Run full tournament simulation multiple times.
    Returns championship frequency for each team.
    
    Parameters:
    - teams: list of team names
    - n_tournament_sims: number of tournament simulations to run
    - format: '32_team' (8 groups, used in 2018/2022) or '48_team' (12 groups, used in 2026)
    """
    sorted_teams = sorted(teams, key=lambda t: elo_ratings.get(t, 1500), reverse=True)
    
    # Set format parameters
    if format == '32_team':
        n_groups = 8
        teams_per_group = 4
        n_teams = 32
        use_third_place = False  # Top 2 from each group = 16 teams -> Round of 16
    elif format == '48_team':
        n_groups = 12
        teams_per_group = 4
        n_teams = 48
        use_third_place = True  # Top 2 (24) + 8 best third = 32 teams -> Round of 32
    else:
        raise ValueError(f"Unknown format: {format}. Use '32_team' or '48_team'")
    
    # Validate team count
    if len(sorted_teams) < n_teams:
        print(f"Warning: Only {len(sorted_teams)} teams provided, expected {n_teams}. Padding with available teams.")
        sorted_teams = sorted_teams[:n_teams] if len(sorted_teams) >= n_teams else sorted_teams
    else:
        sorted_teams = sorted_teams[:n_teams]
    
    champions = Counter()
    finalists = Counter()
    semifinalists = Counter()
    
    for sim_num in range(n_tournament_sims):
        # Create groups with serpentine seeding
        groups = {f'Group {chr(65+i)}': [] for i in range(n_groups)}
        for i, team in enumerate(sorted_teams):
            pot = i // n_groups
            if pot % 2 == 0:
                group_idx = i % n_groups
            else:
                group_idx = (n_groups - 1) - (i % n_groups)
            groups[f'Group {chr(65+group_idx)}'].append(team)
        
        # Simulate group stage
        group_results = simulate_group_stage(groups, model_home, model_away,
                                             player_aggregates, elo_ratings, recent_form)
        
        # Determine advancing teams based on format
        advancing = []
        third_place = []
        
        for group_name, standings in group_results.items():
            advancing.append(standings[0][0])  # Winner
            advancing.append(standings[1][0])  # Runner-up
            if use_third_place and len(standings) > 2:
                third_place.append((standings[2][0], standings[2][1]['points'], standings[2][1]['gd']))
        
        # Add best third-place teams for 48-team format
        if use_third_place:
            third_place.sort(key=lambda x: (x[1], x[2]), reverse=True)
            advancing.extend([t[0] for t in third_place[:8]])
        
        # Knockout rounds
        np.random.shuffle(advancing)
        current_round = advancing
        
        # For 48-team: R32 -> R16 -> QF -> SF -> F
        # For 32-team: R16 -> QF -> SF -> F
        round_names = []
        if format == '48_team':
            round_names = ['Round of 32', 'Round of 16', 'Quarter Finals', 'Semi Finals', 'Final']
        else:
            round_names = ['Round of 16', 'Quarter Finals', 'Semi Finals', 'Final']
        
        for round_name in round_names[:-1]:  # All rounds except Final
            next_round = []
            for i in range(0, len(current_round), 2):
                if i+1 < len(current_round):
                    winner = simulate_knockout_match(current_round[i], current_round[i+1],
                                                     model_home, model_away, player_aggregates,
                                                     elo_ratings, recent_form)
                    next_round.append(winner)
                    
                    # Track semifinalists
                    if round_name == 'Quarter Finals':
                        semifinalists[winner] += 1
                    # Track finalists
                    if round_name == 'Semi Finals':
                        finalists[winner] += 1
            
            current_round = next_round
        
        # Final
        if len(current_round) >= 2:
            champion = simulate_knockout_match(current_round[0], current_round[1],
                                               model_home, model_away, player_aggregates,
                                               elo_ratings, recent_form)
            champions[champion] += 1
    
    return champions, finalists, semifinalists

# Run tournament simulation
print("Running 2026 World Cup simulation (100 tournaments)...")
print("Format: 48 teams, 12 groups")
print("This may take a few minutes...\n")

champions, finalists, semifinalists = simulate_tournament(qualified_teams, n_tournament_sims=100, format='48_team')

In [None]:
# Display results
n_sims = 100

print("=" * 50)
print("2026 WORLD CUP PREDICTION RESULTS")
print("=" * 50)

print("\nChampionship Probability (Top 20):")
print("-" * 40)
for i, (team, count) in enumerate(champions.most_common(20), 1):
    prob = count / n_sims * 100
    bar = '*' * int(prob / 2)
    print(f"{i:2}. {team:20} {prob:5.1f}% {bar}")

print("\n\nFinalist Probability (Top 15):")
print("-" * 40)
for i, (team, count) in enumerate(finalists.most_common(15), 1):
    prob = count / n_sims * 100
    print(f"{i:2}. {team:20} {prob:5.1f}%")

print("\n\nSemifinalist Probability (Top 15):")
print("-" * 40)
for i, (team, count) in enumerate(semifinalists.most_common(15), 1):
    prob = count / n_sims * 100
    print(f"{i:2}. {team:20} {prob:5.1f}%")

## 12. Visualization

In [None]:
# Championship probability visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 8))

# Top 15 championship probabilities
top_15 = champions.most_common(15)
teams = [t[0] for t in top_15]
probs = [t[1] / n_sims * 100 for t in top_15]

colors = plt.cm.Blues(np.linspace(0.3, 0.9, len(teams)))[::-1]
axes[0].barh(teams[::-1], probs[::-1], color=colors)
axes[0].set_xlabel('Championship Probability (%)')
axes[0].set_title('2026 World Cup - Championship Probability')
for i, (team, prob) in enumerate(zip(teams[::-1], probs[::-1])):
    axes[0].text(prob + 0.5, i, f'{prob:.1f}%', va='center', fontsize=9)

# Elo ratings for comparison
elo_top = sorted([(t, elo_ratings.get(t, 1500)) for t in qualified_teams], 
                 key=lambda x: x[1], reverse=True)[:15]
elo_teams = [t[0] for t in elo_top]
elo_vals = [t[1] for t in elo_top]

colors = plt.cm.Reds(np.linspace(0.3, 0.9, len(elo_teams)))[::-1]
axes[1].barh(elo_teams[::-1], elo_vals[::-1], color=colors)
axes[1].set_xlabel('Elo Rating')
axes[1].set_title('Current Elo Ratings (Top 15)')
for i, (team, elo) in enumerate(zip(elo_teams[::-1], elo_vals[::-1])):
    axes[1].text(elo + 5, i, f'{elo:.0f}', va='center', fontsize=9)

plt.tight_layout()
viz_path = f'{MODEL_PATH}/wc2026_predictions.png'
plt.savefig(viz_path, dpi=150, bbox_inches='tight')
plt.show()
print(f"\nVisualization saved to {viz_path}")

In [None]:
# Sample head-to-head predictions for key matchups
key_matchups = [
    ('Brazil', 'Argentina'),
    ('France', 'England'),
    ('Germany', 'Spain'),
    ('Netherlands', 'Portugal'),
    ('United States', 'Mexico'),
    ('Brazil', 'France'),
]

print("\nKey Matchup Predictions:")
print("=" * 70)

for home, away in key_matchups:
    result = predict_match(home, away, model_home, model_away,
                           player_aggregates, elo_ratings, recent_form)
    if result:
        print(f"\n{home} vs {away}")
        print(f"Expected Score: {result['expected_home_goals']:.1f} - {result['expected_away_goals']:.1f}")
        print(f"  {home} wins: {result['home_win_prob']:.1%}")
        print(f"  Draw:        {result['draw_prob']:.1%}")
        print(f"  {away} wins: {result['away_win_prob']:.1%}")

In [None]:
# Final Summary
print("\n" + "=" * 60)
print("MODEL SUMMARY")
print("=" * 60)

print(f"\nData:")
print(f"  - Training matches: {len(X_train):,} (2010-2021)")
print(f"  - Test matches: {len(X_test):,} (2022+)")
print(f"  - Features: {len(feature_cols)}")
print(f"  - Teams with Elo: {len(elo_ratings)}")
print(f"  - Countries with player data: {len(player_aggregates['country'].unique())}")

print(f"\nModel Performance:")
print(f"  - Home Goals RMSE: {np.sqrt(mean_squared_error(y_home_test, y_home_pred)):.3f}")
print(f"  - Away Goals RMSE: {np.sqrt(mean_squared_error(y_away_test, y_away_pred)):.3f}")
print(f"  - Match Outcome Accuracy: {outcome_accuracy:.1%}")

print(f"\n2026 World Cup Prediction:")
print(f"  - Favorite: {champions.most_common(1)[0][0]} ({champions.most_common(1)[0][1]/n_sims*100:.1f}%)")
top_3 = champions.most_common(3)
print(f"  - Top 3: {', '.join([f'{t[0]} ({t[1]/n_sims*100:.1f}%)' for t in top_3])}")

print("\n" + "=" * 60)
print(f"Model artifacts saved to: {MODEL_PATH}")
if USE_GOOGLE_DRIVE:
    print("(Saved to Google Drive - persistent storage)")
print("=" * 60)