# US Open 2025 Prediction Analysis

This notebook provides detailed analysis and visualization of the US Open 2025 predictions.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


import sys
import os
sys.path.append('../src')

from prediction.us_open_2025_predictor import USOpen2025Predictor
from modeling.tournament_predictor import TournamentPredictor

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

%matplotlib inline

## 1. Load Data and Generate Predictions

In [2]:
# Initialize predictor
predictor = USOpen2025Predictor()

# Run full prediction
predictions_df, report = predictor.run_full_prediction()

print("Predictions shape:", predictions_df.shape)
print("\nColumns:", list(predictions_df.columns))
print("\nFirst few rows:")
predictions_df.head()

Starting 2025 US Open prediction process...
Loading processed data...
File not found: data/processed/processed_players.csv
File not found: data/processed/processed_us_open_history.csv
File not found: data/processed/processed_major_championships.csv
File not found: data/processed/processed_pga_tour_recent.csv
File not found: data/processed/processed_current_rankings.csv
File not found: data/processed/processed_current_skills.csv
Getting current US Open field...
Could not retrieve current field: All arrays must be of the same length
Using top-ranked players as field...
Created simulated field: 156 players
Training prediction models...
No training data available
Could not train models
Predictions shape: (0, 0)

Columns: []

First few rows:


## 2. Prediction Report

In [3]:
print(report)

Error: Could not train models


## 3. Visualization of Predictions

In [4]:
# Find probability columns
prob_cols = [col for col in predictions_df.columns if 'prob' in col]
print("Probability columns:", prob_cols)

if prob_cols:
    # Create subplots for different predictions
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('US Open 2025 Prediction Distributions', fontsize=16)
    
    for i, col in enumerate(prob_cols[:4]):
        ax = axes[i//2, i%2]
        
        # Histogram of probabilities
        ax.hist(predictions_df[col] * 100, bins=30, alpha=0.7, edgecolor='black')
        ax.set_xlabel('Probability (%)')
        ax.set_ylabel('Number of Players')
        ax.set_title(col.replace('_prob_xgboost', '').replace('_', ' ').title())
        ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

Probability columns: []


## 4. Top Candidates Analysis

In [5]:
# Top win candidates
win_col = [col for col in predictions_df.columns if 'won_prob' in col]

if win_col:
    top_winners = predictions_df.nlargest(20, win_col[0])
    
    # Create interactive bar chart
    fig = px.bar(
        top_winners, 
        x=win_col[0], 
        y='player_name' if 'player_name' in top_winners.columns else 'player_id',
        orientation='h',
        title='Top 20 Win Candidates - US Open 2025',
        labels={win_col[0]: 'Win Probability', 'player_name': 'Player'}
    )
    
    fig.update_layout(height=600, yaxis={'categoryorder':'total ascending'})
    fig.show()
    
    # Display table
    print("\nTop 20 Win Candidates:")
    display_cols = ['player_name', win_col[0]] if 'player_name' in top_winners.columns else ['player_id', win_col[0]]
    top_winners[display_cols].head(20)

## 5. Model Performance Analysis

In [6]:
# Analyze model performance if available
if hasattr(predictor.predictor, 'models') and predictor.predictor.models:
    print("Model Performance Summary:")
    print("=" * 40)
    
    for target, models in predictor.predictor.models.items():
        print(f"\n{target.upper()}:")
        for model_type, model_info in models.items():
            metrics = model_info.get('metrics', {})
            cv_mean = model_info.get('cv_mean', 0)
            cv_std = model_info.get('cv_std', 0)
            
            print(f"  {model_type}:")
            for metric, value in metrics.items():
                print(f"    {metric}: {value:.4f}")
            print(f"    CV Score: {cv_mean:.4f} (+/- {cv_std*2:.4f})")
else:
    print("No model performance data available")

No model performance data available


## 6. Feature Importance Analysis

In [7]:
# Feature importance for top 10 prediction
try:
    feature_importance = predictor.predictor.get_feature_importance('top_10', 'xgboost', top_n=15)
    
    if not feature_importance.empty:
        # Create horizontal bar chart
        fig = px.bar(
            feature_importance, 
            x='importance', 
            y='feature',
            orientation='h',
            title='Top 15 Most Important Features for Top 10 Prediction',
            labels={'importance': 'Feature Importance', 'feature': 'Feature'}
        )
        
        fig.update_layout(height=500, yaxis={'categoryorder':'total ascending'})
        fig.show()
        
        print("\nTop 15 Features:")
        display(feature_importance)
    else:
        print("No feature importance data available")
        
except Exception as e:
    print(f"Could not generate feature importance: {e}")

Could not generate feature importance: Model not found for target=top_10, model_type=xgboost


## 7. Correlation Analysis

In [8]:
# Correlation between different prediction targets
prob_cols = [col for col in predictions_df.columns if 'prob' in col]

if len(prob_cols) > 1:
    # Calculate correlation matrix
    corr_matrix = predictions_df[prob_cols].corr()
    
    # Create heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, 
                square=True, fmt='.3f')
    plt.title('Correlation Between Prediction Targets')
    plt.tight_layout()
    plt.show()
    
    print("\nCorrelation Matrix:")
    display(corr_matrix)

## 8. Risk vs Reward Analysis

In [9]:
# Compare win probability vs top 10 probability
win_col = [col for col in predictions_df.columns if 'won_prob' in col]
top10_col = [col for col in predictions_df.columns if 'top_10_prob' in col]

if win_col and top10_col:
    # Create scatter plot
    fig = px.scatter(
        predictions_df, 
        x=top10_col[0], 
        y=win_col[0],
        hover_name='player_name' if 'player_name' in predictions_df.columns else 'player_id',
        title='Win Probability vs Top 10 Probability',
        labels={
            top10_col[0]: 'Top 10 Probability',
            win_col[0]: 'Win Probability'
        }
    )
    
    fig.update_layout(height=600)
    fig.show()
    
    # Identify value picks (high top 10 probability, lower win probability)
    predictions_df['value_score'] = predictions_df[top10_col[0]] / (predictions_df[win_col[0]] + 0.001)
    value_picks = predictions_df.nlargest(10, 'value_score')
    
    print("\nTop 10 Value Picks (High Top 10 Probability, Lower Win Probability):")
    display_cols = ['player_name', top10_col[0], win_col[0], 'value_score'] if 'player_name' in value_picks.columns else ['player_id', top10_col[0], win_col[0], 'value_score']
    display(value_picks[display_cols])

## 9. Export Results

In [10]:
# Save detailed analysis results
from datetime import datetime

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Save predictions with analysis
analysis_file = f'../data/predictions/us_open_2025_analysis_{timestamp}.csv'
predictions_df.to_csv(analysis_file, index=False)

print(f"Analysis results saved to: {analysis_file}")
print(f"Total players analyzed: {len(predictions_df)}")

# Summary statistics
print("\nSummary Statistics:")
for col in prob_cols:
    target_name = col.replace('_prob_xgboost', '').replace('_prob_lightgbm', '').replace('_prob_random_forest', '')
    mean_prob = predictions_df[col].mean() * 100
    max_prob = predictions_df[col].max() * 100
    std_prob = predictions_df[col].std() * 100
    print(f"{target_name.upper():<15} Mean: {mean_prob:5.1f}%  Max: {max_prob:5.1f}%  Std: {std_prob:5.1f}%")

Analysis results saved to: ../data/predictions/us_open_2025_analysis_20250612_131958.csv
Total players analyzed: 0

Summary Statistics:
