In [7]:
%pip install xgboost

import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, log_loss, accuracy_score
import xgboost as xgb
import json
from datetime import datetime
import matplotlib.pyplot as plt

# Load data
defensive_df      = pd.read_csv(os.path.join("data/defensive_2019_2023.csv"))
receiving_df      = pd.read_csv(os.path.join("data/receiving_2019_2023.csv"))
rushing_df        = pd.read_csv(os.path.join("data/rushing_2019_2023.csv"))
passing_df        = pd.read_csv(os.path.join("data/passing_2019_2023.csv"))
fumbles_df        = pd.read_csv(os.path.join("data/fumbles_2019_2023.csv"))
interceptions_df  = pd.read_csv(os.path.join("data/interceptions_2019_2023.csv"))
kickreturns_df    = pd.read_csv(os.path.join("data/kickreturns_2019_2023.csv"))
puntreturns_df    = pd.read_csv(os.path.join("data/puntreturn_2019_2023.csv"))
kicking_df        = pd.read_csv(os.path.join("data/kicking_2019_2023.csv"))
punting_df        = pd.read_csv(os.path.join("data/punting_2019_2023.csv"))


# ============================================================================
# ECE CALCULATION (from Walsh & Joshi, 2024)
# ============================================================================

def calculate_ece(y_true, y_pred_proba, n_bins=10):
    """
    Calculate Expected Calibration Error
    
    Args:
        y_true: True binary labels (0 or 1)
        y_pred_proba: Predicted probabilities [0, 1]
        n_bins: Number of bins for calibration (default 10)
    
    Returns:
        ece: Expected Calibration Error
        bin_data: DataFrame with per-bin statistics
    """
    # Create bins
    bins = np.linspace(0, 1, n_bins + 1)
    bin_indices = np.digitize(y_pred_proba, bins) - 1
    bin_indices = np.clip(bin_indices, 0, n_bins - 1)
    
    ece = 0.0
    bin_data = []
    
    for i in range(n_bins):
        bin_mask = bin_indices == i
        
        if bin_mask.sum() == 0:
            continue
        
        bin_size = bin_mask.sum()
        bin_confidence = y_pred_proba[bin_mask].mean()
        bin_accuracy = y_true[bin_mask].mean()
        
        # ECE contribution
        ece += (bin_size / len(y_true)) * abs(bin_confidence - bin_accuracy)
        
        bin_data.append({
            'bin': i,
            'bin_lower': bins[i],
            'bin_upper': bins[i + 1],
            'count': int(bin_size),
            'avg_confidence': bin_confidence,
            'avg_accuracy': bin_accuracy,
            'calibration_error': abs(bin_confidence - bin_accuracy)
        })
    
    return ece, pd.DataFrame(bin_data)


def plot_calibration_curve(y_true, y_pred_proba, n_bins=10, title="Calibration Curve"):
    """Plot calibration curve"""
    ece, bin_data = calculate_ece(y_true, y_pred_proba, n_bins)
    
    fig, ax = plt.subplots(figsize=(8, 6))
    
    # Plot perfect calibration
    ax.plot([0, 1], [0, 1], 'k--', label='Perfect Calibration')
    
    # Plot actual calibration
    if not bin_data.empty:
        ax.plot(bin_data['avg_confidence'], bin_data['avg_accuracy'], 
                'o-', label=f'Model (ECE={ece:.4f})')
    
    ax.set_xlabel('Predicted Probability')
    ax.set_ylabel('Actual Frequency')
    ax.set_title(title)
    ax.legend()
    ax.grid(alpha=0.3)
    
    return fig, ece


# ============================================================================
# DATA PREPARATION
# ============================================================================

def prepare_player_prop_data(df, stat_col='YDS', prop_threshold=50, additional_stat_cols=None):
    """Prepare data for binary prop prediction"""
    df = df.copy()
    
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values(['athlete_id', 'date']).reset_index(drop=True)
    
    if additional_stat_cols is None:
        exclude_cols = ['game_id', 'date', 'season', 'team', 'home_away', 
                       'opposing_team', 'athlete_id', 'display_name', 'position', 'position_abbr']
        additional_stat_cols = [col for col in df.columns if col not in exclude_cols]
    
    for col in additional_stat_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    df = df.dropna(subset=[stat_col])
    df['prop_outcome'] = (df[stat_col] > prop_threshold).astype(int)
    
    # Encode categorical
    le_team = LabelEncoder()
    le_opp = LabelEncoder()
    le_position = LabelEncoder()
    
    df['team_encoded'] = le_team.fit_transform(df['team'].fillna('UNK'))
    df['opposing_team_encoded'] = le_opp.fit_transform(df['opposing_team'].fillna('UNK'))
    df['position_encoded'] = le_position.fit_transform(df['position'].fillna('UNK'))
    df['home_away_encoded'] = (df['home_away'] == 'home').astype(int)
    
    feature_cols = ['team_encoded', 'opposing_team_encoded', 'position_encoded', 
                    'home_away_encoded', 'season']
    
    # Rolling features
    for window in [3, 5, 8]:
        col_name = f'{stat_col.lower()}_rolling_{window}'
        df[col_name] = df.groupby('athlete_id')[stat_col].transform(
            lambda x: x.rolling(window, min_periods=1).mean().shift(1)
        )
        feature_cols.append(col_name)
    
    for add_stat in additional_stat_cols:
        if add_stat != stat_col and add_stat in df.columns:
            for window in [3, 8]:
                col_name = f'{add_stat.lower()}_rolling_{window}'
                df[col_name] = df.groupby('athlete_id')[add_stat].transform(
                    lambda x: x.rolling(window, min_periods=1).mean().shift(1)
                )
                feature_cols.append(col_name)
    
    df['prop_hit_rate_8'] = df.groupby('athlete_id')['prop_outcome'].transform(
        lambda x: x.rolling(8, min_periods=1).mean().shift(1)
    )
    feature_cols.append('prop_hit_rate_8')
    
    df[f'{stat_col.lower()}_season_avg'] = df.groupby(['athlete_id', 'season'])[stat_col].transform(
        lambda x: x.expanding().mean().shift(1)
    )
    feature_cols.append(f'{stat_col.lower()}_season_avg')
    
    df['prop_season_hit_rate'] = df.groupby(['athlete_id', 'season'])['prop_outcome'].transform(
        lambda x: x.expanding().mean().shift(1)
    )
    feature_cols.append('prop_season_hit_rate')
    
    df[f'{stat_col.lower()}_std_8'] = df.groupby('athlete_id')[stat_col].transform(
        lambda x: x.rolling(8, min_periods=2).std().shift(1)
    )
    feature_cols.append(f'{stat_col.lower()}_std_8')
    
    df['trend'] = df[f'{stat_col.lower()}_rolling_3'] - df[f'{stat_col.lower()}_season_avg']
    feature_cols.append('trend')
    
    df['game_num'] = df.groupby(['athlete_id', 'season']).cumcount() + 1
    feature_cols.append('game_num')
    
    df['career_games'] = df.groupby('athlete_id').cumcount()
    feature_cols.append('career_games')
    
    df = df.fillna(0)
    feature_cols = list(dict.fromkeys(feature_cols))
    
    return df, feature_cols, le_team, le_opp, le_position


Note: you may need to restart the kernel to use updated packages.


In [8]:
# === XGBoost Binary Classification ===
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'binary:logistic',  # Binary classification
    'max_depth': 6,
    'learning_rate': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'eval_metric': ['logloss', 'auc'],  # Log loss + AUC
    'seed': 42,
    'scale_pos_weight': 1  # Adjust if imbalanced
}

# Train with early stopping
evals = [(dtrain, 'train'), (dtest, 'test')]
model = xgb.train(
    params,
    dtrain,
    num_boost_round=1000,
    evals=evals,
    early_stopping_rounds=50,
    verbose_eval=50
)

# Predictions (probabilities)
y_pred_train_proba = model.predict(dtrain)
y_pred_test_proba = model.predict(dtest)

# Binary predictions (threshold 0.5)
y_pred_train = (y_pred_train_proba > 0.5).astype(int)
y_pred_test = (y_pred_test_proba > 0.5).astype(int)

# === Evaluation Metrics (as per image) ===
print("\n" + "="*50)
print("=== TRAINING METRICS ===")
print("="*50)
print(f"Log Loss:  {log_loss(y_train, y_pred_train_proba):.4f}")
print(f"ROC-AUC:   {roc_auc_score(y_train, y_pred_train_proba):.4f}")
print(f"Accuracy:  {accuracy_score(y_train, y_pred_train):.4f}")

print("\n" + "="*50)
print("=== TEST METRICS ===")
print("="*50)
print(f"Log Loss:  {log_loss(y_test, y_pred_test_proba):.4f}")
print(f"ROC-AUC:   {roc_auc_score(y_test, y_pred_test_proba):.4f}")
print(f"Accuracy:  {accuracy_score(y_test, y_pred_test):.4f}")

# Target: AUC > 0.60 to validate task difficulty
if roc_auc_score(y_test, y_pred_test_proba) > 0.60:
    print("\nTask validated: AUC > 0.60")
else:
    print("\nX Warning: AUC < 0.60, task may be too difficult or need more features")


[0]	train-logloss:0.48413	train-auc:0.81667	test-logloss:0.48949	test-auc:0.79884
[50]	train-logloss:0.34614	train-auc:0.87260	test-logloss:0.41529	test-auc:0.79983
[55]	train-logloss:0.34195	train-auc:0.87670	test-logloss:0.41599	test-auc:0.79948

=== TRAINING METRICS ===
Log Loss:  0.3419
ROC-AUC:   0.8767
Accuracy:  0.8506

=== TEST METRICS ===
Log Loss:  0.4160
ROC-AUC:   0.7995
Accuracy:  0.8135

‚úì Task validated: AUC > 0.60


In [9]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, log_loss, accuracy_score
import xgboost as xgb
import json
from datetime import datetime
import matplotlib.pyplot as plt

print("="*70)
print("SECTION 1: DATA LOADING")
print("="*70)

# Load data
defensive_df      = pd.read_csv(os.path.join("data/defensive_2019_2023.csv"))
receiving_df      = pd.read_csv(os.path.join("data/receiving_2019_2023.csv"))
rushing_df        = pd.read_csv(os.path.join("data/rushing_2019_2023.csv"))
passing_df        = pd.read_csv(os.path.join("data/passing_2019_2023.csv"))
fumbles_df        = pd.read_csv(os.path.join("data/fumbles_2019_2023.csv"))
interceptions_df  = pd.read_csv(os.path.join("data/interceptions_2019_2023.csv"))
kickreturns_df    = pd.read_csv(os.path.join("data/kickreturns_2019_2023.csv"))
puntreturns_df    = pd.read_csv(os.path.join("data/puntreturn_2019_2023.csv"))
kicking_df        = pd.read_csv(os.path.join("data/kicking_2019_2023.csv"))
punting_df        = pd.read_csv(os.path.join("data/punting_2019_2023.csv"))

print(f"Loaded 10 datasets")
print(f"   Receiving: {len(receiving_df)} rows, {len(receiving_df.columns)} cols")
print(f"   Rushing:   {len(rushing_df)} rows, {len(rushing_df.columns)} cols")
print(f"   Passing:   {len(passing_df)} rows, {len(passing_df.columns)} cols")
print(f"   Defensive: {len(defensive_df)} rows, {len(defensive_df.columns)} cols")
print(f"\nReceiving columns: {list(receiving_df.columns[:10])}")
print(f"\nSample receiving data:")
print(receiving_df.head(3))


# ============================================================================
print("\n" + "="*70)
print("SECTION 2: ECE CALCULATION FUNCTIONS")
print("="*70)
# ============================================================================

def calculate_ece(y_true, y_pred_proba, n_bins=10):
    """
    Calculate Expected Calibration Error
    
    Args:
        y_true: True binary labels (0 or 1)
        y_pred_proba: Predicted probabilities [0, 1]
        n_bins: Number of bins for calibration (default 10)
    
    Returns:
        ece: Expected Calibration Error
        bin_data: DataFrame with per-bin statistics
    """
    bins = np.linspace(0, 1, n_bins + 1)
    bin_indices = np.digitize(y_pred_proba, bins) - 1
    bin_indices = np.clip(bin_indices, 0, n_bins - 1)
    
    ece = 0.0
    bin_data = []
    
    for i in range(n_bins):
        bin_mask = bin_indices == i
        
        if bin_mask.sum() == 0:
            continue
        
        bin_size = bin_mask.sum()
        bin_confidence = y_pred_proba[bin_mask].mean()
        bin_accuracy = y_true[bin_mask].mean()
        
        ece += (bin_size / len(y_true)) * abs(bin_confidence - bin_accuracy)
        
        bin_data.append({
            'bin': i,
            'bin_lower': bins[i],
            'bin_upper': bins[i + 1],
            'count': int(bin_size),
            'avg_confidence': bin_confidence,
            'avg_accuracy': bin_accuracy,
            'calibration_error': abs(bin_confidence - bin_accuracy)
        })
    
    return ece, pd.DataFrame(bin_data)


def plot_calibration_curve(y_true, y_pred_proba, n_bins=10, title="Calibration Curve"):
    """Plot calibration curve"""
    ece, bin_data = calculate_ece(y_true, y_pred_proba, n_bins)
    
    fig, ax = plt.subplots(figsize=(8, 6))
    
    ax.plot([0, 1], [0, 1], 'k--', label='Perfect Calibration')
    
    if not bin_data.empty:
        ax.plot(bin_data['avg_confidence'], bin_data['avg_accuracy'], 
                'o-', label=f'Model (ECE={ece:.4f})')
    
    ax.set_xlabel('Predicted Probability')
    ax.set_ylabel('Actual Frequency')
    ax.set_title(title)
    ax.legend()
    ax.grid(alpha=0.3)
    
    return fig, ece

# Test ECE function
print("Testing ECE calculation...")
y_test = np.array([1, 0, 1, 1, 0, 1, 0, 0, 1, 1])
y_pred_test = np.array([0.9, 0.1, 0.8, 0.7, 0.3, 0.85, 0.2, 0.15, 0.75, 0.95])
test_ece, test_bins = calculate_ece(y_test, y_pred_test, n_bins=5)
print(f"ECE function works! Test ECE: {test_ece:.4f}")
print(f"\n Test bins:\n{test_bins}")

SECTION 1: DATA LOADING
‚úÖ Loaded 10 datasets
   Receiving: 22412 rows, 15 cols
   Rushing:   11534 rows, 14 cols
   Passing:   3450 rows, 17 cols
   Defensive: 54411 rows, 16 cols

Receiving columns: ['game_id', 'date', 'season', 'team', 'home_away', 'opposing_team', 'athlete_id', 'display_name', 'position', 'REC']

Sample receiving data:
     game_id        date  season                team home_away  \
0  401127972  2019-09-08    2019  Indianapolis Colts      away   
1  401127972  2019-09-08    2019  Indianapolis Colts      away   
2  401127972  2019-09-08    2019  Indianapolis Colts      away   

          opposing_team  athlete_id    display_name position  REC  YDS   AVG  \
0  Los Angeles Chargers       14924     T.Y. Hilton       WR    8   87  10.9   
1  Los Angeles Chargers     3728254       Deon Cain       WR    2   35  17.5   
2  Los Angeles Chargers     2977609  Devin Funchess       TE    3   32  10.7   

   TD  LONG  TGTS  
0   2    19     9  
1   0    25     2  
2   0    16

In [10]:

# ============================================================================
print("\n" + "="*70)
print("SECTION 3: DATA PREPARATION FUNCTIONS")
print("="*70)
# ============================================================================

def prepare_player_prop_data(df, stat_col='YDS', prop_threshold=50, additional_stat_cols=None):
    """Prepare data for binary prop prediction"""
    df = df.copy()
    
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values(['athlete_id', 'date']).reset_index(drop=True)
    
    if additional_stat_cols is None:
        exclude_cols = ['game_id', 'date', 'season', 'team', 'home_away', 
                       'opposing_team', 'athlete_id', 'display_name', 'position', 'position_abbr']
        additional_stat_cols = [col for col in df.columns if col not in exclude_cols]
    
    for col in additional_stat_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    df = df.dropna(subset=[stat_col])
    df['prop_outcome'] = (df[stat_col] > prop_threshold).astype(int)
    
    le_team = LabelEncoder()
    le_opp = LabelEncoder()
    le_position = LabelEncoder()
    
    df['team_encoded'] = le_team.fit_transform(df['team'].fillna('UNK'))
    df['opposing_team_encoded'] = le_opp.fit_transform(df['opposing_team'].fillna('UNK'))
    df['position_encoded'] = le_position.fit_transform(df['position'].fillna('UNK'))
    df['home_away_encoded'] = (df['home_away'] == 'home').astype(int)
    
    feature_cols = ['team_encoded', 'opposing_team_encoded', 'position_encoded', 
                    'home_away_encoded', 'season']
    
    # Rolling features
    for window in [3, 5, 8]:
        col_name = f'{stat_col.lower()}_rolling_{window}'
        df[col_name] = df.groupby('athlete_id')[stat_col].transform(
            lambda x: x.rolling(window, min_periods=1).mean().shift(1)
        )
        feature_cols.append(col_name)
    
    for add_stat in additional_stat_cols:
        if add_stat != stat_col and add_stat in df.columns:
            for window in [3, 8]:
                col_name = f'{add_stat.lower()}_rolling_{window}'
                df[col_name] = df.groupby('athlete_id')[add_stat].transform(
                    lambda x: x.rolling(window, min_periods=1).mean().shift(1)
                )
                feature_cols.append(col_name)
    
    df['prop_hit_rate_8'] = df.groupby('athlete_id')['prop_outcome'].transform(
        lambda x: x.rolling(8, min_periods=1).mean().shift(1)
    )
    feature_cols.append('prop_hit_rate_8')
    
    df[f'{stat_col.lower()}_season_avg'] = df.groupby(['athlete_id', 'season'])[stat_col].transform(
        lambda x: x.expanding().mean().shift(1)
    )
    feature_cols.append(f'{stat_col.lower()}_season_avg')
    
    df['prop_season_hit_rate'] = df.groupby(['athlete_id', 'season'])['prop_outcome'].transform(
        lambda x: x.expanding().mean().shift(1)
    )
    feature_cols.append('prop_season_hit_rate')
    
    df[f'{stat_col.lower()}_std_8'] = df.groupby('athlete_id')[stat_col].transform(
        lambda x: x.rolling(8, min_periods=2).std().shift(1)
    )
    feature_cols.append(f'{stat_col.lower()}_std_8')
    
    df['trend'] = df[f'{stat_col.lower()}_rolling_3'] - df[f'{stat_col.lower()}_season_avg']
    feature_cols.append('trend')
    
    df['game_num'] = df.groupby(['athlete_id', 'season']).cumcount() + 1
    feature_cols.append('game_num')
    
    df['career_games'] = df.groupby('athlete_id').cumcount()
    feature_cols.append('career_games')
    
    df = df.fillna(0)
    feature_cols = list(dict.fromkeys(feature_cols))
    
    return df, feature_cols, le_team, le_opp, le_position

# Test data preparation
print("Testing data preparation on receiving data...")
test_prepared, test_features, _, _, _ = prepare_player_prop_data(
    receiving_df.head(1000), 
    stat_col='YDS', 
    prop_threshold=50
)
print(f"Data preparation works!")
print(f"Input rows: 1000, Output rows: {len(test_prepared)}")
print(f"Number of features: {len(test_features)}")
print(f"Features: {test_features[:10]}...")
print(f"Prop outcome distribution: {test_prepared['prop_outcome'].value_counts().to_dict()}")
print(f"\nSample prepared data:")
print(test_prepared[['display_name', 'YDS', 'prop_outcome', 'yds_rolling_3', 'yds_season_avg']].head(5))


SECTION 3: DATA PREPARATION FUNCTIONS
Testing data preparation on receiving data...
‚úÖ Data preparation works!
   Input rows: 1000, Output rows: 1000
   Number of features: 25
   Features: ['team_encoded', 'opposing_team_encoded', 'position_encoded', 'home_away_encoded', 'season', 'yds_rolling_3', 'yds_rolling_5', 'yds_rolling_8', 'rec_rolling_3', 'rec_rolling_8']...
   Prop outcome distribution: {0: 764, 1: 236}

Sample prepared data:
       display_name  YDS  prop_outcome  yds_rolling_3  yds_season_avg
0      Jason Witten   15             0       0.000000        0.000000
1      Jason Witten   25             0      15.000000       15.000000
2      Jason Witten   54             1      20.000000       20.000000
3      Jason Witten   50             0      31.333333       31.333333
4  Larry Fitzgerald  113             1       0.000000        0.000000


In [15]:
# ============================================================================
print("\n" + "="*70)
print("SECTION 4: MODEL TRAINING FUNCTION")
print("="*70)
# ============================================================================

def train_prop_model(df, stat_col, threshold, category_name, 
                     additional_stats=None, params=None):
    """Train prop model with ECE evaluation"""
    
    print(f"\n{'='*70}")
    print(f"Training: {category_name} - {stat_col} Over {threshold}")
    print('='*70)
    
    prepared_df, feature_cols, le_team, le_opp, le_position = prepare_player_prop_data(
        df, stat_col, threshold, additional_stats
    )
    
    if len(prepared_df) < 100:
        print(f"Insufficient data: {len(prepared_df)} samples. Skipping.")
        return None
    
    X = prepared_df[feature_cols].values
    y = prepared_df['prop_outcome'].values
    
    split_idx = int(len(X) * 0.8)
    X_train, X_test = X[:split_idx], X[split_idx:]
    y_train, y_test = y[:split_idx], y[split_idx:]
    
    print(f"Training samples: {len(X_train)}, Test samples: {len(X_test)}")
    print(f"Train hit rate: {y_train.mean():.2%}, Test hit rate: {y_test.mean():.2%}")
    
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    
    if params is None:
        params = {
            'objective': 'binary:logistic',
            'max_depth': 6,
            'learning_rate': 0.1,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'eval_metric': ['logloss', 'auc'],
            'seed': 42,
            'scale_pos_weight': 1
        }
    
    evals = [(dtrain, 'train'), (dtest, 'test')]
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=1000,
        evals=evals,
        early_stopping_rounds=50,
        verbose_eval=False
    )
    
    y_pred_train_proba = model.predict(dtrain)
    y_pred_test_proba = model.predict(dtest)
    y_pred_train = (y_pred_train_proba > 0.5).astype(int)
    y_pred_test = (y_pred_test_proba > 0.5).astype(int)
    
    train_ece, train_bin_data = calculate_ece(y_train, y_pred_train_proba)
    test_ece, test_bin_data = calculate_ece(y_test, y_pred_test_proba)
    
    train_metrics = {
        'log_loss': log_loss(y_train, y_pred_train_proba),
        'roc_auc': roc_auc_score(y_train, y_pred_train_proba),
        'accuracy': accuracy_score(y_train, y_pred_train),
        'ece': train_ece
    }
    
    test_metrics = {
        'log_loss': log_loss(y_test, y_pred_test_proba),
        'roc_auc': roc_auc_score(y_test, y_pred_test_proba),
        'accuracy': accuracy_score(y_test, y_pred_test),
        'ece': test_ece
    }
    
    print(f"\nTRAIN | Log Loss: {train_metrics['log_loss']:.4f} | "
          f"AUC: {train_metrics['roc_auc']:.4f} | Acc: {train_metrics['accuracy']:.4f} | "
          f"ECE: {train_metrics['ece']:.4f}")
    print(f"TEST  | Log Loss: {test_metrics['log_loss']:.4f} | "
          f"AUC: {test_metrics['roc_auc']:.4f} | Acc: {test_metrics['accuracy']:.4f} | "
          f"ECE: {test_metrics['ece']:.4f}")
    
    if test_metrics['ece'] < 0.10:
        print("Well calibrated: ECE < 0.10")
    elif test_metrics['ece'] < 0.15:
        print("Moderate calibration: 0.10 < ECE < 0.15")
    else:
        print("Poor calibration: ECE > 0.15")
    
    if test_metrics['roc_auc'] > 0.60:
        print("Task validated: AUC > 0.60")
    else:
        print("Warning: AUC < 0.60")
    
    importance = model.get_score(importance_type='gain')
    importance_df = pd.DataFrame([
        {'feature': feature_cols[int(k[1:])], 'gain': v}
        for k, v in importance.items()
    ]).sort_values('gain', ascending=False)
    
    print(f"\nüîù Top 5 Features:")
    for idx, row in importance_df.head(5).iterrows():
        print(f"   {row['feature']}: {row['gain']:.2f}")
    
    fig, _ = plot_calibration_curve(
        y_test, y_pred_test_proba, 
        title=f"{category_name} {stat_col} O{threshold} - Calibration"
    )
    os.makedirs("plots", exist_ok=True)
    fig.savefig(f"plots/calibration_{category_name}_{stat_col}_{threshold}.png", dpi=150, bbox_inches='tight')
    plt.close(fig)
    
    return {
        'model': model,
        'feature_cols': feature_cols,
        'train_metrics': train_metrics,
        'test_metrics': test_metrics,
        'importance': importance_df,
        'calibration': {
            'train_bins': train_bin_data,
            'test_bins': test_bin_data
        },
        'encoders': {
            'team': le_team,
            'opponent': le_opp,
            'position': le_position
        },
        'prepared_df': prepared_df,
        'metadata': {
            'category': category_name,
            'stat': stat_col,
            'threshold': threshold,
            'train_size': len(X_train),
            'test_size': len(X_test),
            'train_hit_rate': float(y_train.mean()),
            'test_hit_rate': float(y_test.mean())
        }
    }

# Test single model training
print("Testing single model training on small subset...")
test_model_result = train_prop_model(
    df=receiving_df.head(2000),
    stat_col='YDS',
    threshold=50,
    category_name='receiving_test',
    additional_stats=['REC', 'YDS', 'TD', 'TGTS']
)

if test_model_result:
    print(f"\nSingle model training works!")
    print(f"   Model type: {type(test_model_result['model'])}")
    print(f"   Test AUC: {test_model_result['test_metrics']['roc_auc']:.4f}")
    print(f"   Test ECE: {test_model_result['test_metrics']['ece']:.4f}")
else:
    print("Single model training failed")


# ============================================================================
print("\n" + "="*70)
print("SECTION 5: INFERENCE FUNCTION")
print("="*70)
# ============================================================================

def predict_player_prop(player_name, stat, threshold, trained_models, 
                        receiving_df, rushing_df, passing_df):
    """
    Predict probability for a specific player prop
    
    Args:
        player_name: "Xavier Worthy"
        stat: "YDS"
        threshold: 65
        trained_models: Dictionary of trained models
        receiving_df, rushing_df, passing_df: DataFrames
    
    Returns:
        dict with probabilities and recommendation
    """
    
    # Determine category from stat
    if stat in ['REC', 'TGTS']:
        category = 'receiving'
        df = receiving_df
    elif stat == 'YDS':
        category = 'receiving'  # Default to receiving, could be improved
        df = receiving_df
    elif stat in ['CAR']:
        category = 'rushing'
        df = rushing_df
    elif stat in ['TD', 'INT']:
        category = 'passing'
        df = passing_df
    else:
        return {"error": "Cannot determine category from stat"}
    
    # Build prop name
    prop_name = f"{category}_{stat}_over_{threshold}"
    if prop_name not in trained_models:
        return {"error": f"No trained model for {prop_name}. Available models: {list(trained_models.keys())}"}
    
    model_data = trained_models[prop_name]
    model = model_data['model']
    feature_cols = model_data['feature_cols']
    prepared_df = model_data['prepared_df']
    
    # Find player in prepared data
    player_data = prepared_df[prepared_df['display_name'] == player_name].copy()
    
    if player_data.empty:
        return {"error": f"Player '{player_name}' not found in {category} data"}
    
    # Get most recent game
    latest_game = player_data.iloc[-1]
    
    # Prepare input features
    X_input = latest_game[feature_cols].values.reshape(1, -1)
    dmatrix = xgb.DMatrix(X_input, feature_names=feature_cols)
    
    # Predict
    prob_over = model.predict(dmatrix)[0]
    prob_under = 1 - prob_over
    
    return {
        'player': player_name,
        'prop': f"{stat} Over {threshold}",
        'prob_over': float(prob_over),
        'prob_under': float(prob_under),
        'recommendation': 'BET OVER' if prob_over > 0.55 else 'BET UNDER' if prob_under > 0.55 else 'NO BET',
        'confidence': float(max(prob_over, prob_under)),
        'recent_avg': float(latest_game.get(f'{stat.lower()}_rolling_3', 0)),
        'season_avg': float(latest_game.get(f'{stat.lower()}_season_avg', 0)),
        'model_ece': float(model_data['test_metrics']['ece']),
        'model_auc': float(model_data['test_metrics']['roc_auc'])
    }


# Test inference with the test model we just trained
print("Testing inference function...")
if test_model_result:
    # Get a sample player name from the prepared data (not raw receiving_df)
    prepared_df = test_model_result['prepared_df']
    
    # Find a player with multiple games
    player_counts = prepared_df['display_name'].value_counts()
    sample_player = player_counts[player_counts > 5].index[0] if len(player_counts[player_counts > 5]) > 0 else player_counts.index[0]
    
    print(f"Testing with player: {sample_player}")
    print(f"   Player has {player_counts[sample_player]} games in prepared data")
    
    # Create a minimal trained_models dict for testing
    test_trained_models = {
        'receiving_YDS_over_50': test_model_result
    }
    
    # Test the prediction
    test_prediction = predict_player_prop(
        player_name=sample_player,
        stat='YDS',
        threshold=50,
        trained_models=test_trained_models,
        receiving_df=receiving_df,
        rushing_df=rushing_df,
        passing_df=passing_df
    )
    
    if 'error' in test_prediction:
        print(f"Inference failed: {test_prediction['error']}")
    else:
        print(f"Inference function works!")
        print(f"Test player: {sample_player}")
        print(f"Prediction:")
        for key, value in test_prediction.items():
            if isinstance(value, float):
                print(f"      {key}: {value:.4f}")
            else:
                print(f"      {key}: {value}")
else:
    print("Skipping inference test (model training failed)")



SECTION 4: MODEL TRAINING FUNCTION
Testing single model training on small subset...

Training: receiving_test - YDS Over 50
Training samples: 1600, Test samples: 400
Train hit rate: 23.56%, Test hit rate: 18.00%

üìä TRAIN | Log Loss: 0.2365 | AUC: 0.9721 | Acc: 0.9244 | ECE: 0.1001
üìä TEST  | Log Loss: 0.4255 | AUC: 0.7328 | Acc: 0.8100 | ECE: 0.0453
‚úÖ Well calibrated: ECE < 0.10
‚úÖ Task validated: AUC > 0.60

üîù Top 5 Features:
   yds_rolling_8: 4.74
   tgts_rolling_8: 4.50
   yds_season_avg: 4.00
   tgts_rolling_3: 3.90
   position_encoded: 3.57

‚úÖ Single model training works!
   Model type: <class 'xgboost.core.Booster'>
   Test AUC: 0.7328
   Test ECE: 0.0453

SECTION 5: INFERENCE FUNCTION
Testing inference function...
Testing with player: Larry Fitzgerald
   Player has 9 games in prepared data
‚úÖ Inference function works!
   Test player: Larry Fitzgerald
   Prediction:
      player: Larry Fitzgerald
      prop: YDS Over 50
      prob_over: 0.1135
      prob_under: 0.8

In [19]:
# ============================================================================
print("\n" + "="*70)
print("SECTION 6: MULTI-PROP TRAINING CONFIGURATION")
print("="*70)
# ============================================================================

PROPS_CONFIG = [
    ('receiving', receiving_df, 'YDS', 50, ['REC', 'YDS', 'TD', 'TGTS']),
    ('receiving', receiving_df, 'YDS', 65, ['REC', 'YDS', 'TD', 'TGTS']),
    ('receiving', receiving_df, 'YDS', 75, ['REC', 'YDS', 'TD', 'TGTS']),
    ('receiving', receiving_df, 'REC', 5, ['REC', 'YDS', 'TD', 'TGTS']),
    ('rushing', rushing_df, 'YDS', 50, ['CAR', 'YDS', 'TD']),
    ('rushing', rushing_df, 'YDS', 75, ['CAR', 'YDS', 'TD']),
    ('passing', passing_df, 'YDS', 250, ['YDS', 'TD', 'INT']),
    ('passing', passing_df, 'TD', 1.5, ['YDS', 'TD', 'INT']),
]

print(f"Configuration loaded")
print(f"   Total props to train: {len(PROPS_CONFIG)}")
print(f"\nProps breakdown:")
for i, (cat, _, stat, thresh, _) in enumerate(PROPS_CONFIG, 1):
    print(f"   {i}. {cat} - {stat} Over {thresh}")

print("\nReady to start full training loop!")
print("   This will train all models and may take several minutes.")
print("   Proceed to Section 7 to start training.")


# ============================================================================
print("\n" + "="*70)
print("SECTION 7: FULL MULTI-PROP TRAINING LOOP")
print("="*70)
print("Uncomment the code below to run full training")
print("="*70)
# ============================================================================

# UNCOMMENT TO RUN FULL TRAINING:

trained_models = {}
results_summary = []

for category, df, stat_col, threshold, additional_stats in PROPS_CONFIG:
    prop_name = f"{category}_{stat_col}_over_{threshold}"
    
    try:
        result = train_prop_model(
            df=df,
            stat_col=stat_col,
            threshold=threshold,
            category_name=category,
            additional_stats=additional_stats
        )
        
        if result is not None:
            trained_models[prop_name] = result
            
            os.makedirs("models", exist_ok=True)
            result['model'].save_model(f"models/xgb_{prop_name}.json")
            
            results_summary.append({
                'prop': prop_name,
                'category': category,
                'stat': stat_col,
                'threshold': threshold,
                'test_auc': result['test_metrics']['roc_auc'],
                'test_accuracy': result['test_metrics']['accuracy'],
                'test_logloss': result['test_metrics']['log_loss'],
                'test_ece': result['test_metrics']['ece'],
                'train_samples': result['metadata']['train_size'],
                'test_samples': result['metadata']['test_size']
            })
    
    except Exception as e:
        print(f"Error training {prop_name}: {e}")
        continue

# ============================================================================
# SECTION 8: SUMMARY AND RESULTS
# ============================================================================

print("\n" + "="*70)
print("TRAINING COMPLETE - SUMMARY")
print("="*70)

results_df = pd.DataFrame(results_summary)

results_df_by_ece = results_df.sort_values('test_ece')
results_df_by_auc = results_df.sort_values('test_auc', ascending=False)

print(f"\nTotal models trained: {len(results_df)}")
print(f"Models with AUC > 0.60: {(results_df['test_auc'] > 0.60).sum()}")
print(f"Models with ECE < 0.10: {(results_df['test_ece'] < 0.10).sum()}")
print(f"Average Test AUC: {results_df['test_auc'].mean():.4f}")
print(f"Average Test ECE: {results_df['test_ece'].mean():.4f}")

print("\n" + "="*70)
print("BEST CALIBRATED MODELS (Lowest ECE)")
print("="*70)
print(results_df_by_ece[['prop', 'test_ece', 'test_auc', 'test_accuracy']].head(10).to_string(index=False))

print("\n" + "="*70)
print("BEST DISCRIMINATIVE MODELS (Highest AUC)")
print("="*70)
print(results_df_by_auc[['prop', 'test_auc', 'test_ece', 'test_accuracy']].head(10).to_string(index=False))

results_df.to_csv("models/training_summary_with_ece.csv", index=False)
print("\n Summary saved to models/training_summary_with_ece.csv")

# ============================================================================
# SECTION 9: EXAMPLE PREDICTIONS
# ============================================================================

print("\n" + "="*70)
print("EXAMPLE PREDICTIONS")
print("="*70)

# Find a player that exists in the data
sample_players = receiving_df['display_name'].value_counts().head(5).index.tolist()
print(f"Sample players available: {sample_players}")

for player in sample_players[:2]:
    prediction = predict_player_prop(
        player_name=player,
        stat="YDS",
        threshold=65,
        trained_models=trained_models,
        receiving_df=receiving_df,
        rushing_df=rushing_df,
        passing_df=passing_df
    )
    print(f"\n{player}:")
    print(json.dumps(prediction, indent=2))

print("\n" + "="*70)
print("ALL TESTING COMPLETE!")
print("="*70)


print("\nAll sections loaded successfully!")
print("   To run full training, uncomment Section 7 code block above.")


SECTION 6: MULTI-PROP TRAINING CONFIGURATION
‚úÖ Configuration loaded
   Total props to train: 8

Props breakdown:
   1. receiving - YDS Over 50
   2. receiving - YDS Over 65
   3. receiving - YDS Over 75
   4. receiving - REC Over 5
   5. rushing - YDS Over 50
   6. rushing - YDS Over 75
   7. passing - YDS Over 250
   8. passing - TD Over 1.5

‚ö†Ô∏è  Ready to start full training loop!
   This will train all models and may take several minutes.
   Proceed to Section 7 to start training.

SECTION 7: FULL MULTI-PROP TRAINING LOOP
Uncomment the code below to run full training

Training: receiving - YDS Over 50
Training samples: 17929, Test samples: 4483
Train hit rate: 20.43%, Test hit rate: 20.72%

üìä TRAIN | Log Loss: 0.3128 | AUC: 0.9049 | Acc: 0.8692 | ECE: 0.0441
üìä TEST  | Log Loss: 0.4129 | AUC: 0.8059 | Acc: 0.8099 | ECE: 0.0373
‚úÖ Well calibrated: ECE < 0.10
‚úÖ Task validated: AUC > 0.60

üîù Top 5 Features:
   yds_rolling_8: 44.41
   tgts_rolling_8: 14.41
   position_e