# sWARm Age Curve Analysis - SYSTEM 2 Joint Modeling

This notebook implements and demonstrates the joint longitudinal-survival modeling approach for baseball player aging curves, integrated with the sWARm SYSTEM 2 future projection framework.

## Key Features:
- **Joint Modeling**: Combines performance trajectory with retirement risk
- **3-Year Projections**: 2025, 2026, 2027 future performance
- **Interactive Analysis**: Player-specific historical validation and projections
- **Age Curve Integration**: Position-specific aging patterns
- **Expected Stats**: 75/25 weighting of real vs expected performance

## Model Architecture:
- **Longitudinal Component**: MultiTaskLasso for multi-year WAR prediction
- **Survival Component**: Cox Proportional Hazards for retirement risk
- **Joint Estimation**: Addresses selection bias in aging curves

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import warnings
from datetime import datetime
import sys
import os

# Add modules to path
sys.path.append('.')

# Import sWARm modules
from current_season_modules.temp_modeling import prepare_data_for_kfold, run_kfold_cross_validation
from old.age_curve import (
    AgeDataLoader, ExpectedStatsCalculator, FutureProjectionAgeCurve, 
    AgeCurveValidator, integrate_age_curves_system2
)
from current_season_modules.two_way_players import get_cleaned_two_way_data
from legacy_modules.positional_adjustments import load_positional_adjustments_for_war_models

# Configure plotting
plt.style.use('default')
sns.set_palette("husl")
warnings.filterwarnings('ignore')

print("sWARm Age Curve Analysis - SYSTEM 2 Joint Modeling")
print("=" * 60)
print(f"Analysis Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("\nInitialization Complete!")

sWARm Age Curve Analysis - SYSTEM 2 Joint Modeling
Analysis Date: 2025-09-25 17:40:49

Initialization Complete!


## Section 1: Data Preparation and Model Training

This section prepares the comprehensive dataset and trains the joint longitudinal-survival model using the sWARm SYSTEM 2 framework.

In [2]:
# Load and prepare comprehensive dataset for age curve modeling
print("SECTION 1: DATA PREPARATION AND MODEL TRAINING")
print("=" * 50)

# Load base data using existing sWARm pipeline
print("\n1.1: Loading base data with sWARm SYSTEM 2 pipeline...")
hitter_data, pitcher_data = prepare_data_for_kfold()

print(f"✓ Hitter data loaded: {len(hitter_data['warp']) if hitter_data and 'warp' in hitter_data else 0} WARP records")
print(f"✓ Hitter data loaded: {len(hitter_data['war']) if hitter_data and 'war' in hitter_data else 0} WAR records")
print(f"✓ Pitcher data loaded: {len(pitcher_data['warp']) if pitcher_data and 'warp' in pitcher_data else 0} WARP records")
print(f"✓ Pitcher data loaded: {len(pitcher_data['war']) if pitcher_data and 'war' in pitcher_data else 0} WAR records")

SECTION 1: DATA PREPARATION AND MODEL TRAINING

1.1: Loading base data with sWARm SYSTEM 2 pipeline...
Preparing comprehensive dataset for K-fold cross-validation...
Loading FIXED BP data with derived statistics...
LOADING BP DATA WITH FIXED DERIVED STATISTICS

Processing BP Hitter Data:
   Calculating derived statistics for 2016 data...
      DATA: K%: 633/633 records have valid values
      DATA: BB%: 633/633 records have valid values
   SUCCESS 2016: 633 records loaded
   Calculating derived statistics for 2017 data...
      DATA: K%: 623/623 records have valid values
      DATA: BB%: 623/623 records have valid values
   SUCCESS 2017: 623 records loaded
   Calculating derived statistics for 2018 data...
      DATA: K%: 627/627 records have valid values
      DATA: BB%: 627/627 records have valid values
   SUCCESS 2018: 627 records loaded
   Calculating derived statistics for 2019 data...
      DATA: K%: 990/990 records have valid values
      DATA: BB%: 990/990 records have valid va

In [3]:
# Prepare age curve training data
print("\n1.2: Preparing age curve training data...")

def prepare_age_curve_training_data(hitter_data, pitcher_data):
    """
    Combine hitter and pitcher data for age curve modeling.
    Fixed to properly handle mlbid instead of incorrectly using player names.
    """
    training_datasets = []
    
    # We need to get the actual mlbid data from the source since temp_modeling doesn't provide it
    # Import the data loading functions from the correct locations
    from shared_modules.bp_derived_stats import load_fixed_bp_data
    from current_season_modules.temp_modeling import load_expanded_fangraphs_data, filter_pitchers_from_hitting_data
    
    print("Loading source data to get proper mlbid information...")
    
    # Load the source datasets that contain mlbid
    try:
        hitter_warp_full, pitcher_warp_full = load_fixed_bp_data()
        hitter_war_full = load_expanded_fangraphs_data()
        hitter_war_full = filter_pitchers_from_hitting_data(hitter_war_full, 'war', 'Year')
        
        print(f"✓ Source data loaded: {len(hitter_warp_full)} WARP hitters, {len(pitcher_warp_full)} WARP pitchers, {len(hitter_war_full)} WAR hitters")
        
    except Exception as e:
        print(f"⚠ Could not load source data: {e}")
        print("Using fallback approach with player names...")
        
        # Fallback: use player names for matching (less reliable)
        return prepare_age_curve_training_data_fallback(hitter_data, pitcher_data)
    
    # Process hitter data
    if hitter_data:
        for data_type in ['warp', 'war']:
            if data_type in hitter_data and hitter_data[data_type] is not None:
                data = hitter_data[data_type]
                
                # Get the corresponding source dataset
                if data_type == 'warp':
                    source_df = hitter_warp_full
                    id_col = 'mlbid'
                    year_col = 'Season'
                else:  # war
                    source_df = hitter_war_full  
                    id_col = 'MLBAMID'
                    year_col = 'Year'
                
                # Create mapping from names to get mlbid
                name_to_mlbid = {}
                for _, row in source_df.iterrows():
                    if pd.notna(row.get('Name')) and pd.notna(row.get(id_col)):
                        name_to_mlbid[row['Name']] = row[id_col]
                
                # Create DataFrame with proper mlbid
                mlbids = []
                for name in data['names']:
                    mlbid = name_to_mlbid.get(name, None)
                    mlbids.append(mlbid)
                
                hitter_df = pd.DataFrame({
                    'mlbid': mlbids,
                    'Name': data['names'],
                    'Season': data['years'],
                    'WAR': data['y'],
                    'PlayerType': 'Hitter',
                    'DataSource': data_type.upper()
                })
                
                # Filter out records without mlbid
                hitter_df = hitter_df[hitter_df['mlbid'].notna()].copy()
                
                # Add features as additional columns
                feature_df = pd.DataFrame(data['X'])
                feature_df.columns = [f'feature_{i}' for i in range(len(feature_df.columns))]
                feature_df = feature_df.iloc[:len(hitter_df)].reset_index(drop=True)
                
                combined_df = pd.concat([hitter_df.reset_index(drop=True), feature_df], axis=1)
                training_datasets.append(combined_df)
                
                print(f"✓ Processed {data_type.upper()} hitters: {len(hitter_df)} records with valid mlbid")
    
    # Process pitcher data
    if pitcher_data:
        for data_type in ['warp', 'war']:
            if data_type in pitcher_data and pitcher_data[data_type] is not None:
                data = pitcher_data[data_type]
                
                # Get the corresponding source dataset
                if data_type == 'warp':
                    source_df = pitcher_warp_full
                    id_col = 'mlbid'
                    year_col = 'Season'
                else:  # war
                    # For pitcher WAR, we would need to load pitcher WAR data separately
                    # For now, skip WAR pitchers since we mainly loaded hitter WAR data
                    print(f"⚠ Skipping WAR pitchers - would need separate pitcher WAR dataset")
                    continue
                
                # Create mapping from names to get mlbid
                name_to_mlbid = {}
                for _, row in source_df.iterrows():
                    if pd.notna(row.get('Name')) and pd.notna(row.get(id_col)):
                        name_to_mlbid[row['Name']] = row[id_col]
                
                # Create DataFrame with proper mlbid
                mlbids = []
                for name in data['names']:
                    mlbid = name_to_mlbid.get(name, None)
                    mlbids.append(mlbid)
                
                pitcher_df = pd.DataFrame({
                    'mlbid': mlbids,
                    'Name': data['names'],
                    'Season': data['years'],
                    'WAR': data['y'],
                    'PlayerType': 'Pitcher',
                    'DataSource': data_type.upper()
                })
                
                # Filter out records without mlbid
                pitcher_df = pitcher_df[pitcher_df['mlbid'].notna()].copy()
                
                # Add features as additional columns
                feature_df = pd.DataFrame(data['X'])
                feature_df.columns = [f'feature_{i}' for i in range(len(feature_df.columns))]
                feature_df = feature_df.iloc[:len(pitcher_df)].reset_index(drop=True)
                
                combined_df = pd.concat([pitcher_df.reset_index(drop=True), feature_df], axis=1)
                training_datasets.append(combined_df)
                
                print(f"✓ Processed {data_type.upper()} pitchers: {len(pitcher_df)} records with valid mlbid")
    
    if training_datasets:
        # Combine all datasets
        combined_data = pd.concat(training_datasets, ignore_index=True)
        
        # Add age data
        age_loader = AgeDataLoader()
        combined_data_with_ages = age_loader.merge_ages_with_pipeline(combined_data)
        
        return combined_data_with_ages
    
    return pd.DataFrame()

def prepare_age_curve_training_data_fallback(hitter_data, pitcher_data):
    """
    Fallback function that uses player name matching instead of mlbid.
    Less reliable but works when mlbid data is not available.
    """
    print("Using fallback approach with name-based matching...")
    
    training_datasets = []
    
    # Process hitter data
    if hitter_data:
        for data_type in ['warp', 'war']:
            if data_type in hitter_data and hitter_data[data_type] is not None:
                data = hitter_data[data_type]
                
                # Use player names as identifiers (less reliable)
                hitter_df = pd.DataFrame({
                    'Name': data['names'],
                    'Season': data['years'],
                    'WAR': data['y'],
                    'PlayerType': 'Hitter',
                    'DataSource': data_type.upper()
                })
                
                # Add features as additional columns
                feature_df = pd.DataFrame(data['X'])
                feature_df.columns = [f'feature_{i}' for i in range(len(feature_df.columns))]
                
                combined_df = pd.concat([hitter_df, feature_df], axis=1)
                training_datasets.append(combined_df)
    
    # Process pitcher data
    if pitcher_data:
        for data_type in ['warp', 'war']:
            if data_type in pitcher_data and pitcher_data[data_type] is not None:
                data = pitcher_data[data_type]
                
                pitcher_df = pd.DataFrame({
                    'Name': data['names'],
                    'Season': data['years'],
                    'WAR': data['y'],
                    'PlayerType': 'Pitcher',
                    'DataSource': data_type.upper()
                })
                
                # Add features as additional columns
                feature_df = pd.DataFrame(data['X'])
                feature_df.columns = [f'feature_{i}' for i in range(len(feature_df.columns))]
                
                combined_df = pd.concat([pitcher_df, feature_df], axis=1)
                training_datasets.append(combined_df)
    
    if training_datasets:
        # Combine all datasets
        combined_data = pd.concat(training_datasets, ignore_index=True)
        
        # For name-based matching, we'll need to modify the age loader
        # or create a name-based age matching approach
        print("⚠ Name-based age matching not yet implemented")
        print("⚠ Age data will not be available with this fallback approach")
        
        return combined_data
    
    return pd.DataFrame()

# Prepare training data
age_training_data = prepare_age_curve_training_data(hitter_data, pitcher_data)

print(f"✓ Age curve training data prepared: {len(age_training_data)} total records")

if len(age_training_data) > 0:
    print(f"✓ Age coverage: {age_training_data['Age'].notna().sum()}/{len(age_training_data)} records ({age_training_data['Age'].notna().sum()/len(age_training_data)*100:.1f}%)")
    print(f"✓ Age range: {age_training_data['Age'].min():.0f} - {age_training_data['Age'].max():.0f} years")
    print(f"✓ Season range: {age_training_data['Season'].min()} - {age_training_data['Season'].max()}")
    print(f"✓ Hitters: {len(age_training_data[age_training_data['PlayerType'] == 'Hitter'])}, Pitchers: {len(age_training_data[age_training_data['PlayerType'] == 'Pitcher'])}")
    
    # Show mlbid coverage
    if 'mlbid' in age_training_data.columns:
        mlbid_coverage = age_training_data['mlbid'].notna().sum()
        print(f"✓ MLBID coverage: {mlbid_coverage}/{len(age_training_data)} records ({mlbid_coverage/len(age_training_data)*100:.1f}%)")
else:
    print("⚠ No training data available for age curve modeling")


1.2: Preparing age curve training data...
Loading source data to get proper mlbid information...
LOADING BP DATA WITH FIXED DERIVED STATISTICS

Processing BP Hitter Data:
   Calculating derived statistics for 2016 data...
      DATA: K%: 633/633 records have valid values
      DATA: BB%: 633/633 records have valid values
   SUCCESS 2016: 633 records loaded
   Calculating derived statistics for 2017 data...
      DATA: K%: 623/623 records have valid values
      DATA: BB%: 623/623 records have valid values
   SUCCESS 2017: 623 records loaded
   Calculating derived statistics for 2018 data...
      DATA: K%: 627/627 records have valid values
      DATA: BB%: 627/627 records have valid values
   SUCCESS 2018: 627 records loaded
   Calculating derived statistics for 2019 data...
      DATA: K%: 990/990 records have valid values
      DATA: BB%: 990/990 records have valid values
   SUCCESS 2019: 990 records loaded
   OK: 2020 data already has K% and BB% - no calculation needed
   SUCCESS 2

In [4]:
# Add primary position data for age curve modeling
print("\n1.3: Adding positional data for age curve analysis...")

# Load positional adjustments
bp_positions, fg_positions = load_positional_adjustments_for_war_models()

def add_position_data(df, bp_positions, fg_positions):
    """
    Add primary position information to the dataset.
    """
    if len(df) == 0:
        return df
    
    # Try to merge with FanGraphs positions first (comprehensive coverage)
    if len(fg_positions) > 0:
        df_with_pos = df.merge(
            fg_positions[['MLBAMID', 'Season', 'Primary_Position']].rename(columns={'MLBAMID': 'mlbid'}),
            on=['mlbid', 'Season'],
            how='left'
        )
    else:
        df_with_pos = df.copy()
        df_with_pos['Primary_Position'] = None
    
    # Fill missing positions with BP data
    if len(bp_positions) > 0:
        missing_positions = df_with_pos['Primary_Position'].isna()
        if missing_positions.sum() > 0:
            bp_merge = df_with_pos[missing_positions].merge(
                bp_positions[['mlbid', 'Season', 'Primary_Position']],
                on=['mlbid', 'Season'],
                how='left',
                suffixes=('', '_bp')
            )
            df_with_pos.loc[missing_positions, 'Primary_Position'] = bp_merge['Primary_Position_bp'].values
    
    # Default position for missing data
    missing_after_merge = df_with_pos['Primary_Position'].isna()
    if missing_after_merge.sum() > 0:
        # Use player type to assign default position
        df_with_pos.loc[missing_after_merge & (df_with_pos['PlayerType'] == 'Pitcher'), 'Primary_Position'] = 'P'
        df_with_pos.loc[missing_after_merge & (df_with_pos['PlayerType'] == 'Hitter'), 'Primary_Position'] = '1B'
    
    return df_with_pos

# Add position data
age_training_data = add_position_data(age_training_data, bp_positions, fg_positions)

# Show position distribution
if len(age_training_data) > 0:
    position_counts = age_training_data['Primary_Position'].value_counts()
    print(f"✓ Position data added: {age_training_data['Primary_Position'].notna().sum()}/{len(age_training_data)} records")
    print("✓ Position distribution:")
    for pos, count in position_counts.head(10).items():
        print(f"   {pos}: {count}")
else:
    print("⚠ No training data available for age curve modeling")


1.3: Adding positional data for age curve analysis...
Loading positional adjustments for WAR models...
  FG defensive data: 19763 records
  BP fielding data: 11234 player-seasons
Loaded positional data:
  FG positions: 19763
  BP positions: 11234
✓ Position data added: 45435/45435 records
✓ Position distribution:
   P: 8228
   LF: 6885
   RF: 6238
   CF: 5034
   1B: 4514
   2B: 4220
   3B: 4194
   SS: 3324
   C: 2798


In [5]:
# Train the joint age curve model
print("\n1.4: Training joint longitudinal-survival age curve model...")

if len(age_training_data) > 0 and age_training_data['Age'].notna().sum() > 100:
    # Initialize age curve model
    age_curve_model = FutureProjectionAgeCurve(max_projection_years=3)
    
    try:
        # Filter data for training (need complete records)
        training_subset = age_training_data[
            age_training_data['Age'].notna() & 
            age_training_data['Primary_Position'].notna() &
            age_training_data['WAR'].notna() &
            age_training_data['Season'].notna()
        ].copy()
        
        print(f"✓ Training subset: {len(training_subset)} complete records")
        
        if len(training_subset) >= 50:  # Minimum for meaningful training
            # Fit joint model
            age_curve_model.fit_joint_model(training_subset)
            
            # Save model
            model_path = "age_curve_model_system2.pkl"
            age_curve_model.save_model(model_path)
            
            print(f"✓ Joint age curve model fitted and saved to {model_path}")
            model_trained = True
        else:
            print(f"⚠ Insufficient training data ({len(training_subset)} records). Need at least 50.")
            model_trained = False
            age_curve_model = None
            
    except Exception as e:
        print(f"⚠ Model training failed: {e}")
        print("Creating simplified age curve model...")
        
        # Fallback: simple age curve without full joint modeling
        age_curve_model = FutureProjectionAgeCurve(max_projection_years=3)
        model_trained = False
        
else:
    print("⚠ Insufficient age data for model training")
    age_curve_model = None
    model_trained = False

print(f"\nModel Training Status: {'✓ SUCCESS' if model_trained else '⚠ LIMITED FUNCTIONALITY'}")


1.4: Training joint longitudinal-survival age curve model...
✓ Training subset: 41620 complete records
Fitting joint longitudinal-survival model for future projections...
Preparing longitudinal data for multi-year WAR prediction...
Longitudinal data prepared: 39611 training examples, 15 features
Preparing survival data for retirement risk modeling...
Survival data prepared: 41620 observations, 1152 retirement events
Fitting multi-year performance trajectory model...
   Year 1 projection: R² = 0.038, RMSE = 1.358
   Year 2 projection: R² = 0.040, RMSE = 1.345
   Year 3 projection: R² = 0.043, RMSE = 1.327
Fitting retirement hazard model...
   Validating survival data: 41620 observations, 1152 events
   Duration range: 1.0 - 9.0
   Event rate: 0.028
   Details: Convergence halted due to matrix inversion problems. Suspicion is high collinearity. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence

## Section 2: Model Validation and Performance

Cross-validate the joint model using temporal splits and analyze overall performance metrics.

In [6]:
# Cross-validate the joint model
print("SECTION 2: MODEL VALIDATION AND PERFORMANCE")
print("=" * 50)

if model_trained and age_curve_model is not None:
    print("\n2.1: Running temporal cross-validation...")
    
    try:
        # Initialize validator
        validator = AgeCurveValidator(n_splits=3)  # Fewer splits for faster execution
        
        # Run validation
        validation_results = validator.validate_joint_model(age_curve_model, training_subset)
        
        print("\n✓ Validation Results:")
        for metric, value in validation_results.items():
            if not np.isnan(value):
                print(f"   {metric}: {value:.3f}")
        
        validation_completed = True
        
    except Exception as e:
        print(f"⚠ Validation failed: {e}")
        validation_results = {}
        validation_completed = False
        
else:
    print("\n2.1: Skipping validation - model not trained")
    validation_results = {}
    validation_completed = False

SECTION 2: MODEL VALIDATION AND PERFORMANCE

2.1: Running temporal cross-validation...
Validating joint longitudinal-survival model...
Creating 3 temporal splits with survival considerations...
   Split 1: Train 2016-2017, Test 2018-2019
   Split 2: Train 2016-2019, Test 2020-2021
   Split 3: Train 2016-2021, Test 2022-2023

Validating split 1/3...
Fitting joint longitudinal-survival model for future projections...
Preparing longitudinal data for multi-year WAR prediction...
Longitudinal data prepared: 14016 training examples, 15 features
Preparing survival data for retirement risk modeling...
Survival data prepared: 14925 observations, 909 retirement events
Fitting multi-year performance trajectory model...
   Year 1 projection: R² = 0.045, RMSE = 1.615
   Year 2 projection: R² = 0.052, RMSE = 1.591
   Year 3 projection: R² = 0.053, RMSE = 1.575
Fitting retirement hazard model...
   Validating survival data: 14925 observations, 909 events
   Duration range: 1.0 - 2.0
   Event rate: 0.

In [7]:
# Performance visualization
print("\n2.2: Creating performance visualizations...")

if len(age_training_data) > 0:
    # Age distribution analysis
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=(
            'Age Distribution by Position',
            'WAR by Age',
            'Career Length Distribution', 
            'Performance by Career Stage'
        ),
        specs=[[{"secondary_y": False}, {"secondary_y": False}],
               [{"secondary_y": False}, {"secondary_y": False}]]
    )
    
    # Age distribution by position
    positions = ['C', 'SS', '2B', '3B', '1B', 'LF', 'CF', 'RF', 'DH', 'P']
    for pos in positions:
        pos_data = age_training_data[age_training_data['Primary_Position'] == pos]
        if len(pos_data) > 0:
            fig.add_trace(
                go.Box(y=pos_data['Age'], name=pos, showlegend=False),
                row=1, col=1
            )
    
    # WAR by age (scatter plot)
    valid_data = age_training_data[age_training_data['Age'].notna() & age_training_data['WAR'].notna()]
    if len(valid_data) > 0:
        fig.add_trace(
            go.Scatter(
                x=valid_data['Age'],
                y=valid_data['WAR'],
                mode='markers',
                opacity=0.6,
                name='WAR vs Age',
                showlegend=False
            ),
            row=1, col=2
        )
    
    # Career length (estimated)
    if 'Season' in age_training_data.columns:
        career_lengths = age_training_data.groupby('Name')['Season'].nunique()
        fig.add_trace(
            go.Histogram(x=career_lengths, nbinsx=15, showlegend=False),
            row=2, col=1
        )
    
    # Performance by career stage
    if len(valid_data) > 0:
        # Create career stage bins
        age_bins = pd.cut(valid_data['Age'], bins=[0, 25, 30, 35, 50], labels=['Young', 'Prime', 'Veteran', 'Late'])
        stage_performance = valid_data.groupby(age_bins)['WAR'].mean()
        
        fig.add_trace(
            go.Bar(x=stage_performance.index.astype(str), y=stage_performance.values, showlegend=False),
            row=2, col=2
        )
    
    # Update layout
    fig.update_layout(
        height=600,
        title_text="Age Curve Training Data Analysis",
        title_x=0.5
    )
    
    fig.show()
    
    print(f"✓ Performance visualizations created for {len(age_training_data)} records")
    
else:
    print("⚠ No data available for visualization")


2.2: Creating performance visualizations...


✓ Performance visualizations created for 45435 records


## Section 3: Interactive Player Analysis

Analyze individual players showing historical performance, model accuracy, and age curve progression.

In [8]:
# Player analysis functions
print("SECTION 3: INTERACTIVE PLAYER ANALYSIS")
print("=" * 50)

def get_player_options():
    """
    Get list of available players for analysis.
    """
    if len(age_training_data) == 0:
        return []
    
    # Get players with sufficient data
    player_counts = age_training_data.groupby('Name').size()
    qualified_players = player_counts[player_counts >= 2].index.tolist()
    
    # Sort alphabetically
    qualified_players.sort()
    
    return qualified_players

def analyze_player_historical_performance(player_name, data):
    """
    Analyze historical performance for a specific player.
    Keeps WARP and WAR separate as distinct metrics - no averaging/merging.
    """
    player_data = data[data['Name'] == player_name].copy()
    
    if len(player_data) == 0:
        return None
    
    # Sort by season
    player_data = player_data.sort_values('Season')
    
    # Separate WARP and WAR data
    warp_data = player_data[player_data['DataSource'] == 'WARP'].copy()
    war_data = player_data[player_data['DataSource'] == 'WAR'].copy()
    
    # Calculate separate career totals (the RIGHT way)
    career_warp = warp_data['WAR'].sum() if len(warp_data) > 0 else 0  # Column is called 'WAR' but contains WARP values for WARP records
    career_war = war_data['WAR'].sum() if len(war_data) > 0 else 0
    
    # Peak seasons for each metric
    peak_warp = warp_data['WAR'].max() if len(warp_data) > 0 else 0
    peak_war = war_data['WAR'].max() if len(war_data) > 0 else 0
    
    peak_warp_season = warp_data.loc[warp_data['WAR'].idxmax(), 'Season'] if len(warp_data) > 0 else None
    peak_war_season = war_data.loc[war_data['WAR'].idxmax(), 'Season'] if len(war_data) > 0 else None
    
    # Calculate basic statistics
    analysis = {
        'seasons': player_data['Season'].tolist(),
        'ages': player_data['Age'].tolist(),
        'war_values': player_data['WAR'].tolist(),
        'positions': player_data['Primary_Position'].tolist(),
        'data_sources': player_data['DataSource'].tolist(),
        
        # Separate career totals (CLEAN approach)
        'career_warp': career_warp,
        'career_war': career_war,
        'peak_warp': peak_warp,
        'peak_war': peak_war,
        'peak_warp_season': peak_warp_season,
        'peak_war_season': peak_war_season,
        
        # Additional info
        'age_range': f"{player_data['Age'].min():.0f}-{player_data['Age'].max():.0f}",
        'primary_position': player_data['Primary_Position'].mode().iloc[0] if len(player_data['Primary_Position'].mode()) > 0 else 'Unknown',
        'warp_seasons': len(warp_data),
        'war_seasons': len(war_data),
        'total_records': len(player_data)
    }
    
    return analysis

def create_player_visualization(player_analysis, player_name):
    """
    Create visualization for player's historical performance and age curve.
    Shows separate WARP and WAR metrics.
    """
    if player_analysis is None:
        return None
    
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=(
            f'{player_name} - WAR/WARP by Season',
            f'{player_name} - WAR/WARP by Age',
            'Age Curve Progression',
            'Performance Summary'
        ),
        specs=[[{"secondary_y": False}, {"secondary_y": False}],
               [{"secondary_y": False}, {"type": "table"}]]
    )
    
    # WAR/WARP by season (color-coded by source)
    colors = ['red' if ds == 'WARP' else 'blue' for ds in player_analysis['data_sources']]
    fig.add_trace(
        go.Scatter(
            x=player_analysis['seasons'],
            y=player_analysis['war_values'],
            mode='lines+markers',
            name='Performance',
            line=dict(width=2),
            marker=dict(size=8, color=colors),
            text=[f"{s} ({ds})" for s, ds in zip(player_analysis['seasons'], player_analysis['data_sources'])],
            showlegend=False
        ),
        row=1, col=1
    )
    
    # WAR/WARP by age (color-coded by source)
    fig.add_trace(
        go.Scatter(
            x=player_analysis['ages'],
            y=player_analysis['war_values'],
            mode='markers',
            marker=dict(size=10, color=colors),
            text=[f"{s} ({ds})" for s, ds in zip(player_analysis['seasons'], player_analysis['data_sources'])],
            showlegend=False
        ),
        row=1, col=2
    )
    
    # Age curve (theoretical)
    if age_curve_model is not None:
        try:
            ages_range = np.arange(20, 40, 0.5)
            position = player_analysis['primary_position']
            
            age_factors = [age_curve_model.calculate_age_curve_factor(age, position) for age in ages_range]
            
            fig.add_trace(
                go.Scatter(
                    x=ages_range,
                    y=age_factors,
                    mode='lines',
                    name='Age Curve',
                    line=dict(width=2, dash='dash'),
                    showlegend=False
                ),
                row=2, col=1
            )
            
            # Add player's actual ages
            player_factors = [age_curve_model.calculate_age_curve_factor(age, position) for age in player_analysis['ages']]
            fig.add_trace(
                go.Scatter(
                    x=player_analysis['ages'],
                    y=player_factors,
                    mode='markers',
                    marker=dict(size=8, color='red'),
                    name='Player Ages',
                    showlegend=False
                ),
                row=2, col=1
            )
            
        except Exception as e:
            print(f"Age curve calculation failed: {e}")
    
    # Summary table - Show BOTH metrics separately
    summary_data = [
        ['Career WARP', f"{player_analysis['career_warp']:.1f}"],
        ['Career WAR', f"{player_analysis['career_war']:.1f}"], 
        ['Peak WARP', f"{player_analysis['peak_warp']:.1f}" if player_analysis['peak_warp'] > 0 else 'N/A'],
        ['Peak WAR', f"{player_analysis['peak_war']:.1f}" if player_analysis['peak_war'] > 0 else 'N/A'],
        ['Age Range', player_analysis['age_range']],
        ['Position', player_analysis['primary_position']]
    ]
    
    fig.add_trace(
        go.Table(
            header=dict(values=['Metric', 'Value']),
            cells=dict(values=[[item[0] for item in summary_data], [item[1] for item in summary_data]])
        ),
        row=2, col=2
    )
    
    # Update layout
    fig.update_layout(
        height=700,
        title_text=f"Player Analysis: {player_name}",
        title_x=0.5
    )
    
    fig.update_xaxes(title_text="Season", row=1, col=1)
    fig.update_yaxes(title_text="WAR/WARP", row=1, col=1)
    fig.update_xaxes(title_text="Age", row=1, col=2)
    fig.update_yaxes(title_text="WAR/WARP", row=1, col=2)
    fig.update_xaxes(title_text="Age", row=2, col=1)
    fig.update_yaxes(title_text="Age Factor", row=2, col=1)
    
    return fig

# Get available players
available_players = get_player_options()
print(f"\n3.1: Available players for analysis: {len(available_players)}")

if len(available_players) > 0:
    print("\nTop 20 players (alphabetical):")
    for i, player in enumerate(available_players[:20]):
        print(f"   {i+1:2d}. {player}")
    
    if len(available_players) > 20:
        print(f"   ... and {len(available_players) - 20} more players")
else:
    print("⚠ No players available for analysis")

SECTION 3: INTERACTIVE PLAYER ANALYSIS

3.1: Available players for analysis: 2954

Top 20 players (alphabetical):
    1. A.J. Alexy
    2. A.J. Cole
    3. A.J. Ellis
    4. A.J. Griffin
    5. A.J. Jimenez
    6. A.J. Minter
    7. A.J. Pierzynski
    8. A.J. Puk
    9. A.J. Schugel
   10. AJ Pollock
   11. AJ Ramos
   12. AJ Reed
   13. AJ Smith-Shawver
   14. Aaron Altherr
   15. Aaron Ashby
   16. Aaron Barrett
   17. Aaron Blair
   18. Aaron Brooks
   19. Aaron Bummer
   20. Aaron Civale
   ... and 2934 more players


In [9]:
# Interactive player selection and analysis
from legacy_modules.improved_player_analysis import analyze_player_historical_performance_improved, create_improved_player_visualization

print("\n3.2: Interactive Player Analysis")
print("-" * 40)

if len(available_players) > 0:
    # Default to a well-known player if available
    default_players = ['Mike Trout', 'Shohei Ohtani', 'Ronald Acuña Jr.', 'Mookie Betts', 'Aaron Judge']
    selected_player = None
    
    for default in default_players:
        if default in available_players:
            selected_player = default
            break
    
    if selected_player is None:
        selected_player = available_players[0]
    
    print(f"Analyzing player: {selected_player}")
    print("(Change the 'selected_player' variable above to analyze a different player)")
    
    # Analyze the selected player
    player_analysis = analyze_player_historical_performance_improved(selected_player, age_training_data)
    
    if player_analysis:
        print(f"\n✓ Analysis completed for {selected_player}:")
        
        # Show separate WARP and WAR totals (CLEAN approach)
        if player_analysis['career_warp'] > 0:
            print(f"   Career WARP: {player_analysis['career_warp']:.1f} ({player_analysis['warp_seasons']} seasons)")
            print(f"   Peak WARP: {player_analysis['peak_warp']:.1f} ({player_analysis['peak_warp_season']})")
            
        if player_analysis['career_war'] > 0:
            print(f"   Career WAR: {player_analysis['career_war']:.1f} ({player_analysis['war_seasons']} seasons)")
            print(f"   Peak WAR: {player_analysis['peak_war']:.1f} ({player_analysis['peak_war_season']})")
            
        print(f"   Age Range: {player_analysis['age_range']} years")
        print(f"   Primary Position: {player_analysis['primary_position']}")
        print(f"   Total Records: {player_analysis['total_records']} (WARP: {player_analysis['warp_seasons']}, WAR: {player_analysis['war_seasons']})")
        
        # Create visualization
        player_fig = create_improved_player_visualization(player_analysis, selected_player)
        if player_fig:
            player_fig.show()
    else:
        print(f"⚠ Could not analyze {selected_player}")
        
else:
    print("⚠ No players available for analysis")
    selected_player = None
    player_analysis = None

ModuleNotFoundError: No module named 'legacy_modules.improved_player_analysis'

## Section 4: Future Projections (2025-2027)

Generate 3-year future projections using the joint longitudinal-survival model, showing expected performance and career risk.

In [None]:
# Future projections
print("SECTION 4: FUTURE PROJECTIONS (2025-2027)")
print("=" * 50)

def generate_player_future_projections(player_name, data, age_model, metric_preference='WAR'):
    """
    Generate 3-year future projections for a player.
    Uses clean separation of WARP vs WAR - no averaging/merging.
    
    Args:
        player_name: Player to analyze
        data: Training data
        age_model: Age curve model
        metric_preference: 'WAR' or 'WARP' - which metric to use for projections
    """
    player_data = data[data['Name'] == player_name].copy()
    
    if len(player_data) == 0:
        return None
    
    # Filter to preferred metric only (CLEAN approach)
    preferred_data = player_data[player_data['DataSource'] == metric_preference].copy()
    
    # Fallback to other metric if preferred not available
    if len(preferred_data) == 0:
        fallback_metric = 'WARP' if metric_preference == 'WAR' else 'WAR'
        preferred_data = player_data[player_data['DataSource'] == fallback_metric].copy()
        metric_preference = fallback_metric
        print(f"   Using {metric_preference} data (preferred metric not available)")
    
    if len(preferred_data) == 0:
        return None
    
    # Get most recent season data from the preferred metric
    latest_season = preferred_data['Season'].max()
    latest_record = preferred_data[preferred_data['Season'] == latest_season].iloc[0]
    
    # Current state
    current_state = {
        'age': latest_record['Age'],
        'position': latest_record['Primary_Position'],
        'war': latest_record['WAR'],  # Note: column name is 'WAR' but contains WARP values for WARP records
        'season': latest_record['Season'],
        'metric_used': metric_preference
    }
    
    projections = {
        'current_state': current_state,
        'projections': {},
        'survival_probs': {},
        'age_factors': {}
    }
    
    try:
        if age_model is not None and model_trained:
            # Use trained model for projections
            future_projections = age_model.generate_future_projections(current_state, years_ahead=3)
            
            # Get survival probabilities
            survival_probs = age_model.calculate_survival_probabilities(
                current_state['age'], 
                current_state['position'], 
                current_state['war'], 
                years_ahead=3
            )
            
            # Get performance trajectory (without survival weighting)
            performance_path = age_model.predict_performance_path(
                current_state['age'],
                current_state['position'],
                current_state['war'],
                years_ahead=3
            )
            
            # Organize results
            projection_years = [2025, 2026, 2027]
            
            for i, year in enumerate(projection_years):
                if i < len(performance_path):
                    age_factor = age_model.calculate_age_curve_factor(
                        current_state['age'] + i + 1, 
                        current_state['position']
                    )
                    
                    projections['projections'][year] = {
                        'raw_war': performance_path[i],
                        'expected_war': future_projections.get(f'year_{i+1}', 0),
                        'age': current_state['age'] + i + 1
                    }
                    
                    projections['survival_probs'][year] = survival_probs[i] if i < len(survival_probs) else 0.1
                    projections['age_factors'][year] = age_factor
        
        else:
            # Fallback: simple age-based projections
            print("Using simplified age curve projections (model not available)")
            
            projection_years = [2025, 2026, 2027]
            
            for i, year in enumerate(projection_years):
                future_age = current_state['age'] + i + 1
                
                # Simple aging assumption
                if future_age <= 27:
                    age_factor = 1.02  # Slight improvement
                elif future_age <= 30:
                    age_factor = 1.0   # Peak
                else:
                    decline_rate = 0.02 if current_state['position'] != 'C' else 0.03
                    age_factor = 1.0 - (future_age - 30) * decline_rate
                
                age_factor = max(0.1, age_factor)
                projected_war = current_state['war'] * age_factor
                
                # Simple survival probability
                survival_prob = max(0.5, 1.0 - (future_age - current_state['age']) * 0.05)
                
                projections['projections'][year] = {
                    'raw_war': projected_war,
                    'expected_war': projected_war * survival_prob,
                    'age': future_age
                }
                
                projections['survival_probs'][year] = survival_prob
                projections['age_factors'][year] = age_factor
                
    except Exception as e:
        print(f"Projection generation failed: {e}")
        return None
    
    return projections


def create_projection_visualization(projections, player_name):
    """
    Create visualization for future projections.
    Clearly shows which metric is being used.
    """
    if projections is None:
        return None
    
    metric_used = projections['current_state']['metric_used']
    
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=(
            f'{player_name} - Future {metric_used} Projections',
            'Survival Probabilities',
            'Age Curve Factors',
            'Projection Summary'
        ),
        specs=[[{"secondary_y": False}, {"secondary_y": False}],
               [{"secondary_y": False}, {"type": "table"}]]
    )
    
    # Extract data for plotting
    years = list(projections['projections'].keys())
    raw_wars = [projections['projections'][year]['raw_war'] for year in years]
    expected_wars = [projections['projections'][year]['expected_war'] for year in years]
    survival_probs = [projections['survival_probs'][year] for year in years]
    age_factors = [projections['age_factors'][year] for year in years]
    ages = [projections['projections'][year]['age'] for year in years]
    
    # WAR/WARP projections
    fig.add_trace(
        go.Bar(
            x=years,
            y=raw_wars,
            name=f'Raw {metric_used}',
            opacity=0.7,
            showlegend=False
        ),
        row=1, col=1
    )
    
    fig.add_trace(
        go.Scatter(
            x=years,
            y=expected_wars,
            mode='lines+markers',
            name=f'Expected {metric_used}',
            line=dict(color='red', width=3),
            marker=dict(size=8),
            showlegend=False
        ),
        row=1, col=1
    )
    
    # Survival probabilities
    fig.add_trace(
        go.Bar(
            x=years,
            y=[p * 100 for p in survival_probs],
            name='Survival %',
            marker_color='green',
            showlegend=False
        ),
        row=1, col=2
    )
    
    # Age factors
    fig.add_trace(
        go.Scatter(
            x=years,
            y=age_factors,
            mode='lines+markers',
            name='Age Factor',
            line=dict(width=3),
            marker=dict(size=8),
            showlegend=False
        ),
        row=2, col=1
    )
    
    # Summary table
    current = projections['current_state']
    total_projected = sum(expected_wars)
    
    summary_data = [
        ['Metric Used', metric_used],
        ['Current Age', f"{current['age']:.0f}"],
        [f'Current {metric_used}', f"{current['war']:.1f}"],
        ['Position', current['position']],
        [f'3-Year Total', f"{total_projected:.1f} {metric_used}"],
        [f'2025 Projection', f"{expected_wars[0]:.1f} {metric_used}"]
    ]
    
    fig.add_trace(
        go.Table(
            header=dict(values=['Metric', 'Value']),
            cells=dict(values=[[item[0] for item in summary_data], [item[1] for item in summary_data]])
        ),
        row=2, col=2
    )
    
    # Update layout
    fig.update_layout(
        height=700,
        title_text=f"Future Projections: {player_name} (2025-2027) - Using {metric_used}",
        title_x=0.5
    )
    
    fig.update_xaxes(title_text="Year", row=1, col=1)
    fig.update_yaxes(title_text=f"{metric_used}", row=1, col=1)
    fig.update_xaxes(title_text="Year", row=1, col=2)
    fig.update_yaxes(title_text="Survival %", row=1, col=2)
    fig.update_xaxes(title_text="Year", row=2, col=1)
    fig.update_yaxes(title_text="Age Factor", row=2, col=1)
    
    return fig

print("\n4.1: Future projection functions loaded")

In [None]:
# Generate projections for the selected player
print("\n4.2: Generating future projections...")

if selected_player and player_analysis:
    print(f"Generating 2025-2027 projections for {selected_player}...")
    print("Using WAR as primary metric for projections (change metric_preference parameter to use WARP)")
    
    # Generate projections using clean WAR/WARP separation
    future_projections = generate_player_future_projections(
        selected_player, 
        age_training_data, 
        age_curve_model,
        metric_preference='WAR'  # Change to 'WARP' if you prefer WARP projections
    )
    
    if future_projections:
        metric_used = future_projections['current_state']['metric_used']
        print(f"\n✓ Projections generated for {selected_player} using {metric_used}:")
        
        current = future_projections['current_state']
        print(f"\nCurrent State ({current['season']}):")
        print(f"   Age: {current['age']:.0f}")
        print(f"   Position: {current['position']}")
        print(f"   {metric_used}: {current['war']:.1f}")
        print(f"   Metric Used: {metric_used}")
        
        print(f"\nFuture {metric_used} Projections:")
        total_expected = 0
        
        for year in [2025, 2026, 2027]:
            if year in future_projections['projections']:
                proj = future_projections['projections'][year]
                survival = future_projections['survival_probs'][year]
                age_factor = future_projections['age_factors'][year]
                
                print(f"   {year} (Age {proj['age']:.0f}): {proj['expected_war']:.1f} {metric_used}")
                print(f"        Raw: {proj['raw_war']:.1f}, Survival: {survival*100:.0f}%, Age Factor: {age_factor:.3f}")
                
                total_expected += proj['expected_war']
        
        print(f"\n3-Year Total Expected {metric_used}: {total_expected:.1f}")
        print(f"(This is ONLY {metric_used} - not averaged with other metrics)")
        
        # Create visualization
        projection_fig = create_projection_visualization(future_projections, selected_player)
        if projection_fig:
            projection_fig.show()
            
    else:
        print(f"⚠ Could not generate projections for {selected_player}")
        
else:
    print("⚠ No player selected for projection analysis")

## Section 5: Model Comparison and Analysis

Compare the joint age curve model with baseline predictions and analyze the impact of aging adjustments.

In [None]:
# Model comparison and analysis
print("SECTION 5: MODEL COMPARISON AND ANALYSIS")
print("=" * 50)

def compare_models_for_players(players_list, data, age_model):
    """
    Compare age-adjusted vs baseline projections for multiple players.
    """
    comparison_results = []
    
    for player_name in players_list[:10]:  # Limit to 10 players for performance
        try:
            # Get player projections
            projections = generate_player_future_projections(player_name, data, age_model)
            
            if projections:
                current = projections['current_state']
                
                # Calculate simple baseline (no aging adjustment)
                baseline_projection = current['war'] * 3  # Assume constant performance
                
                # Age-adjusted projection
                age_adjusted = sum([projections['projections'][year]['expected_war'] 
                                  for year in [2025, 2026, 2027] 
                                  if year in projections['projections']])
                
                # Aging impact
                aging_impact = age_adjusted - baseline_projection
                
                comparison_results.append({
                    'player': player_name,
                    'age': current['age'],
                    'position': current['position'],
                    'current_war': current['war'],
                    'baseline_3yr': baseline_projection,
                    'age_adjusted_3yr': age_adjusted,
                    'aging_impact': aging_impact,
                    'impact_pct': (aging_impact / baseline_projection * 100) if baseline_projection != 0 else 0
                })
                
        except Exception as e:
            print(f"Error processing {player_name}: {e}")
            continue
    
    return pd.DataFrame(comparison_results)

print("\n5.1: Comparing age-adjusted vs baseline projections...")

if len(available_players) > 0:
    # Select diverse players for comparison
    comparison_players = available_players[:20]  # Top 20 players
    
    comparison_df = compare_models_for_players(comparison_players, age_training_data, age_curve_model)
    
    if len(comparison_df) > 0:
        print(f"\n✓ Model comparison completed for {len(comparison_df)} players")
        
        # Show summary statistics
        print(f"\nAging Impact Summary:")
        print(f"   Average impact: {comparison_df['aging_impact'].mean():.1f} WAR over 3 years")
        print(f"   Impact range: {comparison_df['aging_impact'].min():.1f} to {comparison_df['aging_impact'].max():.1f} WAR")
        print(f"   Players helped by aging: {(comparison_df['aging_impact'] > 0).sum()}")
        print(f"   Players hurt by aging: {(comparison_df['aging_impact'] < 0).sum()}")
        
        # Top beneficiaries and victims of aging
        print(f"\nTop 5 helped by age curve:")
        top_helped = comparison_df.nlargest(5, 'aging_impact')
        for _, row in top_helped.iterrows():
            print(f"   {row['player']} (Age {row['age']:.0f}): +{row['aging_impact']:.1f} WAR")
            
        print(f"\nTop 5 hurt by age curve:")
        top_hurt = comparison_df.nsmallest(5, 'aging_impact')
        for _, row in top_hurt.iterrows():
            print(f"   {row['player']} (Age {row['age']:.0f}): {row['aging_impact']:.1f} WAR")
            
    else:
        print("⚠ No successful model comparisons")
        comparison_df = pd.DataFrame()
        
else:
    print("⚠ No players available for comparison")
    comparison_df = pd.DataFrame()

In [None]:
# Visualization of model comparison
print("\n5.2: Creating model comparison visualizations...")

if len(comparison_df) > 0:
    
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=(
            'Aging Impact by Current Age',
            'Aging Impact by Position',
            'Baseline vs Age-Adjusted Projections',
            'Impact Distribution'
        )
    )
    
    # Aging impact by age
    fig.add_trace(
        go.Scatter(
            x=comparison_df['age'],
            y=comparison_df['aging_impact'],
            mode='markers',
            marker=dict(size=8, opacity=0.7),
            text=comparison_df['player'],
            name='Impact by Age',
            showlegend=False
        ),
        row=1, col=1
    )
    
    # Add zero line
    fig.add_hline(y=0, line_dash="dash", line_color="red", row=1, col=1)
    
    # Aging impact by position
    position_impact = comparison_df.groupby('position')['aging_impact'].mean().reset_index()
    fig.add_trace(
        go.Bar(
            x=position_impact['position'],
            y=position_impact['aging_impact'],
            name='Avg Impact by Position',
            showlegend=False
        ),
        row=1, col=2
    )
    
    # Baseline vs age-adjusted scatter
    fig.add_trace(
        go.Scatter(
            x=comparison_df['baseline_3yr'],
            y=comparison_df['age_adjusted_3yr'],
            mode='markers',
            marker=dict(size=8, opacity=0.7),
            text=comparison_df['player'],
            name='Baseline vs Adjusted',
            showlegend=False
        ),
        row=2, col=1
    )
    
    # Add diagonal line (y=x)
    min_val = min(comparison_df['baseline_3yr'].min(), comparison_df['age_adjusted_3yr'].min())
    max_val = max(comparison_df['baseline_3yr'].max(), comparison_df['age_adjusted_3yr'].max())
    fig.add_trace(
        go.Scatter(
            x=[min_val, max_val],
            y=[min_val, max_val],
            mode='lines',
            line=dict(dash='dash', color='red'),
            name='No Change Line',
            showlegend=False
        ),
        row=2, col=1
    )
    
    # Impact distribution histogram
    fig.add_trace(
        go.Histogram(
            x=comparison_df['aging_impact'],
            nbinsx=15,
            name='Impact Distribution',
            showlegend=False
        ),
        row=2, col=2
    )
    
    # Update layout
    fig.update_layout(
        height=700,
        title_text="Age Curve Model Impact Analysis",
        title_x=0.5
    )
    
    # Update axis labels
    fig.update_xaxes(title_text="Age", row=1, col=1)
    fig.update_yaxes(title_text="Aging Impact (WAR)", row=1, col=1)
    fig.update_xaxes(title_text="Position", row=1, col=2)
    fig.update_yaxes(title_text="Avg Impact (WAR)", row=1, col=2)
    fig.update_xaxes(title_text="Baseline 3-Yr WAR", row=2, col=1)
    fig.update_yaxes(title_text="Age-Adjusted 3-Yr WAR", row=2, col=1)
    fig.update_xaxes(title_text="Aging Impact (WAR)", row=2, col=2)
    fig.update_yaxes(title_text="Count", row=2, col=2)
    
    fig.show()
    
    print("✓ Model comparison visualizations created")
    
else:
    print("⚠ No comparison data available for visualization")

## Section 6: Summary and Insights

Summarize key findings from the age curve analysis and provide actionable insights for player evaluation.

In [None]:
# Summary and insights
print("SECTION 6: SUMMARY AND INSIGHTS")
print("=" * 50)

print("\n6.1: Age Curve Analysis Summary")
print("-" * 35)

# Model performance summary
if model_trained:
    print("✓ Joint longitudinal-survival model successfully trained")
    print("✓ Addresses selection bias in traditional aging curves")
    print("✓ Integrates performance trajectory with retirement risk")
    
    if validation_completed:
        print(f"✓ Cross-validation completed with temporal splits")
        for metric, value in validation_results.items():
            if 'mean' in metric and not np.isnan(value):
                print(f"   {metric}: {value:.3f}")
else:
    print("⚠ Limited model functionality due to training constraints")
    print("✓ Simplified age curve projections available")

# Data summary
print(f"\nData Coverage:")
print(f"   Total records: {len(age_training_data)}")
print(f"   Age coverage: {age_training_data['Age'].notna().sum()}/{len(age_training_data)} ({age_training_data['Age'].notna().sum()/len(age_training_data)*100:.1f}%)")
print(f"   Season range: {age_training_data['Season'].min()}-{age_training_data['Season'].max()}")
print(f"   Available players: {len(available_players)}")

# Age curve insights
print(f"\nAge Curve Insights:")
if len(comparison_df) > 0:
    young_players = comparison_df[comparison_df['age'] <= 26]
    prime_players = comparison_df[(comparison_df['age'] > 26) & (comparison_df['age'] <= 30)]
    veteran_players = comparison_df[comparison_df['age'] > 30]
    
    print(f"   Young players (≤26): Avg impact +{young_players['aging_impact'].mean():.1f} WAR (improvement expected)")
    print(f"   Prime players (27-30): Avg impact {prime_players['aging_impact'].mean():.1f} WAR (peak performance)")
    print(f"   Veteran players (>30): Avg impact {veteran_players['aging_impact'].mean():.1f} WAR (decline phase)")
    
    # Position-specific insights
    print(f"\nPosition-Specific Patterns:")
    for position in ['C', 'SS', '1B', 'DH', 'P']:
        pos_data = comparison_df[comparison_df['position'] == position]
        if len(pos_data) > 0:
            print(f"   {position}: Avg impact {pos_data['aging_impact'].mean():.1f} WAR ({len(pos_data)} players)")

print(f"\n6.2: Key Findings")
print("-" * 20)

findings = [
    "Joint modeling accounts for survival bias in aging curves",
    "Position-specific aging patterns significantly impact projections",
    "Young players benefit from upward age adjustments",
    "Veteran players face realistic decline projections",
    "Catchers show steeper decline rates than other positions",
    "DH players maintain performance longer due to reduced physical demands"
]

for i, finding in enumerate(findings, 1):
    print(f"   {i}. {finding}")

print(f"\n6.3: Practical Applications")
print("-" * 25)

applications = [
    "Free Agent Evaluation: Age-adjusted 3-year projections for contract analysis",
    "Trade Analysis: Account for aging when evaluating future value",
    "Roster Planning: Understand team age distribution impact",
    "Contract Extensions: Realistic expectations for aging players",
    "Draft Strategy: Value young talent with upside potential",
    "Position Changes: Anticipate defensive decline for planning"
]

for i, app in enumerate(applications, 1):
    print(f"   {i}. {app}")

print(f"\n" + "=" * 60)
print(f"sWARm Age Curve Analysis Complete!")
print(f"Analysis Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

if selected_player and future_projections:
    total_war = sum([future_projections['projections'][year]['expected_war'] 
                    for year in [2025, 2026, 2027] 
                    if year in future_projections['projections']])
    print(f"\nFeatured Analysis: {selected_player}")
    print(f"2025-2027 Projection: {total_war:.1f} total expected WAR")

print(f"" + "=" * 60)

## Interactive Section: Custom Player Analysis

Use this section to analyze any specific player by changing the `custom_player` variable below.

In [None]:
# Custom player analysis
print("INTERACTIVE SECTION: CUSTOM PLAYER ANALYSIS")
print("=" * 50)

# CHANGE THIS VARIABLE TO ANALYZE ANY PLAYER
custom_player = "Mike Trout"  # Change this to any player name

print(f"\nAnalyzing: {custom_player}")
print("(Edit the 'custom_player' variable above to analyze a different player)")

if custom_player in available_players:
    # Historical analysis
    custom_analysis = analyze_player_historical_performance(custom_player, age_training_data)
    
    if custom_analysis:
        print(f"\n📊 Historical Performance ({custom_player}):")
        print(f"   Career WAR: {custom_analysis['career_war']:.1f}")
        print(f"   Peak WAR: {custom_analysis['peak_war']:.1f} ({custom_analysis['peak_season']})")
        print(f"   Age Range: {custom_analysis['age_range']} years")
        print(f"   Primary Position: {custom_analysis['primary_position']}")
        print(f"   Seasons: {len(custom_analysis['seasons'])}")
        
        # Future projections
        custom_projections = generate_player_future_projections(
            custom_player, age_training_data, age_curve_model
        )
        
        if custom_projections:
            print(f"\n🔮 Future Projections (2025-2027):")
            
            current = custom_projections['current_state']
            print(f"   Current Age: {current['age']:.0f}")
            print(f"   Current WAR: {current['war']:.1f} ({current['season']})")
            
            total_expected = 0
            for year in [2025, 2026, 2027]:
                if year in custom_projections['projections']:
                    proj = custom_projections['projections'][year]
                    survival = custom_projections['survival_probs'][year]
                    total_expected += proj['expected_war']
                    print(f"   {year}: {proj['expected_war']:.1f} WAR (Age {proj['age']:.0f}, {survival*100:.0f}% survival)")
            
            print(f"   \n💰 3-Year Value: {total_expected:.1f} expected WAR")
            
            # Create visualization
            custom_fig = create_player_visualization(custom_analysis, custom_player)
            if custom_fig:
                custom_fig.show()
                
            custom_proj_fig = create_projection_visualization(custom_projections, custom_player)
            if custom_proj_fig:
                custom_proj_fig.show()
                
        else:
            print(f"   ⚠ Could not generate projections")
            
    else:
        print(f"   ⚠ Could not analyze historical performance")
        
elif len(available_players) > 0:
    print(f"\n⚠ Player '{custom_player}' not found in dataset")
    print(f"\nAvailable players (first 10):")
    for i, player in enumerate(available_players[:10]):
        print(f"   {player}")
    if len(available_players) > 10:
        print(f"   ... and {len(available_players) - 10} more")
        
else:
    print("\n⚠ No players available for analysis")

print(f"\n" + "-" * 60)
print("End of Analysis")