In [60]:
# ALL-IN-ONE FULLY OPTIMIZED MODE - COMPLETE PIPELINE
# Includes fuzzy matching, caching, and all models with catcher framing + enhanced visualizations

# ===== IMPORTS =====
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from cleanedDataParser import *
import xgboost as xgb
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Input
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam

print("Imports loaded")

Imports loaded


In [None]:
# ===== HELPER FUNCTIONS =====
def print_metrics(name, y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    print(f"{name} - R2: {r2:.4f}, RMSE: {rmse:.4f}")

def plot_results(title, y_true, y_pred, player_names=None):
    """Enhanced plot with player names in hover tooltips"""
    if player_names is None:
        player_names = [f"Player_{i}" for i in range(len(y_true))]
    
    # Calculate errors for additional hover info
    errors = [abs(actual - pred) for actual, pred in zip(y_true, y_pred)]
    error_pcts = [abs(actual - pred) / abs(actual) * 100 if actual != 0 else 0 for actual, pred in zip(y_true, y_pred)]
    
    df = pd.DataFrame({
        'Predicted': y_pred, 
        'Actual': y_true, 
        'Player': player_names,
        'Error': errors,
        'Error_Pct': error_pcts
    })
    
    fig = px.scatter(
        df, x='Predicted', y='Actual', 
        hover_name='Player',
        hover_data={
            'Predicted': ':.3f',
            'Actual': ':.3f', 
            'Error': ':.3f',
            'Error_Pct': ':.1f%'
        },
        title=title,
        labels={'Error_Pct': 'Error %'}
    )
    
    # Add perfect prediction line
    min_val = min(min(y_true), min(y_pred))
    max_val = max(max(y_true), max(y_pred))
    fig.add_shape(
        type="line",
        x0=min_val, y0=min_val,
        x1=max_val, y1=max_val,
        line=dict(color="red", width=2, dash="dash"),
        name="Perfect Prediction"
    )
    
    fig.show()

def plot_training_history(history, title):
    """Plot training and validation loss curves"""
    fig = make_subplots(rows=1, cols=2, 
                        subplot_titles=('Loss', 'Mean Absolute Error'))
    
    # Loss plot
    fig.add_trace(go.Scatter(y=history.history['loss'], name='Train Loss'), row=1, col=1)
    fig.add_trace(go.Scatter(y=history.history['val_loss'], name='Val Loss'), row=1, col=1)
    
    # MAE plot
    fig.add_trace(go.Scatter(y=history.history['mae'], name='Train MAE'), row=1, col=2)
    fig.add_trace(go.Scatter(y=history.history['val_mae'], name='Val MAE'), row=1, col=2)
    
    fig.update_layout(title=f"Training History - {title}")
    fig.show()

def select_best_models_by_category(model_results):
    """Automatically select the best performing model from each category"""
    # Calculate average R2 scores for each model across all predictions
    model_scores = {}
    
    for key, data in model_results.results.items():
        model_name = key.split('_')[0]
        if model_name not in model_scores:
            model_scores[model_name] = []
        
        # Calculate R2 for this prediction
        r2 = r2_score(data['y_true'], data['y_pred'])
        model_scores[model_name].append(r2)
    
    # Average the R2 scores for each model
    avg_scores = {model: np.mean(scores) for model, scores in model_scores.items()}
    
    # Define model categories
    categories = {
        'Linear': ['linear', 'lasso', 'elasticnet'],
        'Tree-based': ['randomforest', 'xgboost'],
        'Instance-based': ['knn'],
        'Neural Network': ['keras']
    }
    
    # Select best from each category
    selected_models = []
    for category, models in categories.items():
        available_models = [m for m in models if m in avg_scores]
        if available_models:
            best_model = max(available_models, key=lambda x: avg_scores[x])
            selected_models.append(best_model)
            print(f"📊 Best {category} model: {best_model.upper()} (R² = {avg_scores[best_model]:.4f})")
    
    return selected_models

def create_keras_model(input_dim, name="model"):
    """Create an optimized Keras neural network"""
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(input_dim,)),  # Fixed: Use Input layer instead of input_dim
        tf.keras.layers.Dense(32, activation='relu', name=f'{name}_dense1'),
        tf.keras.layers.Dropout(0.3, name=f'{name}_dropout1'),
        tf.keras.layers.Dense(16, activation='relu', name=f'{name}_dense2'),
        tf.keras.layers.Dropout(0.2, name=f'{name}_dropout2'),
        tf.keras.layers.Dense(1, activation='linear', name=f'{name}_output')
    ], name=name)
    
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

def validate_and_clean_data(X, y):
    """Clean data of infinite/NaN values and extreme outliers"""
    X = np.array(X)
    y = np.array(y)
    
    # Replace infinite values with NaN, then fill with median
    X = np.where(np.isinf(X), np.nan, X)
    for col in range(X.shape[1]):
        median_val = np.nanmedian(X[:, col])
        X[:, col] = np.where(np.isnan(X[:, col]), median_val, X[:, col])
    
    # Cap extreme outliers (beyond 5 standard deviations)
    for col in range(X.shape[1]):
        mean_val = np.mean(X[:, col])
        std_val = np.std(X[:, col])
        if std_val > 0:
            X[:, col] = np.clip(X[:, col], mean_val - 5*std_val, mean_val + 5*std_val)
    
    # Clean y values
    y = np.where(np.isinf(y), np.nan, y)
    y_median = np.nanmedian(y)
    y = np.where(np.isnan(y), y_median, y)
    
    return X.tolist(), y.tolist()

print("Helper functions loaded")

# ===== DATA PREPARATION =====
def data_preparation():
    print("Loading and preparing data...")
    hitter_data = clean_sorted_hitter()
    hitter_pred_data = clean_warp_hitter()
    pitcher_data = clean_sorted_pitcher()
    pitcher_pred_data = clean_warp_pitcher()
    war_values = clean_war()
    baserunning_values = clean_sorted_baserunning()  # Uses caching
    defensive_values = clean_defensive_players()     # Uses caching + framing

    print(f"Loaded data - Hitters: {len(hitter_data)}, WARP hitters: {len(hitter_pred_data)}, WAR: {len(war_values)}")
    print(f"Baserunning values: {len(baserunning_values)}, Defensive values: {len(defensive_values)}")

    print("Creating name mappings...")
    warp_to_hitter_map = create_name_mapping(hitter_pred_data['Name'].tolist(), hitter_data['Hitters'].tolist())
    warp_to_war_map = create_name_mapping(hitter_pred_data['Name'].tolist(), war_values['Name'].tolist())

    hitter_stats = hitter_data
    x_warp, y_warp, x_war, y_war = [], [], [], []
    hitter_names_warp, hitter_names_war = [], []

    for index, row in hitter_pred_data.iterrows():
        warp_name = row['Name']
        hitter_match = warp_to_hitter_map.get(warp_name)
        if hitter_match:
            player_stats = hitter_stats[hitter_stats['Hitters'] == hitter_match]
            if not player_stats.empty:
                stats = player_stats[['K','BB','AVG','OBP','SLG']].values.flatten().tolist()
                baserunning_val = baserunning_values.get(hitter_match, 0)
                stats.append(baserunning_val)
                defensive_val = defensive_values.get(hitter_match, 0)  # Includes framing!
                stats.append(defensive_val)
                
                x_warp.append(stats)
                y_warp.append(row['WARP'])
                hitter_names_warp.append(warp_name)
                
                war_match = warp_to_war_map.get(warp_name)
                if war_match:
                    war_row = war_values[war_values['Name'] == war_match]
                    if not war_row.empty:
                        x_war.append(stats)
                        y_war.append(war_row['Total WAR'].iloc[0])
                        hitter_names_war.append(warp_name)

    # Clean the data before proceeding
    print("Cleaning data for infinite/NaN values...")
    x_warp, y_warp = validate_and_clean_data(x_warp, y_warp)
    x_war, y_war = validate_and_clean_data(x_war, y_war)

    print(f"Successfully matched {len(x_warp)} hitters with 7 features (5 hitting + baserunning + defense with framing)")

    # Pitcher processing
    pitcher_warp_to_main = create_name_mapping(pitcher_pred_data['Name'].tolist(), pitcher_data['Pitchers'].tolist())
    pitcher_warp_to_war = create_name_mapping(pitcher_pred_data['Name'].tolist(), war_values['Name'].tolist())
    pitcher_stats = pitcher_data

    a_warp, b_warp, a_war, b_war = [], [], [], []
    pitcher_names_warp, pitcher_names_war = [], []

    for index, row in pitcher_pred_data.iterrows():
        warp_name = row['Name']
        pitcher_match = pitcher_warp_to_main.get(warp_name)
        if pitcher_match:
            player_stats = pitcher_stats[pitcher_stats['Pitchers'] == pitcher_match]
            if not player_stats.empty:
                stats = player_stats[['IP','BB','K','HR','ERA']].values.flatten().tolist()
                a_warp.append(stats)
                b_warp.append(row['WARP'])
                pitcher_names_warp.append(warp_name)
                
                war_match = pitcher_warp_to_war.get(warp_name)
                if war_match:
                    war_row = war_values[war_values['Name'] == war_match]
                    if not war_row.empty and 'Primary WAR' in war_row.columns:
                        a_war.append(stats)
                        b_war.append(war_row['Primary WAR'].iloc[0])
                        pitcher_names_war.append(warp_name)

    # Clean pitcher data too
    a_warp, b_warp = validate_and_clean_data(a_warp, b_warp)
    a_war, b_war = validate_and_clean_data(a_war, b_war)

    print(f"Successfully matched {len(a_warp)} pitchers")
    return (x_warp, y_warp, x_war, y_war, a_warp, b_warp, a_war, b_war,
            hitter_names_warp, hitter_names_war, pitcher_names_warp, pitcher_names_war)

# ===== MODEL RESULTS CLASS =====
class ModelResults:
    def __init__(self):
        self.results = {}
        
    def store_results(self, model_name, player_type, metric_type, y_true, y_pred, player_names):
        key = f"{model_name}_{player_type}_{metric_type}"
        self.results[key] = {'y_true': y_true, 'y_pred': y_pred, 'player_names': player_names}

model_results = ModelResults()

# ===== TRAIN/TEST SPLIT =====
def prepare_train_test_splits():
    (x_warp, y_warp, x_war, y_war, a_warp, b_warp, a_war, b_war,
     hitter_names_warp, hitter_names_war, pitcher_names_warp, pitcher_names_war) = data_preparation()
    
    x_warp_train, x_warp_test, y_warp_train, y_warp_test, h_names_warp_train, h_names_warp_test = train_test_split(
        x_warp, y_warp, hitter_names_warp, test_size=0.25, train_size=0.75, random_state=1
    )
    x_war_train, x_war_test, y_war_train, y_war_test, h_names_war_train, h_names_war_test = train_test_split(
        x_war, y_war, hitter_names_war, test_size=0.25, train_size=0.75, random_state=1
    )
    a_warp_train, a_warp_test, b_warp_train, b_warp_test, p_names_warp_train, p_names_warp_test = train_test_split(
        a_warp, b_warp, pitcher_names_warp, test_size=0.25, train_size=0.75, random_state=1
    )
    
    if len(a_war) > 0:
        a_war_train, a_war_test, b_war_train, b_war_test, p_names_war_train, p_names_war_test = train_test_split(
            a_war, b_war, pitcher_names_war, test_size=0.25, train_size=0.75, random_state=1
        )
    else:
        a_war_train, a_war_test, b_war_train, b_war_test = a_warp_train, a_warp_test, b_warp_train, b_warp_test
        p_names_war_train, p_names_war_test = p_names_warp_train, p_names_warp_test

    return (x_warp_train, x_warp_test, y_warp_train, y_warp_test,
            x_war_train, x_war_test, y_war_train, y_war_test,
            a_warp_train, a_warp_test, b_warp_train, b_warp_test,
            a_war_train, a_war_test, b_war_train, b_war_test,
            h_names_warp_test, h_names_war_test, p_names_warp_test, p_names_war_test)

# ===== MODEL FUNCTIONS =====
def run_basic_regressions(data_splits):
    (x_warp_train, x_warp_test, y_warp_train, y_warp_test,
     x_war_train, x_war_test, y_war_train, y_war_test,
     a_warp_train, a_warp_test, b_warp_train, b_warp_test,
     a_war_train, a_war_test, b_war_train, b_war_test,
     h_names_warp_test, h_names_war_test, p_names_warp_test, p_names_war_test) = data_splits

    models = [('linear', LinearRegression()), ('lasso', Lasso()), ('elasticnet', ElasticNet())]
    
    for name, model in models:
        print(f"=== {name.upper()} REGRESSION ===")
        
        datasets = [
            ('hitter', 'warp', x_warp_train, x_warp_test, y_warp_train, y_warp_test, h_names_warp_test),
            ('hitter', 'war', x_war_train, x_war_test, y_war_train, y_war_test, h_names_war_test),
            ('pitcher', 'warp', a_warp_train, a_warp_test, b_warp_train, b_warp_test, p_names_warp_test),
            ('pitcher', 'war', a_war_train, a_war_test, b_war_train, b_war_test, p_names_war_test)
        ]
        
        for player_type, metric, X_train, X_test, y_train, y_test, names_test in datasets:
            if len(X_train) > 0:
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                print_metrics(f"{name} {player_type} {metric}", y_test, y_pred)
                plot_results(f"{player_type} {metric} ({name})", y_test, y_pred, names_test)
                model_results.store_results(name, player_type, metric, y_test, y_pred, names_test)

def run_advanced_models(data_splits):
    (x_warp_train, x_warp_test, y_warp_train, y_warp_test,
     x_war_train, x_war_test, y_war_train, y_war_test,
     a_warp_train, a_warp_test, b_warp_train, b_warp_test,
     a_war_train, a_war_test, b_war_train, b_war_test,
     h_names_warp_test, h_names_war_test, p_names_warp_test, p_names_war_test) = data_splits

    models = [
        ('knn', KNeighborsRegressor(n_neighbors=3, n_jobs=-1)),
        ('randomforest', RandomForestRegressor(n_estimators=50, max_depth=8, random_state=1, n_jobs=-1)),
        ('xgboost', xgb.XGBRegressor(n_estimators=50, max_depth=4, learning_rate=0.1, random_state=1, n_jobs=-1))
    ]
    
    for name, model in models:
        print(f"=== {name.upper()} ===")
        
        datasets = [
            ('hitter', 'warp', x_warp_train, x_warp_test, y_warp_train, y_warp_test, h_names_warp_test),
            ('hitter', 'war', x_war_train, x_war_test, y_war_train, y_war_test, h_names_war_test),
            ('pitcher', 'warp', a_warp_train, a_warp_test, b_warp_train, b_warp_test, p_names_warp_test),
            ('pitcher', 'war', a_war_train, a_war_test, b_war_train, b_war_test, p_names_war_test)
        ]
        
        for player_type, metric, X_train, X_test, y_train, y_test, names_test in datasets:
            if len(X_train) > 0:
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                print_metrics(f"{name} {player_type} {metric}", y_test, y_pred)
                plot_results(f"{player_type} {metric} ({name})", y_test, y_pred, names_test)
                model_results.store_results(name, player_type, metric, y_test, y_pred, names_test)

def run_neural_network(data_splits):
    (x_warp_train, x_warp_test, y_warp_train, y_warp_test,
     x_war_train, x_war_test, y_war_train, y_war_test,
     a_warp_train, a_warp_test, b_warp_train, b_warp_test,
     a_war_train, a_war_test, b_war_train, b_war_test,
     h_names_warp_test, h_names_war_test, p_names_warp_test, p_names_war_test) = data_splits

    scaler = StandardScaler()
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=10, restore_best_weights=True, verbose=0
    )

    print("=== KERAS NEURAL NETWORK ===")

    datasets = [
        ('hitter', 'warp', x_warp_train, x_warp_test, y_warp_train, y_warp_test, h_names_warp_test),
        ('hitter', 'war', x_war_train, x_war_test, y_war_train, y_war_test, h_names_war_test),
        ('pitcher', 'warp', a_warp_train, a_warp_test, b_warp_train, b_warp_test, p_names_warp_test),
        ('pitcher', 'war', a_war_train, a_war_test, b_war_train, b_war_test, p_names_war_test)
    ]
    
    for player_type, metric, X_train, X_test, y_train, y_test, names_test in datasets:
        if len(X_train) > 0:
            print(f"Training Neural Network for {player_type} {metric}...")
            
            # Convert to numpy arrays for scaling
            X_train_np = np.array(X_train)
            X_test_np = np.array(X_test)
            y_train_np = np.array(y_train)
            
            X_train_scaled = scaler.fit_transform(X_train_np)
            X_test_scaled = scaler.transform(X_test_np)
            
            model = create_keras_model(input_dim=X_train_scaled.shape[1], name=f"{player_type}_{metric}")
            
            X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
                X_train_scaled, y_train_np, test_size=0.2, random_state=1
            )
            
            # Ensure data is in the right format for Keras
            history = model.fit(
                X_train_split.astype(np.float32), 
                y_train_split.astype(np.float32),
                validation_data=(X_val_split.astype(np.float32), y_val_split.astype(np.float32)),
                epochs=50, batch_size=32, callbacks=[early_stopping], verbose=0
            )
            
            y_pred = model.predict(X_test_scaled.astype(np.float32), verbose=0).flatten()
            print_metrics(f"Keras {player_type} {metric}", y_test, y_pred)
            plot_results(f"{player_type} {metric} (Keras Neural Network)", y_test, y_pred, names_test)
            plot_training_history(history, f"{player_type} {metric}")
            model_results.store_results("keras", player_type, metric, y_test, y_pred, names_test)

print("All model functions loaded")

✅ Helper functions loaded
✅ All model functions loaded


In [62]:
def plot_quadrant_analysis(model_results, model_names=None):
    """Create enhanced quadrant analysis with accuracy zone and auto-selected best models"""
    
    # Auto-select best models if none specified
    if model_names is None:
        model_names = select_best_models_by_category(model_results)
        print(f"🎯 Auto-selected models: {[m.upper() for m in model_names]}")
    
    # Collect data for analysis
    analysis_data = []
    
    for model_name in model_names:
        # Get hitter results for both WAR and WARP
        war_key = f"{model_name}_hitter_war"
        warp_key = f"{model_name}_hitter_warp" 
        
        if war_key in model_results.results and warp_key in model_results.results:
            war_data = model_results.results[war_key]
            warp_data = model_results.results[warp_key]
            
            # Match players between WAR and WARP datasets
            war_players = {name.lower(): i for i, name in enumerate(war_data['player_names'])}
            
            for i, warp_player in enumerate(warp_data['player_names']):
                warp_player_lower = warp_player.lower()
                if warp_player_lower in war_players:
                    war_idx = war_players[warp_player_lower]
                    
                    # Calculate deltas (actual - predicted) 
                    war_delta = war_data['y_true'][war_idx] - war_data['y_pred'][war_idx]
                    warp_delta = warp_data['y_true'][i] - warp_data['y_pred'][i]
                    
                    # Calculate error percentages for accuracy zone
                    war_actual = war_data['y_true'][war_idx]
                    warp_actual = warp_data['y_true'][i]
                    war_error_pct = abs(war_delta) / abs(war_actual) * 100 if war_actual != 0 else float('inf')
                    warp_error_pct = abs(warp_delta) / abs(warp_actual) * 100 if warp_actual != 0 else float('inf')
                    
                    analysis_data.append({
                        'player': warp_player,
                        'model': model_name,
                        'war_delta': war_delta,
                        'warp_delta': warp_delta,
                        'war_actual': war_actual,
                        'war_pred': war_data['y_pred'][war_idx],
                        'warp_actual': warp_actual,
                        'warp_pred': warp_data['y_pred'][i],
                        'war_error_pct': war_error_pct,
                        'warp_error_pct': warp_error_pct,
                        'in_accuracy_zone': war_error_pct <= 10 and warp_error_pct <= 10,
                        'player_type': 'Hitter'
                    })
        
        # Add pitcher data
        pitcher_war_key = f"{model_name}_pitcher_war"
        pitcher_warp_key = f"{model_name}_pitcher_warp"
        
        if pitcher_war_key in model_results.results and pitcher_warp_key in model_results.results:
            war_data = model_results.results[pitcher_war_key]
            warp_data = model_results.results[pitcher_warp_key]
            
            war_players = {name.lower(): i for i, name in enumerate(war_data['player_names'])}
            
            for i, warp_player in enumerate(warp_data['player_names']):
                warp_player_lower = warp_player.lower()
                if warp_player_lower in war_players:
                    war_idx = war_players[warp_player_lower]
                    
                    war_delta = war_data['y_true'][war_idx] - war_data['y_pred'][war_idx]
                    warp_delta = warp_data['y_true'][i] - warp_data['y_pred'][i]
                    
                    war_actual = war_data['y_true'][war_idx]
                    warp_actual = warp_data['y_true'][i]
                    war_error_pct = abs(war_delta) / abs(war_actual) * 100 if war_actual != 0 else float('inf')
                    warp_error_pct = abs(warp_delta) / abs(warp_actual) * 100 if warp_actual != 0 else float('inf')
                    
                    analysis_data.append({
                        'player': warp_player,
                        'model': model_name,
                        'war_delta': war_delta,
                        'warp_delta': warp_delta,
                        'war_actual': war_actual,
                        'war_pred': war_data['y_pred'][war_idx],
                        'warp_actual': warp_actual,
                        'warp_pred': warp_data['y_pred'][i],
                        'war_error_pct': war_error_pct,
                        'warp_error_pct': warp_error_pct,
                        'in_accuracy_zone': war_error_pct <= 10 and warp_error_pct <= 10,
                        'player_type': 'Pitcher'
                    })
    
    if not analysis_data:
        print("No matching data found for quadrant analysis")
        return
    
    df = pd.DataFrame(analysis_data)
    
    # Calculate the accuracy zone boundary (approximate circle in delta space)
    # For visualization, we'll use the median absolute actual values to estimate the 10% boundary
    median_war_actual = df['war_actual'].abs().median()
    median_warp_actual = df['warp_actual'].abs().median()
    war_10pct_radius = median_war_actual * 0.1
    warp_10pct_radius = median_warp_actual * 0.1
    
    # Create subplots for different views
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=(
            'All Players (by Model)', 
            'All Players (by Position)',
            'Hitters Only', 
            'Pitchers Only'
        ),
        specs=[[{"secondary_y": False}, {"secondary_y": False}],
               [{"secondary_y": False}, {"secondary_y": False}]]
    )
    
    colors_model = {'linear': 'blue', 'randomforest': 'green', 'keras': 'red', 'lasso': 'orange', 'elasticnet': 'purple', 'knn': 'brown', 'xgboost': 'pink'}
    colors_position = {'Hitter': 'blue', 'Pitcher': 'red'}
    
    # Add accuracy zone circles and delta 1 zones to all subplots
    def add_accuracy_zone(fig, row, col, war_radius, warp_radius):
        # REMOVE: Old 10% accuracy zone circle - REPLACE WITH DELTA 1 CROSS
        
        # Add delta 1 cross shape (official margins) - MAIN VISUALIZATION
        # Vertical lines (WAR = ±1)
        fig.add_vline(x=1, line_width=3, line_dash="dot", line_color="orange", row=row, col=col)
        fig.add_vline(x=-1, line_width=3, line_dash="dot", line_color="orange", row=row, col=col)
        # Horizontal lines (WARP = ±1)  
        fig.add_hline(y=1, line_width=3, line_dash="dot", line_color="orange", row=row, col=col)
        fig.add_hline(y=-1, line_width=3, line_dash="dot", line_color="orange", row=row, col=col)
        
        # Add delta 1 intersection square (both within ±1)
        fig.add_shape(
            type="rect",
            x0=-1, y0=-1, x1=1, y1=1,
            line=dict(color="green", width=2, dash="dash"),
            fillcolor="green", 
            opacity=0.1,
            row=row, col=col
        )
        
        # Add legend entries for delta 1 zones (only on first subplot)
        if row == 1 and col == 1:
            fig.add_trace(
                go.Scatter(
                    x=[None], y=[None],
                    mode='lines',
                    line=dict(color='orange', width=3, dash='dot'),
                    name='Delta 1 Cross (WAR≤1 OR WARP≤1)',
                    showlegend=True
                ),
                row=row, col=col
            )
            fig.add_trace(
                go.Scatter(
                    x=[None], y=[None],
                    mode='lines',
                    line=dict(color='green', width=2, dash='dash'),
                    name='Delta 1 Square (WAR≤1 AND WARP≤1)',
                    showlegend=True
                ),
                row=row, col=col
            )
    
    # Plot 1: All players colored by model
    for model in df['model'].unique():
        model_data = df[df['model'] == model]
        fig.add_trace(
            go.Scatter(
                x=model_data['war_delta'], 
                y=model_data['warp_delta'],
                mode='markers',
                name=f'{model.title()}',
                text=model_data['player'],
                hovertemplate='<b>%{text}</b><br>' +
                             'WAR Delta: %{x:.3f}<br>' +
                             'WARP Delta: %{y:.3f}<br>' +
                             f'Model: {model}<extra></extra>',
                marker=dict(color=colors_model.get(model, 'gray'), size=8, opacity=0.7)
            ),
            row=1, col=1
        )
    add_accuracy_zone(fig, 1, 1, war_10pct_radius, warp_10pct_radius)
    
    # Plot 2: All players colored by position  
    for pos in df['player_type'].unique():
        pos_data = df[df['player_type'] == pos]
        fig.add_trace(
            go.Scatter(
                x=pos_data['war_delta'],
                y=pos_data['warp_delta'], 
                mode='markers',
                name=f'{pos}s',
                text=pos_data['player'],
                hovertemplate='<b>%{text}</b><br>' +
                             'WAR Delta: %{x:.3f}<br>' +
                             'WARP Delta: %{y:.3f}<br>' +
                             f'Position: {pos}<extra></extra>',
                marker=dict(color=colors_position[pos], size=8, opacity=0.7),
                showlegend=False
            ),
            row=1, col=2
        )
    add_accuracy_zone(fig, 1, 2, war_10pct_radius, warp_10pct_radius)
    
    # Plot 3: Hitters only
    hitters = df[df['player_type'] == 'Hitter']
    for model in hitters['model'].unique():
        model_data = hitters[hitters['model'] == model]
        fig.add_trace(
            go.Scatter(
                x=model_data['war_delta'],
                y=model_data['warp_delta'],
                mode='markers', 
                name=f'H-{model.title()}',
                text=model_data['player'],
                hovertemplate='<b>%{text}</b><br>' +
                             'WAR Delta: %{x:.3f}<br>' +
                             'WARP Delta: %{y:.3f}<br>' +
                             f'Model: {model}<extra></extra>',
                marker=dict(color=colors_model.get(model, 'gray'), size=8, opacity=0.7),
                showlegend=False
            ),
            row=2, col=1
        )
    add_accuracy_zone(fig, 2, 1, war_10pct_radius, warp_10pct_radius)
    
    # Plot 4: Pitchers only  
    pitchers = df[df['player_type'] == 'Pitcher']
    for model in pitchers['model'].unique():
        model_data = pitchers[pitchers['model'] == model]
        fig.add_trace(
            go.Scatter(
                x=model_data['war_delta'],
                y=model_data['warp_delta'],
                mode='markers',
                name=f'P-{model.title()}', 
                text=model_data['player'],
                hovertemplate='<b>%{text}</b><br>' +
                             'WAR Delta: %{x:.3f}<br>' +
                             'WARP Delta: %{y:.3f}<br>' +
                             f'Model: {model}<extra></extra>',
                marker=dict(color=colors_model.get(model, 'gray'), size=8, opacity=0.7),
                showlegend=False
            ),
            row=2, col=2
        )
    add_accuracy_zone(fig, 2, 2, war_10pct_radius, warp_10pct_radius)
    
    # Add quadrant lines (x=0, y=0)
    for row in [1, 2]:
        for col in [1, 2]:
            # Vertical line at x=0
            fig.add_vline(x=0, line_width=1, line_dash="dash", line_color="gray", row=row, col=col)
            # Horizontal line at y=0  
            fig.add_hline(y=0, line_width=1, line_dash="dash", line_color="gray", row=row, col=col)
    
    # Update layout
    fig.update_layout(
        title="Prediction Delta Analysis: WAR vs WARP Errors<br><sub>Orange cross = Delta 1 official margins (WAR≤1 OR WARP≤1) | Green square = Both within ±1 (WAR≤1 AND WARP≤1)</sub>",
        height=800,
        showlegend=True
    )
    
    # Update axes labels
    fig.update_xaxes(title_text="WAR Delta (Actual - Predicted)")
    fig.update_yaxes(title_text="WARP Delta (Actual - Predicted)")
    
    fig.show()
    
    # Print enhanced summary statistics
    print("=== ENHANCED QUADRANT & ACCURACY ANALYSIS ===")
    for model in df['model'].unique():
        model_data = df[df['model'] == model]
        
        # Quadrant analysis
        q1 = len(model_data[(model_data['war_delta'] > 0) & (model_data['warp_delta'] > 0)])
        q2 = len(model_data[(model_data['war_delta'] < 0) & (model_data['warp_delta'] > 0)])  
        q3 = len(model_data[(model_data['war_delta'] < 0) & (model_data['warp_delta'] < 0)])
        q4 = len(model_data[(model_data['war_delta'] > 0) & (model_data['warp_delta'] < 0)])
        
        # Accuracy zone analysis
        accuracy_zone = len(model_data[model_data['in_accuracy_zone']])
        total = len(model_data)
        
        # NEW: Individual metric accuracy (delta 1 margins)
        war_delta_1 = len(model_data[abs(model_data['war_delta']) <= 1.0])
        warp_delta_1 = len(model_data[abs(model_data['warp_delta']) <= 1.0])
        both_delta_1 = len(model_data[(abs(model_data['war_delta']) <= 1.0) & (abs(model_data['warp_delta']) <= 1.0)])
        either_delta_1 = len(model_data[(abs(model_data['war_delta']) <= 1.0) | (abs(model_data['warp_delta']) <= 1.0)])
        
        print(f"\n{model.upper()} MODEL ({total} players):")
        print(f"  ACCURACY ZONE (≤10% error both): {accuracy_zone} ({accuracy_zone/total*100:.1f}%)")
        print(f"  DELTA 1 CROSS (WAR≤1 OR WARP≤1): {either_delta_1} ({either_delta_1/total*100:.1f}%)")
        print(f"  DELTA 1 INTERSECTION (WAR≤1 AND WARP≤1): {both_delta_1} ({both_delta_1/total*100:.1f}%)")
        print(f"  WAR ONLY (≤1 error): {war_delta_1} ({war_delta_1/total*100:.1f}%)")
        print(f"  WARP ONLY (≤1 error): {warp_delta_1} ({warp_delta_1/total*100:.1f}%)")
        print(f"  Q1 (Both Over-pred): {q1} ({q1/total*100:.1f}%)")
        print(f"  Q2 (WAR Under, WARP Over): {q2} ({q2/total*100:.1f}%)")  
        print(f"  Q3 (Both Under-pred): {q3} ({q3/total*100:.1f}%)")
        print(f"  Q4 (WAR Over, WARP Under): {q4} ({q4/total*100:.1f}%)")
        
        # Show some players in accuracy zone
        accurate_players = model_data[model_data['in_accuracy_zone']]['player'].tolist()
        war_accurate = model_data[abs(model_data['war_delta']) <= 1.0]['player'].tolist()
        warp_accurate = model_data[abs(model_data['warp_delta']) <= 1.0]['player'].tolist()
        
        if accurate_players:
            print(f"  Sample accurate predictions: {', '.join(accurate_players[:3])}{'...' if len(accurate_players) > 3 else ''}")
        if war_accurate:
            print(f"  Sample WAR-accurate (≤1): {', '.join(war_accurate[:3])}{'...' if len(war_accurate) > 3 else ''}")
        if warp_accurate:
            print(f"  Sample WARP-accurate (≤1): {', '.join(warp_accurate[:3])}{'...' if len(warp_accurate) > 3 else ''}")

In [63]:
# ===== EXECUTE THE COMPLETE PIPELINE =====
try:
    # Clear any cached data to get fresh clean data
    clear_all_cache()
    
    # Prepare data
    print("\nPreparing data with fuzzy matching and caching...")
    data_splits = prepare_train_test_splits()
    print("Data preparation complete!")
    
    # Run all models
    print("\n1. Running Basic Regression Models...")
    run_basic_regressions(data_splits)
    
    print("\n2. Running Advanced Models...")
    run_advanced_models(data_splits)
    
    print("\n3. Running Neural Network...")
    run_neural_network(data_splits)
    
    # Generate enhanced quadrant analysis with auto-selected best models
    print("\n4. Generating Enhanced Quadrant Analysis...")
    plot_quadrant_analysis(model_results)  # Auto-selects best from each category + includes delta 1 analysis
    
except Exception as e:
    print(f"\n❌ Error: {e}")
    import traceback
    traceback.print_exc()

Cleared all 2 cache files

Preparing data with fuzzy matching and caching...
Loading and preparing data...
Aggregated hitter data: 361331 game records -> 1805 qualified players (10+ games)
Aggregated pitcher data: 143447 game records -> 1814 unique players
Processing baserunning data (this may take a moment)...
Cached baserunning data (2039 players)
Processing defensive data (this may take a moment)...
Loaded framing data for 49 catchers
Cached defensive data (2713 players)
Loaded data - Hitters: 1805, WARP hitters: 463, WAR: 1508
Baserunning values: 2039, Defensive values: 2713
Creating name mappings...
Using in-memory cached mapping: 463 -> 1805
Using in-memory cached mapping: 463 -> 1508
Cleaning data for infinite/NaN values...
Successfully matched 460 hitters with 7 features (5 hitting + baserunning + defense with framing)
Using in-memory cached mapping: 472 -> 1814
Using in-memory cached mapping: 472 -> 1508
Successfully matched 463 pitchers
Data preparation complete!

1. Running 

linear hitter war - R2: 0.1142, RMSE: 1.5897


linear pitcher warp - R2: 0.4814, RMSE: 0.7936


linear pitcher war - R2: 0.4819, RMSE: 0.9388


=== LASSO REGRESSION ===
lasso hitter warp - R2: 0.1791, RMSE: 1.2368


lasso hitter war - R2: 0.0648, RMSE: 1.6335


lasso pitcher warp - R2: 0.4441, RMSE: 0.8217


lasso pitcher war - R2: 0.4426, RMSE: 0.9737


=== ELASTICNET REGRESSION ===
elasticnet hitter warp - R2: 0.1794, RMSE: 1.2365


elasticnet hitter war - R2: 0.0641, RMSE: 1.6341


elasticnet pitcher warp - R2: 0.4638, RMSE: 0.8070


elasticnet pitcher war - R2: 0.4642, RMSE: 0.9547



2. Running Advanced Models...
=== KNN ===
knn hitter warp - R2: -0.0913, RMSE: 1.4260


knn hitter war - R2: -0.1683, RMSE: 1.8258


knn pitcher warp - R2: 0.2441, RMSE: 0.9581


knn pitcher war - R2: 0.3017, RMSE: 1.0899


=== RANDOMFOREST ===
randomforest hitter warp - R2: 0.1869, RMSE: 1.2309


randomforest hitter war - R2: 0.1061, RMSE: 1.5970


randomforest pitcher warp - R2: 0.3759, RMSE: 0.8706


randomforest pitcher war - R2: 0.3945, RMSE: 1.0150


=== XGBOOST ===
xgboost hitter warp - R2: 0.1652, RMSE: 1.2472


xgboost hitter war - R2: 0.1437, RMSE: 1.5631


xgboost pitcher warp - R2: 0.2913, RMSE: 0.9278


xgboost pitcher war - R2: 0.3976, RMSE: 1.0123



3. Running Neural Network...
=== KERAS NEURAL NETWORK ===
Training Neural Network for hitter warp...
Keras hitter warp - R2: 0.1730, RMSE: 1.2413


Training Neural Network for hitter war...
Keras hitter war - R2: -0.9621, RMSE: 2.3661


Training Neural Network for pitcher warp...
Keras pitcher warp - R2: -0.6432, RMSE: 1.4127


Training Neural Network for pitcher war...
Keras pitcher war - R2: -0.0444, RMSE: 1.3329



4. Generating Enhanced Quadrant Analysis...
📊 Best Linear model: LINEAR (R² = 0.3096)
📊 Best Tree-based model: RANDOMFOREST (R² = 0.2659)
📊 Best Instance-based model: KNN (R² = 0.0715)
📊 Best Neural Network model: KERAS (R² = -0.3692)
🎯 Auto-selected models: ['LINEAR', 'RANDOMFOREST', 'KNN', 'KERAS']


=== ENHANCED QUADRANT & ACCURACY ANALYSIS ===

LINEAR MODEL (231 players):
  ACCURACY ZONE (≤10% error both): 2 (0.9%)
  DELTA 1 CROSS (WAR≤1 OR WARP≤1): 187 (81.0%)
  DELTA 1 INTERSECTION (WAR≤1 AND WARP≤1): 144 (62.3%)
  WAR ONLY (≤1 error): 157 (68.0%)
  WARP ONLY (≤1 error): 174 (75.3%)
  Q1 (Both Over-pred): 87 (37.7%)
  Q2 (WAR Under, WARP Over): 35 (15.2%)
  Q3 (Both Under-pred): 81 (35.1%)
  Q4 (WAR Over, WARP Under): 28 (12.1%)
  Sample accurate predictions: José Iglesias, Max Scherzer
  Sample WAR-accurate (≤1): Mike Tauchman, Abraham Almonte, Austin Barnes...
  Sample WARP-accurate (≤1): Mike Tauchman, Abraham Almonte, Austin Barnes...

RANDOMFOREST MODEL (231 players):
  ACCURACY ZONE (≤10% error both): 5 (2.2%)
  DELTA 1 CROSS (WAR≤1 OR WARP≤1): 186 (80.5%)
  DELTA 1 INTERSECTION (WAR≤1 AND WARP≤1): 135 (58.4%)
  WAR ONLY (≤1 error): 150 (64.9%)
  WARP ONLY (≤1 error): 171 (74.0%)
  Q1 (Both Over-pred): 86 (37.2%)
  Q2 (WAR Under, WARP Over): 37 (16.0%)
  Q3 (Both Under-pr