In [13]:
# Imports and Setup
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from cleanedDataParser import *
import xgboost as xgb
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam

In [None]:
# Helper Functions
def print_metrics(name, y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    print(f"{name} - R2: {r2:.4f}, RMSE: {rmse:.4f}")


def plot_results(title, y_true, y_pred):
    df = pd.DataFrame({'Predicted': y_pred, 'Actual': y_true})
    fig = px.scatter(df, x='Predicted', y='Actual', title=title)
    fig.show()


def plot_training_history(history, title):
    """Plot training and validation loss curves"""
    fig = make_subplots(rows=1, cols=2, 
                        subplot_titles=('Loss', 'Mean Absolute Error'))
    
    # Loss plot
    fig.add_trace(go.Scatter(y=history.history['loss'], name='Train Loss'), row=1, col=1)
    fig.add_trace(go.Scatter(y=history.history['val_loss'], name='Val Loss'), row=1, col=1)
    
    # MAE plot
    fig.add_trace(go.Scatter(y=history.history['mae'], name='Train MAE'), row=1, col=2)
    fig.add_trace(go.Scatter(y=history.history['val_mae'], name='Val MAE'), row=1, col=2)
    
    fig.update_layout(title=f"Training History - {title}")
    fig.show()


def create_keras_model(input_dim, name="model"):
    """Create an optimized Keras neural network"""
    model = tf.keras.Sequential([ # pyright: ignore[reportAttributeAccessIssue]
        tf.keras.layers.Dense(32, activation='relu', input_dim=input_dim, name=f'{name}_input'),  # Reduced from 64 # pyright: ignore[reportAttributeAccessIssue]
        tf.keras.layers.Dropout(0.3, name=f'{name}_dropout1'), # pyright: ignore[reportAttributeAccessIssue]
        tf.keras.layers.Dense(16, activation='relu', name=f'{name}_hidden1'), # pyright: ignore[reportAttributeAccessIssue]
        tf.keras.layers.Dropout(0.2, name=f'{name}_dropout2'), # pyright: ignore[reportAttributeAccessIssue]
        tf.keras.layers.Dense(1, activation='linear', name=f'{name}_output') # pyright: ignore[reportAttributeAccessIssue]
    ], name=name)
    
    model.compile(
        optimizer='adam',
        loss='mse',
        metrics=['mae']
    )
    
    return model

In [None]:
# Data Preparation with Baserunning and Defense
def data_preparation():
    print("Loading and preparing data...")

    # Load data from parser - now properly aggregated to season level
    hitter_data = clean_sorted_hitter()  # Now returns aggregated season data
    hitter_pred_data = clean_warp_hitter()  # Already season level
    pitcher_data = clean_sorted_pitcher()  # Now returns aggregated season data
    pitcher_pred_data = clean_warp_pitcher()  # Already season level
    war_values = clean_war()  # Already season level
    baserunning_values = clean_sorted_baserunning()  # Game level → player totals
    defensive_values = clean_defensive_players()  # Game level → player totals

    print(f"Loaded data - Hitters: {len(hitter_data)}, WARP hitters: {len(hitter_pred_data)}, WAR: {len(war_values)}")
    print(f"Baserunning values: {len(baserunning_values)}, Defensive values: {len(defensive_values)}")

    # Create name mappings using the new fuzzy matching functions
    print("Creating name mappings...")
    warp_to_hitter_map = create_name_mapping(
        hitter_pred_data['Name'].tolist(),
        hitter_data['Hitters'].tolist()
    )
    warp_to_war_map = create_name_mapping(
        hitter_pred_data['Name'].tolist(),
        war_values['Name'].tolist()
    )

    # No need for additional aggregation - data is already season-level
    hitter_stats = hitter_data  # Already aggregated in clean_sorted_hitter()

    # Hitter features & targets (now with baserunning and defense)
    x_warp, y_warp, x_war, y_war = [], [], [], []
    hitter_names_warp, hitter_names_war = [], []

    for index, row in hitter_pred_data.iterrows():
        warp_name = row['Name']
        
        # Use the new fuzzy matching
        hitter_match = warp_to_hitter_map.get(warp_name)
        if hitter_match:
            # Get season stats for this player
            player_stats = hitter_stats[hitter_stats['Hitters'] == hitter_match]
            if not player_stats.empty:
                # Original 5 features
                stats = player_stats[['K','BB','AVG','OBP','SLG']].values.flatten().tolist()
                
                # Add baserunning value (default 0 if not found)
                baserunning_val = baserunning_values.get(hitter_match, 0)
                stats.append(baserunning_val)
                
                # Add defensive value (default 0 if not found)  
                defensive_val = defensive_values.get(hitter_match, 0)
                stats.append(defensive_val)
                
                x_warp.append(stats)
                y_warp.append(row['WARP'])
                hitter_names_warp.append(warp_name)
                
                # Try to get WAR value using fuzzy matching
                war_match = warp_to_war_map.get(warp_name)
                if war_match:
                    war_row = war_values[war_values['Name'] == war_match]
                    if not war_row.empty:
                        x_war.append(stats)
                        y_war.append(war_row['Total WAR'].iloc[0])
                        hitter_names_war.append(warp_name)
                    else:
                        x_war.append(stats)
                        y_war.append(row['WARP'])  # fallback to WARP
                        hitter_names_war.append(warp_name)
                else:
                    x_war.append(stats)
                    y_war.append(row['WARP'])  # fallback to WARP
                    hitter_names_war.append(warp_name)

    print(f"Successfully matched {len(x_warp)} hitters with 7 features (5 hitting + baserunning + defense)")

    # Handle pitcher data similarly
    pitcher_warp_to_main = create_name_mapping(
        pitcher_pred_data['Name'].tolist(),
        pitcher_data['Pitchers'].tolist()
    )
    pitcher_warp_to_war = create_name_mapping(
        pitcher_pred_data['Name'].tolist(),
        war_values['Name'].tolist()
    )

    pitcher_stats = pitcher_data  # Already aggregated in clean_sorted_pitcher()

    # Pitcher features & targets (keeping your original variable names)
    a_warp, b_warp, a_war, b_war = [], [], [], []
    pitcher_names_warp, pitcher_names_war = [], []

    for index, row in pitcher_pred_data.iterrows():
        warp_name = row['Name']
        pitcher_match = pitcher_warp_to_main.get(warp_name)
        if pitcher_match:
            player_stats = pitcher_stats[pitcher_stats['Pitchers'] == pitcher_match]
            if not player_stats.empty:
                stats = player_stats[['IP','BB','K','HR','ERA']].values.flatten().tolist()
                a_warp.append(stats)
                b_warp.append(row['WARP'])
                pitcher_names_warp.append(warp_name)
                
                # Try to get WAR value
                war_match = pitcher_warp_to_war.get(warp_name)
                if war_match:
                    war_row = war_values[war_values['Name'] == war_match]
                    if not war_row.empty and 'Primary WAR' in war_row.columns:
                        a_war.append(stats)
                        b_war.append(war_row['Primary WAR'].iloc[0])
                        pitcher_names_war.append(warp_name)

    print(f"Successfully matched {len(a_warp)} pitchers")

    # Ensure we have data before splitting
    if len(x_warp) == 0:
        raise ValueError("No hitter data matched! Check your data files.")
    if len(a_warp) == 0:
        raise ValueError("No pitcher data matched! Check your data files.")

    print("Data preparation completed successfully!")
    return (x_warp, y_warp, x_war, y_war, a_warp, b_warp, a_war, b_war,
            hitter_names_warp, hitter_names_war, pitcher_names_warp, pitcher_names_war)

In [16]:
# Train/Test Split
def prepare_train_test_splits():
    (x_warp, y_warp, x_war, y_war, a_warp, b_warp, a_war, b_war,
     hitter_names_warp, hitter_names_war, pitcher_names_warp, pitcher_names_war) = data_preparation()
    
    # Hitter splits
    x_warp_train, x_warp_test, y_warp_train, y_warp_test, h_names_warp_train, h_names_warp_test = train_test_split(
        x_warp, y_warp, hitter_names_warp, test_size=0.25, train_size=0.75, random_state=1
    )
    x_war_train, x_war_test, y_war_train, y_war_test, h_names_war_train, h_names_war_test = train_test_split(
        x_war, y_war, hitter_names_war, test_size=0.25, train_size=0.75, random_state=1
    )

    # Pitcher splits
    a_warp_train, a_warp_test, b_warp_train, b_warp_test, p_names_warp_train, p_names_warp_test = train_test_split(
        a_warp, b_warp, pitcher_names_warp, test_size=0.25, train_size=0.75, random_state=1
    )
    
    # Handle case where no pitcher WAR data is available
    if len(a_war) > 0:
        a_war_train, a_war_test, b_war_train, b_war_test, p_names_war_train, p_names_war_test = train_test_split(
            a_war, b_war, pitcher_names_war, test_size=0.25, train_size=0.75, random_state=1
        )
    else:
        # Use WARP data as fallback for pitcher WAR
        a_war_train, a_war_test, b_war_train, b_war_test = a_warp_train, a_warp_test, b_warp_train, b_warp_test
        p_names_war_train, p_names_war_test = p_names_warp_train, p_names_warp_test

    print("Train/test splits completed!")
    return (x_warp_train, x_warp_test, y_warp_train, y_warp_test,
            x_war_train, x_war_test, y_war_train, y_war_test,
            a_warp_train, a_warp_test, b_warp_train, b_warp_test,
            a_war_train, a_war_test, b_war_train, b_war_test,
            h_names_warp_test, h_names_war_test, p_names_warp_test, p_names_war_test)

In [17]:
# Player Lookup and Analysis
class ModelResults:
    def __init__(self):
        self.results = {}
        self.player_names = {}
        
    def store_results(self, model_name, player_type, metric_type, y_true, y_pred, player_names):
        """Store model results for later lookup"""
        key = f"{model_name}_{player_type}_{metric_type}"
        self.results[key] = {
            'y_true': y_true,
            'y_pred': y_pred,
            'player_names': player_names
        }
    
    def lookup_player(self, player_name, model_name=None):
        """Look up a specific player's results across all models"""
        results = []
        search_name = player_name.lower()
        
        for key, data in self.results.items():
            model, player_type, metric = key.split('_', 2)
            
            # Skip if specific model requested and doesn't match
            if model_name and model.lower() != model_name.lower():
                continue
                
            names = [name.lower() for name in data['player_names']]
            
            # Find matching player
            for i, name in enumerate(names):
                if search_name in name or name in search_name:
                    actual = data['y_true'][i]
                    predicted = data['y_pred'][i]
                    error = abs(actual - predicted)
                    error_pct = (error / abs(actual) * 100) if actual != 0 else float('inf')
                    
                    results.append({
                        'model': model,
                        'player_type': player_type,
                        'metric': metric,
                        'player_name': data['player_names'][i],
                        'actual': actual,
                        'predicted': predicted,
                        'error': error,
                        'error_pct': error_pct
                    })
        
        return results
    
    def display_player_results(self, player_name, model_name=None):
        """Display formatted results for a player"""
        results = self.lookup_player(player_name, model_name)
        
        if not results:
            print(f"No results found for player: {player_name}")
            return
            
        print(f"\n=== RESULTS FOR {results[0]['player_name'].upper()} ===")
        
        # Group by player type and metric
        for player_type in ['hitter', 'pitcher']:
            type_results = [r for r in results if r['player_type'] == player_type]
            if not type_results:
                continue
                
            print(f"\n{player_type.upper()} PERFORMANCE:")
            
            for metric in ['warp', 'war']:
                metric_results = [r for r in type_results if r['metric'] == metric]
                if not metric_results:
                    continue
                    
                print(f"\n  {metric.upper()}:")
                print(f"    {'Model':<15} {'Actual':<8} {'Predicted':<10} {'Error':<8} {'Error %':<8}")
                print(f"    {'-'*55}")
                
                for result in metric_results:
                    print(f"    {result['model']:<15} {result['actual']:<8.3f} {result['predicted']:<10.3f} "
                          f"{result['error']:<8.3f} {result['error_pct']:<8.1f}%")

# Global results storage
model_results = ModelResults()

In [None]:
# Basic Regression Models (Linear, Lasso, ElasticNet) - OPTIMIZED
def run_basic_regressions(data_splits=None):
    # Get data once instead of recalculating
    if data_splits is None:
        data_splits = prepare_train_test_splits()
    
    (x_warp_train, x_warp_test, y_warp_train, y_warp_test,
     x_war_train, x_war_test, y_war_train, y_war_test,
     a_warp_train, a_warp_test, b_warp_train, b_warp_test,
     a_war_train, a_war_test, b_war_train, b_war_test,
     h_names_warp_test, h_names_war_test, p_names_warp_test, p_names_war_test) = data_splits

    models = [
        ('linear', LinearRegression()),
        ('lasso', Lasso()),
        ('elasticnet', ElasticNet())
    ]
    
    for name, model in models:
        print(f"=== {name.upper()} REGRESSION ===")
        
        # Train all variants efficiently in a loop
        datasets = [
            ('hitter', 'warp', x_warp_train, x_warp_test, y_warp_train, y_warp_test, h_names_warp_test),
            ('hitter', 'war', x_war_train, x_war_test, y_war_train, y_war_test, h_names_war_test),
            ('pitcher', 'warp', a_warp_train, a_warp_test, b_warp_train, b_warp_test, p_names_warp_test),
            ('pitcher', 'war', a_war_train, a_war_test, b_war_train, b_war_test, p_names_war_test)
        ]
        
        for player_type, metric, X_train, X_test, y_train, y_test, names_test in datasets:
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            print_metrics(f"{name} {player_type} {metric}", y_test, y_pred)
            plot_results(f"{player_type} {metric} ({name})", y_test, y_pred)
            model_results.store_results(name, player_type, metric, y_test, y_pred, names_test)

In [None]:
# Advanced Models (KNN, Random Forest, XGBoost) - OPTIMIZED
def run_advanced_models(data_splits=None):
    # Get data once instead of recalculating
    if data_splits is None:
        data_splits = prepare_train_test_splits()
    
    (x_warp_train, x_warp_test, y_warp_train, y_warp_test,
     x_war_train, x_war_test, y_war_train, y_war_test,
     a_warp_train, a_warp_test, b_warp_train, b_warp_test,
     a_war_train, a_war_test, b_war_train, b_war_test,
     h_names_warp_test, h_names_war_test, p_names_warp_test, p_names_war_test) = data_splits

    # Optimized model configurations with parallelization
    models = [
        ('knn', KNeighborsRegressor(n_neighbors=3, n_jobs=-1)),  # Use all CPU cores
        ('randomforest', RandomForestRegressor(n_estimators=50, max_depth=8, random_state=1, n_jobs=-1)),  # Smaller, faster
        ('xgboost', xgb.XGBRegressor(n_estimators=50, max_depth=4, learning_rate=0.1, random_state=1, n_jobs=-1))  # Optimized
    ]
    
    for name, model in models:
        print(f"=== {name.upper()} ===")
        
        datasets = [
            ('hitter', 'warp', x_warp_train, x_warp_test, y_warp_train, y_warp_test, h_names_warp_test),
            ('hitter', 'war', x_war_train, x_war_test, y_war_train, y_war_test, h_names_war_test),
            ('pitcher', 'warp', a_warp_train, a_warp_test, b_warp_train, b_warp_test, p_names_warp_test),
            ('pitcher', 'war', a_war_train, a_war_test, b_war_train, b_war_test, p_names_war_test)
        ]
        
        for player_type, metric, X_train, X_test, y_train, y_test, names_test in datasets:
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            print_metrics(f"{name} {player_type} {metric}", y_test, y_pred)
            plot_results(f"{player_type} {metric} ({name})", y_test, y_pred)
            model_results.store_results(name, player_type, metric, y_test, y_pred, names_test)

In [None]:
# Neural Network (Keras/TensorFlow) - OPTIMIZED
def run_neural_network(data_splits=None):
    # Get data once instead of recalculating
    if data_splits is None:
        data_splits = prepare_train_test_splits()
    
    (x_warp_train, x_warp_test, y_warp_train, y_warp_test,
     x_war_train, x_war_test, y_war_train, y_war_test,
     a_warp_train, a_warp_test, b_warp_train, b_warp_test,
     a_war_train, a_war_test, b_war_train, b_war_test,
     h_names_warp_test, h_names_war_test, p_names_warp_test, p_names_war_test) = data_splits

    # Scale features for neural network (critical for convergence)
    scaler = StandardScaler()

    # More aggressive early stopping for faster training
    early_stopping = tf.keras.callbacks.EarlyStopping( # pyright: ignore[reportAttributeAccessIssue]
        monitor='val_loss',
        patience=10,  # Reduced from 20
        restore_best_weights=True,
        verbose=0
    )

    print("=== KERAS NEURAL NETWORK ===")

    datasets = [
        ('hitter', 'warp', x_warp_train, x_warp_test, y_warp_train, y_warp_test, h_names_warp_test),
        ('hitter', 'war', x_war_train, x_war_test, y_war_train, y_war_test, h_names_war_test),
        ('pitcher', 'warp', a_warp_train, a_warp_test, b_warp_train, b_warp_test, p_names_warp_test),
        ('pitcher', 'war', a_war_train, a_war_test, b_war_train, b_war_test, p_names_war_test)
    ]
    
    for player_type, metric, X_train, X_test, y_train, y_test, names_test in datasets:
        print(f"Training Neural Network for {player_type} {metric}...")
        
        # Scale features
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        model = create_keras_model(input_dim=len(X_train[0]), name=f"{player_type}_{metric}")
        
        # Split training data for validation
        X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
            X_train_scaled, y_train, test_size=0.2, random_state=1
        )
        
        # Optimized training - faster epochs and larger batches
        history = model.fit(
            X_train_split, y_train_split,
            validation_data=(X_val_split, y_val_split),
            epochs=50,  # Reduced from 100
            batch_size=32,  # Increased from 16
            callbacks=[early_stopping],
            verbose=0
        )
        
        y_pred = model.predict(X_test_scaled, verbose=0).flatten()
        print_metrics(f"Keras {player_type} {metric}", y_test, y_pred)
        plot_results(f"{player_type} {metric} (Keras Neural Network)", y_test, y_pred)
        plot_training_history(history, f"{player_type} {metric}")
        model_results.store_results("keras", player_type, metric, y_test, y_pred, names_test)

In [None]:
# Run All Models - OPTIMIZED
def main():
    print("=== RUNNING ALL MODELS (OPTIMIZED) ===\n")
    
    # 1. Prepare data ONCE for all models
    print("Preparing data once for all models...")
    data_splits = prepare_train_test_splits()
    
    # 2. Pass data to all model functions to avoid recomputation
    print("1. Basic Regression Models")
    run_basic_regressions(data_splits)
    
    print("\n2. Advanced Models")  
    run_advanced_models(data_splits)
    
    print("\n3. Neural Network")
    run_neural_network(data_splits)
    
    print("\n=== ALL MODELS COMPLETED ===")
    print("\nTo lookup a player's results, use:")
    print("model_results.display_player_results('Player Name')")

# Uncomment to run all models
main()

=== RUNNING ALL MODELS (OPTIMIZED) ===

Preparing data once for all models...
Loading and preparing data...
Loaded data - Hitters: 361331, WARP hitters: 463, WAR: 1508
Baserunning values: 0, Defensive values: 0
Creating name mappings...


In [None]:
# Player Lookup Examples
# After running models, use these commands to look up specific players:

# Example usage:
# model_results.display_player_results("Mike Trout")
# model_results.display_player_results("Gerrit Cole") 
# model_results.display_player_results("Mookie Betts", "linear")  # Only linear model results

# You can also get raw results:
# trout_results = model_results.lookup_player("Mike Trout")
# print(trout_results)