In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
import plotly.express as px
from cleanedDataParser import *

# -----------------------------
# Data Preparation
# -----------------------------
def data_preparation():
    # Load data from parser
    hitter_data = clean_sorted_hitter()
    hitter_pred_data = clean_warp_hitter()
    pitcher_data = clean_sorted_pitcher()
    pitcher_pred_data = clean_warp_pitcher()
    war_values = clean_war()

    # Hitter features & targets
    x_warp, y_warp, x_war, y_war = [], [], [], []

    for index, row in hitter_pred_data.iterrows():
        name = row['Name'].split()[1]  # use last name
        hitter_row = hitter_data[hitter_data['Hitters'] == name]
        if not hitter_row.empty:
            stats = hitter_row[['K','BB','AVG','OBP','SLG']].values.flatten().tolist()
            x_warp.append(stats)
            x_war.append(stats)
            y_warp.append(row['WARP'])
            y_war.append(war_values.loc[index, 'Total War'])

    # Train/test split (hitters)
    x_warp_train, x_warp_test, y_warp_train, y_warp_test = train_test_split(
        x_warp, y_warp, test_size=0.25, random_state=1
    )
    x_war_train, x_war_test, y_war_train, y_war_test = train_test_split(
        x_war, y_war, test_size=0.25, random_state=1
    )

    # Pitcher features & targets
    a_warp, b_warp, a_war, b_war = [], [], [], []

    for index, row in pitcher_pred_data.iterrows():
        name = row['Name']
        pitcher_row = pitcher_data[pitcher_data['Pitchers'] == name]
        if not pitcher_row.empty:
            stats = pitcher_row[['IP','BB','K','HR','ERA']].values.flatten().tolist()
            a_warp.append(stats)
            b_warp.append(row['WARP'])
            b_war.append(war_values.loc[index, 'Primary WAR'])

    # Train/test split (pitchers)
    a_warp_train, a_warp_test, b_warp_train, b_warp_test = train_test_split(
        a_warp, b_warp, test_size=0.25, random_state=1
    )
    a_war_train, a_war_test, b_war_train, b_war_test = train_test_split(
        a_war, b_war, test_size=0.25, random_state=1
    )

    return (x_warp_train, x_warp_test, y_warp_train, y_warp_test,
            x_war_train, x_war_test, y_war_train, y_war_test,
            a_warp_train, a_warp_test, b_warp_train, b_warp_test,
            a_war_train, a_war_test, b_war_train, b_war_test)


# -----------------------------
# Helper: Print metrics
# -----------------------------
def print_metrics(name, y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    print(f"{name} - R2: {r2:.4f}, RMSE: {rmse:.4f}")


# -----------------------------
# Helper: Plot results
# -----------------------------
def plot_results(title, y_true, y_pred):
    df = pd.DataFrame({'Predicted': y_pred, 'Actual': y_true})
    px.scatter(df, x='Predicted', y='Actual', title=title).show()


# -----------------------------
# Linear Regression
# -----------------------------
def performancelinear():
    (x_warp_train, x_warp_test, y_warp_train, y_warp_test,
     x_war_train, x_war_test, y_war_train, y_war_test,
     a_warp_train, a_warp_test, b_warp_train, b_warp_test,
     a_war_train, a_war_test, b_war_train, b_war_test) = data_preparation()

    model = LinearRegression()

    # Hitter WARP
    model.fit(x_warp_train, y_warp_train)
    y_warp_pred = model.predict(x_warp_test)
    print_metrics("Linear Hitter WARP", y_warp_test, y_warp_pred)
    plot_results("Hitter WARP (Linear)", y_warp_test, y_warp_pred)

    # Hitter WAR
    model.fit(x_war_train, y_war_train)
    y_war_pred = model.predict(x_war_test)
    print_metrics("Linear Hitter WAR", y_war_test, y_war_pred)
    plot_results("Hitter WAR (Linear)", y_war_test, y_war_pred)

    # Pitcher WARP
    model.fit(a_warp_train, b_warp_train)
    b_warp_pred = model.predict(a_warp_test)
    print_metrics("Linear Pitcher WARP", b_warp_test, b_warp_pred)
    plot_results("Pitcher WARP (Linear)", b_warp_test, b_warp_pred)

    # Pitcher WAR
    model.fit(a_war_train, b_war_train)
    b_war_pred = model.predict(a_war_test)
    print_metrics("Linear Pitcher WAR", b_war_test, b_war_pred)
    plot_results("Pitcher WAR (Linear)", b_war_test, b_war_pred)


# -----------------------------
# Lasso Regression
# -----------------------------
def performancelasso():
    (x_warp_train, x_warp_test, y_warp_train, y_warp_test,
     x_war_train, x_war_test, y_war_train, y_war_test,
     a_warp_train, a_warp_test, b_warp_train, b_warp_test,
     a_war_train, a_war_test, b_war_train, b_war_test) = data_preparation()

    model = Lasso()

    model.fit(x_warp_train, y_warp_train)
    y_warp_pred = model.predict(x_warp_test)
    print_metrics("Lasso Hitter WARP", y_warp_test, y_warp_pred)
    plot_results("Hitter WARP (Lasso)", y_warp_test, y_warp_pred)

    model.fit(x_war_train, y_war_train)
    y_war_pred = model.predict(x_war_test)
    print_metrics("Lasso Hitter WAR", y_war_test, y_war_pred)
    plot_results("Hitter WAR (Lasso)", y_war_test, y_war_pred)

    model.fit(a_warp_train, b_warp_train)
    b_warp_pred = model.predict(a_warp_test)
    print_metrics("Lasso Pitcher WARP", b_warp_test, b_warp_pred)
    plot_results("Pitcher WARP (Lasso)", b_warp_test, b_warp_pred)

    model.fit(a_war_train, b_war_train)
    b_war_pred = model.predict(a_war_test)
    print_metrics("Lasso Pitcher WAR", b_war_test, b_war_pred)
    plot_results("Pitcher WAR (Lasso)", b_war_test, b_war_pred)


# -----------------------------
# Elastic Net Regression
# -----------------------------
def performanceelasticnet():
    (x_warp_train, x_warp_test, y_warp_train, y_warp_test,
     x_war_train, x_war_test, y_war_train, y_war_test,
     a_warp_train, a_warp_test, b_warp_train, b_warp_test,
     a_war_train, a_war_test, b_war_train, b_war_test) = data_preparation()

    model = ElasticNet()

    model.fit(x_warp_train, y_warp_train)
    y_warp_pred = model.predict(x_warp_test)
    print_metrics("ElasticNet Hitter WARP", y_warp_test, y_warp_pred)
    plot_results("Hitter WARP (ElasticNet)", y_warp_test, y_warp_pred)

    model.fit(x_war_train, y_war_train)
    y_war_pred = model.predict(x_war_test)
    print_metrics("ElasticNet Hitter WAR", y_war_test, y_war_pred)
    plot_results("Hitter WAR (ElasticNet)", y_war_test, y_war_pred)

    model.fit(a_warp_train, b_warp_train)
    b_warp_pred = model.predict(a_warp_test)
    print_metrics("ElasticNet Pitcher WARP", b_warp_test, b_warp_pred)
    plot_results("Pitcher WARP (ElasticNet)", b_warp_test, b_warp_pred)

    model.fit(a_war_train, b_war_train)
    b_war_pred = model.predict(a_war_test)
    print_metrics("ElasticNet Pitcher WAR", b_war_test, b_war_pred)
    plot_results("Pitcher WAR (ElasticNet)", b_war_test, b_war_pred)


# -----------------------------
# KNN Regression
# -----------------------------
def performanceknnvisualization():
    (x_warp_train, x_warp_test, y_warp_train, y_warp_test,
     x_war_train, x_war_test, y_war_train, y_war_test,
     a_warp_train, a_warp_test, b_warp_train, b_warp_test,
     a_war_train, a_war_test, b_war_train, b_war_test) = data_preparation()

    model = KNeighborsRegressor(n_neighbors=3)

    model.fit(x_warp_train, y_warp_train)
    y_warp_pred = model.predict(x_warp_test)
    print_metrics("KNN Hitter WARP", y_warp_test, y_warp_pred)
    plot_results("Hitter WARP (KNN)", y_warp_test, y_warp_pred)

    model.fit(x_war_train, y_war_train)
    y_war_pred = model.predict(x_war_test)
    print_metrics("KNN Hitter WAR", y_war_test, y_war_pred)
    plot_results("Hitter WAR (KNN)", y_war_test, y_war_pred)

    model.fit(a_warp_train, b_warp_train)
    b_warp_pred = model.predict(a_warp_test)
    print_metrics("KNN Pitcher WARP", b_warp_test, b_warp_pred)
    plot_results("Pitcher WARP (KNN)", b_warp_test, b_warp_pred)

    model.fit(a_war_train, b_war_train)
    b_war_pred = model.predict(a_war_test)
    print_metrics("KNN Pitcher WAR", b_war_test, b_war_pred)
    plot_results("Pitcher WAR (KNN)", b_war_test, b_war_pred)


# -----------------------------
# Main
# -----------------------------
def main():
    print("----- Linear Regression -----")
    performancelinear()
    print("\n----- Lasso Regression -----")
    performancelasso()
    print("\n----- ElasticNet Regression -----")
    performanceelasticnet()
    print("\n----- KNN Regression -----")
    performanceknnvisualization()


if __name__ == "__main__":
    main()


----- Linear Regression -----


ValueError: With n_samples=0, test_size=0.25 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.