In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from itertools import product
from scipy.ndimage import gaussian_filter
from itertools import product
import matplotlib.gridspec as gridspec
import matplotlib.colors as mcolors
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import os
from fpdf import FPDF
import joblib
from datetime import datetime
import helper_functions as hf
from constants import(
    platoon_state_mapping,
    side_buckets,
    height_buckets,
    count_values,
    num_clusters,
    numerical_features,
    pseudo_sample_size,
    median_features
)
import warnings
warnings.filterwarnings('ignore')

rv_model = joblib.load('rv_model.pkl')
gmm_models = hf.load_gmm_models()

pitches_df = pd.read_csv('all_pitches.csv')
global_means = pd.read_csv('global_means.csv')

pitches_df = hf.prepare_data(pitches_df, game_only=True)

In [2]:
result_class_mapping = {
    "Out": 0,
    "Error": 0,
    "Fielderschoice": 0,
    "Single": 1,
    "Double": 2,
    "Triple": 3,
    "HomeRun": 4
}

pitches_df["ResultClass"] = pitches_df["PlayResult"].map(result_class_mapping)

In [4]:
features = ['ExitSpeed', 'Angle', 'DirectionBucket']
target = 'ResultClass'

model_df = pitches_df.dropna(subset=features + [target])

X = model_df[features]
y = model_df[target]

In [6]:
import pandas as pd
import joblib
import helper_functions as hf
import xgboost as xgb
import optuna
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import log_loss, make_scorer
import numpy as np
from constants import rv_features, rv_target
import warnings
warnings.filterwarnings('ignore')

log_loss_scorer = make_scorer(log_loss, greater_is_better=False, needs_proba=True)

def objective(trial):
    params = {
        'objective': 'multi:softprob',
        'eval_metric': 'mlogloss',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'lambda': trial.suggest_float('lambda', 1e-3, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-3, 10.0, log=True),
        'gamma': trial.suggest_float('gamma', 0.0, 5.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'num_class': 5
    }

    model = xgb.XGBClassifier(**params, random_state=100)

    kf = KFold(n_splits=5, shuffle=True, random_state=100)
    cv_scores = cross_val_score(model, X, y, scoring=log_loss_scorer, cv=kf)

    return np.mean(cv_scores)

multiclass_study = optuna.create_study(direction='minimize')
multiclass_study.optimize(objective, n_trials=25)

multiclass_best_params = multiclass_study.best_params
multiclass_model = xgb.XGBClassifier(**multiclass_best_params, random_state=100, num_class=5)
multiclass_model.fit(X, y)

print(f"Best Parameters: {multiclass_best_params}")

joblib.dump(multiclass_model, 'multiclass_model.pkl')

print("Multiclass Model saved!")


[I 2025-03-11 11:39:15,225] A new study created in memory with name: no-name-4e6ec393-ea05-4ed2-bd4c-49d419d7bc9b
[I 2025-03-11 11:39:16,237] Trial 0 finished with value: -0.6308115107255305 and parameters: {'learning_rate': 0.18562485556268069, 'max_depth': 6, 'min_child_weight': 10, 'subsample': 0.5218859042265322, 'colsample_bytree': 0.7035227314705224, 'lambda': 0.2590617887113503, 'alpha': 0.001242082957942944, 'gamma': 4.755736576810502, 'n_estimators': 50}. Best is trial 0 with value: -0.6308115107255305.
[I 2025-03-11 11:39:19,359] Trial 1 finished with value: -0.6128629360002783 and parameters: {'learning_rate': 0.15763320986093884, 'max_depth': 5, 'min_child_weight': 6, 'subsample': 0.7125846218326882, 'colsample_bytree': 0.9439599993456766, 'lambda': 0.01142718718542299, 'alpha': 0.05475907126389026, 'gamma': 0.20498626924271457, 'n_estimators': 140}. Best is trial 0 with value: -0.6308115107255305.
[I 2025-03-11 11:39:22,844] Trial 2 finished with value: -0.6135983118207502