# Pitcher P4 vs Non-P4 D1 (Baseline)

Train a logistic regression model on D1-only pitchers to predict Power 4 vs Non-P4 D1.


In [23]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import precision_score, recall_score, f1_score


In [14]:
data_path = r"/Users/ryankolodziejczyk/Documents/AI Baseball Recruitment/code/backend/data/pitchers/pitchers_data_clean.csv"
df = pd.read_csv(data_path)
print('shape', df.shape)
df.head()


shape (18343, 28)


Unnamed: 0,primary_position,height,weight,throwing_hand,FastballVelocity (max),FastballVelo Range,FastballSpin Rate (avg),Changeup Velo Range,Changeup Spin Rate (avg),Curveball Velo Range,...,fb_sl_velo_diff,FastballVelocity (max)_missing,FastballVelo Range_missing,FastballSpin Rate (avg)_missing,Changeup Velo Range_missing,Changeup Spin Rate (avg)_missing,Curveball Velo Range_missing,Curveball Spin Rate (avg)_missing,Slider Velo Range_missing,Slider Spin Rate (avg)_missing
0,RHP,74.0,225.0,R,91.0,89.5,2325.0,81.5,1787.0,77.0,...,,0,0,0,0,0,0,0,1,1
1,RHP,76.0,180.0,R,87.0,84.5,2067.0,78.5,1661.0,76.5,...,,0,0,0,0,0,0,0,1,1
2,LHP,70.0,160.0,L,81.0,80.0,2121.0,69.5,1670.0,65.4,...,,0,0,0,0,0,0,0,1,1
3,RHP,75.0,195.0,R,85.8,83.0,2151.0,72.2,1719.0,66.8,...,16.4,0,0,0,0,0,0,0,0,0
4,RHP,71.0,170.0,R,90.0,87.3,2216.0,81.3,1952.0,75.0,...,16.8,0,0,0,0,0,0,0,0,1


## Filter to D1 only
Use D1 groups only (Power 4, Mid Major, Low Major). Target is Power 4 vs Non-P4 D1.

In [15]:
d1_groups = {'Power 4', 'Mid Major', 'Low Major'}
df_d1 = df[df['group'].isin(d1_groups)].copy()
df_d1['p4_or_not'] = (df_d1['group'] == 'Power 4').astype(int)
print(df_d1['p4_or_not'].value_counts())


p4_or_not
0    4150
1    2253
Name: count, dtype: int64


## Feature set
Drop group/target columns; keep all pitcher attributes and missingness flags.

In [16]:
X = df_d1.drop(columns=['group', 'p4_or_not', 'throwing_hand'])
y = df_d1['p4_or_not']
print('features', X.shape)
X.head()


features (6403, 26)


Unnamed: 0,primary_position,height,weight,FastballVelocity (max),FastballVelo Range,FastballSpin Rate (avg),Changeup Velo Range,Changeup Spin Rate (avg),Curveball Velo Range,Curveball Spin Rate (avg),...,fb_sl_velo_diff,FastballVelocity (max)_missing,FastballVelo Range_missing,FastballSpin Rate (avg)_missing,Changeup Velo Range_missing,Changeup Spin Rate (avg)_missing,Curveball Velo Range_missing,Curveball Spin Rate (avg)_missing,Slider Velo Range_missing,Slider Spin Rate (avg)_missing
2,LHP,70.0,160.0,81.0,80.0,2121.0,69.5,1670.0,65.4,1942.0,...,,0,0,0,0,0,0,0,1,1
4,RHP,71.0,170.0,90.0,87.3,2216.0,81.3,1952.0,75.0,2360.0,...,16.8,0,0,0,0,0,0,0,0,1
9,LHP,77.0,190.0,,,,,,,,...,,1,1,1,1,1,1,1,1,1
10,LHP,74.0,207.0,,,,,,,,...,,1,1,1,1,1,1,1,1,1
11,LHP,70.0,170.0,,84.0,,75.5,,70.5,,...,16.5,1,0,1,0,1,0,1,0,1


## Train/test split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print('train', X_train.shape, 'test', X_test.shape)


train (5122, 26) test (1281, 26)


## Baseline model (LogReg + KNN imputer)
Numeric pipeline uses scaling before KNN imputation (per instruction).

In [18]:
cat_cols = X.select_dtypes(include='object').columns.tolist()
num_cols = [c for c in X.columns if c not in cat_cols]

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('imputer', KNNImputer(n_neighbors=10))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocess = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols),
])

clf = LogisticRegression(max_iter=5000, class_weight='balanced')

model = Pipeline(steps=[
    ('preprocess', preprocess),
    ('clf', clf)
])

model.fit(X_train, y_train)
pred = model.predict(X_test)
proba = model.predict_proba(X_test)[:, 1]

print('Accuracy:', accuracy_score(y_test, pred))
print('ROC-AUC:', roc_auc_score(y_test, proba))
print(classification_report(y_test, pred, digits=3))
print(confusion_matrix(y_test, pred))


Accuracy: 0.6494925839188135
ROC-AUC: 0.7104560147463468
              precision    recall  f1-score   support

           0      0.770     0.654     0.707       830
           1      0.502     0.641     0.563       451

    accuracy                          0.649      1281
   macro avg      0.636     0.648     0.635      1281
weighted avg      0.676     0.649     0.657      1281

[[543 287]
 [162 289]]


## LightGBM baseline (native missing values)
No numeric imputation; LightGBM handles NaNs directly.


In [21]:
# LightGBM with native missing values (only categorical imputation + one-hot)
cat_cols_lgb = X.select_dtypes(include='object').columns.tolist()
num_cols_lgb = [c for c in X.columns if c not in cat_cols_lgb]

numeric_transformer_lgb = Pipeline(steps=[
    ('identity', 'passthrough')
])

categorical_transformer_lgb = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocess_lgb = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', num_cols_lgb),
        ('cat', categorical_transformer_lgb, cat_cols_lgb),
])

lgbm = lgb.LGBMClassifier(
    n_estimators=400,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight='balanced',
    random_state=42
)

lgb_model = Pipeline(steps=[
    ('preprocess', preprocess_lgb),
    ('clf', lgbm)
])

lgb_model.fit(X_train, y_train)
lgb_pred = lgb_model.predict(X_test)
lgb_proba = lgb_model.predict_proba(X_test)[:, 1]

print('Accuracy:', accuracy_score(y_test, lgb_pred))
print('ROC-AUC:', roc_auc_score(y_test, lgb_proba))
print(classification_report(y_test, lgb_pred, digits=3))
print(confusion_matrix(y_test, lgb_pred))


[LightGBM] [Info] Number of positive: 1802, number of negative: 3320
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001524 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2555
[LightGBM] [Info] Number of data points in the train set: 5122, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Accuracy: 0.6799375487900078
ROC-AUC: 0.6997341917559372
              precision    recall  f1-score   support

           0      0.745     0.770     0.757       830
           1      0.548     0.514     0.531       451

    accuracy                          0.680      1281
   macro avg      0.647     0.642     0.644      1281
weighted avg      0.676     0.680     0.677      1281

[[639 191]
 [219 232]]




The LGBM is clearly worse (lower ROC‑AUC, precision, recall). With our current data, I think that we are near the ceiling for our data. We will now do CV work to find optimal cutoff with our LogReg + KNN model.

## CV threshold tuning (LogReg + KNN)
Maximize precision with guardrails on accuracy/recall.


In [24]:
# K-fold CV to pick threshold: maximize precision with guardrails
clf_cv = LogisticRegression(max_iter=5000, class_weight='balanced')
model_cv = Pipeline(steps=[
    ('preprocess', preprocess),
    ('clf', clf_cv)
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
proba_cv = cross_val_predict(model_cv, X, y, cv=skf, method='predict_proba')[:, 1]

thresholds = np.linspace(0.3, 0.8, 101)
rows = []
for t in thresholds:
    preds = (proba_cv >= t).astype(int)
    rows.append({
        'threshold': t,
        'accuracy': accuracy_score(y, preds),
        'precision': precision_score(y, preds, zero_division=0),
        'recall': recall_score(y, preds),
        'f1': f1_score(y, preds)
    })

results = pd.DataFrame(rows)
# Guardrails: adjust as needed for P4 precision vs recall trade-off
guarded = results[(results['accuracy'] >= 0.62) & (results['recall'] >= 0.50)]
if len(guarded) == 0:
    print('No thresholds met guardrails. Showing top precision overall.')
    best = results.sort_values(['precision','f1'], ascending=False).head(10)
else:
    best = guarded.sort_values(['precision','f1'], ascending=False).head(10)

print('Top thresholds by precision (with guardrails if available):')
display(best)

best_row = best.iloc[0]
print('Selected threshold:', best_row['threshold'])
print('Accuracy:', best_row['accuracy'])
print('Precision:', best_row['precision'])
print('Recall:', best_row['recall'])
print('F1:', best_row['f1'])


Top thresholds by precision (with guardrails if available):


Unnamed: 0,threshold,accuracy,precision,recall,f1
55,0.575,0.697954,0.581253,0.506436,0.541271
54,0.57,0.695455,0.574963,0.515757,0.543753
53,0.565,0.694206,0.571084,0.525965,0.547597
52,0.56,0.694206,0.569608,0.53573,0.55215
51,0.555,0.695143,0.569132,0.549933,0.559368
50,0.55,0.694987,0.567325,0.56103,0.56416
49,0.545,0.693425,0.563098,0.574345,0.568666
48,0.54,0.691551,0.558898,0.585442,0.571862
47,0.535,0.690145,0.555418,0.598313,0.576068
46,0.53,0.689052,0.55261,0.610741,0.580223


Selected threshold: 0.575
Accuracy: 0.6979540840231142
Precision: 0.5812531839021905
Recall: 0.5064358632933866
F1: 0.5412713472485768


In [29]:
# using same prior LogReg model, but with the CV-selected threshold
best_thr = best_row['threshold']

proba_thr = model.predict_proba(X_test)[:, 1]
pred_thr = (proba_thr >= 0.555).astype(int)

print('Accuracy:', accuracy_score(y_test, pred_thr))
print('ROC-AUC:', roc_auc_score(y_test, proba_thr))
print(classification_report(y_test, pred_thr, digits=3))
print(confusion_matrix(y_test, pred_thr))


Accuracy: 0.6775956284153005
ROC-AUC: 0.7104560147463468
              precision    recall  f1-score   support

           0      0.750     0.754     0.752       830
           1      0.543     0.537     0.540       451

    accuracy                          0.678      1281
   macro avg      0.646     0.645     0.646      1281
weighted avg      0.677     0.678     0.677      1281

[[626 204]
 [209 242]]


## Save model (P4 vs Non-P4 D1)
We use cutoff = 0.555 based on CV precision/recall balance.


In [30]:
import os
import json as _json
import joblib
from datetime import datetime

# Ensure `model` is trained before saving
version_tag = datetime.now().strftime('version_%m%d%Y')
model_dir = f'/Users/ryankolodziejczyk/Documents/AI Baseball Recruitment/code/backend/ml/models/models_p/models_p4_or_not_p/{version_tag}'
os.makedirs(model_dir, exist_ok=True)

model_path = os.path.join(model_dir, 'logreg_knn_model.pkl')
metadata_path = os.path.join(model_dir, 'model_metadata.json')

joblib.dump(model, model_path)

metadata = {
    'model_type': 'LogisticRegression',
    'imputer': 'KNNImputer(n_neighbors=10) with StandardScaler before imputation',
    'threshold': 0.555,
    'features': list(X.columns),
    'notes': 'P4 vs Non-P4 D1. Selected for simplicity and stable CV precision/recall balance.'
}

with open(metadata_path, 'w') as f:
    _json.dump(metadata, f, indent=2)

print('Saved model to:', model_path)
print('Saved metadata to:', metadata_path)


Saved model to: /Users/ryankolodziejczyk/Documents/AI Baseball Recruitment/code/backend/ml/models/models_p/models_p4_or_not_p/version_02042026/logreg_knn_model.pkl
Saved metadata to: /Users/ryankolodziejczyk/Documents/AI Baseball Recruitment/code/backend/ml/models/models_p/models_p4_or_not_p/version_02042026/model_metadata.json
