In [29]:
import pandas as pd
import numpy as np
from micrograd.engine import Value
from micrograd.nn import Neuron, Layer, MLP 
from pybaseball import statcast, batting_stats
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')


For this example I will be training a binary classifer to predict if a pitch will be a home run or not based on a few pitch features. Because of the massive amount of data generated in baseball I am limiting the focus of this model to just 1 month of pitch data 

In [30]:
def get_data(start_date, end_date):
    pitch_data = statcast(start_date, end_date)
    features = []
    labels = []
    
    for _, pitch in pitch_data.iterrows():
        if pd.isna(pitch.release_speed) or pd.isna(pitch.release_spin_rate) or pd.isna(pitch.pfx_z):
            continue
        feature_vector = [
            float(pitch.release_speed), #pitch velocity
            float(pitch.release_spin_rate), #spin rate
            float(pitch.pfx_z) #vertical movement
        ]
        label = 1 if pitch.events == 'home_run' else 0 
        
        features.append(feature_vector)
        labels.append(label)
    return np.array(features),np.array(labels)
def normalize_data(X):
    mean = np.mean(X,axis=0)
    std = np.std(X,axis=0)
    return (X - mean) / (std + 1e-8), (mean,std)

In [31]:
if __name__ == "__main__":
    #get the data
    start_date = '2023-03-30'
    end_date = '2023-04-30'
    #fetch and prepare data
    X, y = get_data(start_date,end_date)
    X_normalized, (mean,std) = normalize_data(X)
    
    model = nn.MLP(3,[4,1])
    learning_rate = 0.01
    
    print("\nTraining Model")
    for epoch in range(100):
        total_loss = 0 
        
        for xi,yi in zip(X_normalized, y):
            x = [Value(xij) for xij in xi]
            
            pred = model(x)
            
            loss = (pred - yi) * (pred - yi)
            total_loss += loss.data
            
            model.zero_grad()
            loss.backward()
            
            for p in model.parameters():
                p.data += -learning_rate * p.grad
        if epoch % 10 == 0:
            print(f'Epoch {epoch}, Loss: {total_loss/len(X):.4f}')
    #print some stats
    print('\nDataset Statistics:')
    print(f"Total Pitches Analyzed: {len(y)}")
    print(f"Number of Home Runs: {sum(y)}")
    print(f"Home Run Rate: {sum(y)/len(y):.4%}")

This is a large query, it may take a moment to complete


100%|██████████| 32/32 [00:39<00:00,  1.24s/it]



Training Model
Epoch 0, Loss: 0.0080
Epoch 10, Loss: 0.0076
Epoch 20, Loss: 0.0076
Epoch 30, Loss: 0.0076
Epoch 40, Loss: 0.0076
Epoch 50, Loss: 0.0076
Epoch 60, Loss: 0.0076
Epoch 70, Loss: 0.0076
Epoch 80, Loss: 0.0076
Epoch 90, Loss: 0.0076

Dataset Statistics:
Total Pitches Analyzed: 124880
Number of Home Runs: 953
Home Run Rate: 0.7631%


In [35]:
example_pitch = [
        95.0,    # velocity (mph)
        2200.0,  # spin rate
        0.2      # vertical movement
    ]

In [36]:
example_normalized = (example_pitch - mean) / (std + 1e-8)

In [39]:
pred = model([Value(x) for x in example_normalized])
print(f"Pitch characteristics:")
print(f"- Velocity: {example_pitch[0]} mph")
print(f"- Spin Rate: {example_pitch[1]} rpm")
print(f"- Vertical Movement: {example_pitch[2]} inches")
print(f"Home run probability: {pred.data:.4%}")
pred


Pitch characteristics:
- Velocity: 95.0 mph
- Spin Rate: 2200.0 rpm
- Vertical Movement: 0.2 inches
Home run probability: -0.1517%


Value(data=-0.0015174030700386642, grad=0)