In [1]:
import pandas as pd
import numpy as np
import time
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, TensorDataset
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from matplotlib import pyplot as plt

In [2]:
# Load data
# file_path = '../data/statcast-short.csv'
file_path = '../data/statcast_data_2016_2023.csv'
data = pd.read_csv(file_path)

  data = pd.read_csv(file_path)


In [3]:
data["game_date"] = pd.to_datetime(data["game_date"])

# How many pitches per year?
data["game_date"].dt.year.value_counts().sort_index()

game_date
2016    716073
2017    721244
2018    721190
2019    749399
2020    263584
2021    709852
2022    712392
2023    717945
Name: count, dtype: int64

In [4]:
# Preprocess data
data['events'].fillna(data["description"], inplace=True)

# Encode position within the event. Shown to be useful in the {batter, pitcher}2vec paper. There's probably a better way, but this works for now.
# data["events"] = data["events"] + " " + data["hit_location"].astype(str)

# Bin the hc_x and hc_y columns
data["hc_x_bin"] = pd.cut(data["hc_x"], bins=10, labels=False)
data["hc_y_bin"] = pd.cut(data["hc_y"], bins=10, labels=False)

location_string = data["hc_x_bin"].astype(str) + " " + data["hc_y_bin"].astype(str)

data["events"] = (data["events"] + " " + location_string).str.replace("nan", "").str.strip()

data["events"].value_counts()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['events'].fillna(data["description"], inplace=True)


events
ball                   1677532
foul                    932675
called_strike           810557
swinging_strike         356935
strikeout               304841
                        ...   
field_out 3.0 9.0            1
double 9.0 6.0               1
double_play 9.0 5.0          1
field_out 8.0 0.0            1
triple 4.0 5.0               1
Name: count, Length: 757, dtype: int64

In [5]:
# Encode events

event_to_idx = {event: idx for idx, event in enumerate(data['events'].unique())}
idx_to_event = {idx: event for event, idx in event_to_idx.items()}
data['event_idx'] = data['events'].map(event_to_idx)

In [6]:
data["events"].value_counts()

events
ball                   1677532
foul                    932675
called_strike           810557
swinging_strike         356935
strikeout               304841
                        ...   
field_out 3.0 9.0            1
double 9.0 6.0               1
double_play 9.0 5.0          1
field_out 8.0 0.0            1
triple 4.0 5.0               1
Name: count, Length: 757, dtype: int64

In [7]:
# Combine batters and pitchers to get all players
all_players = pd.concat([data['batter'], data['pitcher']]).unique()

# Create a LabelEncoder for all players
le_players = LabelEncoder()
le_players.fit(all_players)

# Transform batter and pitcher columns
data['batter'] = le_players.transform(data['batter'])
data['pitcher'] = le_players.transform(data['pitcher'])

# Get the number of unique players
num_players = len(le_players.classes_)

In [8]:
# Create a LabelEncoder for pitch types
le_pitch_types = LabelEncoder()
le_pitch_types.fit(data['pitch_type'])

# Transform pitch_type column
data['pitch_type'] = le_pitch_types.transform(data['pitch_type'])

# Get the number of unique pitch types
num_pitch_types = len(le_pitch_types.classes_)
print(f"Number of unique pitch types: {num_pitch_types}")

Number of unique pitch types: 20


In [9]:
val_data = data.iloc[int(0.9*len(data)):]
data = data.iloc[:int(0.9*len(data))]

In [10]:
# Define dataset class
class BaseballDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        batter = row['batter']
        pitcher = row['pitcher']
        pitch_type = row['pitch_type']
        release_speed = row['release_speed']
        release_pos_x = row['release_pos_x']
        release_pos_z = row['release_pos_z']
        game_state = row[['home_score', 'away_score', 'balls', 'strikes', 'outs_when_up', 'inning']].values.astype(np.float32)
        outcome = row['event_idx']

        return torch.tensor([batter, pitcher], dtype=torch.long), \
               torch.tensor([pitch_type, release_speed, release_pos_x, release_pos_z], dtype=torch.float32), \
               torch.tensor(game_state, dtype=torch.float32), \
               torch.tensor(outcome, dtype=torch.long)

In [11]:
# class BaseballModel(nn.Module):
#     def __init__(self, num_players, embedding_dim, game_state_dim, hidden_dim, output_dim):
#         super(BaseballModel, self).__init__()
#         self.batter_embedding = nn.Embedding(num_players, embedding_dim)
#         self.pitcher_embedding = nn.Embedding(num_players, embedding_dim)
#         self.pitch_model = nn.Sequential(
#             nn.Linear(embedding_dim * 2 + game_state_dim, hidden_dim),  # 50 + 50 + 6 = 106
#             nn.ReLU(),
#             nn.Linear(hidden_dim, 4)  # Predict pitch_type, release_speed, release_pos_x, release_pos_z
#         )
#         self.outcome_model = nn.Sequential(
#             nn.Linear(embedding_dim * 2 + 4 + game_state_dim, hidden_dim),  # 50 + 50 + 4 + 6 = 110
#             nn.ReLU(),
#             nn.Linear(hidden_dim, output_dim)  # Predict outcome
#         )

#     def forward(self, players, game_state):
#         batter_emb = self.batter_embedding(players[:, 0])
#         pitcher_emb = self.pitcher_embedding(players[:, 1])
#         x = torch.cat([batter_emb, pitcher_emb, game_state], dim=1)

#         pitch_pred = self.pitch_model(x)

#         outcome_input = torch.cat([batter_emb, pitcher_emb, pitch_pred, game_state], dim=1)
#         outcome_pred = self.outcome_model(outcome_input)

#         return pitch_pred, outcome_pred

In [12]:
class BaseballModel(nn.Module):
    def __init__(self, num_players, embedding_dim, game_state_dim, hidden_dim, output_dim):
        super(BaseballModel, self).__init__()
        self.batter_embedding = nn.Embedding(num_players, embedding_dim)
        self.pitcher_embedding = nn.Embedding(num_players, embedding_dim)
        
        self.pitch_model = nn.Sequential(
            nn.Linear(embedding_dim * 2 + game_state_dim, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim // 2),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim // 2, 4)  # Predict pitch_type, release_speed, release_pos_x, release_pos_z
        )
        
        self.outcome_model = nn.Sequential(
            nn.Linear(embedding_dim * 2 + 4 + game_state_dim, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim // 2),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim // 2, output_dim)  # Predict outcome
        )

    def forward(self, players, game_state):
        batter_emb = self.batter_embedding(players[:, 0])
        pitcher_emb = self.pitcher_embedding(players[:, 1])
        x = torch.cat([batter_emb, pitcher_emb, game_state], dim=1)

        pitch_pred = self.pitch_model(x)

        outcome_input = torch.cat([batter_emb, pitcher_emb, pitch_pred, game_state], dim=1)
        outcome_pred = self.outcome_model(outcome_input)

        return pitch_pred, outcome_pred

In [13]:
# Initialize model, loss function, and optimizer
num_players = data['batter'].nunique() + data['pitcher'].nunique()
embedding_dim = 128
game_state_dim = 6
hidden_dim = 256
output_dim = len(event_to_idx)

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cpu"

model = BaseballModel(num_players, embedding_dim, game_state_dim, hidden_dim, output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [16]:
def save_checkpoint(model, optimizer, epoch, avg_loss, avg_val_loss, filename):
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'avg_loss': avg_loss,
        'avg_val_loss': avg_val_loss,
    }
    torch.save(checkpoint, filename)

def load_checkpoint(filename):
    checkpoint = torch.load(filename)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    avg_loss = checkpoint['avg_loss']
    avg_val_loss = checkpoint['avg_val_loss']
    return epoch, avg_loss, avg_val_loss

In [19]:
# Use this when you've found the right batch size

train_dataset = BaseballDataset(data)
val_dataset = BaseballDataset(val_data)

train_loader = DataLoader(train_dataset, batch_size=1048576, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1048576, shuffle=False)

num_epochs = 100
checkpoint_dir = 'checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)

avg_losses = []
avg_val_losses = []

load_checkpoint_filename = "../notebooks/checkpoints/checkpoint_epoch_50_1722738319.pth"
if load_checkpoint_filename:
    epoch, avg_loss, avg_val_loss = load_checkpoint(load_checkpoint_filename)
    print(f"Loaded checkpoint from {load_checkpoint_filename}, starting at epoch {epoch}, loss: {avg_loss}, val loss: {avg_val_loss}")


for epoch in tqdm(range(epoch + 1, epoch + num_epochs + 1)):
    epoch_start_time = time.time()
    total_loss = 0
    total_val_loss = 0

    model.train()
    for players, pitch_characteristics, game_state, outcome in train_loader:
        # Move data to GPU
        players = players.to(device)
        pitch_characteristics = pitch_characteristics.to(device)
        game_state = game_state.to(device)
        outcome = outcome.to(device)

        optimizer.zero_grad()

        pitch_pred, outcome_pred = model(players, game_state)

        loss = criterion(outcome_pred, outcome)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    model.eval()
    with torch.no_grad():
        for players, pitch_characteristics, game_state, outcome in val_loader:
            players = players.to(device)
            pitch_characteristics = pitch_characteristics.to(device)
            game_state = game_state.to(device)
            outcome = outcome.to(device)

            pitch_pred, outcome_pred = model(players, game_state)

            val_loss = criterion(outcome_pred, outcome)
            total_val_loss += val_loss.item()
        
    avg_loss = total_loss / len(train_loader)
    avg_val_loss = total_val_loss / len(val_loader)

    avg_losses.append(avg_loss)
    avg_val_losses.append(avg_val_loss)

    epoch_end_time = time.time()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Val Loss: {avg_val_loss:.4f}")
    print(f"Data Per Second: {len(data) / (epoch_end_time - epoch_start_time):.2f}")

    # Save checkpoint
    checkpoint_filename = os.path.join(checkpoint_dir, f'checkpoint_epoch_{epoch+1}_{int(time.time())}.pth')
    save_checkpoint(model, optimizer, epoch+1, avg_loss, avg_val_loss, checkpoint_filename)

print("Training complete")

# After training, if you want to use the model on CPU again
model = model.to("cpu")


  checkpoint = torch.load(filename)


Loaded checkpoint from ../notebooks/checkpoints/checkpoint_epoch_50_1722738319.pth, starting at epoch 50, loss: 2.5178525924682615, val loss: 2.4092178344726562


  0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
# # Use this to find the right batch size for your machine/params for SPEED

# import time
# import torch
# from torch.utils.data import DataLoader
# from tqdm import tqdm

# def train_and_evaluate_sample(model, train_dataset, val_dataset, batch_size, num_batches, device, criterion, optimizer):
#     train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
#     val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

#     model.train()
#     total_loss = 0
#     total_val_loss = 0
#     start_time = time.time()

#     # Train on a sample of batches
#     for i, (players, pitch_characteristics, game_state, outcome) in enumerate(train_loader):
#         if i >= num_batches:
#             break

#         players = players.to(device)
#         pitch_characteristics = pitch_characteristics.to(device)
#         game_state = game_state.to(device)
#         outcome = outcome.to(device)

#         optimizer.zero_grad()
#         pitch_pred, outcome_pred = model(players, game_state)
#         loss = criterion(outcome_pred, outcome)
#         loss.backward()
#         optimizer.step()
#         total_loss += loss.item()

#     # Validate on a sample of batches
#     model.eval()
#     with torch.no_grad():
#         for i, (players, pitch_characteristics, game_state, outcome) in enumerate(val_loader):
#             if i >= num_batches:
#                 break

#             players = players.to(device)
#             pitch_characteristics = pitch_characteristics.to(device)
#             game_state = game_state.to(device)
#             outcome = outcome.to(device)

#             pitch_pred, outcome_pred = model(players, game_state)
#             val_loss = criterion(outcome_pred, outcome)
#             total_val_loss += val_loss.item()

#     avg_loss = total_loss / num_batches
#     avg_val_loss = total_val_loss / num_batches
#     sample_runtime = time.time() - start_time

#     # Estimate full epoch time
#     estimated_epoch_time = sample_runtime * (len(train_dataset) / (batch_size * num_batches))

#     return avg_loss, avg_val_loss, estimated_epoch_time

# def find_optimal_batch_size(model, train_dataset, val_dataset, device, criterion, optimizer):
#     batch_sizes = [32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152]
#     num_batches = 5  # Number of batches to sample for each batch size
#     results = []

#     for batch_size in tqdm(batch_sizes, desc="Testing batch sizes"):
#         model.to(device)
#         avg_loss, avg_val_loss, estimated_epoch_time = train_and_evaluate_sample(
#             model, train_dataset, val_dataset, batch_size, num_batches, device, criterion, optimizer
#         )
#         data_per_second = len(train_dataset) / estimated_epoch_time
#         results.append({
#             'batch_size': batch_size,
#             'avg_loss': avg_loss,
#             'avg_val_loss': avg_val_loss,
#             'data_per_second': data_per_second,
#             'estimated_epoch_time': estimated_epoch_time
#         })
#         print(f"Batch size: {batch_size}, Loss: {avg_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Data/s: {data_per_second:.2f}, Est. Epoch Time: {estimated_epoch_time:.2f}s")

#     # Find the batch size with the lowest validation loss
#     best_batch_size = min(results, key=lambda x: x['avg_val_loss'])['batch_size']
    
#     print("\nResults:")
#     for result in results:
#         print(f"Batch size: {result['batch_size']}, Loss: {result['avg_loss']:.4f}, Val Loss: {result['avg_val_loss']:.4f}, Data/s: {result['data_per_second']:.2f}, Est. Epoch Time: {result['estimated_epoch_time']:.2f}s")
    
#     print(f"\nBest batch size based on validation loss: {best_batch_size}")

#     # Save results to CSV
#     df = pd.DataFrame(results)
#     df.to_csv('batch_size_results.csv', index=False)
#     print("Results saved to batch_size_results.csv")

#     return results, best_batch_size

# train_dataset = BaseballDataset(data)
# val_dataset = BaseballDataset(val_data)

# train_loader = DataLoader(train_dataset, batch_size=262144, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=262144, shuffle=False)

# # Usage
# results, best_batch_size = find_optimal_batch_size(model, train_dataset, val_dataset, device, criterion, optimizer)

# # Train the model with the best batch size
# best_train_loader = DataLoader(train_dataset, batch_size=best_batch_size, shuffle=True)
# best_val_loader = DataLoader(val_dataset, batch_size=best_batch_size, shuffle=False)

# # ... (rest of your training code using best_train_loader and best_val_loader)

# print("Training complete")

# # After training, if you want to use the model on CPU again
# model = model.to("cpu")

Testing batch sizes:   0%|          | 0/17 [00:00<?, ?it/s]

Testing batch sizes:   6%|▌         | 1/17 [00:00<00:12,  1.24it/s]

Batch size: 32, Loss: 6.8497, Val Loss: 6.5881, Data/s: 199.42, Est. Epoch Time: 23972.46s


Testing batch sizes:  12%|█▏        | 2/17 [00:01<00:11,  1.31it/s]

Batch size: 64, Loss: 6.8335, Val Loss: 6.5891, Data/s: 436.45, Est. Epoch Time: 10953.18s


Testing batch sizes:  18%|█▊        | 3/17 [00:02<00:12,  1.09it/s]

Batch size: 128, Loss: 6.7736, Val Loss: 6.5848, Data/s: 583.93, Est. Epoch Time: 8186.81s


Testing batch sizes:  24%|██▎       | 4/17 [00:04<00:16,  1.30s/it]

Batch size: 256, Loss: 6.7374, Val Loss: 6.5752, Data/s: 675.54, Est. Epoch Time: 7076.55s


Testing batch sizes:  29%|██▉       | 5/17 [00:07<00:23,  1.97s/it]

Batch size: 512, Loss: 6.7014, Val Loss: 6.5642, Data/s: 810.08, Est. Epoch Time: 5901.25s


Testing batch sizes:  35%|███▌      | 6/17 [00:13<00:35,  3.26s/it]

Batch size: 1024, Loss: 6.6296, Val Loss: 6.5304, Data/s: 887.45, Est. Epoch Time: 5386.78s


Testing batch sizes:  41%|████      | 7/17 [00:24<00:58,  5.85s/it]

Batch size: 2048, Loss: 6.6074, Val Loss: 6.5169, Data/s: 917.15, Est. Epoch Time: 5212.33s


Testing batch sizes:  47%|████▋     | 8/17 [00:46<01:38, 10.93s/it]

Batch size: 4096, Loss: 6.5425, Val Loss: 6.4826, Data/s: 939.21, Est. Epoch Time: 5089.91s


Testing batch sizes:  53%|█████▎    | 9/17 [01:30<02:49, 21.18s/it]

Batch size: 8192, Loss: 6.4895, Val Loss: 6.4416, Data/s: 936.72, Est. Epoch Time: 5103.47s


Testing batch sizes:  59%|█████▉    | 10/17 [02:59<04:54, 42.07s/it]

Batch size: 16384, Loss: 6.4386, Val Loss: 6.3941, Data/s: 921.96, Est. Epoch Time: 5185.16s


Testing batch sizes:  65%|██████▍   | 11/17 [05:54<08:17, 82.88s/it]

Batch size: 32768, Loss: 6.3750, Val Loss: 6.3390, Data/s: 934.16, Est. Epoch Time: 5117.47s


Testing batch sizes:  71%|███████   | 12/17 [11:44<13:40, 164.08s/it]

Batch size: 65536, Loss: 6.3007, Val Loss: 6.2733, Data/s: 936.76, Est. Epoch Time: 5103.23s


Testing batch sizes:  76%|███████▋  | 13/17 [21:36<19:35, 293.84s/it]

Batch size: 131072, Loss: 6.2136, Val Loss: 6.1925, Data/s: 1106.21, Est. Epoch Time: 4321.53s


Testing batch sizes:  82%|████████▏ | 14/17 [37:31<24:40, 493.53s/it]

Batch size: 262144, Loss: 6.1090, Val Loss: 3.6515, Data/s: 1372.57, Est. Epoch Time: 3482.88s


Testing batch sizes:  88%|████████▊ | 15/17 [1:05:40<28:27, 853.73s/it]

Batch size: 524288, Loss: 5.9824, Val Loss: 2.3779, Data/s: 1552.52, Est. Epoch Time: 3079.20s


Testing batch sizes:  94%|█████████▍| 16/17 [1:46:14<22:09, 1329.43s/it]

Batch size: 1048576, Loss: 5.8300, Val Loss: 1.1522, Data/s: 2153.92, Est. Epoch Time: 2219.45s


In [None]:
# Plot the losses
plt.figure(figsize=(10, 5))
plt.plot(avg_losses, label='Training Loss')
plt.plot(avg_val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Losses')
plt.legend()
plt.show()

NameError: name 'avg_losses' is not defined

<Figure size 1000x500 with 0 Axes>

In [None]:
# Save the model
torch.save(model.state_dict(), 'enhanced_baseball_model.pth')

In [None]:
def predict(model, player_data, game_state_data, sample=False):
    model.eval()
    with torch.no_grad():
        players = torch.tensor(player_data, dtype=torch.long).unsqueeze(0)
        game_state = torch.tensor(game_state_data, dtype=torch.float32).unsqueeze(0)
        
        pitch_pred, outcome_pred = model(players, game_state)
        
        outcome_prob = torch.softmax(outcome_pred, dim=1)
        
        if sample:
            # Sample from the probability distribution
            predicted_outcome = torch.multinomial(outcome_prob, num_samples=1).item()
        else:
            # Choose the most likely outcome (argmax)
            predicted_outcome = torch.argmax(outcome_prob, dim=1).item()
        
        return idx_to_event[predicted_outcome]

In [None]:
# Example prediction
model = model.to("cpu")
player_data = [data.iloc[5]['batter'], data.iloc[5]['pitcher']]
game_state_data = data.iloc[0][['home_score', 'away_score', 'balls', 'strikes', 'outs_when_up', 'inning']].values.astype(np.float32)

print(player_data)

predicted_event = predict(model, player_data, game_state_data, sample=True)
print(f'Predicted event: {predicted_event}')


[1327, 743]
Predicted event: other_out
