# Data Collection and Processing

This notebook is going to cover the data collection from pybaseball, as well as cleaning the data and picking features. The data will be focused on Red Sox hitters in the 2024 season. Pitcher data will be collected based on pitchers that the Red Sox hitters faced throughout the season.

In [1]:
import pandas as pd

In [2]:
from pybaseball import batting_stats

all_qualified_2024 = batting_stats(2024, qual=100)

red_sox_qualified_batters = all_qualified_2024[all_qualified_2024["Team"] == "BOS"]

red_sox_qualified_batters.head()

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA,L-WAR
41,24617,2024,Jarren Duran,BOS,27,160,671,735,191,108,...,113.9,225,0.437,515,0.161,0.266,0.271,0.448,0.338,6.4
30,17350,2024,Rafael Devers,BOS,27,138,525,601,143,76,...,114.7,201,0.523,384,0.123,0.266,0.272,0.509,0.364,3.9
84,23772,2024,Wilyer Abreu,BOS,25,132,399,447,101,51,...,114.4,139,0.498,279,0.157,0.282,0.229,0.418,0.317,2.8
35,15711,2024,Tyler O'Neill,BOS,29,113,411,473,99,50,...,113.1,123,0.484,254,0.164,0.308,0.213,0.48,0.339,2.1
214,27531,2024,David Hamilton,BOS,26,98,294,317,73,47,...,108.8,69,0.322,214,0.187,0.29,0.231,0.346,0.281,1.9


In [3]:
from pybaseball import playerid_lookup

red_sox_names = red_sox_qualified_batters["Name"].unique()

player_ids = []
for name in red_sox_names:
    first, last = name.split(" ", 1)
    res = playerid_lookup(last, first)
    if not res.empty:
        player_ids.append(res.iloc[0])

player_ids_df = pd.DataFrame(player_ids)

Gathering player lookup table. This may take a moment.


In [4]:
player_ids_df.head()

Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
0,duran,jarren,680776,duraj001,duranja01,24617,2021.0,2025.0
0,devers,rafael,646240,dever001,deverra01,17350,2017.0,2025.0
0,abreu,wilyer,677800,abrew002,abreuwi02,23772,2023.0,2025.0
0,o'neill,tyler,641933,oneit001,oneilty01,15711,2018.0,2025.0
0,hamilton,david,666152,hamid002,hamilda03,27531,2023.0,2025.0


In [7]:
from pybaseball import statcast
import pybaseball.cache

pybaseball.cache.enable()

data = statcast('2024-04-01', '2024-10-30', team='TEX')

data.head()



This is a large query, it may take a moment to complete


  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_cop

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,n_thruorder_pitcher,n_priorpa_thisgame_player_at_bat,pitcher_days_since_prev_game,batter_days_since_prev_game,pitcher_days_until_next_game,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle
73,SL,2024-09-29,83.5,-2.8,5.86,"Robertson, David",694359,502085,strikeout,swinging_strike,...,1,3,2,1,,,3.69,-1.45,1.45,43.6
75,FC,2024-09-29,94.9,-2.81,5.79,"Robertson, David",694359,502085,,foul,...,1,3,2,1,,,1.17,-0.08,0.08,40.7
78,FC,2024-09-29,94.3,-2.87,5.77,"Robertson, David",694359,502085,,foul,...,1,3,2,1,,,1.39,-0.2,0.2,40.3
79,KC,2024-09-29,86.3,-2.71,5.92,"Robertson, David",694359,502085,,ball,...,1,3,2,1,,,3.81,-0.62,0.62,44.5
81,FC,2024-09-29,93.9,-2.78,5.86,"Robertson, David",694359,502085,,foul,...,1,3,2,1,,,1.42,-0.4,0.4,41.8


In [10]:
data.columns

Index(['pitch_type', 'game_date', 'release_speed', 'release_pos_x',
       'release_pos_z', 'player_name', 'batter', 'pitcher', 'events',
       'description',
       ...
       'n_thruorder_pitcher', 'n_priorpa_thisgame_player_at_bat',
       'pitcher_days_since_prev_game', 'batter_days_since_prev_game',
       'pitcher_days_until_next_game', 'batter_days_until_next_game',
       'api_break_z_with_gravity', 'api_break_x_arm', 'api_break_x_batter_in',
       'arm_angle'],
      dtype='object', length=113)

In [11]:
from pybaseball import batting_stats_range

df = batting_stats_range("2024-04-01", "2024-05-01")

df.head()

Unnamed: 0,Name,Age,#days,Lev,Tm,G,PA,AB,R,H,...,SH,SF,GDP,SB,CS,BA,OBP,SLG,OPS,mlbID
1,CJ Abrams,23,349,Maj-NL,Washington,24,109,99,18,29,...,0,0,0,4,2,0.293,0.358,0.586,0.944,682928
2,Jos\xc3\xa9 Abreu,37,352,Maj-AL,Houston,18,64,60,5,7,...,0,1,2,0,0,0.117,0.156,0.133,0.29,547989
3,Wilyer Abreu,25,349,Maj-AL,Boston,23,83,72,15,24,...,0,1,0,5,0,0.333,0.41,0.556,0.965,677800
4,Ronald Acu\xc3\xb1a Jr.,26,349,Maj-NL,Atlanta,26,122,103,25,25,...,0,0,2,12,1,0.243,0.361,0.32,0.681,660670
5,Willy Adames,28,349,Maj-NL,Milwaukee,27,120,103,16,28,...,0,1,1,4,1,0.272,0.367,0.505,0.872,642715
