In [1]:
import pandas as pd
import numpy as np

pd.reset_option('display.max_columns')
pd.reset_option('display.max_rows')
pd.reset_option('display.max_colwidth')
pd.reset_option('display.width')

# pd.set_option('display.max_rows',None)

In [2]:
# Remove any rows that are not of the "play" type since these are the rows that contain the pitch sequence
# Add header to the csv

team_abbrv = [
    'ANA','ARI','ATL','BAL','BOS','CHA','CHN','CIN','CLE','COL',
    'DET','HOU','KCA','LAN','MIA','MIL','MIN','NYA','NYN','OAK',
    'PHI','PIT','SDN','SEA','SFN','SLN','TBA','TEX','TOR','WAS'
] 

for team in team_abbrv:
    file_path = f"Event_Data/2022eve/2022{team}.csv"
    output_file_path = f"Event_Data/2022filtered/2022{team}.csv"

    headers = ['Type','Inning','Visiting/Home (0/1)','Retro PID','Count','Pitches','Event']

    with open(file_path, 'r') as f:
        with open(output_file_path, 'w') as out:
            out.write(','.join(headers) + '\n')
            for line in f:
                fields = line.strip().split(',')
                if fields[0] == 'play':
                    out.write(line)

In [3]:
matrix = np.zeros((15,15))
states = [
    '[0-0]', '[1-0]', '[0-1]', '[2-0]', '[1-1]', '[0-2]', '[3-0]', 
    '[2-1]', '[1-2]', '[3-1]', '[2-2]', '[3-2]', 'OUT', 'WALK', 'PLAY'
]
transition_df = pd.DataFrame(matrix, index=states, columns=states)
# Add a 1 at the diagonal intersection of OUT, WALK, and PLAY
transition_df.loc['OUT','OUT'] = 1
transition_df.loc['WALK','WALK'] = 1
transition_df.loc['PLAY','PLAY'] = 1

transition_df

Unnamed: 0,[0-0],[1-0],[0-1],[2-0],[1-1],[0-2],[3-0],[2-1],[1-2],[3-1],[2-2],[3-2],OUT,WALK,PLAY
[0-0],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[1-0],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[0-1],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[2-0],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[1-1],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[0-2],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[3-0],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[2-1],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[1-2],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[3-1],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
def create_transition_matrix(df):
    strike_list = ['C','K','S','O','T','L']
    foul_list = ['F']
    ball_list = ['B']
    in_play_list = ['X']

    # re-initialize transition matrix
    matrix = np.zeros((15,15))
    states = [
        '[0-0]', '[1-0]', '[0-1]', '[2-0]', '[1-1]', '[0-2]', '[3-0]', 
        '[2-1]', '[1-2]', '[3-1]', '[2-2]', '[3-2]', 'OUT', 'WALK', 'PLAY'
    ]
    transition_df = pd.DataFrame(matrix, index=states, columns=states)
    # Add a 1 at the diagonal intersection of OUT, WALK, and PLAY
    transition_df.loc['OUT','OUT'] = 1
    transition_df.loc['WALK','WALK'] = 1
    transition_df.loc['PLAY','PLAY'] = 1

    for index,row in df.iterrows():
        state = '[0-0]'
        balls = 0
        strikes = 0
        for pitch in row['Pitches']:
            if state == 'OUT' or state == 'WALK' or state == 'PLAY':
                break
            if pitch in foul_list:
                if strikes < 2:
                    strikes += 1
                transition_df.loc[state,f'[{balls}-{strikes}]'] += 1
                state = f'[{balls}-{strikes}]'
            if pitch in strike_list:
                strikes += 1
                if strikes == 3:
                    transition_df.loc[state,'OUT'] += 1
                    state = 'OUT'
                else:
                    transition_df.loc[state,f'[{balls}-{strikes}]'] += 1
                    state = f'[{balls}-{strikes}]'
            if pitch in ball_list:
                balls += 1
                if balls == 4:
                    transition_df.loc[state,'WALK'] += 1
                    state = 'WALK'
                else:
                    transition_df.loc[state,f'[{balls}-{strikes}]'] += 1
                    state = f'[{balls}-{strikes}]'
            if pitch in in_play_list:
                transition_df.loc[state,'PLAY'] += 1
                state = 'PLAY'
    # Make the OUT, WALK, and PLAY rows all 0 except for the diagonal
    transition_df.loc['OUT','OUT'] = 1
    transition_df.loc['WALK','WALK'] = 1
    transition_df.loc['PLAY','PLAY'] = 1
    transition_df.loc['OUT','WALK'] = 0
    transition_df.loc['OUT','PLAY'] = 0
    transition_df.loc['WALK','OUT'] = 0
    transition_df.loc['WALK','PLAY'] = 0
    transition_df.loc['PLAY','OUT'] = 0
    transition_df.loc['PLAY','WALK'] = 0
    return transition_df

In [5]:
master_df = transition_df.copy()
master_df

Unnamed: 0,[0-0],[1-0],[0-1],[2-0],[1-1],[0-2],[3-0],[2-1],[1-2],[3-1],[2-2],[3-2],OUT,WALK,PLAY
[0-0],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[1-0],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[0-1],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[2-0],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[1-1],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[0-2],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[3-0],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[2-1],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[1-2],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[3-1],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
for team in team_abbrv:
    print(team)
    file_path = f"Event_Data/2022filtered/2022{team}.csv"
    df = pd.read_csv(file_path)

    # Since sequential rows with the same player ID represent repeated information, remove the previous row from the dataframe
    mask = df['Retro PID'] != df['Retro PID'].shift(-1)
    df = df[mask]
    df.reset_index(drop=True, inplace=True) 

    # Remove the following characters from each element in the Pitches column: +, *, ., 1, 2, 3, >
    chars_to_remove = '[+*\.123>]'
    df['Pitches'] = df['Pitches'].replace(chars_to_remove, '', regex=True)

    # Keep Rows that only contain: B, C, F, K, L, M, O, S, T, X
    chars_to_find = '[BCFKLMOSTX]'
    df = df[df['Pitches'].str.contains(chars_to_find, na=False)]
    df = df.reset_index(drop=True)  

    transition_df = create_transition_matrix(df)
    master_df = master_df.add(transition_df, fill_value=0)

ANA
ARI
ATL
BAL
BOS
CHA
CHN
CIN
CLE
COL
DET
HOU
KCA
LAN
MIA
MIL
MIN
NYA
NYN
OAK
PHI
PIT
SDN
SEA
SFN
SLN
TBA
TEX
TOR
WAS


In [7]:
master_df

Unnamed: 0,[0-0],[1-0],[0-1],[2-0],[1-1],[0-2],[3-0],[2-1],[1-2],[3-1],[2-2],[3-2],OUT,WALK,PLAY
[0-0],0.0,69861.0,91708.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20010.0
[1-0],0.0,0.0,0.0,23620.0,35117.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10919.0
[0-1],0.0,0.0,0.0,0.0,36510.0,39148.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15716.0
[2-0],0.0,0.0,0.0,0.0,0.0,0.0,7236.0,12357.0,0.0,0.0,0.0,0.0,0.0,0.0,3907.0
[1-1],0.0,0.0,0.0,0.0,0.0,0.0,0.0,24406.0,32622.0,0.0,0.0,0.0,0.0,0.0,14320.0
[0-2],0.0,0.0,0.0,0.0,0.0,9238.0,0.0,0.0,21450.0,0.0,0.0,0.0,8652.0,0.0,8788.0
[3-0],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4473.0,0.0,0.0,0.0,2363.0,359.0
[2-1],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10756.0,17807.0,0.0,0.0,0.0,8089.0
[1-2],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15087.0,0.0,25876.0,0.0,13338.0,0.0,14493.0
[3-1],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7731.0,0.0,4160.0,3289.0


In [8]:
master_df.to_csv('avg_counts_df.csv')

In [9]:
# normalize every row in master_df so that the sum of each row is 1
master_df = master_df.div(master_df.sum(axis=1), axis=0)
master_df.round(3)

Unnamed: 0,[0-0],[1-0],[0-1],[2-0],[1-1],[0-2],[3-0],[2-1],[1-2],[3-1],[2-2],[3-2],OUT,WALK,PLAY
[0-0],0.0,0.385,0.505,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.11
[1-0],0.0,0.0,0.0,0.339,0.504,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.157
[0-1],0.0,0.0,0.0,0.0,0.4,0.428,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.172
[2-0],0.0,0.0,0.0,0.0,0.0,0.0,0.308,0.526,0.0,0.0,0.0,0.0,0.0,0.0,0.166
[1-1],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.342,0.457,0.0,0.0,0.0,0.0,0.0,0.201
[0-2],0.0,0.0,0.0,0.0,0.0,0.192,0.0,0.0,0.446,0.0,0.0,0.0,0.18,0.0,0.183
[3-0],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.622,0.0,0.0,0.0,0.328,0.05
[2-1],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.293,0.486,0.0,0.0,0.0,0.221
[1-2],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.219,0.0,0.376,0.0,0.194,0.0,0.211
[3-1],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.509,0.0,0.274,0.217


In [10]:
master_df.to_csv('avg_rates_df.csv')

# KEY
      +  following pickoff throw by the catcher
      *  indicates the following pitch was blocked by the catcher
      .  marker for play not involving the batter
      1  pickoff throw to first
      2  pickoff throw to second
      3  pickoff throw to third
      >  Indicates a runner going on the pitch

      A  automatic strike, usually for pitch timer violation
      B  ball
      C  called strike
      F  foul
      H  hit batter
      I  intentional ball
      K  strike (unknown type)
      L  foul bunt
      M  missed bunt attempt
      N  no pitch (on balks and interference calls)
      O  foul tip on bunt
      P  pitchout
      Q  swinging on pitchout
      R  foul ball on pitchout
      S  swinging strike
      T  foul tip
      U  unknown or missed pitch
      V  called ball because pitcher went to his mouth or automatic ball on intentional walk or
         pitch timer violation
      X  ball put into play by batter
      Y  ball put into play on pitchout