In [1]:
import pandas as pd
import numpy as np

pd.reset_option('display.max_columns')
pd.reset_option('display.max_rows')
pd.reset_option('display.max_colwidth')
pd.reset_option('display.width')

# pd.set_option('display.max_rows',None)

In [2]:
def create_transition_matrix(df):
    strike_list = ['C','K','S','O','T','L']
    foul_list = ['F']
    ball_list = ['B']
    in_play_list = ['X']

    # re-initialize transition matrix
    matrix = np.zeros((15,15))
    states = [
        '[0-0]', '[1-0]', '[0-1]', '[2-0]', '[1-1]', '[0-2]', '[3-0]', 
        '[2-1]', '[1-2]', '[3-1]', '[2-2]', '[3-2]', 'OUT', 'WALK', 'PLAY'
    ]
    transition_df = pd.DataFrame(matrix, index=states, columns=states)
    # Add a 1 at the diagonal intersection of OUT, WALK, and PLAY
    transition_df.loc['OUT','OUT'] = 1
    transition_df.loc['WALK','WALK'] = 1
    transition_df.loc['PLAY','PLAY'] = 1

    for index,row in df.iterrows():
        state = '[0-0]'
        balls = 0
        strikes = 0
        for pitch in row['Pitches']:
            if state == 'OUT' or state == 'WALK' or state == 'PLAY':
                break
            if pitch in foul_list:
                if strikes < 2:
                    strikes += 1
                transition_df.loc[state,f'[{balls}-{strikes}]'] += 1
                state = f'[{balls}-{strikes}]'
            if pitch in strike_list:
                strikes += 1
                if strikes == 3:
                    transition_df.loc[state,'OUT'] += 1
                    state = 'OUT'
                else:
                    transition_df.loc[state,f'[{balls}-{strikes}]'] += 1
                    state = f'[{balls}-{strikes}]'
            if pitch in ball_list:
                balls += 1
                if balls == 4:
                    transition_df.loc[state,'WALK'] += 1
                    state = 'WALK'
                else:
                    transition_df.loc[state,f'[{balls}-{strikes}]'] += 1
                    state = f'[{balls}-{strikes}]'
            if pitch in in_play_list:
                transition_df.loc[state,'PLAY'] += 1
                state = 'PLAY'
    # Make the OUT, WALK, and PLAY rows all 0 except for the diagonal
    transition_df.loc['OUT','OUT'] = 1
    transition_df.loc['WALK','WALK'] = 1
    transition_df.loc['PLAY','PLAY'] = 1
    transition_df.loc['OUT','WALK'] = 0
    transition_df.loc['OUT','PLAY'] = 0
    transition_df.loc['WALK','OUT'] = 0
    transition_df.loc['WALK','PLAY'] = 0
    transition_df.loc['PLAY','OUT'] = 0
    transition_df.loc['PLAY','WALK'] = 0
    return transition_df

In [3]:
team_abbrv = [
    'ANA','ARI','ATL','BAL','BOS','CHA','CHN','CIN','CLE','COL',
    'DET','HOU','KCA','LAN','MIA','MIL','MIN','NYA','NYN','OAK',
    'PHI','PIT','SDN','SEA','SFN','SLN','TBA','TEX','TOR','WAS'
]

In [4]:
# Remove any rows that are not of the "play" type since these are the rows that contain the pitch sequence
# Add header to the csv
for team in team_abbrv:
    file_path = f"Event_Data/2022eve/2022{team}.csv"
    output_file_path = f"Event_Data/2022filtered/2022{team}.csv"

    headers = ['Type','Inning','Visiting/Home (0/1)','Retro PID','Count','Pitches','Event']

    with open(file_path, 'r') as f:
        with open(output_file_path, 'w') as out:
            out.write(','.join(headers) + '\n')
            for line in f:
                fields = line.strip().split(',')
                if fields[0] == 'play':
                    out.write(line)

In [5]:
def initialize_transition_df():
    matrix = np.zeros((15,15))
    states = [
        '[0-0]', '[1-0]', '[0-1]', '[2-0]', '[1-1]', '[0-2]', '[3-0]', 
        '[2-1]', '[1-2]', '[3-1]', '[2-2]', '[3-2]', 'OUT', 'WALK', 'PLAY'
    ]
    transition_df = pd.DataFrame(matrix, index=states, columns=states)
    # Add a 1 at the diagonal intersection of OUT, WALK, and PLAY
    transition_df.loc['OUT','OUT'] = 1
    transition_df.loc['WALK','WALK'] = 1
    transition_df.loc['PLAY','PLAY'] = 1
    return transition_df

transition_df = initialize_transition_df()

In [6]:
master_df = transition_df.copy()
master_df

Unnamed: 0,[0-0],[1-0],[0-1],[2-0],[1-1],[0-2],[3-0],[2-1],[1-2],[3-1],[2-2],[3-2],OUT,WALK,PLAY
[0-0],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[1-0],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[0-1],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[2-0],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[1-1],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[0-2],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[3-0],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[2-1],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[1-2],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[3-1],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
for team in team_abbrv:
    print(team)
    file_path = f"Event_Data/2022filtered/2022{team}.csv"
    df = pd.read_csv(file_path)

    # Since sequential rows with the same player ID represent repeated information, remove the previous row from the dataframe
    mask = df['Retro PID'] != df['Retro PID'].shift(-1)
    df = df[mask]
    df.reset_index(drop=True, inplace=True) 

    # Remove the following characters from each element in the Pitches column: +, *, ., 1, 2, 3, >
    chars_to_remove = '[+*\.123>]'
    df['Pitches'] = df['Pitches'].replace(chars_to_remove, '', regex=True)

    # Keep Rows that only contain: B, C, F, K, L, M, O, S, T, X
    chars_to_find = '[BCFKLMOSTX]'
    df = df[df['Pitches'].str.contains(chars_to_find, na=False)]
    df = df.reset_index(drop=True)  

    transition_df = create_transition_matrix(df)
    master_df = master_df.add(transition_df, fill_value=0)

ANA
ARI
ATL
BAL
BOS
CHA
CHN
CIN
CLE
COL
DET
HOU
KCA
LAN
MIA
MIL
MIN
NYA
NYN
OAK
PHI
PIT
SDN
SEA
SFN
SLN
TBA
TEX
TOR
WAS


# Player Specific

In [80]:
players = ['hendk001','kersc001','waina001','wheez001','greiz001','montf001','bumgm001']
player_id = players[6]
valid_teams = []
for team in team_abbrv:
    file_path = f"Event_Data/2022eve/2022{team}.csv"
    output_file_path = f"Event_Data/2022{player_id}/2022{team}.csv"

    game_lines = []
    with open(file_path, 'r') as f:
        for line in f:
            fields = line.strip().split(',')
            if fields[0] == 'id':
                start_row_exists = any(row[0] == 'start' and row[1] == player_id for row in game_lines)
                if start_row_exists:
                    game_lines.insert(0,prior_id)
                    with open(output_file_path, 'a') as out:
                        for game_line in game_lines:
                            out.write(','.join(game_line) + '\n')
                    if team not in valid_teams:
                        valid_teams.append(team)
                game_lines = []
                prior_id = fields
            else:
                game_lines.append(fields)

In [81]:
valid_teams

['ARI',
 'ATL',
 'CHN',
 'CIN',
 'COL',
 'LAN',
 'MIA',
 'PHI',
 'SDN',
 'SFN',
 'SLN',
 'WAS']

In [82]:
# Remove lines from games after pitcher is removed
for team in valid_teams:
    file_path = f"Event_Data/2022{player_id}/2022{team}.csv"
    output_file_path = f"Event_Data/2022{player_id}subs/2022{team}.csv"

    game_lines = []
    with open(file_path, 'r') as f:
        for line in f:
            fields = line.strip().split(',')
            if fields[0] == 'id':
                current_game = fields
                subs = 0
                starting = True

            if fields[0] == 'start' and fields[1] == player_id:
                start_row = fields

            if fields[0] == 'sub' and fields[3] == start_row[3] and fields[4] == start_row[4] and fields[5] == start_row[5] and subs < 1:
                with open(output_file_path, 'a') as out:
                    for game_line in game_lines:
                        out.write(','.join(game_line) + '\n')
                game_lines = []
                starting = False
                subs += 1

            elif starting:
                game_lines.append(fields)

In [83]:
# Need to not count rows where player isn't the one pitching
for team in valid_teams:
    file_path = f"Event_Data/2022{player_id}subs/2022{team}.csv"
    output_file_path = f"Event_Data/2022{player_id}filtered/2022{team}.csv"

    headers = ['Type','Inning','Visiting/Home (0/1)','Retro PID','Count','Pitches','Event']

    with open(file_path, 'r') as f:
        with open(output_file_path, 'w') as out:
            out.write(','.join(headers) + '\n')
            for line in f:
                if fields[0] == 'start' and fields[1] == player_id:
                    home = fields[3]
                fields = line.strip().split(',')
                if fields[0] == 'play' and fields[2] != home:
                    out.write(line)

In [84]:
transition_df = initialize_transition_df()
player_df = transition_df.copy()
player_df

Unnamed: 0,[0-0],[1-0],[0-1],[2-0],[1-1],[0-2],[3-0],[2-1],[1-2],[3-1],[2-2],[3-2],OUT,WALK,PLAY
[0-0],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[1-0],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[0-1],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[2-0],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[1-1],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[0-2],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[3-0],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[2-1],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[1-2],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[3-1],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [85]:
for team in valid_teams:
    print(team)
    file_path = f"Event_Data/2022{player_id}filtered/2022{team}.csv"
    df = pd.read_csv(file_path)

    # Since sequential rows with the same player ID represent repeated information, remove the previous row from the dataframe
    mask = df['Retro PID'] != df['Retro PID'].shift(-1)
    df = df[mask]
    df.reset_index(drop=True, inplace=True) 

    # Remove the following characters from each element in the Pitches column: +, *, ., 1, 2, 3, >
    chars_to_remove = '[+*\.123>]'
    df['Pitches'] = df['Pitches'].replace(chars_to_remove, '', regex=True)

    # Keep Rows that only contain: B, C, F, K, L, M, O, S, T, X
    chars_to_find = '[BCFKLMOSTX]'
    df = df[df['Pitches'].str.contains(chars_to_find, na=False)]
    df = df.reset_index(drop=True)  

    transition_df = create_transition_matrix(df)
    player_df = player_df.add(transition_df, fill_value=0)

ARI
ATL
CHN
CIN
COL
LAN
MIA
PHI
SDN
SFN
SLN
WAS


In [86]:
master_df.loc['OUT', 'OUT'] = 1
master_df.loc['WALK', 'WALK'] = 1
master_df.loc['PLAY', 'PLAY'] = 1


In [87]:
player_df.loc['OUT', 'OUT'] = 1
player_df.loc['WALK', 'WALK'] = 1
player_df.loc['PLAY', 'PLAY'] = 1

In [88]:
df_sum = master_df.sum().sum()
df_sum

705982.0

In [89]:
master_df.to_csv('avg_counts_df.csv')
player_df.to_csv(f'{player_id}_counts_df.csv')

In [90]:
# normalize every row in df so that the sum of each row is 1
master_rates_df = master_df.div(master_df.sum(axis=1), axis=0)
player_rates_df = player_df.div(player_df.sum(axis=1), axis=0)
master_rates_df.round(3)
player_rates_df.round(3)

Unnamed: 0,[0-0],[1-0],[0-1],[2-0],[1-1],[0-2],[3-0],[2-1],[1-2],[3-1],[2-2],[3-2],OUT,WALK,PLAY
[0-0],0.0,0.386,0.499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.115
[1-0],0.0,0.0,0.0,0.347,0.481,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.172
[0-1],0.0,0.0,0.0,0.0,0.388,0.414,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.198
[2-0],0.0,0.0,0.0,0.0,0.0,0.0,0.344,0.505,0.0,0.0,0.0,0.0,0.0,0.0,0.151
[1-1],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.355,0.412,0.0,0.0,0.0,0.0,0.0,0.233
[0-2],0.0,0.0,0.0,0.0,0.0,0.219,0.0,0.0,0.47,0.0,0.0,0.0,0.082,0.0,0.23
[3-0],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.733,0.0,0.0,0.0,0.233,0.033
[2-1],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.264,0.45,0.0,0.0,0.0,0.286
[1-2],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.224,0.0,0.404,0.0,0.164,0.0,0.208
[3-1],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.508,0.0,0.288,0.203


In [91]:
master_rates_df.to_csv('Dataframes/average_rates_df.csv')
player_rates_df.to_csv(f'Dataframes/{player_id}_rates_df.csv')

# KEY
      +  following pickoff throw by the catcher
      *  indicates the following pitch was blocked by the catcher
      .  marker for play not involving the batter
      1  pickoff throw to first
      2  pickoff throw to second
      3  pickoff throw to third
      >  Indicates a runner going on the pitch

      A  automatic strike, usually for pitch timer violation
      B  ball
      C  called strike
      F  foul
      H  hit batter
      I  intentional ball
      K  strike (unknown type)
      L  foul bunt
      M  missed bunt attempt
      N  no pitch (on balks and interference calls)
      O  foul tip on bunt
      P  pitchout
      Q  swinging on pitchout
      R  foul ball on pitchout
      S  swinging strike
      T  foul tip
      U  unknown or missed pitch
      V  called ball because pitcher went to his mouth or automatic ball on intentional walk or
         pitch timer violation
      X  ball put into play by batter
      Y  ball put into play on pitchout