In [39]:
import pandas as pd
import numpy as np
pd.reset_option('display.max_columns')
pd.reset_option('display.max_rows')
pd.reset_option('display.max_colwidth')
pd.reset_option('display.width')

# pd.set_option('display.max_rows',None)

In [40]:
matrix = np.zeros((14,14))
states = [
    '[0-0]', '[1-0]', '[0-1]', '[2-0]', '[1-1]', '[0-2]', '[3-0]', 
    '[2-1]', '[1-2]', '[3-1]', '[2-2]', '[3-2]', 'OUT', 'WALK'
]

transition_df = pd.DataFrame(matrix, index=states, columns=states)
transition_df

Unnamed: 0,[0-0],[1-0],[0-1],[2-0],[1-1],[0-2],[3-0],[2-1],[1-2],[3-1],[2-2],[3-2],OUT,WALK
[0-0],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[1-0],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[0-1],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[2-0],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[1-1],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[0-2],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[3-0],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[2-1],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[1-2],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[3-1],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
# Remove any rows that are not of the "play" type since these are the rows that contain the pitch sequence
# Add header to the csv

team_abbrv = [
    'ANA','ARI','ATL','BAL','BOS','CHA','CHN','CIN','CLE','COL',
    'DET','HOU','KCA','LAN','MIA','MIL','MIN','NYA','NYN','OAK',
    'PHI','PIT','SDN','SEA','SFN','SLN','TBA','TEX','TOR','WAS'
]

for team in team_abbrv:
    file_path = f"Event_Data/2022eve/2022{team}.csv"
    output_file_path = f"Event_Data/2022filtered/2022{team}.csv"

    headers = ['Type','Inning','Visiting/Home (0/1)','Retro PID','Count','Pitches','Event']

    with open(file_path, 'r') as f:
        with open(output_file_path, 'w') as out:
            out.write(','.join(headers) + '\n')
            for line in f:
                fields = line.strip().split(',')
                if fields[0] == 'play':
                    out.write(line)

In [42]:
df = pd.read_csv(output_file_path)

In [43]:
df

Unnamed: 0,Type,Inning,Visiting/Home (0/1),Retro PID,Count,Pitches,Event
0,play,1,0,marts002,22,CBCBX,S9/L89S-
1,play,1,0,davij006,11,BS11>B,CS2(24)
2,play,1,0,davij006,31,BS11>B.BB,W
3,play,1,0,lindf001,2,*SFFC,K
4,play,1,0,alonp001,32,FCB*B*B>X,6/F56D
...,...,...,...,...,...,...,...
7152,play,6,1,adamr004,2,.SC*S,K
7153,play,6,1,thoml002,32,CBBBFB,W
7154,play,6,1,abrac001,11,BFX,D9/G3.1-3
7155,play,6,1,menej001,11,BSX,63/G6.3-H


In [44]:
# Since sequential rows with the same player ID represent repeated information, remove the previous row from the dataframe

# First, create a mask to identify where the "Retro PID" changes. 
# This mask will be `True` at every position where "Retro PID" is different from the next one.
mask = df['Retro PID'] != df['Retro PID'].shift(-1)

# Now, filter the DataFrame to keep only the rows where "Retro PID" changes. 
# These are the last occurrences in each group of consecutive identical "Retro PID".
df = df[mask]

df.reset_index(drop=True, inplace=True)

# Now, `df_filtered` contains only the desired rows.



In [46]:
df

Unnamed: 0,Type,Inning,Visiting/Home (0/1),Retro PID,Count,Pitches,Event
0,play,1,0,marts002,22,CBCBX,S9/L89S-
1,play,1,0,davij006,31,BS11>B.BB,W
2,play,1,0,lindf001,2,*SFFC,K
3,play,1,0,alonp001,32,FCB*B*B>X,6/F56D
4,play,1,1,hernc005,22,CCBFFBC,K
...,...,...,...,...,...,...,...
6236,play,6,1,adamr004,2,.SC*S,K
6237,play,6,1,thoml002,32,CBBBFB,W
6238,play,6,1,abrac001,11,BFX,D9/G3.1-3
6239,play,6,1,menej001,11,BSX,63/G6.3-H


# KEY
      +  following pickoff throw by the catcher
      *  indicates the following pitch was blocked by the catcher
      .  marker for play not involving the batter
      1  pickoff throw to first
      2  pickoff throw to second
      3  pickoff throw to third
      >  Indicates a runner going on the pitch

      A  automatic strike, usually for pitch timer violation
      B  ball
      C  called strike
      F  foul
      H  hit batter
      I  intentional ball
      K  strike (unknown type)
      L  foul bunt
      M  missed bunt attempt
      N  no pitch (on balks and interference calls)
      O  foul tip on bunt
      P  pitchout
      Q  swinging on pitchout
      R  foul ball on pitchout
      S  swinging strike
      T  foul tip
      U  unknown or missed pitch
      V  called ball because pitcher went to his mouth or automatic ball on intentional walk or
         pitch timer violation
      X  ball put into play by batter
      Y  ball put into play on pitchout

In [47]:
# Remove the following characters from each element in the Pitches column: +, *, ., 1, 2, 3, >

# Define the characters to remove
chars_to_remove = '[+*\.123>]'

# Use a regular expression (regex) to replace those characters with an empty string in the 'Pitches' column
df['Pitches'] = df['Pitches'].replace(chars_to_remove, '', regex=True)

# Display the DataFrame to verify the changes
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Pitches'] = df['Pitches'].replace(chars_to_remove, '', regex=True)


Unnamed: 0,Type,Inning,Visiting/Home (0/1),Retro PID,Count,Pitches,Event
0,play,1,0,marts002,22,CBCBX,S9/L89S-
1,play,1,0,davij006,31,BSBBB,W
2,play,1,0,lindf001,2,SFFC,K
3,play,1,0,alonp001,32,FCBBBX,6/F56D
4,play,1,1,hernc005,22,CCBFFBC,K
...,...,...,...,...,...,...,...
6236,play,6,1,adamr004,2,SCS,K
6237,play,6,1,thoml002,32,CBBBFB,W
6238,play,6,1,abrac001,11,BFX,D9/G3.1-3
6239,play,6,1,menej001,11,BSX,63/G6.3-H


In [50]:
# Remove Rows that contain: A, H, I, N, P, Q, R, U, V, Y


# Define the characters that you're interested in as a regular expression
# This regex will match any string that contains any of the characters
chars_to_find = '[AHINPQRUVY]'

# Filter the DataFrame. '~' is the logical NOT operator, and df['Pitches'].str.contains(chars_to_find)
# will return a boolean Series. When used with '~', we select rows that don't match the pattern.
df = df[~df['Pitches'].str.contains(chars_to_find, na=False)]


df

Unnamed: 0,Type,Inning,Visiting/Home (0/1),Retro PID,Count,Pitches,Event
0,play,1,0,marts002,22,CBCBX,S9/L89S-
1,play,1,0,davij006,31,BSBBB,W
2,play,1,0,lindf001,2,SFFC,K
3,play,1,0,alonp001,32,FCBBBX,6/F56D
4,play,1,1,hernc005,22,CCBFFBC,K
...,...,...,...,...,...,...,...
6236,play,6,1,adamr004,2,SCS,K
6237,play,6,1,thoml002,32,CBBBFB,W
6238,play,6,1,abrac001,11,BFX,D9/G3.1-3
6239,play,6,1,menej001,11,BSX,63/G6.3-H
