In [4]:

# import stuff
import pandas as pd
from matplotlib import pyplot
from pybaseball import statcast_batter, statcast_pitcher, playerid_lookup
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier
import ipywidgets as widgets
from IPython.display import clear_output
import textwrap


def setup():
    global gen_graph
    global clear_button
    global pfn
    global pln
    global numberofballs
    global numberofstrikes
    global bfn
    global p_type
    global bln
    global zonenum
    global pripitch
    global gen_graph
    global graph_of

    pfn = widgets.Text(placeholder="Pitcher's First Name")
    display(pfn)
    pln = widgets.Text(placeholder="Pitcher's Last Name")
    display(pln)
    bfn = widgets.Text(placeholder="Batter's First Name")
    display(bfn)
    bln = widgets.Text(placeholder="Batter's Last Name")
    display(bln)

    numberofballs = widgets.Dropdown(options=['0', '1', '2', '3'],value='0',description='Balls:')
    display(numberofballs)
    numberofstrikes = widgets.Dropdown(options=['0', '1', '2'],value='0',description='Strikes:')
    display(numberofstrikes)

    p_type = widgets.Dropdown(options=['Slider', 'Curveball', '4-seam', '2-seam', 'Sinker', 'Cutter', 'Changeup', 'Knuckleball', 'Knuckle-curve', 'Splitter'],value='Slider',description='Pitch Type:')
    display(p_type)

    print("Numbered strike zone from catcher's view:")
    wimage = widgets.HTML(value='<img src="https://lh3.googleusercontent.com/proxy/0DIkB_7nRLjXgDc255JH0v3qDNGZ8iKP8Dfvb_1VHgFja3IuQIBPoUyK5hSvg7MdCmhYzuBXSjdHRAGRt8m0d0jBtUafdBTwRiu3UcekmDuLV4FraA">')
    display(wimage)
    zonenum = widgets.Dropdown(options=['none', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14'],description='Prior Location:')
    display(zonenum)

    pripitch = widgets.Dropdown(options=['none', 'Fastball', 'Breakingball', 'Offspeed'],description='Prior Pitch:')
    display(pripitch)
    
    graph_of = widgets.Dropdown(options=['Swing %', 'Contact %', 'Hot Zones'], value='Hot Zones', description='Graph of:')
    display(graph_of)
    
    gen_graph = widgets.Button(description='Generate Graph')
    clear_button = widgets.Button(description='Clear Output')
    display(gen_graph)
    display(clear_button)
    button_function()




def generate_graph(b):
    
    strike_number = int(numberofstrikes.value)
    ball_number = int(numberofballs.value)
    if p_type.value == 'Slider':
        type_of_pitch = 'SL'
    elif p_type.value == 'Curveball':
        type_of_pitch = 'CU'
    elif p_type.value == '4-seam':
        type_of_pitch = 'FF'
    elif p_type.value == '2-seam':
        type_of_pitch = 'FT'
    elif p_type.value == 'Sinker':
        type_of_pitch = 'SI'
    elif p_type.value == 'Changeup':
        type_of_pitch = 'CH'
    elif p_type.value == 'Knuckleball':
        type_of_pitch = 'KN'
    elif p_type.value == 'Knucklecurve':
        type_of_pitch = 'KC'
    elif p_type.value == 'Splitter':
        type_of_pitch = 'FS'


    previous_pitch_type = pripitch.value  # brk, fst, ofspd, or none
    previous_pitch_zone = zonenum.value
    # set the batter and pitcher names as well as the time frame
    pitcher_first_name = pfn.value
    pitcher_last_name = pln.value
    firstname = bfn.value
    lastname = bln.value
    startdate = '2015-03-15'
    enddate = '2025-07-22'

    # get the pitcher and batter data from pybaseball
    pitcheridlookup = playerid_lookup(pitcher_last_name, pitcher_first_name)
    pitcheridlookupnumber = pitcheridlookup.values[0][2]
    playeridlookup = playerid_lookup(lastname, firstname)
    playeridlookupnumber = playeridlookup.values[0][2]

    # filter out the bad rows of data for the batter
    df0 = statcast_batter(startdate, enddate, playeridlookupnumber)
    df1 = df0.loc[(df0['plate_x'].notnull()) & (df0['plate_z'].notnull())]
    df2 = df1.loc[(df1['vx0'].notnull()) & (df1['vy0'].notnull())]
    df3 = df2.loc[(df2['release_speed'].notnull()) & (df2['vz0'].notnull())]
    df4 = df3.loc[(df3['pfx_x'].notnull()) & (df3['pfx_z'].notnull())]
    df5 = df4.loc[(df4['strikes'].notnull()) & (df4['balls'].notnull())]
    df6 = df5.loc[(df5['spin_axis'].notnull()) & (df5['release_spin_rate'].notnull())]
    df7a = df6.loc[(df6['description'].notnull()) & (df6['pitch_type'].notnull())]
    df7 = df7a.loc[(df7a['pitch_number'].notnull()) & (df7a['zone'].notnull())]
    df7 = df7.loc[(df7['p_throws'].notnull())]
    df7 = df7.loc[(df7['description'] != 'intent_ball') & (df7['pitch_type'] != 'PO')]

    # set anything that wasn't hit to a launch speed of 0. These will be deleted later
    df7.loc[df7['launch_speed'].isna(), 'launch_speed'] = 0
    df7.loc[df7['description'].isin(['ball', 'called_strike', 'blocked_ball', 'hit_by_pitch', 'pitchout']), 'launch_speed'] = 0

    # create a swing column
    df7['swing'] = 1

    # if the pitch was a ball, called strike, blocked ball, hit by pitch, or pitchout, then have swing be 0
    df7.loc[df7['description'].isin(['ball', 'called_strike', 'blocked_ball', 'hit_by_pitch', 'pitchout']), 'swing'] = 0

    # if the ball was hit into play, have type be 1. If not, then type is 0
    df7.loc[df7['type'].isin(['X']), 'type'] = 1
    df7.loc[df7['type'].isin(['S', 'B']), 'type'] = 0

    # drop the bad rows out of the dataframe
    df8 = df7.reset_index(drop=True)

    # create arrays for the previous pitch type and location
    prior_pitch_array = []
    zone_array = []

    # create columns in the dataframe for prior pitch and prior zone. Set the data in the columns to none
    df8['prior_pitch'] = 'none'
    df8['prior_zone'] = 'none'

    # make sure the first pitch that the batter faced was the first pitch of an at bat
    df8['pitch_number'][len(df8)-1] = 1

    # loop through the dataset to set the first pitch as 'none' for the prior pitch type and zone. If it isn't the first pitch, set the pitch type and zone to what they were
    for i in range(0, len(df8)):
        if df8['pitch_number'][i] == 1:
            prior_pitch_array.append('none')
            zone_array.append('none')
        elif df8['pitch_number'][i] != 1:
            prior_pitch_array.append(df8['pitch_type'][i + 1])
            zone_array.append(str(df8['zone'][i + 1]))

    # set the prior pitch and zone columns of the dataset to the corresponding columns of their arrays
    df8['prior_zone'] = zone_array
    df8['prior_pitch'] = prior_pitch_array

    # take all prior pitch types and sort them into categories
    df8.loc[df8['prior_pitch'].isin(['SC', 'SL', 'CU', 'KC', 'CS', 'FS', 'KN']), 'prior_pitch'] = 'Breakingball'
    df8.loc[df8['prior_pitch'].isin(['FF', 'FT', 'SI', 'FC']), 'prior_pitch'] = 'Fastball'
    df8.loc[df8['prior_pitch'].isin(['CH', 'FO']), 'prior_pitch'] = 'Offspeed'

    # if the pitch isn't in a mainstream category, make its category 'Breakingball' bc most rare pitches are breaking balls
    for i in range(0, len(df8)):
        if df8['prior_pitch'][i] != 'Breakingball':
            if df8['prior_pitch'][i] != 'Fastball':
                if df8['prior_pitch'][i] != 'Offspeed':
                    if df8['prior_pitch'][i] != 'none':
                        df8['prior_pitch'][i] = 'Breakingball'

    # cut out the useless columns of the array
    df9 = df8[['plate_x', 'plate_z', 'release_speed', 'pfx_x', 'pfx_z', 'vx0', 'vy0', 'vz0', 'strikes', 'balls', 'spin_axis', 'release_spin_rate', 'p_throws', 'prior_pitch', 'prior_zone', 'swing']]
    # filter df 10 to only pitches that the batter swings at
    df10 = df8.loc[(df8['swing'] == 1)]
    # cut out the useless columns of the array
    df10 = df10[['plate_x', 'plate_z', 'release_speed', 'pfx_x', 'pfx_z', 'vx0', 'vy0', 'vz0', 'strikes', 'balls', 'spin_axis', 'release_spin_rate', 'p_throws', 'prior_pitch', 'prior_zone', 'type']]
    # filter df 11 to only pitches that the batter makes contact with
    df11 = df8.loc[(df8['type'] == 1)]
    # cut out the useless columns of the array
    df11 = df11[['plate_x', 'plate_z', 'release_speed', 'pfx_x', 'pfx_z', 'vx0', 'vy0', 'vz0', 'strikes', 'balls', 'spin_axis', 'release_spin_rate', 'p_throws', 'prior_pitch', 'prior_zone', 'launch_speed']]
    # df 9 is the swing dataframe, df 10 is the contact array, and df 11 is the heat map array

    # create separate columns to the dataframe for prior pitches so it will understand strings
    dummy = pd.get_dummies(df9['prior_pitch'])
    df9 = pd.concat([df9, dummy], axis=1)
    df9 = df9.drop('prior_pitch', axis=1)

    # create separate columns to the dataframe for p_throws so it will understand strings
    dummy_p_throws = pd.get_dummies(df9['p_throws'])
    df9 = pd.concat([df9, dummy_p_throws], axis=1)
    df9 = df9.drop('p_throws', axis=1)

    # create separate columns to the dataframe for prior zone so it will understand strings
    zonedummy = pd.get_dummies(df9['prior_zone'])
    df9 = pd.concat([df9, zonedummy], axis=1)
    df9 = df9.drop('prior_zone', axis=1)

    # create another column in the dataframe for swing so swing will be the last column and then delete the previous swing column
    swing_column = df9['swing'].values
    df9 = df9.drop('swing', axis=1)
    df9['swings'] = swing_column


    # create separate columns to the dataframe for prior pitches so it will understand strings
    dummy2 = pd.get_dummies(df10['prior_pitch'])
    df10 = pd.concat([df10, dummy2], axis=1)
    df10 = df10.drop('prior_pitch', axis=1)

    # create separate columns to the dataframe for p_throws so it will understand strings
    dummy_p_throws2 = pd.get_dummies(df10['p_throws'])
    df10 = pd.concat([df10, dummy_p_throws2], axis=1)
    df10 = df10.drop('p_throws', axis=1)

    # create separate columns to the dataframe for prior zone so it will understand strings
    zonedummy2 = pd.get_dummies(df10['prior_zone'])
    df10 = pd.concat([df10, zonedummy2], axis=1)
    df10 = df10.drop('prior_zone', axis=1)

    # create another column in the dataframe for type so type will be the last column and then delete the previous type column
    type_column = df10['type'].values
    df10 = df10.drop('type', axis=1)
    df10['types'] = type_column


    # create separate columns to the dataframe for prior pitches so it will understand strings
    dummy3 = pd.get_dummies(df11['prior_pitch'])
    df11 = pd.concat([df11, dummy3], axis=1)
    df11 = df11.drop('prior_pitch', axis=1)

    # create separate columns to the dataframe for p_throws so it will understand strings
    dummy_p_throws3 = pd.get_dummies(df11['p_throws'])
    df11 = pd.concat([df11, dummy_p_throws3], axis=1)
    df11 = df11.drop('p_throws', axis=1)
    #
    # create separate columns to the dataframe for prior zone so it will understand strings
    zonedummy3 = pd.get_dummies(df11['prior_zone'])
    df11 = pd.concat([df11, zonedummy3], axis=1)
    df11 = df11.drop('prior_zone', axis=1)

    # create another column in the dataframe for launch_speed so launch_speed will be the last column and then delete the previous launch_speed column
    speed_column = df11['launch_speed'].values
    df11 = df11.drop('launch_speed', axis=1)
    df11['lsa'] = speed_column

    # convert the dataframes into arrays
    df9_array = df9.values
    df10_array = df10.values
    df11_array = df11.values

    # split the dataset into data and result columns
    x = df9_array[:, 0:(len(df9.columns) - 1)]
    y = df9_array[:, (len(df9.columns) - 1)]
    y=y.astype('int')

    # split the dataset into data and result columns
    x2 = df10_array[:, 0:(len(df10.columns) - 1)]
    y2 = df10_array[:, (len(df10.columns) - 1)]
    y2=y2.astype('int')

    # split the dataset into data and result columns
    x3 = df11_array[:, 0:(len(df11.columns) - 1)]
    y3 = df11_array[:, (len(df11.columns) - 1)]
    y3=y3.astype('int')

    # split the datasets into train and validation groups
    X_train, X_validation, Y_train, Y_validation = train_test_split(x, y, test_size=0.20, random_state=1)
    X_train2, X_validation2, Y_train2, Y_validation2 = train_test_split(x2, y2, test_size=0.20, random_state=1)
    X_train3, X_validation3, Y_train3, Y_validation3 = train_test_split(x3, y3, test_size=0.20, random_state=1)

    # filter out the bad pitcher data (not necessary but good to do)
    pdata0 = statcast_pitcher(startdate, enddate, pitcheridlookupnumber)
    pdata2 = pdata0.loc[(pdata0['vx0'].notnull()) & (pdata0['vy0'].notnull())]
    pdata3 = pdata2.loc[(pdata2['release_speed'].notnull()) & (pdata2['vz0'].notnull())]
    pdata4 = pdata3.loc[(pdata3['pfx_x'].notnull()) & (pdata3['pfx_z'].notnull())]
    pdata6 = pdata4.loc[(pdata4['spin_axis'].notnull()) & (pdata4['release_spin_rate'].notnull())]
    pdata7 = pdata6.loc[(pdata6['pitch_type'].notnull()) & (pdata6['p_throws'].notnull())]
    pdata8 = pdata7[['release_speed', 'pfx_x', 'pfx_z', 'vx0', 'vy0', 'vz0', 'spin_axis', 'release_spin_rate', 'p_throws', 'pitch_type']]

    # get the shape of the pitcher array ready for graphing
    pitcher_array = []
    for i in range(0, 1600):
        pitcher_array.append([])

    # figure out what hand the pitcher throws with
    p_throw = pdata8['p_throws'][1]

    # make each row of pdata8 a different type of pitch and have the data in that row be the mean for that pitch
    pdata8 = pdata8.groupby('pitch_type').mean()

    # fill in pitcher_array with data for each location of pitch
    for i in range(0, 40):
        for k in range(0, 40):
            pitcher_array[40*i+k].append(k/10-2)
            pitcher_array[40*i+k].append(i/10+0.5)
            pitcher_array[40*i+k].append(pdata8['release_speed'][type_of_pitch])
            pitcher_array[40*i+k].append(pdata8['pfx_x'][type_of_pitch])
            pitcher_array[40*i+k].append(pdata8['pfx_z'][type_of_pitch])
            pitcher_array[40*i+k].append(pdata8['vx0'][type_of_pitch])
            pitcher_array[40*i+k].append(pdata8['vy0'][type_of_pitch])
            pitcher_array[40*i+k].append(pdata8['vz0'][type_of_pitch])
            pitcher_array[40*i+k].append(strike_number)
            pitcher_array[40*i+k].append(ball_number)
            pitcher_array[40*i+k].append(pdata8['spin_axis'][type_of_pitch])
            pitcher_array[40*i+k].append(pdata8['release_spin_rate'][type_of_pitch])
            pitcher_array[40*i+k].append(0)
            pitcher_array[40*i+k].append(0)
            pitcher_array[40 * i + k].append(0)
            pitcher_array[40 * i + k].append(0)
            pitcher_array[40 * i + k].append(0)
            pitcher_array[40 * i + k].append(0)
            pitcher_array[40 * i + k].append(0)
            pitcher_array[40 * i + k].append(0)
            pitcher_array[40 * i + k].append(0)
            pitcher_array[40 * i + k].append(0)
            pitcher_array[40 * i + k].append(0)
            pitcher_array[40 * i + k].append(0)
            pitcher_array[40 * i + k].append(0)
            pitcher_array[40 * i + k].append(0)
            pitcher_array[40 * i + k].append(0)
            pitcher_array[40 * i + k].append(0)
            pitcher_array[40 * i + k].append(0)
            pitcher_array[40 * i + k].append(0)
            pitcher_array[40 * i + k].append(0)
            pitcher_array[40 * i + k].append(0)

    # convert pitcher_array to a dataframe and set the correct previous pitch type, previous pitch zone, and pitcher handedness to 1
    pitcher_array_df = pd.DataFrame(data=pitcher_array, columns=['plate_x', 'plate_z', 'release_speed', 'pfx_x', 'pfx_z', 'vx0', 'vy0', 'vz0', 'strikes', 'balls', 'spin_axis', 'release_spin_rate', 'Breakingball', 'Fastball', 'none', 'Offspeed', 'L', 'R', '1', '11', '12', '13', '14', '2', '3', '4', '5', '6', '7', '8', '9', 'none'])
    pitcher_array_df[previous_pitch_type] = 1
    pitcher_array_df[p_throw] = 1
    pitcher_array_df[previous_pitch_zone] = 1

    # fit the model and use it on the swing data
    model = GradientBoostingClassifier()
    model.fit(X_train, Y_train)
    yproba = model.predict_proba(pitcher_array)
    yprobadf = pd.DataFrame(data=yproba, columns=['take', 'swing'])
    total_swing = yprobadf['swing']

    # fit the model again and use it on the contact data
    model.fit(X_train2, Y_train2)
    yproba2 = model.predict_proba(pitcher_array)
    yprobadf2 = pd.DataFrame(data=yproba2, columns=['miss', 'hit'])

    # set total contact to the probability of contact (swing% * hit% on swing)
    total_contact = []
    for i in range(len(yprobadf)):
        total_contact.append(yprobadf['swing'][i]*yprobadf2['hit'][i])

    # fit a different model model and use it on the heat map data
    model2 = GradientBoostingRegressor()
    model2.fit(X_train3, Y_train3)
    yproba3 = model2.predict(pitcher_array)

    # set total velo to the heat map data (contact% * estimated exit velocity on contact)
    total_velo = []
    for i in range(len(total_contact)):
        total_velo.append(total_contact[i] * (yproba3[i]-40))

    shading = []
    hzvalue = '1) A good Hot Zone number is about 25'
    swingvalue = "2) A player's hot zone graph may seem questionably low. This is because swing percentages are part of the algorithim, and batters may not swing often in counts like 0-0"  
    swingvalue2 = "Note: A player's hot zone graph may seem questionably low. This is because swing percentages are part of the algorithim, and batters may not swing often in counts like 0-0"  


    # Wrap this text.
    twrapper = textwrap.TextWrapper(width=50)
    hzwords = twrapper.wrap(text=hzvalue)
    swingwords = twrapper.wrap(text=swingvalue)
    swingwords2 = twrapper.wrap(text=swingvalue2)

    

    if graph_of.value == 'Hot Zones':
        shading = total_velo
        print('Notes:')
        for element in hzwords:
            print(element)      
        for element in swingwords:
            print(element) 
    elif graph_of.value == 'Contact %':
        for element in swingwords2:
            print(element)
        shading = total_contact
    elif graph_of.value == 'Swing %':
        shading = total_swing
    # create plot on data
    fig, ax = pyplot.subplots()
    # C can be total_velo, total_contact, or total_swing
    graph = ax.hexbin(pitcher_array_df['plate_x'], pitcher_array_df['plate_z'], C=shading, vmin=0, gridsize=15, cmap='BuPu')
    key = pyplot.colorbar(graph)
    key.set_label(graph_of.value)
    rectangle = pyplot.Rectangle((-0.7, 1.75), 1.4, 1.66, fc='none', ec="black")
    ax.add_patch(rectangle)

    ball_number = str(ball_number)
    strike_number = str(strike_number)
    previous_pitch_zone = str(previous_pitch_zone)

    if ball_number == '0' and strike_number == '0':
        pyplot.title(firstname + " " + lastname + " vs a " + p_type.value + ' from ' + pitcher_first_name + ' ' + pitcher_last_name + ' on the first pitch')
    else:
        pyplot.title(firstname + " " + lastname + " vs a " + ball_number + "-" + strike_number + ' ' + p_type.value + ' from ' + pitcher_first_name + ' ' + pitcher_last_name + ' after a ' + previous_pitch_type + ' in zone ' + previous_pitch_zone)


    pyplot.show()

def clear_function(a):
    clear_output(wait=True)
    setup()

def button_function():
    gen_graph.on_click(generate_graph)
    clear_button.on_click(clear_function)

setup()
    
gen_graph.on_click(generate_graph)
clear_button.on_click(clear_function)




Text(value='', placeholder="Pitcher's First Name")

Text(value='', placeholder="Pitcher's Last Name")

Text(value='', placeholder="Batter's First Name")

Text(value='', placeholder="Batter's Last Name")

Dropdown(description='Balls:', options=('0', '1', '2', '3'), value='0')

Dropdown(description='Strikes:', options=('0', '1', '2'), value='0')

Dropdown(description='Pitch Type:', options=('Slider', 'Curveball', '4-seam', '2-seam', 'Sinker', 'Cutter', 'C…

Numbered strike zone from catcher's view:


HTML(value='<img src="https://lh3.googleusercontent.com/proxy/0DIkB_7nRLjXgDc255JH0v3qDNGZ8iKP8Dfvb_1VHgFja3Iu…

Dropdown(description='Prior Location:', options=('none', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '1…

Dropdown(description='Prior Pitch:', options=('none', 'Fastball', 'Breakingball', 'Offspeed'), value='none')

Dropdown(description='Graph of:', index=2, options=('Swing %', 'Contact %', 'Hot Zones'), value='Hot Zones')

Button(description='Generate Graph', style=ButtonStyle())

Button(description='Clear Output', style=ButtonStyle())