In [1]:
import pandas as pd
import numpy as np

## Max Wenzel
# BPC: Batter Performance Correlation

For this stat I will be using a combination of several stats into one and then examining the correlation of this stat with time to decide if a batter is getting better or worse over time.


In [342]:
b_d = pd.read_csv("data/batter_data.csv")

In [343]:
at_bats = b_d[["playerID", "AB"]].copy()

for ind,row in at_bats.groupby("playerID").sum().iterrows(): # Get only the players that have a total of at least 1200 at bats
    if row["AB"] < 1200:
        b_d = b_d[b_d.playerID != ind]
        # Calc slugging percentage

b_d["SLG"] = ((b_d["H"] - (b_d["2B"] + b_d["3B"] + b_d["HR"])) + 2 * b_d["2B"] + 3 * b_d["3B"] + 4 * b_d["HR"])/b_d["AB"]
# calc OBP
b_d["OBP"] = (b_d['H'] + b_d["BB"] + b_d["HBP"])/(b_d["AB"] + b_d["BB"] + b_d["SF"] + b_d["HBP"])

In [344]:
b_d["BA"] = b_d["H"]/b_d["AB"]
b_d["BABIP"] = (b_d["H"] -b_d["HR"])/(b_d["AB"] - b_d["SO"] - b_d["HR"] + b_d["SF"])
# get the OPS

b_d["OPS"] = b_d["OBP"] + b_d["SLG"]

In [345]:
# Avoid nasty zeros 
b_d.loc[b_d["DPHt"] == 0, "DPHh"] = 0
b_d.loc[b_d["DPHt"] == 0, "DPHt"] = 1

b_d.loc[b_d["FBHt"] == 0, "FBHh"] = 0
b_d.loc[b_d["FBHt"] == 0, "FBHt"] = 1

b_d.loc[b_d["BCAt"] == 0, "BCAh"] = 0
b_d.loc[b_d["BCAt"] == 0, "BCAt"] = 1

In [346]:
# calc percentage stats
b_d["RBI/AB"] = b_d["RBI"]/b_d["AB"]
b_d["DPH"] = b_d["DPHh"]/b_d["DPHt"]
b_d["FBH"] = b_d["FBHh"]/b_d["FBHt"]
b_d["BCA"] = b_d["BCAh"]/b_d["BCAt"]

In [347]:
b_d = b_d[["playerID", "yearID", "nameLast", "nameFirst", "OPS", "RBI/AB", "DPH", "FBH", "BCA", "R", "BA", "BABIP"]]

In [348]:
# Some strange happenings caused the DPH value to often go awry so I had to ultimately exclude it form the megastat
b_d["bpcsum"] = b_d["OPS"] + b_d["RBI/AB"] + b_d["FBH"] + b_d["BCA"]#+ b_d["DPH"] 

In [349]:
b_d["name"] = b_d["nameFirst"] +","+ b_d["nameLast"]

In [350]:
def calc_BPC(df, name):
    f,l = name.split(",")
    play = df.loc[(df["nameLast"] == l) & (df["nameFirst"] == f) & (df["yearID"] < 2018)]
    comb = play["bpcsum"].corr(play["yearID"])

    play = df.loc[(df["nameLast"] == l) & (df["nameFirst"] == f) & (df["yearID"] > 2015)]

    com = play["bpcsum"].corr(play["yearID"])
    
    pred = comb * 100
    actu = com * 100
    if pred < 0:
        pred = False
    else:
        pred = True
    if actu < 0:
        actu = False
    else:
        actu = True
    return  (pred, actu)


names = list(set(b_d["name"]))
tot = []
for nam in names:
    a, b = calc_BPC(b_d, nam)
    #print(a,b)
    tot.append([a,b])

pred = [x[0]  for x in tot]
actu = [x[1]  for x in tot]
res = [x[0] == x[1] for x in tot]



print("Actual",sum(actu)/len(actu))

print("Pred:",sum(pred)/len(pred))

print("Accur",sum(res)/len(res))

Actual 0.4186046511627907
Pred: 0.8465116279069768
Accur 0.4697674418604651


Oh no! Looks like the predictor stat I spent all this time on is actually an terrible predictor. You'd be better off flipping a coin to decide. All hope is not lost, while my original plan of having a good predictor failed, I in the process created a rather good indicator of skill, to show this I can for one take a look at the top players ranked by this statistic.

In [356]:
b_d = b_d.sort_values("bpcsum", ascending=False)

In [357]:
b_d.head()

Unnamed: 0,playerID,yearID,nameLast,nameFirst,OPS,RBI/AB,DPH,FBH,BCA,R,BA,BABIP,bpcsum,name
828,martijd02,2017,Martinez,J. D.,1.066093,0.240741,1.0,0.238095,0.679803,85,0.303241,0.326996,2.224732,"J. D.,Martinez"
1393,vottojo01,2017,Votto,Joey,1.031849,0.178891,1.08,0.206897,0.79902,106,0.320215,0.320628,2.216656,"Joey,Votto"
1354,troutmi01,2018,Trout,Mike,1.088088,0.167728,0.540541,0.20339,0.75,101,0.312102,0.346154,2.209206,"Mike,Trout"
596,harpebr03,2015,Harper,Bryce,1.108997,0.190019,0.283951,0.161972,0.697704,118,0.330134,0.369318,2.158692,"Bryce,Harper"
109,bettsmo01,2018,Betts,Mookie,1.078495,0.153846,0.457831,0.184211,0.742038,129,0.346154,0.368159,2.15859,"Mookie,Betts"


The top ranked players here are indeed players that are considered to be some of the best

Furthermore, I can show how this stat correlates with other popular stats

In [358]:
b_d["bpcsum"].corr(b_d["BA"])

0.4861411727247345

In [359]:
b_d["bpcsum"].corr(b_d["BABIP"])

0.30677316211762534

In [361]:
b_d["bpcsum"].corr(b_d["R"])

0.5344645187759932

You can see that it is somewhat correlated to the stats above but not exactly which I believe is due to BPCsum incorporating the more skill based abilities such as hitting particularly fast balls and knowing when or not to swing at a ball.

In [151]:
def get_players(b_d):
    names = list(set(b_d["name"]))
    np.random.shuffle(names)
    for ii in range(20):
        print(names[ii])