### Batting Analysis

As always, import the necessary statements. Additionally, we change our pandas settings to show all the columns for all of our dataframes, allowing us to view all of the data.

In [1]:
import pandas as pd
import pickle
pd.set_option('display.max_columns', None)

We import our dataframes for batting, pitching, and fielding statistics as well as our list of teams from the pickle file titled `Stats.pkl`.

In [2]:
with open('Stats.pkl', 'rb') as f:
    dfb = pickle.load(f)
    dfp = pickle.load(f)
    dff = pickle.load(f)
    teams = pickle.load(f)

Here, we create a function to calculate some common baseball statistics as well as a few of the more advanced metrics we will be using in our analysis. This function will be useful later on when we gather our league totals so that we do not have to write up all of these same calculations for the new dataframe.

In [3]:
def calcBatStats(df):
    # Batting Average (useful later for League totals)
    df["AVG"] = round(df["H"] / df["AB"], 3)
    # Caught Stealing
    df["CS"] = round(df["SBA"] - df["SB"], 3)
    # Plate Appearances    
    df["PA"] = round(df["AB"] + df["SF"] + df["SH"] + df["BB"] + df["HBP"])
    # OPS
    df["OPS"] = round(df["OBP"] + df["SLG"], 3)
    # Singles
    df["1B"] = df["H"] - df["2B"] - df["3B"] - df["HR"]
    # Runs Created
    df["RC"] = round((df["H"] + df["BB"]) * df["TB"] / (df["AB"] + df["BB"]), 3)
    # POP
    df["POP"] = round(df["OPS"] + df["AVG"], 3)
    # Total Average
    df["TA"] = round((df["TB"] + df["BB"] + df["HBP"] + df["SB"]) / (df["AB"] - df["H"] + df["SH"] + df["SF"] + df["CS"] + df["GDP"]), 3)
    # Batting Average on Balls in Play
    df["BABIP"] = round((df["H"] - df["HR"]) / (df["AB"] - df["SO"] - df["HR"] + df["SF"]), 3)
    # Hoban Efficiency Quotient - Offense
    df["HEQO"] = df["TB"] + df["R"] + df["RBI"] + df["SB"] + (0.5 * df["BB"])
    # Base Stealing Runs
    df["BSR"] = round(((df["H"] + df["BB"] - df["CS"]) * (df["TB"] + (0.7 * df["SB"]))) / (df["AB"] + df["BB"] + df["CS"]), 3)

We run this function on our batting dataframes for each team. Note the newly created columns on the right side of the table. 

In [4]:
for df in dfb:
    calcBatStats(df)
dfb[2][:5]

Unnamed: 0,Batting,GP,GS,AVG,AB,R,H,2B,3B,HR,RBI,TB,SLG,BB,HBP,SO,GDP,OBP,SF,SH,SB,SBA,CS,PA,OPS,1B,RC,POP,TA,BABIP,HEQO,BSR
1,"Griffin, Chris",16,12.0,0.423,52,12,22,5,0,2,8,33,0.635,12,1,5,0,0.53,1,0,2,2,0,66,1.165,15,17.531,1.588,1.548,0.435,61.0,18.275
2,"Carr, Austin",16,12.0,0.367,49,6,18,3,0,1,16,24,0.49,8,3,18,0,0.483,0,1,0,0,0,61,0.973,14,10.947,1.34,1.094,0.567,50.0,10.947
3,"Danforth, Patrick",16,12.0,0.333,48,6,16,4,2,0,8,24,0.5,2,3,17,1,0.389,1,3,4,4,0,57,0.889,10,8.64,1.222,0.892,0.5,43.0,9.648
4,"Rich, Alex",16,12.0,0.217,60,7,13,4,0,0,4,17,0.283,2,4,9,0,0.288,0,0,5,6,1,66,0.571,9,4.113,0.788,0.583,0.255,34.0,4.556
5,"Pawlik, Jeff",16,12.0,0.212,52,6,11,3,0,0,4,14,0.269,5,2,18,0,0.3,1,0,1,2,1,60,0.569,8,3.93,0.781,0.512,0.314,27.5,3.802


#### Crossroads League Statistics

Now that we have a few more advanced individual batting statistics, we need to gather the Crossroads League totals in order to incorporate them into our statistics and eventually rank the various hitters based on different stats.

To do this, we first subset each of the teams' batting table to extract only the Total row at the bottom. 

In [5]:
bat_totals = [df[df.Batting.str.contains("Total:", regex = False)] for df in dfb]
bat_totals[2]

Unnamed: 0,Batting,GP,GS,AVG,AB,R,H,2B,3B,HR,RBI,TB,SLG,BB,HBP,SO,GDP,OBP,SF,SH,SB,SBA,CS,PA,OPS,1B,RC,POP,TA,BABIP,HEQO,BSR
30,Total:,16,,0.235,485,72,114,29,4,4,59,163,0.336,62,18,144,5,0.34,6,7,20,25,5,578,0.676,77,52.446,0.911,0.668,0.321,345.0,54.832


Next, we combine each of these rows into a new dataframe named `merged_bat_totals` using the code `pd.concat`. Below we show the first four rows of this new table.

In [6]:
merged_bat_totals = pd.concat(bat_totals)
merged_bat_totals[:4]

Unnamed: 0,Batting,GP,GS,AVG,AB,R,H,2B,3B,HR,RBI,TB,SLG,BB,HBP,SO,GDP,OBP,SF,SH,SB,SBA,CS,PA,OPS,1B,RC,POP,TA,BABIP,HEQO,BSR
25,Total:,16,,0.222,446,42,99,16,4,2,36,129,0.289,54,11,120,14,0.318,4,6,11,12,1,521,0.607,77,39.474,0.829,0.551,0.296,245.0,41.474
32,Total:,16,,0.185,453,31,84,7,2,0,28,95,0.21,58,12,170,3,0.293,2,3,10,14,4,528,0.503,75,26.399,0.688,0.459,0.295,193.0,27.332
30,Total:,16,,0.235,485,72,114,29,4,4,59,163,0.336,62,18,144,5,0.34,6,7,20,25,5,578,0.676,77,52.446,0.911,0.668,0.321,345.0,54.832
22,Total:,16,,0.265,490,86,130,40,4,7,71,199,0.406,48,16,101,3,0.345,9,10,39,47,8,573,0.751,79,65.84,1.016,0.774,0.315,419.0,70.46


Now that we have our Totals table, we add a total row at the bottom using the code `.sum()`. Since the column labeled "Batting" does not provide any valuable information, we remove it from the table. Then, we isolate the totals row at the bottom with the code `.iloc[-1:]` to put the batting totals into a series.

In [7]:
merged_bat_totals.loc["CL_Total"] = merged_bat_totals.sum()
del merged_bat_totals["Batting"]
CL_bat_totals = merged_bat_totals.iloc[-1,:]

We make a copy to avoid any errors from overwriting the values, and then we calculate a few more statistics and run the `calcBatStats` function on the series. We printed out a few of the league totals.

In [8]:
CL_tot_b = CL_bat_totals.copy()
# CL On-Base Percentage
CL_tot_b["OBP"] = round((CL_tot_b["H"] + CL_tot_b["BB"] + CL_tot_b["HBP"]) / (CL_tot_b["AB"] + CL_tot_b["BB"] + CL_tot_b["HBP"] + CL_tot_b["SF"]), 3)
# CL Slugging Percentage
CL_tot_b["SLG"] = round(CL_tot_b["TB"] / CL_tot_b["AB"], 3)
# CL weighted On-Base Average (excluding IBB, formula from https://library.fangraphs.com/offense/woba/)
CL_tot_b["wOBA"] = round((0.69*CL_tot_b["BB"] + 0.72*CL_tot_b["HBP"] + 0.89*CL_tot_b["1B"] + 1.27*CL_tot_b["2B"] + 1.62*CL_tot_b["3B"] + 2.10*CL_tot_b["HR"]) / (CL_tot_b["AB"] + CL_tot_b["BB"] + CL_tot_b["SF"] + CL_tot_b["HBP"]), 3)
# CL Runs per Plate Appearance
CL_tot_b["R_per_PA"] = round(CL_tot_b["R"] / CL_tot_b["PA"], 3)
CL_tot_b["wRC_per_PA"] = CL_tot_b["R_per_PA"]
# ABF (statistic that incorporates the value of various methods of reaching base with regard to their assistance towards scoring runs)
CL_tot_b["ABF"] = round((0.47*CL_tot_b["1B"] + 0.38*CL_tot_b["2B"] + 0.55*CL_tot_b["3B"] + 0.93*CL_tot_b["HR"] + 0.33*CL_tot_b["BB"] + CL_tot_b["HBP"]) / (CL_tot_b["AB"] - CL_tot_b["H"]), 3)
# Calculate other totals such as AVG, OPS, POP, etc
calcBatStats(CL_tot_b)
# deleted Games Started column because it doesn't apply to league totals
del CL_tot_b["GS"]

In [9]:
CL_tot_b[:5]

GP      160.000
AVG       0.258
AB     4846.000
R       864.000
H      1250.000
Name: CL_Total, dtype: float64

#### Advanced Statistics/Ranking Hitters

We would like to be able to rank hitters across the Crossroads League, and we will do this with a statistic called Weighted Runs Created Plus (wRC+). Below we calculate a few other statistics that will get us closer to our calculation of wRC+ for each hitter. 

In [10]:
for df in dfb:
    #OPS+
    df["OPS_plus"] = round(100 * ((df["OBP"]/CL_tot_b["OBP"]) + (df["SLG"]/CL_tot_b["SLG"]) - 1), 3)
    #Weighted On-Base Average
    df["wOBA"] = round(((0.69*df["BB"]) + (0.72*df["HBP"]) + (0.89*df["1B"]) + (1.27*df["2B"]) + (1.62*df["3B"]) + (2.1*df["HR"])) / (df["AB"] + df["BB"] + df["SF"] + df["HBP"]), 3)
    #Batting Runs
    df["Bat_Runs"] = round(((0.69*df["BB"]) + (0.72*df["HBP"]) + (0.89*df["1B"]) + (1.27*df["2B"]) + (1.62*df["3B"]) + (2.1*df["HR"])) / (df["AB"] + df["BB"] + df["SF"] + df["HBP"]), 3)
    #Weighted Runs Above Average (verify constant)
    df["wRAA"] = round(((df["wOBA"] - CL_tot_b["wOBA"]) / (1.157)) * df["PA"], 3)
    #Weighted Runs Created
    df["wRC"] = round(df["wRAA"] + (df["PA"] * (CL_tot_b["R"])/(CL_tot_b["PA"])), 3)

Since wRC+ incorporates the park factor for each hitter's home park, we have to load our various park factors for each team.

In [11]:
with open('Park_Factor.pkl', 'rb') as f:
    park_factor = pickle.load(f)

Now that we have our park factors, we can calculate our wRC+ for each hitter.

In [12]:
#Weighted Runs Created+
for i in range(len(teams)):
    dfb[i]["wRC_plus"] = round(100 * ((((dfb[i]["wRAA"] / dfb[i]["PA"]) + CL_tot_b["R_per_PA"]) + (CL_tot_b["R_per_PA"] - (park_factor[i]) * (CL_tot_b["R_per_PA"]))) / (CL_tot_b["wRC_per_PA"])), 3)

Before we begin ranking the hitters, we subset the data to only include hitters that have at least 27 at bats, which equates to one per conference game. 

In [13]:
#make sure everyone has at least 16 at bats (1 per game)
for i in range(len(teams)):
    dfb[i] = dfb[i][dfb[i]['AB'] >= 16]

We make copies of our dataframes to avoid errors and then we pretty the dataframes by adding a column for each player's team, removing the totals and opponents rows, rearranging the table's columns to show the important information first, and removing the unneccesary decimal in the `GS` column.

In [14]:
temp_dfb = [] 
for df in dfb: #make copy to avoid errors
    temp_dfb.append(df.copy())
for i in range(len(teams)): #add column for team
    temp_dfb[i]["Team"] = teams[i]
for df in temp_dfb: 
    #df.drop(df.tail(2).index,inplace=True) # drop last 2 rows (only run this line once or data will be lost)
    team = df.pop("Team")
    df.insert(1, team.name, team) #move team column to second
    stat = df.pop("wRC_plus")
    df.insert(4, stat.name, stat) #move wRC+ column to front of statistical columns
    df['GS'] = df['GS'].astype(int) #remove decimal place on GS column
temp_dfb[2][-2:] #verify totals and opponents rows are gone

ValueError: Cannot convert non-finite values (NA or inf) to integer

With our more readable data, we combine each team's table into one dataframe wiht the code `pd.concat`. Then, we sort the table by wRC+ to show the hitters with the highest wRC+ first. Finally, we display our top ten hitters.

In [15]:
all_hitters = pd.concat(temp_dfb) #collect all in one dataframe
top_hitters = all_hitters.sort_values(by=['wRC_plus'], ascending=False) #sort by wRC+ in descending order
top_hitters[:10] #top 10 hitters

Unnamed: 0,Batting,Team,GP,GS,wRC_plus,AVG,AB,R,H,2B,3B,HR,RBI,TB,SLG,BB,HBP,SO,GDP,OBP,SF,SH,SB,SBA,CS,PA,OPS,1B,RC,POP,TA,BABIP,HEQO,BSR,OPS_plus,wOBA,Bat_Runs,wRAA,wRC
1,"Porcellato, Matteo",Marian,16,16.0,219.138,0.418,55,19,23,5,1,1,16,33,0.6,9,7,8,0,0.549,0,1,6,7,1,72,1.149,16,16.5,1.567,1.618,0.478,78.5,17.742,214.225,0.501,0.501,10.641,21.419
3,"Blinn, Denver",IWU,16,16.0,209.36,0.4,60,24,24,9,2,3,16,46,0.767,9,4,7,0,0.5,1,0,7,8,1,74,1.267,10,22.0,1.667,1.737,0.412,97.5,23.269,245.627,0.526,0.526,12.536,23.613
1,"Griffin, Chris",Grace,16,12.0,208.088,0.423,52,12,22,5,0,2,8,33,0.635,12,1,5,0,0.53,1,0,2,2,0,66,1.165,15,17.531,1.588,1.548,0.435,61.0,18.275,218.381,0.498,0.498,9.583,19.462
3,"Glover, Jake",MVNU,16,16.0,190.922,0.321,53,10,17,4,0,4,16,33,0.623,10,3,7,1,0.455,0,0,0,1,1,66,1.078,9,14.143,1.399,1.211,0.31,64.0,13.406,194.313,0.463,0.463,7.587,17.466
2,"Jones, Caden",Marian,16,16.0,187.449,0.345,58,16,20,3,0,4,14,35,0.603,3,5,18,0,0.424,0,0,0,0,0,66,1.027,13,13.197,1.372,1.132,0.444,66.5,13.197,180.311,0.446,0.446,6.617,16.496
1,"Young, Owen",HU,10,10.0,186.725,0.412,34,2,14,4,0,0,5,18,0.529,3,0,6,1,0.459,0,0,1,1,0,37,0.988,10,8.27,1.4,1.048,0.5,27.5,8.592,170.088,0.434,0.434,3.326,8.864
2,"Dice, Camden",HU,16,16.0,173.464,0.354,48,8,17,7,0,1,9,27,0.563,3,1,11,0,0.396,1,1,0,0,0,54,0.959,9,10.588,1.313,0.939,0.432,45.5,10.588,161.752,0.411,0.411,3.78,11.863
6,"Goodin, Lucas",IWU,16,16.0,172.482,0.368,57,14,21,2,1,3,14,34,0.596,10,1,7,0,0.471,0,0,3,5,2,68,1.067,15,15.731,1.435,1.263,0.383,70.0,15.172,191.48,0.462,0.462,7.758,17.937
2,"Carr, Austin",Grace,16,12.0,171.214,0.367,49,6,18,3,0,1,16,24,0.49,8,3,18,0,0.483,0,1,0,0,0,61,0.973,14,10.947,1.34,1.094,0.567,50.0,10.947,166.242,0.434,0.434,5.483,14.614
5,"Thixton, Tye",IWU,16,16.0,167.871,0.379,66,24,25,5,0,3,11,39,0.591,7,2,5,2,0.453,0,1,8,8,0,76,1.044,17,17.096,1.423,1.273,0.379,85.5,19.551,185.133,0.454,0.454,8.145,19.521
