### Analysis

As always, import the necessary statements.

In [1]:
import pandas as pd
import pickle

We import our dataframes for batting, pitching, and fielding statistics as well as our list of teams from the pickle file titled `Stats.pkl`.

In [2]:
with open('Stats.pkl', 'rb') as f:
    dfb = pickle.load(f)
    dfp = pickle.load(f)
    dff = pickle.load(f)
    teams = pickle.load(f)

Here, we create a function to calculate some common baseball statistics as well as a few of the more advanced metrics we will be using in our analysis. This function will be useful later on when we gather our league totals so that we do not have to write up all of these same calculations for the new dataframe.

In [3]:
def calcBatStats(df):
    # Batting Average (useful later for League totals)
    df["AVG"] = round(df["H"] / df["AB"], 3)
    # Caught Stealing
    df["CS"] = round(df["SBA"] - df["SB"], 3)
    # Plate Appearances    
    df["PA"] = round(df["AB"] + df["SF"] + df["SH"] + df["BB"] + df["HBP"])
    # OPS
    df["OPS"] = round(df["OBP"] + df["SLG"], 3)
    # Singles
    df["1B"] = df["H"] - df["2B"] - df["3B"] - df["HR"]
    # Runs Created
    df["RC"] = round((df["H"] + df["BB"]) * df["TB"] / (df["AB"] + df["BB"]), 3)
    # POP
    df["POP"] = round(df["OPS"] + df["AVG"], 3)
    # Total Average
    df["TA"] = round((df["TB"] + df["BB"] + df["HBP"] + df["SB"]) / (df["AB"] - df["H"] + df["SH"] + df["SF"] + df["CS"] + df["GDP"]), 3)
    # Batting Average on Balls in Play
    df["BABIP"] = round((df["H"] - df["HR"]) / (df["AB"] - df["SO"] - df["HR"] + df["SF"]), 3)
    # Hoban Efficiency Quotient - Offense
    df["HEQO"] = df["TB"] + df["R"] + df["RBI"] + df["SB"] + (0.5 * df["BB"])
    # Base Stealing Runs
    df["BSR"] = round(((df["H"] + df["BB"] - df["CS"]) * (df["TB"] + (0.7 * df["SB"]))) / (df["AB"] + df["BB"] + df["CS"]), 3)

We run this function on our batting dataframes for each team. Note the newly created columns on the right side of the table. 

In [4]:
for df in dfb:
    calcBatStats(df)
dfb[2][:5]

Unnamed: 0,Batting,GP,GS,AVG,AB,R,H,2B,3B,HR,...,CS,PA,OPS,1B,RC,POP,TA,BABIP,HEQO,BSR
1,"Griffin, Chris",26,25.0,0.39,82,18,32,4,4,4,...,2,101,1.188,20,27.134,1.578,1.442,0.491,99.5,25.455
2,"Enyart, Mitchell",23,22.0,0.352,71,11,25,4,0,1,...,0,86,0.885,20,14.049,1.237,0.863,0.4,65.5,14.356
3,"Harris, Xavier",27,27.0,0.337,86,19,29,3,0,0,...,1,106,0.82,26,13.196,1.157,0.855,0.408,73.5,13.57
4,"Elford, Sid",18,8.0,0.333,36,5,12,2,0,1,...,0,41,0.887,9,6.8,1.22,0.885,0.44,31.0,7.08
5,"Haney, Houston",24,23.0,0.321,78,8,25,7,0,2,...,0,88,0.851,16,13.735,1.172,0.821,0.39,72.5,13.988


#### Crossroads League Statistics

Now that we have a few more advanced individual batting statistics, we need to gather the Crossroads League totals in order to incorporate them into our statistics and eventually rank the various hitters based on different stats.

To do this, we first subset each of the teams' batting table to extract only the Total row at the bottom. 

In [5]:
bat_totals = [df[df.Batting.str.contains("Total:", regex = False)] for df in dfb]
bat_totals[2]

Unnamed: 0,Batting,GP,GS,AVG,AB,R,H,2B,3B,HR,...,CS,PA,OPS,1B,RC,POP,TA,BABIP,HEQO,BSR
26,Total:,27,,0.292,872,160,255,38,10,14,...,13,1005,0.777,193,127.076,1.069,0.754,0.359,722.5,127.302


Next, we combine each of these rows into a new dataframe named `merged_bat_totals` using the code `pd.concat`. Below we show the first four rows of this new table.

In [6]:
merged_bat_totals = pd.concat(bat_totals)
merged_bat_totals[:4]

Unnamed: 0,Batting,GP,GS,AVG,AB,R,H,2B,3B,HR,...,CS,PA,OPS,1B,RC,POP,TA,BABIP,HEQO,BSR
22,Total:,27,,0.26,861,115,224,32,3,7,...,7,992,0.667,182,92.841,0.927,0.615,0.33,574.5,97.216
28,Total:,27,,0.254,859,132,218,39,1,15,...,4,1001,0.697,163,101.859,0.951,0.643,0.314,615.5,102.027
26,Total:,27,,0.292,872,160,255,38,10,14,...,13,1005,0.777,193,127.076,1.069,0.754,0.359,722.5,127.302
26,Total:,27,,0.294,843,154,248,60,9,8,...,20,1002,0.798,171,131.48,1.092,0.836,0.364,768.0,139.334


Now that we have our Totals table, we add a total row at the bottom using the code `.sum()`. Since the column labeled "Batting" does not provide any valuable information, we remove it from the table. Then, we isolate the totals row at the bottom with the code `.iloc[-1:]` to put the batting totals into a series.

In [7]:
merged_bat_totals.loc["CL_Total"] = merged_bat_totals.sum()
del merged_bat_totals["Batting"]
CL_bat_totals = merged_bat_totals.iloc[-1,:]

We make a copy to avoid any errors from overwriting the values, and then we calculate a few more statistics and run the `calcBatStats` function on the series. We printed out a few of the league totals.

In [32]:
CL_tot_b = CL_bat_totals.copy()
# CL On-Base Percentage
CL_tot_b["OBP"] = round((CL_tot_b["H"] + CL_tot_b["BB"] + CL_tot_b["HBP"]) / (CL_tot_b["AB"] + CL_tot_b["BB"] + CL_tot_b["HBP"] + CL_tot_b["SF"]), 3)
# CL Slugging Percentage
CL_tot_b["SLG"] = round(CL_tot_b["TB"] / CL_tot_b["AB"], 3)
# CL weighted On-Base Average (excluding IBB, formula from https://library.fangraphs.com/offense/woba/)
CL_tot_b["wOBA"] = round((0.69*CL_tot_b["BB"] + 0.72*CL_tot_b["HBP"] + 0.89*CL_tot_b["1B"] + 1.27*CL_tot_b["2B"] + 1.62*CL_tot_b["3B"] + 2.10*CL_tot_b["HR"]) / (CL_tot_b["AB"] + CL_tot_b["BB"] + CL_tot_b["SF"] + CL_tot_b["HBP"]), 3)
# CL Runs per Plate Appearance
CL_tot_b["R_per_PA"] = round(CL_tot_b["R"] / CL_tot_b["PA"], 3)
CL_tot_b["wRC_per_PA"] = CL_tot_b["R_per_PA"]
# ABF (statistic that incorporates the value of various methods of reaching base with regard to their assistance towards scoring runs)
CL_tot_b["ABF"] = round((0.47*CL_tot_b["1B"] + 0.38*CL_tot_b["2B"] + 0.55*CL_tot_b["3B"] + 0.93*CL_tot_b["HR"] + 0.33*CL_tot_b["BB"] + CL_tot_b["HBP"]) / (CL_tot_b["AB"] - CL_tot_b["H"]), 3)
# Calculate other totals such as AVG, OPS, POP, etc
calcBatStats(CL_tot_b)
# deleted Games Started column because it doesn't apply to league totals
del CL_tot_b["GS"]

In [33]:
CL_tot_b[:5]

GP      270.000
AVG       0.285
AB     8608.000
R      1548.000
H      2452.000
Name: CL_Total, dtype: float64

#### Advanced Statistics

In [10]:
for df in dfb:
    #OPS+
    df["OPS_plus"] = round(100 * ((df["OBP"]/CL_tot_b["OBP"]) + (df["SLG"]/CL_tot_b["SLG"]) - 1), 3)
    #Weighted On-Base Average
    df["wOBA"] = round(((0.69*df["BB"]) + (0.72*df["HBP"]) + (0.89*df["1B"]) + (1.27*df["2B"]) + (1.62*df["3B"]) + (2.1*df["HR"])) / (df["AB"] + df["BB"] + df["SF"] + df["HBP"]), 3)
    #Batting Runs
    df["Bat_Runs"] = round(((0.69*df["BB"]) + (0.72*df["HBP"]) + (0.89*df["1B"]) + (1.27*df["2B"]) + (1.62*df["3B"]) + (2.1*df["HR"])) / (df["AB"] + df["BB"] + df["SF"] + df["HBP"]), 3)
    #Weighted Runs Above Average (verify constant)
    df["wRAA"] = round(((df["wOBA"] - CL_tot_b["wOBA"]) / (1.157)) * df["PA"], 3)
    #Weighted Runs Created
    df["wRC"] = round(df["wRAA"] + (df["PA"] * (CL_tot_b["R"])/(CL_tot_b["PA"])), 3)

In [11]:
dfb[2][:3]

Unnamed: 0,Batting,GP,GS,AVG,AB,R,H,2B,3B,HR,...,POP,TA,BABIP,HEQO,BSR,OPS_plus,wOBA,Bat_Runs,wRAA,wRC
1,"Griffin, Chris",26,25.0,0.39,82,18,32,4,4,4,...,1.578,1.442,0.491,99.5,25.455,200.847,0.505,0.505,13.443,28.85
2,"Enyart, Mitchell",23,22.0,0.352,71,11,25,4,0,1,...,1.237,0.863,0.4,65.5,14.356,125.466,0.392,0.392,3.048,16.167
3,"Harris, Xavier",27,27.0,0.337,86,19,29,3,0,0,...,1.157,0.855,0.408,73.5,13.57,109.978,0.377,0.377,2.382,18.551


In [12]:
with open('Park_Factor.pkl', 'rb') as f:
    park_factor = pickle.load(f)

In [13]:
#Weighted Runs Created+ (check formula)
for i in range(len(teams)):
    dfb[i]["wRC_plus"] = round(100 * ((((dfb[i]["wRAA"] / dfb[i]["PA"]) + CL_tot_b["R_per_PA"]) + (CL_tot_b["R_per_PA"] - (park_factor[i]) * (CL_tot_b["R_per_PA"]))) / (CL_tot_b["wRC_per_PA"])), 3)

In [16]:
#make sure everyone has at least 27 at bats (1 per game)
for i in range(len(teams)):
    dfb[i] = dfb[i][dfb[i]['AB'] >= 27]

In [29]:
all_hitters = pd.concat(dfb) #collect all in one dataframe
all_hitters = all_hitters[all_hitters.Batting != "Total:"] #remove total row
all_hitters = all_hitters[all_hitters.Batting != "Opponents:"] #remove opponent row
all_hitters[-2:] #verify totals and opponents rows are gone

Unnamed: 0,Batting,GP,GS,AVG,AB,R,H,2B,3B,HR,...,TA,BABIP,HEQO,BSR,OPS_plus,wOBA,Bat_Runs,wRAA,wRC,wRC_plus
12,"Mcgill, Christian",21,13.0,0.217,46,15,10,2,0,0,...,0.659,0.256,43.0,3.837,58.437,0.295,0.295,-2.856,6.144,34.723
13,"Kennedy, Andrew",24,16.0,0.204,54,6,11,2,0,0,...,0.545,0.289,33.0,3.984,48.771,0.28,0.28,-3.989,5.926,26.251


In [28]:
top_hitters = all_hitters.sort_values(by=['wRC_plus'], ascending=False) #sort by wRC+ in descending order
top_hitters[:10] #top 10 hitters

Unnamed: 0,Batting,GP,GS,AVG,AB,R,H,2B,3B,HR,...,TA,BABIP,HEQO,BSR,OPS_plus,wOBA,Bat_Runs,wRAA,wRC,wRC_plus
5,"Eder, Caleb",27,27.0,0.348,92,27,32,5,1,13,...,1.803,0.333,149.5,39.023,239.659,0.549,0.549,20.878,39.488,208.274
1,"Griffin, Chris",26,25.0,0.39,82,18,32,4,4,4,...,1.442,0.491,99.5,25.455,200.847,0.505,0.505,13.443,28.85,198.283
3,"English, Ethan",27,27.0,0.352,105,26,37,5,0,14,...,1.493,0.324,154.5,35.338,211.714,0.518,0.518,17.754,36.517,190.764
4,"Myers, Ashton",27,27.0,0.373,102,26,38,7,2,7,...,1.265,0.425,135.0,29.115,187.977,0.484,0.484,13.679,31.832,189.416
2,"Glover, Jake",27,27.0,0.408,98,18,40,9,2,2,...,1.328,0.463,110.0,30.55,178.739,0.471,0.471,12.342,30.495,182.073
4,"Hansen, Jacob",15,12.0,0.372,43,6,16,5,0,1,...,1.103,0.484,46.0,11.404,155.5,0.439,0.439,3.803,11.43,176.51
6,"Lichty, Daniel",27,27.0,0.344,96,28,33,5,4,1,...,1.121,0.386,109.5,22.867,144.621,0.422,0.422,7.18,25.027,166.907
4,"Jones, Caden",25,20.0,0.308,65,14,20,5,0,4,...,1.0,0.291,66.0,14.192,146.443,0.417,0.417,4.506,16.557,157.89
11,"Natividad, Andrew",21,12.0,0.293,41,8,12,5,0,0,...,0.912,0.364,35.0,6.8,127.107,0.405,0.405,2.66,11.355,157.298
3,"Clark, Donovan",27,26.0,0.382,89,24,34,5,0,0,...,1.194,0.531,97.5,22.068,129.503,0.403,0.403,4.719,20.736,156.172
