### Analysis

As always, import the necessary statements.

In [67]:
import pandas as pd
import pickle

We import our dataframes for batting, pitching, and fielding statistics as well as our list of teams from the pickle file titled `Stats.pkl`.

In [68]:
with open('Stats.pkl', 'rb') as f:
    dfb = pickle.load(f)
    dfp = pickle.load(f)
    dff = pickle.load(f)
    teams = pickle.load(f)

Here, we create a function to calculate some common baseball statistics as well as a few of the more advanced metrics we will be using in our analysis. This function will be useful later on when we gather our league totals so that we do not have to write up all of these same calculations for the new dataframe.

In [69]:
def calcBatStats(df):
    # Batting Average (useful later for League totals)
    df["AVG"] = round(df["H"] / df["AB"], 3)
    # Caught Stealing
    df["CS"] = round(df["SBA"] - df["SB"], 3)
    # Plate Appearances    
    df["PA"] = round(df["AB"] + df["SF"] + df["SH"] + df["BB"] + df["HBP"])
    # OPS
    df["OPS"] = round(df["OBP"] + df["SLG"], 3)
    # Singles
    df["1B"] = df["H"] - df["2B"] - df["3B"] - df["HR"]
    # Runs Created
    df["RC"] = round((df["H"] + df["BB"]) * df["TB"] / (df["AB"] + df["BB"]), 3)
    # POP
    df["POP"] = round(df["OPS"] + df["AVG"], 3)
    # Total Average
    df["TA"] = round((df["TB"] + df["BB"] + df["HBP"] + df["SB"]) / (df["AB"] - df["H"] + df["SH"] + df["SF"] + df["CS"] + df["GDP"]), 3)
    # Batting Average on Balls in Play
    df["BABIP"] = round((df["H"] - df["HR"]) / (df["AB"] - df["SO"] - df["HR"] + df["SF"]), 3)
    # Hoban Efficiency Quotient - Offense
    df["HEQO"] = df["TB"] + df["R"] + df["RBI"] + df["SB"] + (0.5 * df["BB"])
    # Base Stealing Runs
    df["BSR"] = round(((df["H"] + df["BB"] - df["CS"]) * (df["TB"] + (0.7 * df["SB"]))) / (df["AB"] + df["BB"] + df["CS"]), 3)

We run this function on our batting dataframes for each team. Note the newly created columns on the right side of the table. 

In [70]:
for df in dfb:
    calcBatStats(df)
dfb[2][:5]

Unnamed: 0,Batting,GP,GS,AVG,AB,R,H,2B,3B,HR,...,CS,PA,OPS,1B,RC,POP,TA,BABIP,HEQO,BSR
1,"Griffin, Chris",26,25.0,0.39,82,18,32,4,4,4,...,2,101,1.188,20,27.134,1.578,1.442,0.491,99.5,25.455
2,"Enyart, Mitchell",23,22.0,0.352,71,11,25,4,0,1,...,0,86,0.885,20,14.049,1.237,0.863,0.4,65.5,14.356
3,"Harris, Xavier",27,27.0,0.337,86,19,29,3,0,0,...,1,106,0.82,26,13.196,1.157,0.855,0.408,73.5,13.57
4,"Elford, Sid",18,8.0,0.333,36,5,12,2,0,1,...,0,41,0.887,9,6.8,1.22,0.885,0.44,31.0,7.08
5,"Haney, Houston",24,23.0,0.321,78,8,25,7,0,2,...,0,88,0.851,16,13.735,1.172,0.821,0.39,72.5,13.988


#### Crossroads League Statistics

Now that we have a few more advanced individual batting statistics, we need to gather the Crossroads League totals in order to incorporate them into our statistics and eventually rank the various hitters based on different stats.

To do this, we first subset each of the teams' batting table to extract only the Total row at the bottom. 

In [71]:
bat_totals = [df[df.Batting.str.contains("Total:", regex = False)] for df in dfb]
bat_totals[2]

Unnamed: 0,Batting,GP,GS,AVG,AB,R,H,2B,3B,HR,...,CS,PA,OPS,1B,RC,POP,TA,BABIP,HEQO,BSR
26,Total:,27,,0.292,872,160,255,38,10,14,...,13,1005,0.777,193,127.076,1.069,0.754,0.359,722.5,127.302


Next, we combine each of these rows into a new dataframe named `merged_bat_totals` using the code `pd.concat`. Below we show the first four rows of this new table.

In [72]:
merged_bat_totals = pd.concat(bat_totals)
merged_bat_totals[:4]

Unnamed: 0,Batting,GP,GS,AVG,AB,R,H,2B,3B,HR,...,CS,PA,OPS,1B,RC,POP,TA,BABIP,HEQO,BSR
22,Total:,27,,0.26,861,115,224,32,3,7,...,7,992,0.667,182,92.841,0.927,0.615,0.33,574.5,97.216
28,Total:,27,,0.254,859,132,218,39,1,15,...,4,1001,0.697,163,101.859,0.951,0.643,0.314,615.5,102.027
26,Total:,27,,0.292,872,160,255,38,10,14,...,13,1005,0.777,193,127.076,1.069,0.754,0.359,722.5,127.302
26,Total:,27,,0.294,843,154,248,60,9,8,...,20,1002,0.798,171,131.48,1.092,0.836,0.364,768.0,139.334


Now that we have our Totals table, we add a total row at the bottom using the code `.sum()`. Since the column labeled "Batting" does not provide any valuable information, we remove it from the table. Then, we isolate the totals row at the bottom with the code `.iloc[-1:]` to put the batting totals into a series.

In [73]:
merged_bat_totals.loc["CL_Total"] = merged_bat_totals.sum()
del merged_bat_totals["Batting"]
CL_bat_totals = merged_bat_totals.iloc[-1,:]

We make a copy to avoid any errors from overwriting the values, and then we calculate a few more statistics and run the `calcBatStats` function on the series. We printed out all of the league totals.

In [102]:
CL_tot_b = CL_bat_totals.copy()
# CL On-Base Percentage
CL_tot_b["OBP"] = round((CL_tot_b["H"] + CL_tot_b["BB"] + CL_tot_b["HBP"]) / (CL_tot_b["AB"] + CL_tot_b["BB"] + CL_tot_b["HBP"] + CL_tot_b["SF"]), 3)
# CL Slugging Percentage
CL_tot_b["SLG"] = round(CL_tot_b["TB"] / CL_tot_b["AB"], 3)
# CL weighted On-Base Average (excluding IBB, formula from https://library.fangraphs.com/offense/woba/)
CL_tot_b["wOBA"] = round((0.69*CL_tot_b["BB"] + 0.72*CL_tot_b["HBP"] + 0.89*CL_tot_b["1B"] + 1.27*CL_tot_b["2B"] + 1.62*CL_tot_b["3B"] + 2.10*CL_tot_b["HR"]) / (CL_tot_b["AB"] + CL_tot_b["BB"] + CL_tot_b["SF"] + CL_tot_b["HBP"]), 3)
# CL Runs per Plate Appearance
CL_tot_b["R_per_PA"] = round(CL_tot_b["R"] / CL_tot_b["PA"], 3)
CL_tot_b["wRC_per_PA"] = CL_tot_b["R_per_PA"]
# ABF (statistic that incorporates the value of various methods of reaching base with regard to their assistance towards scoring runs)
CL_tot_b["ABF"] = round((0.47*CL_tot_b["1B"] + 0.38*CL_tot_b["2B"] + 0.55*CL_tot_b["3B"] + 0.93*CL_tot_b["HR"] + 0.33*CL_tot_b["BB"] + CL_tot_b["HBP"]) / (CL_tot_b["AB"] - CL_tot_b["H"]), 3)
# Calculate other totals such as AVG, OPS, POP, etc
calcBatStats(CL_tot_b)

In [103]:
CL_tot_b

GP              270.000
GS                0.000
AVG               0.285
AB             8608.000
R              1548.000
H              2452.000
2B              442.000
3B               53.000
HR              180.000
RBI            1379.000
TB             3540.000
SLG               0.411
BB             1062.000
HBP             241.000
SO             1857.000
GDP             122.000
OBP               0.375
SF               96.000
SH              141.000
SB              339.000
SBA             444.000
CS              105.000
PA            10148.000
OPS               0.786
1B             1777.000
RC             1286.407
POP               1.071
TA                0.783
BABIP             0.341
HEQO           7337.000
BSR            1317.321
wOBA              0.351
R_per_PA          0.153
wRC_per_PA        0.153
ABF               0.291
Name: CL_Total, dtype: float64

#### Advanced Statistics

In [98]:
for df in dfb:
    #OPS+
    df["OPS_plus"] = round(100 * ((df["OBP"]/CL_tot_b["OBP"]) + (df["SLG"]/CL_tot_b["SLG"]) - 1), 3)
    #Weighted On-Base Average
    df["wOBA"] = round(((0.69*df["BB"]) + (0.72*df["HBP"]) + (0.89*df["1B"]) + (1.27*df["2B"]) + (1.62*df["3B"]) + (2.1*df["HR"])) / (df["AB"] + df["BB"] + df["SF"] + df["HBP"]), 3)
    #Batting Runs
    df["Bat_Runs"] = round(((0.69*df["BB"]) + (0.72*df["HBP"]) + (0.89*df["1B"]) + (1.27*df["2B"]) + (1.62*df["3B"]) + (2.1*df["HR"])) / (df["AB"] + df["BB"] + df["SF"] + df["HBP"]), 3)
    #Weighted Runs Above Average (verify constant)
    df["wRAA"] = round(((df["wOBA"] - CL_tot_b["wOBA"]) / (1.157)) * df["PA"], 3)
    #Weighted Runs Created
    df["wRC"] = round(df["wRAA"] + (df["PA"] * (CL_tot_b["R"])/(CL_tot_b["PA"])), 3)

In [99]:
dfb[2][:3]

Unnamed: 0,Batting,GP,GS,AVG,AB,R,H,2B,3B,HR,...,POP,TA,BABIP,HEQO,BSR,OPS_plus,wOBA,wRAA,Bat_Runs,wRC
1,"Griffin, Chris",26,25.0,0.39,82,18,32,4,4,4,...,1.578,1.442,0.491,99.5,25.455,200.847,0.505,13.443,0.505,28.85
2,"Enyart, Mitchell",23,22.0,0.352,71,11,25,4,0,1,...,1.237,0.863,0.4,65.5,14.356,125.466,0.392,3.048,0.392,16.167
3,"Harris, Xavier",27,27.0,0.337,86,19,29,3,0,0,...,1.157,0.855,0.408,73.5,13.57,109.978,0.377,2.382,0.377,18.551


In [100]:
with open('Park_Factor.pkl', 'rb') as f:
    park_factor = pickle.load(f)

In [104]:
#Weighted Runs Created+ (check formula)
for i in range(len(teams)):
    dfb[i]["wRC_plus"] = round(100 * ((((dfb[i]["wRAA"] / dfb[i]["PA"]) + CL_tot_b["R_per_PA"]) + (CL_tot_b["R_per_PA"] - (park_factor[i]) * (CL_tot_b["R_per_PA"]))) / (CL_tot_b["wRC_per_PA"])), 3)

In [115]:
#make sure everyone has at least 10 at bats
for i in range(len(teams)):
    dfb[i] = dfb[i][dfb[i]['AB'] >= 10]

In [117]:
dfb[2]

Unnamed: 0,Batting,GP,GS,AVG,AB,R,H,2B,3B,HR,...,TA,BABIP,HEQO,BSR,OPS_plus,wOBA,wRAA,Bat_Runs,wRC,wRC_plus
1,"Griffin, Chris",26,25.0,0.39,82,18,32,4,4,4,...,1.442,0.491,99.5,25.455,200.847,0.505,13.443,0.505,28.85,198.283
2,"Enyart, Mitchell",23,22.0,0.352,71,11,25,4,0,1,...,0.863,0.4,65.5,14.356,125.466,0.392,3.048,0.392,16.167,134.455
3,"Harris, Xavier",27,27.0,0.337,86,19,29,3,0,0,...,0.855,0.408,73.5,13.57,109.978,0.377,2.382,0.377,18.551,125.978
4,"Elford, Sid",18,8.0,0.333,36,5,12,2,0,1,...,0.885,0.44,31.0,7.08,125.509,0.393,1.488,0.393,7.742,135.011
5,"Haney, Houston",24,23.0,0.321,78,8,25,7,0,2,...,0.821,0.39,72.5,13.988,115.558,0.366,1.141,0.366,14.565,119.765
6,"Anderson, David",25,25.0,0.312,109,14,34,6,2,0,...,0.628,0.343,74.5,14.998,85.497,0.319,-3.153,0.319,14.237,93.213
7,"Brooks, Ethan",17,10.0,0.294,34,7,10,0,1,3,...,0.96,0.412,37.0,6.451,139.165,0.407,1.742,0.407,7.234,142.917
8,"Snyder, RJ",25,15.0,0.288,59,21,17,2,0,1,...,0.714,0.314,63.5,7.03,75.288,0.304,-2.559,0.304,7.051,84.742
9,"Sapp, Jacob",25,23.0,0.277,65,2,18,4,0,1,...,0.792,0.34,39.5,9.539,99.541,0.356,0.337,0.356,12.235,114.114
10,"Newkirk, Sam",26,25.0,0.236,89,15,21,2,2,1,...,0.583,0.294,58.5,8.263,63.862,0.289,-5.466,0.289,10.093,76.265
