# Baseball Analytics 

#### Individual Statistics

In [167]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [168]:
teams =["Bethel", "Goshen", "Grace", "HU", "IWU", "Marian", "MVNU", "SAU", "SFU", "Taylor"]
t_nums = [1629, 1678, 1679, 1688, 1694, 1717, 1736, 1780, 1805, 1784]

In [169]:
urls = ['http://www.dakstats.com/WebSync/Pages/Team/IndividualStats.aspx?association=10&sg=MBA&conference=NAIMBA1_CROSS&team=' + str(num) + '&sea=NAIMBA_2019' for num in t_nums]
#Create a handle, page, to handle the contents of the website
pages = [requests.get(url) for url in urls]
#Store the page as an element tree using BeautifulSoup4
soups = [BeautifulSoup(page.content) for page in pages]

In [170]:
stat_tables = [[
    [
      [td.get_text(strip=True) for td in tr.find_all('td')] 
      for tr in table.find_all('tr') 
    ]#for each row in each table
    for table in soup.find_all('table',{"class":"gridViewReportBuilderWide"}) 
  ] for soup in soups] #for each table on each webpage

In [171]:
def nice_data(d):
    #If we can turn it into an integer, then do that
    try:
        d=int(d)
    except:
    #If it's not an integer, it might be a float
        try:
            d = float(d)
        except:
            pass
    return(d)

In [172]:
bat_headers = ["Batting", "GP", "GS", "AVG", "AB", "R", "H", "2B", "3B", "HR", "RBI", "TB", "SLG", "BB", "HBP", "SO", "GDP", "OBP", "SF", "SH", "SB", "SBA"]
bat_rows = [stat_tables[i][0] for i in range(len(teams))]

In [173]:
dfb = [pd.DataFrame(columns = bat_headers, data = bat_rows[i]) for i in range(len(teams))]
dfb = [df.iloc[1:] for df in dfb] # remove first empty row
dfb[2][:5]

Unnamed: 0,Batting,GP,GS,AVG,AB,R,H,2B,3B,HR,...,SLG,BB,HBP,SO,GDP,OBP,SF,SH,SB,SBA
1,"Harris, Xavier",44,44,0.336,137,25,46,8,0,0,...,0.394,21,8,31,3,0.446,2,1,4,6
2,"Griffin, Chris",43,41,0.333,132,28,44,8,5,5,...,0.583,28,4,32,0,0.463,0,1,0,2
3,"Anderson, David",42,42,0.322,174,21,56,8,2,0,...,0.391,10,0,15,5,0.355,2,1,4,7
4,"Sapp, Jacob",39,37,0.302,106,3,32,4,0,1,...,0.368,14,5,20,1,0.405,1,2,0,0
5,"Enyart, Mitchell",35,31,0.299,107,15,32,4,0,1,...,0.364,12,1,20,1,0.372,1,4,1,1


In [174]:
for df in dfb:
    df[["GP", "GS", "AVG", "AB", "R", "H", "2B", "3B", "HR", "RBI", "TB", "SLG", "BB", "HBP", "SO", "GDP", "OBP", "SF", "SH", "SB", "SBA"]] = df[["GP", "GS", "AVG", "AB", "R", "H", "2B", "3B", "HR", "RBI", "TB", "SLG", "BB", "HBP", "SO", "GDP", "OBP", "SF", "SH", "SB", "SBA"]].apply(pd.to_numeric)
dfb[2][:5]

Unnamed: 0,Batting,GP,GS,AVG,AB,R,H,2B,3B,HR,...,SLG,BB,HBP,SO,GDP,OBP,SF,SH,SB,SBA
1,"Harris, Xavier",44,44.0,0.336,137,25,46,8,0,0,...,0.394,21,8,31,3,0.446,2,1,4,6
2,"Griffin, Chris",43,41.0,0.333,132,28,44,8,5,5,...,0.583,28,4,32,0,0.463,0,1,0,2
3,"Anderson, David",42,42.0,0.322,174,21,56,8,2,0,...,0.391,10,0,15,5,0.355,2,1,4,7
4,"Sapp, Jacob",39,37.0,0.302,106,3,32,4,0,1,...,0.368,14,5,20,1,0.405,1,2,0,0
5,"Enyart, Mitchell",35,31.0,0.299,107,15,32,4,0,1,...,0.364,12,1,20,1,0.372,1,4,1,1


In [175]:
pitch_headers = ["Pitching", "ERA", "W", "L", "GP", "GS", "CG", "SHO", "CBO", "SV", "IP", "H", "R", "ER", "BB", "SO", "2B", "3B", "HR", "TBF", "B_AVG", "WP", "HBP", "BK", "SFA", "SHA"]
pitch_rows = [stat_tables[i][1] for i in range(len(teams))]

In [176]:
dfp = [pd.DataFrame(columns = pitch_headers, data = pitch_rows[i]) for i in range(len(teams))]
dfp = [df.iloc[1:] for df in dfp]
dfp[2][:5]

Unnamed: 0,Pitching,ERA,W,L,GP,GS,CG,SHO,CBO,SV,...,2B,3B,HR,TBF,B_AVG,WP,HBP,BK,SFA,SHA
1,"Haney, Houston",3.64,3,3,11,8,3,0,0,0,...,17,2,3,248,0.322,1,2,0,2,1
2,"Noska, Jordan",3.71,2,0,14,0,0,0,0,1,...,6,1,2,125,0.355,2,4,1,3,3
3,"Peterson, Ike",5.4,0,1,13,0,0,0,0,1,...,3,2,0,93,0.277,3,4,2,1,2
4,"Anderson, David",5.64,3,4,11,7,1,0,0,1,...,15,5,3,215,0.314,3,3,0,1,2
5,"Hammel, Jacob",6.43,2,0,10,0,0,0,0,1,...,5,0,0,62,0.212,4,2,0,0,1


In [177]:
field_headers = ["Fielding", "GP", "GS", "C", "PO", "A", "E", "FLD_pct", "DP", "TP", "SBA", "RCS", "SB_pct", "PB", "CI", "OBS"]
field_rows = [stat_tables[i][2] for i in range(len(teams))]

In [178]:
dff = [pd.DataFrame(columns = field_headers, data = field_rows[i]) for i in range(len(teams))]
dff = [df.iloc[1:] for df in dff]
dff[2][:5]

Unnamed: 0,Fielding,GP,GS,C,PO,A,E,FLD_pct,DP,TP,SBA,RCS,SB_pct,PB,CI,OBS
1,"Peebles, Joey",8,3,1,0,1,0,1.0,0,0,0,0,0.0,0,0,0
2,"Clark, Scottie",10,5,8,0,8,0,1.0,0,0,0,0,0.0,0,0,0
3,"Swartzentruber, Logan",13,9,5,1,4,0,1.0,0,0,0,0,0.0,0,0,0
4,"Hammel, Jacob",10,0,4,0,4,0,1.0,0,0,0,0,0.0,0,0,0
5,"Noska, Jordan",14,0,5,1,4,0,1.0,0,0,0,0,0.0,0,0,0


In [225]:
def calcBatStats(df):
    # Batting Average (useful later for league totals)
    df["AVG"] = round(df["H"] / df["AB"], 3)
    # Caught Stealing
    df["CS"] = round(df["SBA"] - df["SB"], 3)
    # Plate Appearances    
    df["PA"] = round(df["AB"] + df["SF"] + df["SH"] + df["BB"] + df["HBP"])
    # OPS
    df["OPS"] = round(df["OBP"] + df["SLG"], 3)
    # Singles
    df["1B"] = df["H"] - df["2B"] - df["3B"] - df["HR"]
    # Runs Created
    df["RC"] = round((df["H"] + df["BB"]) * df["TB"] / (df["AB"] + df["BB"]), 3)
    # POP
    df["POP"] = round(df["OPS"] + df["AVG"], 3)
    # Total Average
    df["TA"] = round((df["TB"] + df["BB"] + df["HBP"] + df["SB"]) / (df["AB"] - df["H"] + df["SH"] + df["SF"] + df["CS"] + df["GDP"]), 3)
    # Batting Average on Balls in Play
    df["BABIP"] = round((df["H"] - df["HR"]) / (df["AB"] - df["SO"] - df["HR"] + df["SF"]), 3)
    # Hoban Efficiency Quotient - Offense
    df["HEQO"] = df["TB"] + df["R"] + df["RBI"] + df["SB"] + (0.5 * df["BB"])
    # Base Stealing Runs
    df["BSR"] = round(((df["H"] + df["BB"] - df["CS"]) * (df["TB"] + (0.7 * df["SB"]))) / (df["AB"] + df["BB"] + df["CS"]), 3)

In [226]:
for df in dfb:
    calcBatStats(df)
dfb[2][:5]

Unnamed: 0,Batting,GP,GS,AVG,AB,R,H,2B,3B,HR,...,CS,PA,OPS,1B,RC,POP,TA,BABIP,HEQO,BSR
1,"Harris, Xavier",44,44.0,0.336,137,25,46,8,0,0,...,2,169,0.84,38,22.899,1.176,0.879,0.426,116.5,23.075
2,"Griffin, Chris",43,41.0,0.333,132,28,44,8,5,5,...,2,165,1.046,26,34.65,1.379,1.198,0.411,146.0,33.272
3,"Anderson, David",42,42.0,0.322,174,21,56,8,2,0,...,3,187,0.746,46,24.391,1.068,0.636,0.348,120.0,23.852
4,"Sapp, Jacob",39,37.0,0.302,106,3,32,4,0,1,...,0,128,0.773,27,14.95,1.075,0.744,0.36,61.0,14.95
5,"Enyart, Mitchell",35,31.0,0.299,107,15,32,4,0,1,...,0,125,0.736,27,14.42,1.035,0.654,0.356,85.0,14.679


#### Crossroads League Statistics

In [180]:
bat_totals = [df[df.Batting.str.contains("Total:", regex = False)] for df in dfb]
bat_totals[2]

Unnamed: 0,Batting,GP,GS,AVG,AB,R,H,2B,3B,HR,...,CS,PA,OPS,1B,RC,POP,TA,BABIP,HEQO,BSR
26,Total:,44,,0.284,1418,258,403,60,14,19,...,19,1640,0.751,310,192.815,1.035,0.722,0.348,1151.0,194.507


In [198]:
merged_bat_totals = pd.concat(bat_totals)
merged_bat_totals[:4]

Unnamed: 0,Batting,GP,GS,AVG,AB,R,H,2B,3B,HR,...,CS,PA,OPS,1B,RC,POP,TA,BABIP,HEQO,BSR
27,Total:,40,,0.263,1235,177,325,47,6,10,...,12,1445,0.688,262,140.604,0.951,0.654,0.333,864.5,147.068
28,Total:,49,,0.263,1538,230,404,69,1,23,...,9,1790,0.704,311,185.131,0.967,0.65,0.325,1091.5,186.233
26,Total:,44,,0.284,1418,258,403,60,14,19,...,19,1640,0.751,310,192.815,1.035,0.722,0.348,1151.0,194.507
30,Total:,42,,0.273,1266,231,345,75,10,14,...,33,1534,0.759,246,175.636,1.032,0.805,0.336,1111.5,186.944


In [199]:
merged_bat_totals.loc["CL_Total"] = merged_bat_totals.sum()
del merged_bat_totals["Batting"]
CL_bat_totals = merged_bat_totals.iloc[-1,:]
CL_bat_totals

GP         492.000
GS           0.000
AVG          2.785
AB       15138.000
R         2685.000
H         4225.000
2B         767.000
3B          79.000
HR         262.000
RBI       2365.000
TB        5936.000
SLG          3.903
BB        1941.000
HBP        463.000
SO        3277.000
GDP        202.000
OBP          3.733
SF         182.000
SH         239.000
SB         605.000
SBA        809.000
CS         204.000
PA       17963.000
OPS          7.636
1B        3117.000
RC        2152.233
POP         10.421
TA           7.606
BABIP        3.360
HEQO     12561.500
BSR       2201.824
Name: CL_Total, dtype: float64

In [233]:
CL_tot_b = CL_bat_totals.copy()
# CL On-Base Percentage
CL_tot_b["OBP"] = round((CL_tot_b["H"] + CL_tot_b["BB"] + CL_tot_b["HBP"]) / (CL_tot_b["AB"] + CL_tot_b["BB"] + CL_tot_b["HBP"] + CL_tot_b["SF"]), 3)
# CL Slugging Percentage
CL_tot_b["SLG"] = round(CL_tot_b["TB"] / CL_tot_b["AB"], 3)
# CL weighted On-Base Average (excluding IBB, formula from https://library.fangraphs.com/offense/woba/)
CL_tot_b["wOBA"] = round((0.69*CL_tot_b["BB"] + 0.72*CL_tot_b["HBP"] + 0.89*CL_tot_b["1B"] + 1.27*CL_tot_b["2B"] + 1.62*CL_tot_b["3B"] + 2.10*CL_tot_b["HR"]) / (CL_tot_b["AB"] + CL_tot_b["BB"] + CL_tot_b["SF"] + CL_tot_b["HBP"]), 3)
# CL Runs per Plate Appearance
CL_tot_b["R_per_PA"] = round(CL_tot_b["R"] / CL_tot_b["PA"], 3)
# ABF
CL_tot_b["ABF"] = round((0.47*CL_tot_b["1B"] + 0.38*CL_tot_b["2B"] + 0.55*CL_tot_b["3B"] + 0.93*CL_tot_b["HR"] + 0.33*CL_tot_b["BB"] + CL_tot_b["HBP"]) / (CL_tot_b["AB"] - CL_tot_b["H"]), 3)
# Calculate other totals such as AVG, OPS, POP, etc
calcBatStats(CL_tot_b)
CL_tot_b

GP            492.000
GS              0.000
AVG             0.279
AB          15138.000
R            2685.000
H            4225.000
2B            767.000
3B             79.000
HR            262.000
RBI          2365.000
TB           5936.000
SLG             0.392
BB           1941.000
HBP           463.000
SO           3277.000
GDP           202.000
OBP             0.374
SF            182.000
SH            239.000
SB            605.000
SBA           809.000
CS            204.000
PA          17963.000
OPS             0.766
1B           3117.000
RC           2143.063
POP             1.045
TA              0.762
BABIP           0.336
HEQO        12561.500
BSR          2193.794
wOBA            0.344
R_per_PA        0.149
ABF             0.288
Name: CL_Total, dtype: float64

#### Park Factor

In [49]:
urls = ['http://www.dakstats.com/WebSync/Pages/Team/TeamSchedule.aspx?association=10&sg=MBA&sea=NAIMBA_2019&team=' +
        str(num) for num in t_nums]
#Create a handle, page, to handle the contents of the website
pages = [requests.get(url) for url in urls]
#Store the page as an element tree using BeautifulSoup4
soups = [BeautifulSoup(page.content) for page in pages]

The code below collects all of the html tables from the different teams' webpages on DakStats.

In [50]:
team_tables = [
  [
    [
      [td.get_text(strip=True) for td in tr.find_all('td')] 
      for tr in table.find_all('tr') 
    ]#for each row in each table
    for table in soup.find_all('table') 
  ]#for each table on each webpage
  for soup in soups 
]#for each team's webpage

The below for loop allows us to locate the table from the webpage that contains the data we are interested in. We find the headers in the 33rd table and the actual data in the 35th table. We will assume that this is the same for all teams.

In [51]:
for i in range(len(team_tables[2])):
  #print(i, team_tables[2][i])
  #The line ablve is commented out becuause we only needed to run it once to find the location of the data on the webpage.
  pass

Next, we define the column names for our dataframe.

In [53]:
headers = [['Date', 'Opponent', 'Location', 'Score', 'Outcome'] for tables in team_tables]
headers[2]

['Date', 'Opponent', 'Location', 'Score', 'Outcome']

Here, we collected the data into the list `team_rows`. We used the code `[:5]` to take only the first 5 columns of data and we used the code `[1::2]` to collect the data from every other row, since between each list of data there is an empty list.

In [54]:
team_rows = [[r[:5] for r in tables[35][1::2]] for tables in team_tables]
team_rows[2][:9]

[['2/27/2019', 'Lourdes (Ohio)', 'N', '3-4', 'L'],
 ['2/27/2019', 'Lourdes (Ohio)', 'N', '4-8', 'L'],
 ['3/2/2019', 'Cornerstone (Mich.)', 'N', '3-4', 'L'],
 ['3/2/2019', 'Trinity Baptist', 'N', '5-1', 'W'],
 ['3/4/2019', 'Michigan-Dearborn', 'N', '13-1', 'W'],
 ['3/5/2019', 'Rochester (Mich.)', 'N', '24-4', 'W'],
 ['3/6/2019', 'Robert Morris (Ill.)', 'N', '10-9', 'W'],
 ['3/8/2019', 'Bethel (Ind.) *', 'N', '13-6', 'W'],
 ['3/9/2019', 'Bethel (Ind.) *', 'N', '14-2', 'W']]

Now, we put the data into a datframe.

In [55]:
dfc = [pd.DataFrame(columns = headers[i], data = team_rows[i]) for i in range(len(headers))]
dfc[2][:5]

Unnamed: 0,Date,Opponent,Location,Score,Outcome
0,2/27/2019,Lourdes (Ohio),N,3-4,L
1,2/27/2019,Lourdes (Ohio),N,4-8,L
2,3/2/2019,Cornerstone (Mich.),N,3-4,L
3,3/2/2019,Trinity Baptist,N,5-1,W
4,3/4/2019,Michigan-Dearborn,N,13-1,W


We subset the data to only include opponents with an asterisk which denotes conference games.

In [56]:
conf_df = [df[df.Opponent.str.contains("*", regex = False)] for df in dfc]
conf_df[2][:5]

Unnamed: 0,Date,Opponent,Location,Score,Outcome
7,3/8/2019,Bethel (Ind.) *,N,13-6,W
8,3/9/2019,Bethel (Ind.) *,N,14-2,W
9,3/9/2019,Bethel (Ind.) *,N,3-1,W
10,3/14/2019,Taylor (Ind.) *,A,5-15,L
11,3/16/2019,Taylor (Ind.) *,A,2-10,L


The below code copies the dataframe with `.copy()` to avoid errors, splits the "Score" column into two columns, one for the selected team and one for the opponent. Then, the code `str.replace(' \*', '', regex= True)` eliminates the parentheses and the number between them for extra-inning games.

In [58]:
tidy_conf = conf_df.copy()
for i, df in enumerate(conf_df):
  split_scores = df['Score'].str.replace(r"\(.*\)","").str.split('-', expand = True)
  tidy_conf[i] = df.assign(Score = pd.to_numeric(split_scores[0]),
                           Opp_score = pd.to_numeric(split_scores[1]),
                           Opponent = df.Opponent.str.replace(' \*', '', regex= True),
                           Date = pd.to_datetime(df.Date)
                           )
tidy_conf[2][:5]

Unnamed: 0,Date,Opponent,Location,Score,Outcome,Opp_score
7,2019-03-08,Bethel (Ind.),N,13,W,6
8,2019-03-09,Bethel (Ind.),N,14,W,2
9,2019-03-09,Bethel (Ind.),N,3,W,1
10,2019-03-14,Taylor (Ind.),A,5,L,15
11,2019-03-16,Taylor (Ind.),A,2,L,10


Finally, we subset the home games in one dataframe and the away games in another, and then use the sum of the two score columns in each to calculate our park factor for each team.

In [59]:
conf_h = [df[df.Location.str.contains("H", regex = False)] for df in tidy_conf]
conf_a = [df[df.Location.str.contains("A", regex = False)] for df in tidy_conf]

In [61]:
h_runs_per_game = [(df.Score.sum() + df.Opp_score.sum())/len(df.index) for df in conf_h]
a_runs_per_game = [(df.Score.sum() + df.Opp_score.sum())/len(df.index) for df in conf_a]
park_factor = [h_runs_per_game[i]/a_runs_per_game[i] for i in range(len(headers))]
park_factor

[0.963768115942029,
 1.1224268689057422,
 0.8870967741935484,
 0.7320261437908496,
 1.0357675111773472,
 0.7938931297709924,
 0.8571428571428571,
 1.15,
 1.3483365949119375,
 1.3363844393592679]