In [1]:
'''
Paris Dinh
Data Mining
Baseball Slappers
11/10/19

Goal: Can we set a min_support and min_confince to determine who the best players are individually and in a team?

Special packages - downloadable through pip:
pybaseball
efficient_apriori
'''
# Init: Get statcast data for particular year -- for testing
from pybaseball import bwar_bat
from pybaseball import batting_stats
from efficient_apriori import apriori
import numpy as np
import pandas as pd
import matplotlib as plt

BSC = batting_stats(2017)
WAR = bwar_bat()

In [2]:
print('Prior Shape:')
print('BSC shape: ', BSC.shape)
print('WAR shape: ', WAR.shape)

Prior Shape:
BSC shape:  (957, 287)
WAR shape:  (107049, 17)


In [3]:
WAR_2017 = WAR.loc[WAR.year_ID == 2017]
WAR_2017 = WAR_2017.dropna()

# dropping some attributes
WAR_clean = WAR_2017.iloc[:,[0,1,2,4,7,10,11,14,16]].reset_index()
print('Clean WAR shape: ',WAR_clean.shape)

Clean WAR shape:  (794, 10)


In [4]:
'''
HELPER FUNCTION
Maps value into Very Low to Very High [VL,L,H,VH] based on 1st and 3rd quartile and median.
Args:
  x : value to be mapped
  s : series of which values originate
'''
def mapVal(x,s):
    q = np.quantile(s, [0,.25,.50,.75,1])
    name = s.name
    # Very High - above 3rd quartile
    if x > q[3]: 
        return('VH '+ name)
    
    # High - above median, exclusive
    elif x <= q[3] and x > q[2]: 
        return('H '+ name)
    
    # Low - above 1st quartile, exclusive
    elif x <= q[2] and x > q[1]: 
        return('L '+ name)
    
    # Very Low - below 1st quartile, inclusive
    elif x <= q[1]: 
        return('VL '+ name)    

In [5]:
# map WAR
m = WAR_clean.loc[:,'WAR'].apply(lambda x: mapVal(x,WAR_clean.WAR))
WAR_clean['WAR_c'] = m

# map salary
m = WAR_clean.loc[:,'salary'].apply(lambda x: mapVal(x,WAR_clean.salary))
WAR_clean['salary_c'] = m

# map RRA
m = WAR_clean.loc[:,'runs_above_avg'].apply(lambda x: mapVal(x,WAR_clean.runs_above_avg))
WAR_clean['RRA_c'] = m

WAR_clean.head()

Unnamed: 0,index,name_common,mlb_ID,player_ID,team_ID,pitcher,salary,runs_above_avg,WAR_rep,WAR,WAR_c,salary_c,RRA_c
0,63,Fernando Abad,472551.0,abadfe01,BOS,Y,2000000.0,0.0,0.0,0.0,L WAR,H salary,L runs_above_avg
1,236,Jose Abreu,547989.0,abreujo02,CHW,N,10825000.0,24.6,2.3,4.81,VH WAR,VH salary,VH runs_above_avg
2,360,Cristhian Adames,542436.0,adamecr01,COL,N,540000.0,-4.4,0.05,-0.41,VL WAR,VL salary,VL runs_above_avg
3,467,Matt Adams,571431.0,adamsma01,ATL,N,2800000.0,-2.9,0.98,0.64,H WAR,H salary,VL runs_above_avg
4,705,Jesus Aguilar,542583.0,aguilje01,MIL,N,536000.0,3.3,0.96,1.24,VH WAR,VL salary,VH runs_above_avg


In [6]:
# Players grouped by WAR category
# WAR_VH = WAR_clean.loc[WAR_clean.WAR_c == 'VH'] 
# WAR_H = WAR_clean.loc[WAR_clean.WAR_c == 'H'] 
# WAR_L = WAR_clean.loc[WAR_clean.WAR_c == 'L'] 
# WAR_VL = WAR_clean.loc[WAR_clean.WAR_c == 'VL'] 

In [7]:
'''
Clean BSC data and map to scale
'''
BSC_clean = BSC.dropna(axis=1)
BSC_clean = BSC_clean.dropna().reset_index()
BSC_clean.drop(['index','Season','Dol','Age Rng'],axis=1,inplace=True)

# init new dataframe for mapped values
BSC_categ = BSC_clean.iloc[:,:3]
for x in BSC_clean.columns[3:]:
    # map
    BSC_categ[x] = BSC_clean.loc[:,x].apply(lambda y: mapVal(y,BSC_clean.loc[:,x]))
print('Clean BSC shape: ',BSC_clean.shape)

Clean BSC shape:  (957, 77)


In [8]:
'''
TO DO:
Perform Apriori on BSC_categ to identify frequent itemsets and association rules.
'''
BSC_categ['basket'] = BSC_categ.iloc[:,3:].apply(lambda x: set(x).union(x), axis=1)
BSC_categ.head()

Unnamed: 0,Name,Team,Age,G,AB,PA,H,1B,2B,3B,...,SwStr%,BsR,Def,wSB,Off,Lg,TTO%,Swing% (pi),Zone% (pi),basket
0,Aaron Judge,Yankees,25.0,VH G,VH AB,VH PA,VH H,VH 1B,VH 2B,VH 3B,...,H SwStr%,L BsR,VL Def,VL wSB,VH Off,VH Lg,VH TTO%,VL Swing% (pi),VL Zone% (pi),"{VH +WPA, VL BUH%, H IFH, VH BB, H pLI, VH SLG..."
1,Jose Altuve,Astros,27.0,VH G,VH AB,VH PA,VH H,VH 1B,VH 2B,VH 3B,...,VL SwStr%,VH BsR,H Def,VH wSB,VH Off,VH Lg,VL TTO%,H Swing% (pi),L Zone% (pi),"{VH +WPA, H SH, VH BU, VH BB, VH SLG, VH WPA/L..."
2,Giancarlo Stanton,Marlins,27.0,VH G,VH AB,VH PA,VH H,VH 1B,VH 2B,VL 3B,...,H SwStr%,VL BsR,H Def,VL wSB,VH Off,VH Lg,H TTO%,L Swing% (pi),VL Zone% (pi),"{VH +WPA, VL BUH%, VH BB, H SB, H pLI, VH SLG,..."
3,Mike Trout,Angels,25.0,VH G,VH AB,VH PA,VH H,VH 1B,VH 2B,VH 3B,...,VL SwStr%,VH BsR,VL Def,VH wSB,VH Off,VH Lg,H TTO%,VL Swing% (pi),VL Zone% (pi),"{VH +WPA, VL BUH%, VH BB, VH SLG, VH WPA/LI, V..."
4,Kris Bryant,Cubs,25.0,VH G,VH AB,VH PA,VH H,VH 1B,VH 2B,VH 3B,...,L SwStr%,VH BsR,H Def,VL wSB,VH Off,VH Lg,H TTO%,L Swing% (pi),VL Zone% (pi),"{VH +WPA, VL BUH%, VH BB, H pLI, VH SLG, VH WP..."


In [9]:
# gives both frequent itemsets and rules
itemsets, rules = apriori(BSC_categ.basket,.4,.5) # min_supp, min_conf

In [16]:
# Frequent pentuples, itemset : count
itemsets[6]

{('L wSB', 'VL 3B', 'VL CS', 'VL IBB', 'VL SB', 'VL SF'): 389}

**Prefix Scale**

- VL : Very Low
- L : Low
- H : High
- VH : Very High

**Acronym Translations**

Complete list found here: https://library.fangraphs.com/offense/offensive-statistics-list/


- wSB : Weighted Stolen Base
- SB : Stolen Base
- 3B : Triple
- BUH/BUH% : Bunt Hit (percentage)
- CS : Caught Stealing
- IBB : Intentional Walk
- HBP : Hit-by-Pitch
- SF : Sacrafice Fly
- HR : Home Run
- HR/FB : Home Run/Fly Ball

**Itemset Interpretation**

Frequent levels (VL to VH) of a particular attribute. These indicate frequent scales of statistics on a per player basis. This gives us common attributes amongst players that may indicate which attributes we may or may not use to distinguish good and bad players. Most, if not all of these frequent items are on the scale of VL to L, but this may be a result of chosen min_supp and min_conf. Other insights pending...


In [11]:
items,ruls = apriori(BSC_categ.basket,.3,.5)

In [12]:
# frequent k=8 sets
print(items[8])

{('L BsR', 'L wSB', 'VL 3B', 'VL BUH', 'VL BUH%', 'VL CS', 'VL HBP', 'VL IBB'): 294, ('L BsR', 'L wSB', 'VL 3B', 'VL BUH', 'VL BUH%', 'VL CS', 'VL HBP', 'VL SB'): 299, ('L BsR', 'L wSB', 'VL 3B', 'VL BUH', 'VL BUH%', 'VL CS', 'VL HBP', 'VL SF'): 290, ('L BsR', 'L wSB', 'VL 3B', 'VL BUH', 'VL BUH%', 'VL CS', 'VL HR', 'VL HR/FB'): 291, ('L BsR', 'L wSB', 'VL 3B', 'VL BUH', 'VL BUH%', 'VL CS', 'VL HR', 'VL IBB'): 289, ('L BsR', 'L wSB', 'VL 3B', 'VL BUH', 'VL BUH%', 'VL CS', 'VL HR', 'VL SB'): 291, ('L BsR', 'L wSB', 'VL 3B', 'VL BUH', 'VL BUH%', 'VL CS', 'VL HR/FB', 'VL IBB'): 289, ('L BsR', 'L wSB', 'VL 3B', 'VL BUH', 'VL BUH%', 'VL CS', 'VL HR/FB', 'VL SB'): 291, ('L BsR', 'L wSB', 'VL 3B', 'VL BUH', 'VL BUH%', 'VL CS', 'VL IBB', 'VL SB'): 310, ('L BsR', 'L wSB', 'VL 3B', 'VL BUH', 'VL BUH%', 'VL CS', 'VL IBB', 'VL SF'): 300, ('L BsR', 'L wSB', 'VL 3B', 'VL BUH', 'VL BUH%', 'VL CS', 'VL SB', 'VL SF'): 306, ('L BsR', 'L wSB', 'VL 3B', 'VL BUH', 'VL BUH%', 'VL HBP', 'VL IBB', 'VL SB'): 2

In [13]:
ruls[1022164-10:]

[{VL HBP, VL IBB} -> {VL HR, VL HR/FB, VL IFH, VL IFH%, VL SB, VL SF},
 {VL HBP, VL HR/FB} -> {VL HR, VL IBB, VL IFH, VL IFH%, VL SB, VL SF},
 {VL HBP, VL HR} -> {VL HR/FB, VL IBB, VL IFH, VL IFH%, VL SB, VL SF},
 {VL SF} -> {VL HBP, VL HR, VL HR/FB, VL IBB, VL IFH, VL IFH%, VL SB},
 {VL SB} -> {VL HBP, VL HR, VL HR/FB, VL IBB, VL IFH, VL IFH%, VL SF},
 {VL IFH%} -> {VL HBP, VL HR, VL HR/FB, VL IBB, VL IFH, VL SB, VL SF},
 {VL IFH} -> {VL HBP, VL HR, VL HR/FB, VL IBB, VL IFH%, VL SB, VL SF},
 {VL HR/FB} -> {VL HBP, VL HR, VL IBB, VL IFH, VL IFH%, VL SB, VL SF},
 {VL HR} -> {VL HBP, VL HR/FB, VL IBB, VL IFH, VL IFH%, VL SB, VL SF},
 {VL HBP} -> {VL HR, VL HR/FB, VL IBB, VL IFH, VL IFH%, VL SB, VL SF}]