In [1]:
'''
Paris Dinh
Data Mining
Baseball Slappers
11/10/19

Goal: Can we set a min_support and min_confince to determine who the best players are individually and in a team?
'''
# Init: Get statcast data for particular year -- for testing
from pybaseball import bwar_bat
from pybaseball import batting_stats
import numpy as np
import pandas as pd
import matplotlib as plt

BSC = batting_stats(2017)
WAR = bwar_bat()

In [2]:
print('Prior Shape:')
print('BSC shape: ', BSC.shape)
print('WAR shape: ', WAR.shape)

Prior Shape:
BSC shape:  (957, 287)
WAR shape:  (107049, 17)


In [3]:
WAR_2017 = WAR.loc[WAR.year_ID == 2017]
WAR_2017 = WAR_2017.dropna()

# dropping some attributes
WAR_clean = WAR_2017.iloc[:,[0,1,2,4,7,10,11,14,16]].reset_index()
print('Clean WAR shape: ',WAR_clean.shape)

Clean WAR shape:  (794, 10)


In [4]:
'''
HELPER FUNCTION
Maps value into Very Low to Very High [VL,L,H,VH] based on 1st and 3rd quartile and median.
Args:
  x : value to be mapped
  s : series of which values originate
'''
def mapVal(x,s):
    q = np.quantile(s, [0,.25,.50,.75,1])
    # Very High - above 3rd quartile
    if x > q[3]: 
        return('VH')
    
    # High - above median, exclusive
    elif x <= q[3] and x > q[2]: 
        return('H')
    
    # Low - above 1st quartile, exclusive
    elif x <= q[2] and x > q[1]: 
        return('L')
    
    # Very Low - below 1st quartile, inclusive
    elif x <= q[1]: 
        return('VL')    

In [5]:
# map WAR
m = WAR_clean.loc[:,'WAR'].apply(lambda x: mapVal(x,WAR_clean.WAR))
WAR_clean['WAR_c'] = m

# map salary
m = WAR_clean.loc[:,'salary'].apply(lambda x: mapVal(x,WAR_clean.salary))
WAR_clean['salary_c'] = m

# map RRA
m = WAR_clean.loc[:,'runs_above_avg'].apply(lambda x: mapVal(x,WAR_clean.runs_above_avg))
WAR_clean['RRA_c'] = m

WAR_clean.head()

Unnamed: 0,index,name_common,mlb_ID,player_ID,team_ID,pitcher,salary,runs_above_avg,WAR_rep,WAR,WAR_c,salary_c,RRA_c
0,63,Fernando Abad,472551.0,abadfe01,BOS,Y,2000000.0,0.0,0.0,0.0,L,H,L
1,236,Jose Abreu,547989.0,abreujo02,CHW,N,10825000.0,24.6,2.3,4.81,VH,VH,VH
2,360,Cristhian Adames,542436.0,adamecr01,COL,N,540000.0,-4.4,0.05,-0.41,VL,VL,VL
3,467,Matt Adams,571431.0,adamsma01,ATL,N,2800000.0,-2.9,0.98,0.64,H,H,VL
4,705,Jesus Aguilar,542583.0,aguilje01,MIL,N,536000.0,3.3,0.96,1.24,VH,VL,VH


In [6]:
# Players grouped by WAR category
WAR_VH = WAR_clean.loc[WAR_clean.WAR_c == 'VH'] 
WAR_H = WAR_clean.loc[WAR_clean.WAR_c == 'H'] 
WAR_L = WAR_clean.loc[WAR_clean.WAR_c == 'L'] 
WAR_VL = WAR_clean.loc[WAR_clean.WAR_c == 'VL'] 

In [7]:
'''
Clean BSC data and map to scale
'''
BSC_clean = BSC.dropna(axis=1)
BSC_clean = BSC_clean.dropna().reset_index()
BSC_clean.drop(['index','Season','Dol','Age Rng'],axis=1,inplace=True)

# init new dataframe for mapped values
BSC_categ = BSC_clean.iloc[:,:3]
for x in BSC_clean.columns[3:]:
    # map
    BSC_categ[x] = BSC_clean.loc[:,x].apply(lambda y: mapVal(y,BSC_clean.loc[:,x]))

In [8]:
BSC_categ

Unnamed: 0,Name,Team,Age,G,AB,PA,H,1B,2B,3B,...,Swing%,SwStr%,BsR,Def,wSB,Off,Lg,TTO%,Swing% (pi),Zone% (pi)
0,Aaron Judge,Yankees,25.0,VH,VH,VH,VH,VH,VH,VH,...,VL,H,L,VL,VL,VH,VH,VH,VL,VL
1,Jose Altuve,Astros,27.0,VH,VH,VH,VH,VH,VH,VH,...,H,VL,VH,H,VH,VH,VH,VL,H,L
2,Giancarlo Stanton,Marlins,27.0,VH,VH,VH,VH,VH,VH,VL,...,L,H,VL,H,VL,VH,VH,H,L,VL
3,Mike Trout,Angels,25.0,VH,VH,VH,VH,VH,VH,VH,...,VL,VL,VH,VL,VH,VH,VH,H,VL,VL
4,Kris Bryant,Cubs,25.0,VH,VH,VH,VH,VH,VH,VH,...,L,L,VH,H,VL,VH,VH,H,L,VL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
952,Elias Diaz,Pirates,26.0,H,H,H,H,H,H,VL,...,VH,L,VL,VL,VH,VL,H,VL,H,H
953,Mark Trumbo,Orioles,31.0,VH,VH,VH,VH,VH,VH,VL,...,H,H,VL,VL,VL,VL,VH,H,H,VL
954,Luis Torrens,Padres,21.0,H,H,H,H,H,H,H,...,H,L,H,VL,VL,VL,H,L,H,H
955,Rougned Odor,Rangers,23.0,VH,VH,VH,VH,VH,VH,VH,...,VH,H,VH,VL,VH,VL,VH,L,VH,VL
