In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append('../')
from streak_counter import hit_checker
import random
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook as tqdm

import warnings
warnings.filterwarnings('ignore')

d = pd.read_csv('../retrosheet_data/2005-2018_games.csv')
d.rename(columns={'unknown':'double_header_flag'}, inplace=True)
d['ab_flag'] = d.ab_flag.map({'F':0,'T':1})
d.sh_flag = d.sh_flag.map({'F':0,'T':1})
d.sf_flag = d.sf_flag.map({'F':0,'T':1})

## Calculating Batting Average and H/PA

H / AB

Simplify it like Ralph said: just pick a year then go with it

<br>Way Algorithm works: it picks all the events for a PA (at-bat, walk, intentional walk, hit by pitch, interference and hits)

<br>It appends everything to 1 main data frame based on the chosen date

<br> So far, it returns H/PA, H, PA, AB, Gm_played (games played), and BA


In [2]:
def calculate_best_batter(df, dates):
    
    #find me all the Plate Apperances as described by MLB rules
    AB_only = df[df.ab_flag == 1]
    event_flags = [14,15,16,17] #14 is BB 15 is IBB 16 is HBP 17 is interference
    BB_IBB_HBP_INT = df[df.event_type.isin(event_flags)]
    sac_hits = df[df.sh_flag ==1]
    sac_fly  = df[df.sf_flag ==1]
    new_df = AB_only.append(BB_IBB_HBP_INT)
    new_df = new_df.append(sac_hits)
    new_df = new_df.append(sac_fly)

    month = dates[0]
    day   = dates[1]
    
    #filter out the data you don't need, IE only pick the previous months and day beforehand
    if month <= 3:
        new_df = new_df[new_df.month <= month]
        new_df = new_df[new_df.day_ <= day]
    else:
        prev_months_df   = new_df[new_df.month < month]
        current_month_df = new_df[new_df.month == month]
        current_month_days_df = current_month_df[current_month_df.day_ < day]
        new_df = prev_months_df.append(current_month_days_df) 


    hit_flags = [20,21,22,23]
    new_df.loc[new_df.event_type.isin(hit_flags),'hit_flag'] = 1
    new_df.loc[new_df.event_type.isin(hit_flags)==False,'hit_flag'] = 0

    batters_records = new_df.groupby(['res_batter'])
    batting_summary = batters_records.agg({'hit_flag':[np.mean,np.sum, np.size],'ab_flag':[np.sum],'game_id':pd.Series.nunique})

    batting_hits_pa = batting_summary.hit_flag
    batting_hits_pa.rename(columns={'mean':'H/PA','sum':'H','size':'PA'}, inplace=True)

    batting_ab = batting_summary.ab_flag
    batting_ab.rename(columns={'sum':'AB'}, inplace=True)

    batting_games_played = batting_summary.game_id
    batting_games_played.rename(columns={'nunique':'Gm_played'}, inplace=True)

    batting_summary = pd.concat([batting_hits_pa,batting_ab,batting_games_played], axis =1)
    batting_summary.loc[:,'BA'] = batting_summary.loc[:,'H'] / batting_summary.loc[:,'AB']
    batting_summary['H/Gm_played'] = batting_summary['H'] / batting_summary['Gm_played']
    
    batting_summary.sort_values(by=['Gm_played'], inplace=True)
    gm_filter = 0
    gm_filter = batting_summary.Gm_played.max() * 0.7
    #print(gm_filter)
    batting_summary = batting_summary[batting_summary.Gm_played >= gm_filter]   #filter for players who play mostly every day
    #FILTER CRITERIA. TRYING TO MAKE IT BASED on something like # games player started, probably  / #games his team played
    return batting_summary
    

In [3]:
df_test = d.copy()
df_test = df_test.query('year == 2016')

In [4]:
groups = df_test.groupby(['month','day_'])
game_dates = groups.nunique().index.tolist()

## Strategy 1:

Pick the batter with the highest batting average, start from 16th game of season and on-wards
<br><br> If the batter didn't play that day, it will try with the second best batting average player etc until it finds someone who has played

<br> The batter with the highest batting average doesn't play on a specific day for a few reasons.<br>A) His team didn't play at all <br>B) He got a rest day from his coach <br>C) He might have had 1 AB as a sub so this will need to be revised in a future strategy <br> D) He might have only gotten walks or intentional walks in a game but that is EXTREMELY rare so I will ignore this case. In fact, this may only happen once per season for all the batters in the league. This is equivalent to starting and finishing a game but not registering an AB because you were walked or you only hit sacrifice flies/bunts the whole game.

In [5]:
i = 15
hit_counter = 0
total_counter = 0

while i < len(game_dates) -1 :
    batter_rank = 0
    df_test2 =calculate_best_batter(df_test,game_dates[i])
    batter = df_test2.sort_values(by=['BA'], ascending=False).iloc[batter_rank].name.split()
    daily_hit_result = hit_checker(groups.get_group(game_dates[i]), batter)
    i = i + 1
    
    while (daily_hit_result == 'continue_streak'):
        batter_rank = batter_rank + 1 
        batter = df_test2.sort_values(by=['BA'], ascending=False).iloc[batter_rank].name.split()
        daily_hit_result = hit_checker(groups.get_group(game_dates[i]), batter)

    if daily_hit_result =='lose_streak':
        pass
    else:
        hit_counter = hit_counter + 1
        
    total_counter = total_counter + 1
    
print('Complete\n')
print('Hit counter', hit_counter, ' total_games_played', total_counter, ' \nRatio:',hit_counter/total_counter)

Complete

Hit counter 118  total_games_played 163  
Ratio: 0.7239263803680982


## Strategy 2:

Pick the batter with the highest H/PA

In [6]:
i = 15
hit_counter = 0
total_counter = 0

while i < len(game_dates) -1 :
    batter_rank = 0
    df_test2 =calculate_best_batter(df_test,game_dates[i])
    batter = df_test2.sort_values(by=['H/PA'], ascending=False).iloc[batter_rank].name.split()
    daily_hit_result = hit_checker(groups.get_group(game_dates[i]), batter)
    i = i + 1
    
    while (daily_hit_result == 'continue_streak'):
        batter_rank = batter_rank + 1 
        batter = df_test2.sort_values(by=['H/PA'], ascending=False).iloc[batter_rank].name.split()
        daily_hit_result = hit_checker(groups.get_group(game_dates[i]), batter)

    if daily_hit_result =='lose_streak':
        pass
    else:
        hit_counter = hit_counter + 1
        
    total_counter = total_counter + 1
    
print('Complete\n')
print('Hit counter', hit_counter, ' total_games_played', total_counter, ' \nRatio:',hit_counter/total_counter)

Complete

Hit counter 115  total_games_played 163  
Ratio: 0.7055214723926381


## Strategy 3:

Pick the batter with the highest H/Gm_played

In [7]:
i = 15
hit_counter = 0
total_counter = 0

while i < len(game_dates) -1 :
    batter_rank = 0
    df_test2 =calculate_best_batter(df_test,game_dates[i])
    batter = df_test2.sort_values(by=['H/Gm_played'], ascending=False).iloc[batter_rank].name.split()
    daily_hit_result = hit_checker(groups.get_group(game_dates[i]), batter)
    i = i + 1
    
    while (daily_hit_result == 'continue_streak'):
        batter_rank = batter_rank + 1 
        batter = df_test2.sort_values(by=['H/Gm_played'], ascending=False).iloc[batter_rank].name.split()
        daily_hit_result = hit_checker(groups.get_group(game_dates[i]), batter)

    if daily_hit_result =='lose_streak':
        pass
    else:
        hit_counter = hit_counter + 1
        
    total_counter = total_counter + 1
    
print('Complete\n')
print('Hit counter', hit_counter, ' total_games_played', total_counter, ' \nRatio:',hit_counter/total_counter)

Complete

Hit counter 117  total_games_played 163  
Ratio: 0.7177914110429447


## Strategy 4:

Pick the batter with the highest at least 1 hit per game ratio this season

## Strategy 5:

Pick the batter with the highest number of hits

In [8]:
i = 15
hit_counter = 0
total_counter = 0

while i < len(game_dates) -1 :
    batter_rank = 0
    df_test2 =calculate_best_batter(df_test,game_dates[i])
    batter = df_test2.sort_values(by=['H'], ascending=False).iloc[batter_rank].name.split()
    daily_hit_result = hit_checker(groups.get_group(game_dates[i]), batter)
    i = i + 1
    
    while (daily_hit_result == 'continue_streak'):
        batter_rank = batter_rank + 1 
        batter = df_test2.sort_values(by=['H'], ascending=False).iloc[batter_rank].name.split()
        daily_hit_result = hit_checker(groups.get_group(game_dates[i]), batter)

    if daily_hit_result =='lose_streak':
        pass
    else:
        hit_counter = hit_counter + 1
        
    total_counter = total_counter + 1
    
print('Complete\n')
print('Hit counter', hit_counter, ' total_games_played', total_counter, ' \nRatio:',hit_counter/total_counter)

Complete

Hit counter 118  total_games_played 163  
Ratio: 0.7239263803680982


## Strategy 1. run from 2009 to 2018

In [12]:
years = np.arange(2009, 2019, 1)

for year2 in tqdm(years):
    df_test = d.copy()
    df_test = df_test[df_test.year== year2]

    groups = df_test.groupby(['month','day_'])
    game_dates = groups.nunique().index.tolist()

    i = 15
    hit_counter = 0
    total_counter = 0

    while i < len(game_dates) -1 :
        batter_rank = 0
        df_test2 =calculate_best_batter(df_test,game_dates[i])
        batter = df_test2.sort_values(by=['BA'], ascending=False).iloc[batter_rank].name.split()
        daily_hit_result = hit_checker(groups.get_group(game_dates[i]), batter)
        i = i + 1

        while (daily_hit_result == 'continue_streak'):
            batter_rank = batter_rank + 1 
            batter = df_test2.sort_values(by=['BA'], ascending=False).iloc[batter_rank].name.split()
            daily_hit_result = hit_checker(groups.get_group(game_dates[i]), batter)

        if daily_hit_result =='lose_streak':
            pass
        else:
            hit_counter = hit_counter + 1

        total_counter = total_counter + 1

    print('Complete\n')
    print('Year:', year2)
    print('\nHit counter', hit_counter, ' total_games_played', total_counter, ' \nRatio:',hit_counter/total_counter)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

Complete

Year: 2009

Hit counter 124  total_games_played 165  
Ratio: 0.7515151515151515
Complete

Year: 2010

Hit counter 113  total_games_played 164  
Ratio: 0.6890243902439024
Complete

Year: 2011

Hit counter 112  total_games_played 163  
Ratio: 0.6871165644171779
Complete

Year: 2012

Hit counter 115  total_games_played 165  
Ratio: 0.696969696969697
Complete

Year: 2013

Hit counter 121  total_games_played 164  
Ratio: 0.7378048780487805
Complete

Year: 2014

Hit counter 122  total_games_played 165  
Ratio: 0.7393939393939394
Complete

Year: 2015

Hit counter 112  total_games_played 163  
Ratio: 0.6871165644171779
Complete

Year: 2016

Hit counter 118  total_games_played 163  
Ratio: 0.7239263803680982
Complete

Year: 2017

Hit counter 111  total_games_played 163  
Ratio: 0.6809815950920245
Complete

Year: 2018

Hit counter 122  total_games_played 168  
Ratio: 0.7261904761904762


In [None]:
def team_game_table(df2):
    team_games_away = df2.groupby(['away_team'])
    team_games_home = df2.groupby(['home_team'])
    away_games = team_games_away.agg({'game_id':pd.Series.nunique}).reset_index().rename(columns={'game_id':'away_games'})
    home_games = team_games_home.agg({'game_id':pd.Series.nunique}).reset_index().rename(columns={'game_id':'home_games'})
    team_games = pd.merge(away_games, home_games, left_on=['away_team'], right_on=['home_team'],how='inner')
    team_games.drop('home_team',inplace=True, axis=1)
    team_games['total'] = team_games.away_games + team_games.home_games
    return team_games