# Notebook for first 5 strategies

In [1]:
import pandas as pd
import numpy as np

import sys
sys.path.append('../')
from streak_counter import hit_checker_logs

import random
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook as tqdm

import warnings
warnings.filterwarnings("ignore")
import time

d = pd.read_csv('../retrosheet_data/2005-2016_game_logs.csv')
game_info = pd.read_csv('../retrosheet_data/2005-2016_game_info.csv')
d['double_header_flag']   = d.game_id.apply(lambda x: x[11:]).astype(int)


In [2]:
years = np.arange(2005,2016+1,1)
n_years = len(years)
seasonal_perf_df = pd.DataFrame()
results = pd.DataFrame()
strategies = {'strategy_1':'BA','strategy_2':'H/PA_seasonal','strategy_3':'H_cum_season','strategy_4':'H/seasonal_game_played','strategy_5':'hit_1_0_SRatio'}

d.loc[d['H'] >= 1, 'hit_1_0'] = 1
d.loc[d['H'] == 0, 'hit_1_0'] = 0

d.sort_values(by=['res_batter','year','month','day','game_id'],inplace=True)

d['hit_1_0_SRatio'] = d.groupby(['res_batter','year'])['hit_1_0'].transform(pd.Series.cumsum) / d.seasonal_game_played

In [3]:
d.sample(3)[['game_id','year','month','day','res_batter','BA','H/PA_seasonal','H_cum_season','seasonal_game_played','H/seasonal_game_played','hit_1_0_SRatio']]

Unnamed: 0,game_id,year,month,day,res_batter,BA,H/PA_seasonal,H_cum_season,seasonal_game_played,H/seasonal_game_played,hit_1_0_SRatio
176989,CHN200609030,2006,9,3,finls001,0.25,0.22093,95.0,112,0.848214,0.571429
469929,ANA201407070,2014,7,7,reimn001,0.0,0.0,0.0,1,0.0,0.0
466703,TOR201304210,2013,4,21,rasmc001,0.25,0.225806,14.0,17,0.823529,0.647059


# Strategy 1-5 given n years

<br>Strategy 1: pick the season leader in terms of BA (batting average) batter must have played at least 1 game past 5 days
<br>Strategy 2: pick the season leader in trms of H/PA (hits per plate apperances) batter must have played at leats 1 game past 5 days
<br>Strategy 3: pick the season leader in terms of Hits, batter must have played at least 1 game past 5 days
<br>Strategy 4: pick the season leader in terms of Hits per game played this season, batter must have played at least 1 game past 5 days
<br>Strategy 5: pick the batter with the highest ratio of 'at least 1 hit games' this season.

## Run the cell below

<br>It will run every strategy from 2005-2016 (takes around 10 mins to process)
<br>The outputs are as follows:<br>
<br>1. seasonal_perf_df (seasonal performance of the strategy). Rows: strategies: Columns: year +  number of hits (success), year + number of attemps. 
<br>2. results : this is a data frame containing the aggregated results. rows: strategies. columns: number of years, total number of hits, total number of attempts, percentage (also known as success rate or 'P'), CI - confidence interval

In [4]:
start_time = time.time()

for strategy in strategies:
    hit_counter = 0
    total_counter = 0

    for year_ in years:
        df=d.copy()
        df = df[df['year']==year_]
        game_info_yearly = game_info.copy()
        game_info_yearly = game_info_yearly[game_info_yearly['year']==year_]
        
        seasonal_hit_counter = 0
        seasonal_tot_counter = 0
        i = 30
        
        daily_games = df.groupby(['month','day'])
        daily_game_info = game_info_yearly.groupby(['month','day'])
        
        game_dates = list(daily_games.groups.keys())

        while i < len(game_dates) -1 :
            batter_rank = 1

            daily_df = daily_games.get_group(game_dates[i])

            prev_day_1 = daily_games.get_group(game_dates[i-1])
            prev_day_2 = daily_games.get_group(game_dates[i-2])
            prev_day_3 = daily_games.get_group(game_dates[i-3])
            prev_day_4 = daily_games.get_group(game_dates[i-4])
            prev_day_5 = daily_games.get_group(game_dates[i-5])
            prev_days  = pd.concat([prev_day_1,prev_day_2,prev_day_3,prev_day_4,prev_day_5]).sort_values(by=['res_batter','month','day'])
            prev_days.drop_duplicates(subset=['res_batter'],keep='last',inplace=True)
            
            max_games_played = prev_days.seasonal_game_played.max()
            leaders = prev_days[prev_days.seasonal_game_played >= 0.7 * max_games_played]
            leaders.sort_values(by=[strategies.get(strategy)], ascending=False, inplace=True)

            batter    = leaders.iloc[batter_rank - 1].res_batter.split()
            
            daily_df_info = daily_game_info.get_group(game_dates[i])
            starting_lineup = set(np.unique(daily_df_info[['visitor_batter_1','visitor_batter_2','visitor_batter_3','visitor_batter_4','visitor_batter_5','visitor_batter_6','visitor_batter_7','visitor_batter_8','visitor_batter_9','home_batter_1','home_batter_2','home_batter_3','home_batter_4','home_batter_5','home_batter_6','home_batter_7','home_batter_8','home_batter_9']]))
            
            available_batters = set(starting_lineup) - set(np.unique(daily_df_info[['home_starting_pitcher','vis._starting_pitcher']]))
            
            while batter[0] not in available_batters:
                batter_rank = batter_rank +1
                batter = leaders.iloc[batter_rank - 1].res_batter.split()
         
            daily_hit_result = hit_checker_logs(daily_df, batter)

            i=i+1
            if daily_hit_result =='continue_streak':
                pass
            elif daily_hit_result =='lose_streak':
                pass
            else:
                hit_counter = hit_counter + 1
                seasonal_hit_counter = seasonal_hit_counter +1

            total_counter = total_counter + 1
            seasonal_tot_counter = seasonal_tot_counter + 1

        print('Complete: ',strategy, ' Year:', year_)
                
        column_hit   = str(year_) +'_hit_counter'
        column_total = str(year_) +'_hit_total'
        percentage = str(year_) +'_percentage'
        seasonal_perf_df.loc[strategy, column_hit] = seasonal_hit_counter
        seasonal_perf_df.loc[strategy, column_total] = seasonal_tot_counter
        seasonal_perf_df.loc[strategy, percentage] = seasonal_hit_counter / seasonal_tot_counter
    
    
    results.loc[strategy, 'n_years'] = n_years
    results.loc[strategy, 'hit_counter'] = hit_counter #theres already a counter
    results.loc[strategy, 'total_counter'] = total_counter # theres already a counter
    results.loc[strategy, 'percentage'] = hit_counter / total_counter  
    results.loc[strategy, 'CI'] = 95

end_time = time.time()    
print("COMPLETE : took", round((end_time - start_time)/60),'minutes and', round((end_time - start_time),1) % 60,'seconds')

Complete:  strategy_1  Year: 2005
Complete:  strategy_1  Year: 2006
Complete:  strategy_1  Year: 2007
Complete:  strategy_1  Year: 2008
Complete:  strategy_1  Year: 2009
Complete:  strategy_1  Year: 2010
Complete:  strategy_1  Year: 2011
Complete:  strategy_1  Year: 2012
Complete:  strategy_1  Year: 2013
Complete:  strategy_1  Year: 2014
Complete:  strategy_1  Year: 2015
Complete:  strategy_1  Year: 2016
Complete:  strategy_2  Year: 2005
Complete:  strategy_2  Year: 2006
Complete:  strategy_2  Year: 2007
Complete:  strategy_2  Year: 2008
Complete:  strategy_2  Year: 2009
Complete:  strategy_2  Year: 2010
Complete:  strategy_2  Year: 2011
Complete:  strategy_2  Year: 2012
Complete:  strategy_2  Year: 2013
Complete:  strategy_2  Year: 2014
Complete:  strategy_2  Year: 2015
Complete:  strategy_2  Year: 2016
Complete:  strategy_3  Year: 2005
Complete:  strategy_3  Year: 2006
Complete:  strategy_3  Year: 2007
Complete:  strategy_3  Year: 2008
Complete:  strategy_3  Year: 2009
Complete:  str

In [5]:
results_copy = results.copy()

In [6]:
results

Unnamed: 0,n_years,hit_counter,total_counter,percentage,CI
strategy_1,12.0,1316.0,1793.0,0.733965,95.0
strategy_2,12.0,1342.0,1793.0,0.748466,95.0
strategy_3,12.0,1372.0,1793.0,0.765198,95.0
strategy_4,12.0,1387.0,1793.0,0.773564,95.0
strategy_5,12.0,1326.0,1793.0,0.739543,95.0


In [14]:
seasonal_perf_df.head()

Unnamed: 0,2005_hit_counter,2005_hit_total,2005_percentage,2006_hit_counter,2006_hit_total,2006_percentage,2007_hit_counter,2007_hit_total,2007_percentage,2008_hit_counter,...,2013_percentage,2014_hit_counter,2014_hit_total,2014_percentage,2015_hit_counter,2015_hit_total,2015_percentage,2016_hit_counter,2016_hit_total,2016_percentage
strategy_1,118.0,149.0,0.791946,109.0,149.0,0.731544,119.0,150.0,0.793333,114.0,...,0.751678,109.0,150.0,0.726667,95.0,148.0,0.641892,114.0,148.0,0.77027
strategy_2,114.0,149.0,0.765101,120.0,149.0,0.805369,115.0,150.0,0.766667,116.0,...,0.677852,110.0,150.0,0.733333,111.0,148.0,0.75,112.0,148.0,0.756757
strategy_3,114.0,149.0,0.765101,111.0,149.0,0.744966,122.0,150.0,0.813333,119.0,...,0.697987,118.0,150.0,0.786667,114.0,148.0,0.77027,111.0,148.0,0.75
strategy_4,123.0,149.0,0.825503,118.0,149.0,0.791946,118.0,150.0,0.786667,119.0,...,0.744966,114.0,150.0,0.76,116.0,148.0,0.783784,108.0,148.0,0.72973
strategy_5,114.0,149.0,0.765101,115.0,149.0,0.771812,114.0,150.0,0.76,119.0,...,0.711409,105.0,150.0,0.7,97.0,148.0,0.655405,107.0,148.0,0.722973


In [12]:
results.to_csv('strat_1_5.csv',index=None)
seasonal_perf_df.to_csv('strat_1_5_seasons.csv',index=None)

In [13]:
import statsmodels.api as sm

p1 = results.loc['strategy_1','hit_counter']
p2 = results.loc['strategy_2','hit_counter']
p3 = results.loc['strategy_3','hit_counter']
p4 = results.loc['strategy_4','hit_counter']
p5 = results.loc['strategy_5','hit_counter']

n1 = results.loc['strategy_1','total_counter']
n2 = results.loc['strategy_2','total_counter']
n3 = results.loc['strategy_3','total_counter']
n4 = results.loc['strategy_4','total_counter']
n5 = results.loc['strategy_5','total_counter']


zscore1, pvalue1 = sm.stats.proportions_ztest([p1, p4], [n1, n4], alternative = 'two-sided', prop_var = False)
zscore2, pvalue2 = sm.stats.proportions_ztest([p2, p4], [n2, n4], alternative = 'two-sided', prop_var = False)
zscore3, pvalue3 = sm.stats.proportions_ztest([p3, p4], [n3, n4], alternative = 'two-sided', prop_var = False)
zscore5, pvalue5 = sm.stats.proportions_ztest([p5, p4], [n5, n4], alternative = 'two-sided', prop_var = False)

(zscore1, pvalue1), (zscore2, pvalue2), (zscore3, pvalue3), (zscore5, pvalue5)
#null: p1 = p4
#alt: p1 != p4

# p2,p3,p4 reject null since p-value <=alpha of 0.05 or 0.01
# p1 vs p5 fail to reject null so basically there is no difference between both of them or p1 = p5

((-2.7520740590022288, 0.0059219126968768406),
 (-1.7620794145759797, 0.07805588080244806),
 (-0.59465872677467035, 0.55207161590946585),
 (-2.3735749990316095, 0.017616814907058916))

In [1]:
''' 
I compared everything to Strategy 4 since it had the highest success rate or 'percentage'
-null hypothesis are the following: p1 = p4, p2 = p4, p3 = p4, p5 = p4   (p1 represents percentage or success rate for strategy 1 etc...)
-alt  hypothesis are the following: p1 != p4, p2 != p4 p3 != p4 p5 != p4 or in other words the success rates are different
using a standard error rate of 5% we see that p1, and p5 all have p-values smaller then 0.05
this means we reject the null hypothesis in favor of the alt hypothesis. in our case this means that p4 is larger
which means that strategy 4 is better then strategy 1 and 5. For strategy 2 and 3, the p-value was higher then 0.05 which means we
fail to reject the null. Essentially strategy 2, 3 and strategy 4 have the same odds and are equally
as good.
'''

" \nI compared everything to Strategy 4 since it had the highest success rate or 'percentage'\n-null hypothesis are the following: p1 = p4, p2 = p4, p3 = p4, p5 = p4   (p1 represents percentage or success rate for strategy 1 etc...)\n-alt  hypothesis are the following: p1 != p4, p2 != p4 p3 != p4 p5 != p4 or in other words the success rates are different\nusing a standard error rate of 5% we see that p1, and p5 all have p-values smaller then 0.05\nthis means we reject the null hypothesis in favor of the alt hypothesis. in our case this means that p4 is larger\nwhich means that strategy 4 is better then strategy 1 and 5. For strategy 2 and 3, the p-value was higher then 0.05 which means we\nfail to reject the null. Essentially strategy 2, 3 and strategy 4 have the same odds and are equally\nas good.\n"

In [16]:
#Even using the Confidence intervall, we see that the z_scores from Strategy 1-2-5 do not lie within the 95% CI boundaries zscore
# in strategy 3, the Zscore was -0.6728 which is within the -1.9599 to 1.9599 boundary so its possible they came from the same distribution
# so the odds between strategy 2, 3 and 4 aren't statistically significant, but for the other strategies(1/5)
#Strategy 4 is better

#Assume a 95% CI
from scipy.stats import norm
left_boundary  = norm.ppf(0.05/2)
right_boundary = norm.ppf(1-(0.05/2))

#these are the standard z scores for a 95% CI
left_boundary, right_boundary

(-1.9599639845400545, 1.959963984540054)

## How the loop works:

It loops through every strategy contained in a dictionary named 'strategies'
<br><br> Once it picks a strategy, it will start looping throughout the years 1 by 1
<br><br> Once it picks a year, it will go through every game starting from the 15th day of the season, it will calculate who's the current leader given the specific metric (metric in strategies dictionary)
<br><br>  Once it calculates the current leader (only considers results from previous days) it will load the current day's results and check if the chosen player got a hit or not. If this player did not play or did not get an AB it will verify with the next 'leader'

<br> It loops through every day of the regular season for a given year trying out a specific strategy. Once a season is done, it will go to the next season/year, and re-try the strategy. once all the seasons have been exhausted it will restart at the first year with the next strategy.

### Script for 1 season

I've included the code here so one can check to see if the code works, which it does.

 <br> 
The Print Format is as follows :<br><br>(MONTH,DAY), ['PLAYER CHOSEN'], 1/'lose_streak'/'continue streak (1 = hit), Batting Average <br><br> You can check the batting average on baseball reference, just make sure you look at this statistics for the previous game day! 

In [10]:
df = d.copy()
year_ = 2016
df = df[df.year==year_]
daily_games = df.groupby(['month','day'])
game_dates = list(daily_games.groups.keys())

In [17]:
# script for 1 season
hit_counter = 0
total_counter = 0
i = 30

while i < len(game_dates) -1 :
        batter_rank = 1

        daily_df = daily_games.get_group(game_dates[i])

        prev_day_1 = daily_games.get_group(game_dates[i-1])
        prev_day_2 = daily_games.get_group(game_dates[i-2])
        prev_day_3 = daily_games.get_group(game_dates[i-3])
        prev_day_4 = daily_games.get_group(game_dates[i-4])
        prev_day_5 = daily_games.get_group(game_dates[i-5])
        prev_days  = pd.concat([prev_day_1,prev_day_2,prev_day_3,prev_day_4,prev_day_5]).sort_values(by=['res_batter','month','day'])
        prev_days.drop_duplicates(subset=['res_batter'],keep='last',inplace=True)
            
        max_games_played = prev_days.seasonal_game_played.max()
        leaders = prev_days[prev_days.seasonal_game_played >= 0.7 * max_games_played]
        leaders.sort_values(by=['BA'], ascending=False, inplace=True)

        batter    = leaders.iloc[batter_rank - 1].res_batter.split()
            
        daily_df_info = daily_game_info.get_group(game_dates[i])
        starting_lineup = set(np.unique(daily_df_info[['visitor_batter_1','visitor_batter_2','visitor_batter_3','visitor_batter_4','visitor_batter_5','visitor_batter_6','visitor_batter_7','visitor_batter_8','visitor_batter_9','home_batter_1','home_batter_2','home_batter_3','home_batter_4','home_batter_5','home_batter_6','home_batter_7','home_batter_8','home_batter_9']]))
            
        available_batters = set(starting_lineup) - set(np.unique(daily_df_info[['home_starting_pitcher','vis._starting_pitcher']]))
            
        while batter[0] not in available_batters:
            batter_rank = batter_rank +1
            batter = leaders.iloc[batter_rank - 1].res_batter.split()
         
        daily_hit_result = hit_checker_logs(daily_df, batter)

        i=i+1
        if daily_hit_result =='continue_streak':
            pass
        elif daily_hit_result =='lose_streak':
            pass
        else:
            hit_counter = hit_counter + 1
            seasonal_hit_counter = seasonal_hit_counter +1

        total_counter = total_counter + 1
        seasonal_tot_counter = seasonal_tot_counter + 1

                        
print('Complete\n')
print('Hit counter', hit_counter, ' total_games_played', total_counter, ' \nRatio:',hit_counter/total_counter)

Complete

Hit counter 114  total_games_played 148  
Ratio: 0.7702702702702703


### Script for 1 strategy, every season between 2005-2016

### Strategy 1: pick the batter with the highest BA

Output results

In [18]:
hit_counter = 0
total_counter = 0
years = np.arange(2005,2016+1,1)
for year_ in years:
    df=d.copy()
    df = df[df['year']==year_]

    i = 15
    daily_games = df.groupby(['month','day'])
    game_dates = list(daily_games.groups.keys())
    while i < len(game_dates) -1 :
        batter_rank = 30

        daily_df = daily_games.get_group(game_dates[i])

        # prev_day_1 to 5 help me get a list of players who have played the past 5 days. 
        prev_day_1 = daily_games.get_group(game_dates[i-1])
        prev_day_2 = daily_games.get_group(game_dates[i-2])
        prev_day_3 = daily_games.get_group(game_dates[i-3])
        prev_day_4 = daily_games.get_group(game_dates[i-4])
        prev_day_5 = daily_games.get_group(game_dates[i-5])
        prev_days  = pd.concat([prev_day_1,prev_day_2,prev_day_3,prev_day_4,prev_day_5]).sort_values(by=['res_batter','month','day'])
        prev_days.drop_duplicates(subset=['res_batter'],keep='last',inplace=True)
    
        max_games_played = prev_days.seasonal_game_played.max()
        leaders = prev_days[prev_days.seasonal_game_played >= 0.7 * max_games_played]
        leaders.sort_values(by=['BA'], ascending=False, inplace=True)
    
        batter    = leaders.iloc[batter_rank - 1].res_batter.split()

        daily_hit_result = hit_checker_logs(daily_df, batter)

        while (daily_hit_result == 'continue_streak'):
            batter_rank = batter_rank + 1 
            batter = leaders.iloc[batter_rank - 1].res_batter.split()
            daily_hit_result = hit_checker_logs(daily_df, batter)

        i=i+1
        if daily_hit_result =='lose_streak':
            pass
        else:
            hit_counter = hit_counter + 1

        total_counter = total_counter + 1

    print('Complete\n')
    print('Year ',year_)
    print('\nHit counter', hit_counter, ' total_games_played', total_counter, ' \nRatio:',hit_counter/total_counter)

Complete

Year  2005

Hit counter 114  total_games_played 164  
Ratio: 0.6951219512195121
Complete

Year  2006

Hit counter 240  total_games_played 328  
Ratio: 0.7317073170731707
Complete

Year  2007

Hit counter 358  total_games_played 493  
Ratio: 0.7261663286004056
Complete

Year  2008

Hit counter 472  total_games_played 661  
Ratio: 0.7140695915279879
Complete

Year  2009

Hit counter 596  total_games_played 826  
Ratio: 0.7215496368038741
Complete

Year  2010

Hit counter 710  total_games_played 990  
Ratio: 0.7171717171717171
Complete

Year  2011

Hit counter 823  total_games_played 1153  
Ratio: 0.7137901127493496
Complete

Year  2012

Hit counter 928  total_games_played 1318  
Ratio: 0.7040971168437026
Complete

Year  2013

Hit counter 1037  total_games_played 1482  
Ratio: 0.6997300944669366
Complete

Year  2014

Hit counter 1154  total_games_played 1647  
Ratio: 0.7006678809957498
Complete

Year  2015

Hit counter 1255  total_games_played 1810  
Ratio: 0.6933701657458563
Co

END

Henry