# Notebook for first 5 strategies

In [1]:
import pandas as pd
import numpy as np

import sys
sys.path.append('../')
from streak_counter import hit_checker_logs

import random
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook as tqdm

#import warnings
#warnings.filterwarnings("ignore")
import time

d = pd.read_csv('../retrosheet_data/2005-2016_game_logs.csv')

# warning pops out because 'left_field_umpire' can be a string or float dtype
game_info = pd.read_csv('../retrosheet_data/2005-2016_game_info.csv', low_memory=False)

d['double_header_flag']   = d.game_id.apply(lambda x: x[11:]).astype(int)


In [2]:
years = np.arange(2005,2016+1,1)
n_years = len(years)
seasonal_perf_df = pd.DataFrame()
results = pd.DataFrame()

strategies = {'strategy_1':'BA',            #seasonal batting average
              'strategy_2':'H/PA_seasonal', #this is the cumulative hits divided by Plate Appearances, seperated by seasons
              'strategy_3':'H_cum_season',    #H_cum_season stands for Hits_cumulative_season,
              'strategy_4':'H/seasonal_game_played',   #H/seasonal_game_played divides the player's cumulative hits by the number of games played
              'strategy_5':'hit_1_0_SRatio'}    #hit_1_0_SRatio is the seasonal ratio of games where the player got at least 1 hit

d.loc[d['H'] >= 1, 'hit_1_0'] = 1
d.loc[d['H'] == 0, 'hit_1_0'] = 0

d.sort_values(by=['res_batter','year','month','day','game_id'],inplace=True)

d['hit_1_0_SRatio'] = d.groupby(['res_batter','year'])['hit_1_0'].transform(pd.Series.cumsum) / d.seasonal_game_played

In [3]:
d.sample(3)[['game_id','year','month','day','res_batter','BA','H/PA_seasonal','H_cum_season','seasonal_game_played','H/seasonal_game_played','hit_1_0_SRatio']]

Unnamed: 0,game_id,year,month,day,res_batter,BA,H/PA_seasonal,H_cum_season,seasonal_game_played,H/seasonal_game_played,hit_1_0_SRatio
612155,WAS200705110,2007,5,11,zimmr001,0.256944,0.235669,37.0,35,1.057143,0.714286
193809,LAN200605030,2006,5,3,garcn001,0.277778,0.232558,10.0,10,1.0,0.7
480780,BAL200905250,2009,5,25,robeb003,0.295082,0.262136,54.0,45,1.2,0.711111


# Strategy 1-5 given n years

<br>Strategy 1: pick the season leader in terms of BA (batting average) batter must have played at least 1 game past 5 days
<br>Strategy 2: pick the season leader in trms of H/PA (hits per plate apperances) batter must have played at leats 1 game past 5 days
<br>Strategy 3: pick the season leader in terms of Hits, batter must have played at least 1 game past 5 days
<br>Strategy 4: pick the season leader in terms of Hits per game played this season, batter must have played at least 1 game past 5 days
<br>Strategy 5: pick the batter with the highest ratio of 'at least 1 hit games' this season.

## Run the cell below

<br>It will run every strategy from 2005-2016 (takes around 10 mins to process)
<br>The outputs are as follows:<br>
<br>1. seasonal_perf_df (seasonal performance of the strategy). Rows: strategies: Columns: year +  number of hits (success), year + number of attemps. 
<br>2. results : this is a data frame containing the aggregated results. rows: strategies. columns: number of years, total number of hits, total number of attempts, percentage (also known as success rate or 'P'), CI - confidence interval

In [4]:
#players part of the starting lineup are denoted like this: visitor_batter_1, visitor_batter_2 etc
interested_batters = []
for i in range(1,10):
    for team in ['visitor','home']:
        interested_batters.append(f'{team}_batter_{i}')

In [5]:
%%time

for strategy in strategies:
    hit_counter = 0
    total_counter = 0

    for year_ in years:
        df = d[d['year']==year_].copy()
        game_info_yearly = game_info[game_info['year']==year_].copy() 
        
        seasonal_hit_counter = 0
        seasonal_tot_counter = 0

        daily_games = df.groupby(['month','day'])
        daily_game_info = game_info_yearly.groupby(['month','day'])
        
        game_dates = list(daily_games.groups.keys())

        for i in range(30, len(game_dates)) :
            batter_rank = 1

            daily_df = daily_games.get_group(game_dates[i])

            prev_day_list = []
            
            for j in range(1,6):
                prev_day_list.append(daily_games.get_group(game_dates[i-j]))
                        
            prev_days  = pd.concat(prev_day_list).sort_values(by=['res_batter','month','day'])
            prev_days = prev_days.drop_duplicates(subset=['res_batter'],keep='last')
            
            max_games_played = prev_days.seasonal_game_played.max()
            
            leaders = prev_days[prev_days.seasonal_game_played >= 0.7 * max_games_played]
            leaders = leaders.sort_values(by=[strategies.get(strategy)], ascending=False)

            batter    = leaders.iloc[batter_rank - 1].res_batter.split()
            
            daily_df_info = daily_game_info.get_group(game_dates[i])
            starting_lineup = set(np.unique(daily_df_info[interested_batters]))
            
            available_batters = set(starting_lineup) - set(np.unique(daily_df_info[['home_starting_pitcher','vis._starting_pitcher']]))
            
            while batter[0] not in available_batters:
                batter_rank = batter_rank +1
                batter = leaders.iloc[batter_rank - 1].res_batter.split()
            
            daily_hit_result = hit_checker_logs(daily_df, batter)

            if daily_hit_result =='continue_streak':
                pass
            elif daily_hit_result =='lose_streak':
                pass
            else:
                hit_counter = hit_counter + 1
                seasonal_hit_counter = seasonal_hit_counter +1

            total_counter = total_counter + 1
            seasonal_tot_counter = seasonal_tot_counter + 1

        print('Complete: ',strategy, ' Year:', year_)
                
        column_hit   = str(year_) +'_hit_counter'
        column_total = str(year_) +'_hit_total'
        percentage = str(year_) +'_percentage'
        seasonal_perf_df.loc[strategy, column_hit] = seasonal_hit_counter
        seasonal_perf_df.loc[strategy, column_total] = seasonal_tot_counter
        seasonal_perf_df.loc[strategy, percentage] = seasonal_hit_counter / seasonal_tot_counter
    
    
    results.loc[strategy, 'n_years'] = n_years
    results.loc[strategy, 'hit_counter'] = hit_counter #theres already a counter
    results.loc[strategy, 'total_counter'] = total_counter # theres already a counter
    results.loc[strategy, 'percentage'] = hit_counter / total_counter

print('COMPLETE')

Complete:  strategy_1  Year: 2005
Complete:  strategy_1  Year: 2006
Complete:  strategy_1  Year: 2007
Complete:  strategy_1  Year: 2008
Complete:  strategy_1  Year: 2009
Complete:  strategy_1  Year: 2010
Complete:  strategy_1  Year: 2011
Complete:  strategy_1  Year: 2012
Complete:  strategy_1  Year: 2013
Complete:  strategy_1  Year: 2014
Complete:  strategy_1  Year: 2015
Complete:  strategy_1  Year: 2016
Complete:  strategy_2  Year: 2005
Complete:  strategy_2  Year: 2006
Complete:  strategy_2  Year: 2007
Complete:  strategy_2  Year: 2008
Complete:  strategy_2  Year: 2009
Complete:  strategy_2  Year: 2010
Complete:  strategy_2  Year: 2011
Complete:  strategy_2  Year: 2012
Complete:  strategy_2  Year: 2013
Complete:  strategy_2  Year: 2014
Complete:  strategy_2  Year: 2015
Complete:  strategy_2  Year: 2016
Complete:  strategy_3  Year: 2005
Complete:  strategy_3  Year: 2006
Complete:  strategy_3  Year: 2007
Complete:  strategy_3  Year: 2008
Complete:  strategy_3  Year: 2009
Complete:  str

In [6]:
results = results.reset_index()
results = results.rename(columns={'index':'strategy'})

In [7]:
results

Unnamed: 0,strategy,n_years,hit_counter,total_counter,percentage
0,strategy_1,12.0,1325.0,1805.0,0.734072
1,strategy_2,12.0,1352.0,1805.0,0.74903
2,strategy_3,12.0,1383.0,1805.0,0.766205
3,strategy_4,12.0,1397.0,1805.0,0.773961
4,strategy_5,12.0,1336.0,1805.0,0.740166


In [8]:
seasonal_perf_df.head()

Unnamed: 0,2005_hit_counter,2005_hit_total,2005_percentage,2006_hit_counter,2006_hit_total,2006_percentage,2007_hit_counter,2007_hit_total,2007_percentage,2008_hit_counter,...,2013_percentage,2014_hit_counter,2014_hit_total,2014_percentage,2015_hit_counter,2015_hit_total,2015_percentage,2016_hit_counter,2016_hit_total,2016_percentage
strategy_1,118.0,150.0,0.786667,110.0,150.0,0.733333,120.0,151.0,0.794702,114.0,...,0.753333,110.0,151.0,0.728477,96.0,149.0,0.644295,115.0,149.0,0.771812
strategy_2,115.0,150.0,0.766667,121.0,150.0,0.806667,116.0,151.0,0.768212,116.0,...,0.68,111.0,151.0,0.735099,112.0,149.0,0.751678,113.0,149.0,0.758389
strategy_3,115.0,150.0,0.766667,112.0,150.0,0.746667,123.0,151.0,0.81457,119.0,...,0.7,119.0,151.0,0.788079,115.0,149.0,0.771812,112.0,149.0,0.751678
strategy_4,124.0,150.0,0.826667,119.0,150.0,0.793333,119.0,151.0,0.788079,119.0,...,0.746667,115.0,151.0,0.761589,117.0,149.0,0.785235,108.0,149.0,0.724832
strategy_5,115.0,150.0,0.766667,116.0,150.0,0.773333,115.0,151.0,0.761589,119.0,...,0.713333,106.0,151.0,0.701987,98.0,149.0,0.657718,107.0,149.0,0.718121


In [9]:
results.to_csv('strat_1_5.csv',index=None)
seasonal_perf_df.to_csv('strat_1_5_seasons.csv')

In [10]:
import statsmodels.api as sm

results = results.sort_values(by=['percentage'],ascending=False)
results = results.reset_index(drop=True)

p_highest = results.iloc[0].loc['hit_counter']
n_highest = results.iloc[0].loc['total_counter']
z_score_list = [0]
p_value_list = [0]

for row in range(1, results.shape[0]):
    p = results.iloc[row].loc['hit_counter']
    n = results.iloc[row].loc['total_counter']
    zscore, pvalue = sm.stats.proportions_ztest([p, p_highest], [n, n_highest], alternative = 'two-sided', prop_var = False)
    z_score_list.append(zscore)
    p_value_list.append(pvalue)


z_score_series = pd.Series(z_score_list)   
p_value_series = pd.Series(p_value_list)

results['zscore'] = z_score_series
results['pvalue'] = p_value_series

results

Unnamed: 0,strategy,n_years,hit_counter,total_counter,percentage,zscore,pvalue
0,strategy_4,12.0,1397.0,1805.0,0.773961,0.0,0.0
1,strategy_3,12.0,1383.0,1805.0,0.766205,-0.553758,0.579744
2,strategy_2,12.0,1352.0,1805.0,0.74903,-1.757426,0.078845
3,strategy_5,12.0,1336.0,1805.0,0.740166,-2.367357,0.017916
4,strategy_1,12.0,1325.0,1805.0,0.734072,-2.782502,0.005394



I compared everything to Strategy 4 since it had the highest success rate or 'percentage' 

- null hypothesis are the following: p1 = p4, p2 = p4, p3 = p4, p5 = p4   (p1 represents percentage or success rate for strategy 1 etc...)
- alt  hypothesis are the following: p1 != p4, p2 != p4 p3 != p4 p5 != p4 or in other words the success rates are different


Results: using a standard error rate of 5% we see that strategy_1, and strategy_5 all have p-values smaller then 0.05
this means we reject the null hypothesis in favor of the alt hypothesis. in our case this means that p4 is larger
which means that strategy 4 is better then strategy 1 and 5. For strategy 2 and 3, the p-value was higher then 0.05 which means we
fail to reject the null. Essentially strategy 2, 3 and strategy 4 have the same odds and are equally
as good. Given the sample size, we can expect, 95% of the time, that the 'True' success rate will fall and include these 3 values.

In [11]:
# Using the confidence interval approach, if the calculated Zscore is beyond the left and right boundary then the results are
# statistically significant.

#Assume a 95% CI
from scipy.stats import norm
left_boundary  = norm.ppf(0.05/2)
right_boundary = norm.ppf(1-(0.05/2))

#these are the standard z scores for a 95% CI
left_boundary, right_boundary

(-1.9599639845400545, 1.959963984540054)

As we can see, the Zscores for strategy 5 and 1 are below -1.96 which means the 'percentage' or success rate for strategy 4 is statistically significant and better. For strategies 2 and 3, the Zscore is only -1.75 and -.5537 respectively which means that the differences in success rates are not statistically significant.

## How the loop works:

It loops through every strategy contained in a dictionary named 'strategies'
<br><br> Once it picks a strategy, it will start looping throughout the years 1 by 1
<br><br> Once it picks a year, it will go through every game starting from the 30th day of the season, it will calculate who's the current leader given the specific metric (metric in strategies dictionary)
<br><br>  Once it calculates the current leader, the program only picks the top batter given that he's in the list of starting players. If this player is not in the starting lineup, it will verify the next batter until the chosen one is in the starting lineups.

<br> It loops through every day of the regular season for a given year trying out a specific strategy. Once a season is done, it will go to the next season/year, and re-try the strategy. once all the seasons have been exhausted it will restart at the first year with the next strategy.

END

Henry