# Notebook for Strategy 6 to 10

In [1]:
import pandas as pd
import numpy as np

import sys
sys.path.append('../')
from streak_counter import hit_checker_logs

import random
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook as tqdm

#import warnings
#warnings.filterwarnings("ignore")
import time

d = pd.read_csv('../retrosheet_data/2005-2016_game_logs.csv')

# warning pops out because 'left_field_umpire' can be a string or float dtype
game_info = pd.read_csv('../retrosheet_data/2005-2016_game_info.csv', low_memory=False)

d['double_header_flag']   = d.game_id.apply(lambda x: x[11:]).astype(int)


In [2]:
years = np.arange(2005,2016+1,1)
n_years = len(years)
seasonal_perf_df = pd.DataFrame()
results = pd.DataFrame()

strategies = {'strategy_6':'BA_car',
              'strategy_7':'H/PA_car',
              'strategy_8':'H_cum_car',   #dont think this is useful
              'strategy_9':'H/career_game_played', #dont think this is useful
              'strategy_10':'hit_1_0_CRatio_car'}

#strategies = {'strategy_1':'BA'}

d.loc[d['H'] >= 1, 'hit_1_0'] = 1
d.loc[d['H'] == 0, 'hit_1_0'] = 0

d.sort_values(by=['res_batter','year','month','day','game_id'],inplace=True)

d['hit_1_0_CRatio_car'] = d.groupby(['res_batter'])['hit_1_0'].transform(pd.Series.cumsum) / d.career_game_played

In [3]:
d.sample(3)[['game_id','year','month','day','res_batter','BA_car','H/PA_car','H_cum_car','career_game_played','H/career_game_played','hit_1_0_CRatio_car']]

Unnamed: 0,game_id,year,month,day,res_batter,BA_car,H/PA_car,H_cum_car,career_game_played,H/career_game_played,hit_1_0_CRatio_car
356784,TOR201609130,2016,9,13,martr004,0.255372,0.220282,1236.0,1401,0.882227,0.62384
605783,SDN200905240,2009,5,24,younc003,0.145695,0.121547,22.0,85,0.258824,0.247059
24865,SLN200709180,2007,9,18,barar001,0.251142,0.227273,220.0,261,0.842912,0.628352


# Strategy 6-10 given n years

<br>Strategy 6: pick the current Career leader in terms of BA (batting average) batter must have played at least 1 game past 5 days
<br>Strategy 7: pick the current Career leader in terms of H/PA_car (hits per plate apperances) batter must have played at leats 1 game past 5 days
<br>Strategy 8: pick the current Career leader in terms of Hits, batter must have played at least 1 game past 5 days
<br>Strategy 9: pick the current Career leader in terms of Hits per game played throughout his career, batter must have played at least 1 game past 5 days
<br>Strategy 10: pick the current Career leader in terms of highest ratio of 'at least 1 hit games' career wise

## Run the cell below

<br>It will run every strategy from 2005-2016 (takes around 10 mins to process)
<br>The outputs are as follows:<br>
<br>1. seasonal_perf_df (seasonal performance of the strategy). Rows: strategies: Columns: year +  number of hits (success), year + number of attemps. 
<br>2. results : this is a data frame containing the aggregated results. rows: strategies. columns: number of years, total number of hits, total number of attempts, percentage (also known as success rate or 'P'), CI - confidence interval

In [4]:
interested_batters = []
for i in range(1,10):
    for team in ['visitor','home']:
        interested_batters.append(f'{team}_batter_{i}')

In [5]:
%%time

for strategy in strategies:
    hit_counter = 0
    total_counter = 0

    for year_ in years:
        df = d[d['year']==year_].copy()
        game_info_yearly = game_info[game_info['year']==year_].copy() 
        
        seasonal_hit_counter = 0
        seasonal_tot_counter = 0

        daily_games = df.groupby(['month','day'])
        daily_game_info = game_info_yearly.groupby(['month','day'])
        
        game_dates = list(daily_games.groups.keys())

        for i in range(6, len(game_dates)) :
            batter_rank = 1

            daily_df = daily_games.get_group(game_dates[i])

            prev_day_list = []
            
            for j in range(1,6):
                prev_day_list.append(daily_games.get_group(game_dates[i-j]))
                        
            prev_days  = pd.concat(prev_day_list).sort_values(by=['res_batter','month','day'])
            prev_days = prev_days.drop_duplicates(subset=['res_batter'],keep='last')
            
            max_games_played = prev_days.seasonal_game_played.max()
            
            leaders = prev_days[prev_days.seasonal_game_played >= 0.7 * max_games_played]
            leaders = leaders.sort_values(by=[strategies.get(strategy)], ascending=False)

            batter    = leaders.iloc[batter_rank - 1].res_batter.split()
            
            daily_df_info = daily_game_info.get_group(game_dates[i])
            starting_lineup = set(np.unique(daily_df_info[interested_batters]))
            
            available_batters = set(starting_lineup) - set(np.unique(daily_df_info[['home_starting_pitcher','vis._starting_pitcher']]))
            
            while batter[0] not in available_batters:
                batter_rank = batter_rank +1
                batter = leaders.iloc[batter_rank - 1].res_batter.split()
            
            daily_hit_result = hit_checker_logs(daily_df, batter)

            if daily_hit_result =='continue_streak':
                pass
            elif daily_hit_result =='lose_streak':
                pass
            else:
                hit_counter = hit_counter + 1
                seasonal_hit_counter = seasonal_hit_counter +1

            total_counter = total_counter + 1
            seasonal_tot_counter = seasonal_tot_counter + 1

        print('Complete: ',strategy, ' Year:', year_)
                
        column_hit   = str(year_) +'_hit_counter'
        column_total = str(year_) +'_hit_total'
        percentage = str(year_) +'_percentage'
        seasonal_perf_df.loc[strategy, column_hit] = seasonal_hit_counter
        seasonal_perf_df.loc[strategy, column_total] = seasonal_tot_counter
        seasonal_perf_df.loc[strategy, percentage] = seasonal_hit_counter / seasonal_tot_counter
    
    
    results.loc[strategy, 'n_years'] = n_years
    results.loc[strategy, 'hit_counter'] = hit_counter #theres already a counter
    results.loc[strategy, 'total_counter'] = total_counter # theres already a counter
    results.loc[strategy, 'percentage'] = hit_counter / total_counter

print('COMPLETE')

Complete:  strategy_6  Year: 2005
Complete:  strategy_6  Year: 2006
Complete:  strategy_6  Year: 2007
Complete:  strategy_6  Year: 2008
Complete:  strategy_6  Year: 2009
Complete:  strategy_6  Year: 2010
Complete:  strategy_6  Year: 2011
Complete:  strategy_6  Year: 2012
Complete:  strategy_6  Year: 2013
Complete:  strategy_6  Year: 2014
Complete:  strategy_6  Year: 2015
Complete:  strategy_6  Year: 2016
Complete:  strategy_7  Year: 2005
Complete:  strategy_7  Year: 2006
Complete:  strategy_7  Year: 2007
Complete:  strategy_7  Year: 2008
Complete:  strategy_7  Year: 2009
Complete:  strategy_7  Year: 2010
Complete:  strategy_7  Year: 2011
Complete:  strategy_7  Year: 2012
Complete:  strategy_7  Year: 2013
Complete:  strategy_7  Year: 2014
Complete:  strategy_7  Year: 2015
Complete:  strategy_7  Year: 2016
Complete:  strategy_8  Year: 2005
Complete:  strategy_8  Year: 2006
Complete:  strategy_8  Year: 2007
Complete:  strategy_8  Year: 2008
Complete:  strategy_8  Year: 2009
Complete:  str

In [6]:
results = results.reset_index()
results = results.rename(columns={'index':'strategy'})

In [7]:
results

Unnamed: 0,strategy,n_years,hit_counter,total_counter,percentage
0,strategy_6,12.0,1482.0,2093.0,0.708075
1,strategy_7,12.0,1506.0,2093.0,0.719541
2,strategy_8,12.0,1574.0,2093.0,0.752031
3,strategy_9,12.0,1594.0,2093.0,0.761586
4,strategy_10,12.0,1561.0,2093.0,0.745819


In [8]:
seasonal_perf_df

Unnamed: 0,2005_hit_counter,2005_hit_total,2005_percentage,2006_hit_counter,2006_hit_total,2006_percentage,2007_hit_counter,2007_hit_total,2007_percentage,2008_hit_counter,...,2013_percentage,2014_hit_counter,2014_hit_total,2014_percentage,2015_hit_counter,2015_hit_total,2015_percentage,2016_hit_counter,2016_hit_total,2016_percentage
strategy_6,131.0,174.0,0.752874,119.0,174.0,0.683908,122.0,175.0,0.697143,120.0,...,0.793103,118.0,175.0,0.674286,122.0,173.0,0.705202,120.0,173.0,0.693642
strategy_7,127.0,174.0,0.729885,126.0,174.0,0.724138,130.0,175.0,0.742857,136.0,...,0.678161,121.0,175.0,0.691429,118.0,173.0,0.682081,120.0,173.0,0.693642
strategy_8,133.0,174.0,0.764368,136.0,174.0,0.781609,142.0,175.0,0.811429,142.0,...,0.683908,127.0,175.0,0.725714,119.0,173.0,0.687861,121.0,173.0,0.699422
strategy_9,141.0,174.0,0.810345,133.0,174.0,0.764368,143.0,175.0,0.817143,143.0,...,0.706897,121.0,175.0,0.691429,128.0,173.0,0.739884,126.0,173.0,0.728324
strategy_10,133.0,174.0,0.764368,135.0,174.0,0.775862,136.0,175.0,0.777143,130.0,...,0.712644,122.0,175.0,0.697143,129.0,173.0,0.745665,119.0,173.0,0.687861


In [9]:
seasonal_perf_df[['2008_hit_counter','2008_hit_total','2008_percentage','2009_hit_counter','2009_hit_total','2009_percentage','2010_hit_counter','2010_hit_total','2010_percentage','2011_hit_counter','2011_hit_total','2011_percentage','2012_hit_counter','2012_hit_total','2012_percentage','2013_hit_counter','2013_hit_total','2013_percentage']]

Unnamed: 0,2008_hit_counter,2008_hit_total,2008_percentage,2009_hit_counter,2009_hit_total,2009_percentage,2010_hit_counter,2010_hit_total,2010_percentage,2011_hit_counter,2011_hit_total,2011_percentage,2012_hit_counter,2012_hit_total,2012_percentage,2013_hit_counter,2013_hit_total,2013_percentage
strategy_6,120.0,178.0,0.674157,124.0,175.0,0.708571,121.0,174.0,0.695402,127.0,173.0,0.734104,120.0,175.0,0.685714,138.0,174.0,0.793103
strategy_7,136.0,178.0,0.764045,132.0,175.0,0.754286,129.0,174.0,0.741379,125.0,173.0,0.722543,124.0,175.0,0.708571,118.0,174.0,0.678161
strategy_8,142.0,178.0,0.797753,148.0,175.0,0.845714,136.0,174.0,0.781609,125.0,173.0,0.722543,126.0,175.0,0.72,119.0,174.0,0.683908
strategy_9,143.0,178.0,0.803371,149.0,175.0,0.851429,134.0,174.0,0.770115,125.0,173.0,0.722543,128.0,175.0,0.731429,123.0,174.0,0.706897
strategy_10,130.0,178.0,0.730337,146.0,175.0,0.834286,136.0,174.0,0.781609,123.0,173.0,0.710983,128.0,175.0,0.731429,124.0,174.0,0.712644


In [10]:
results.to_csv('strat_6_10.csv',index=None)
seasonal_perf_df.to_csv('strat_6_10_seasons.csv')

In [11]:
#create function to track the best players and chosen number of times and based on metric
#create function that takes top 5 guys
# create function that would see streak???

In [12]:
import statsmodels.api as sm

results = results.sort_values(by=['percentage'],ascending=False)
results = results.reset_index(drop=True)

p_highest = results.iloc[0].loc['hit_counter']
n_highest = results.iloc[0].loc['total_counter']
z_score_list = [0]
p_value_list = [0]

for row in range(1, results.shape[0]):
    p = results.iloc[row].loc['hit_counter']
    n = results.iloc[row].loc['total_counter']
    zscore, pvalue = sm.stats.proportions_ztest([p, p_highest], [n, n_highest], alternative = 'two-sided', prop_var = False)
    z_score_list.append(zscore)
    p_value_list.append(pvalue)


z_score_series = pd.Series(z_score_list)   
p_value_series = pd.Series(p_value_list)

results['zscore'] = z_score_series
results['pvalue'] = p_value_series

results

Unnamed: 0,strategy,n_years,hit_counter,total_counter,percentage,zscore,pvalue
0,strategy_9,12.0,1594.0,2093.0,0.761586,0.0,0.0
1,strategy_8,12.0,1574.0,2093.0,0.752031,-0.720548,0.471187
2,strategy_10,12.0,1561.0,2093.0,0.745819,-1.183817,0.236486
3,strategy_7,12.0,1506.0,2093.0,0.719541,-3.103034,0.001915
4,strategy_6,12.0,1482.0,2093.0,0.708075,-3.921597,8.8e-05


For now we are just going to save the results, and then combine and compare them to strategy 1-5. Right off the bat though it looks like strategy 9 did the best. Statistically significant then strat 6 and 7 but thats about it. Strat 8-9-10 performed the same.

I suspect there might be something wrong and I will have to investigate these results more carefully. What I think might be wrong is that just because a batter did well 1 season, he might have the best metric (Eg career hits leader) coming into the next season but if he performs poorly in this new season then after a few weeks I would want the algorithm to NOT pick him... even though the statistic should revert back to mean.

Maybe if I incorporated a rolling average of say 50 days- 100 days- 200 days- 300 days etc then maybe we could improve the results

## How the loop works:

It loops through every strategy contained in a dictionary named 'strategies'
<br><br> Once it picks a strategy, it will start looping throughout the years 1 by 1
<br><br> Once it picks a year, it will go through every game starting from the 15th day of the season, it will calculate who's the current leader given the specific metric (metric in strategies dictionary)
<br><br>  Once it calculates the current leader (only considers results from previous days) it will load the current day's results and check if the chosen player got a hit or not. If this player did not play or did not get an AB it will verify with the next 'leader'

<br> It loops through every day of the regular season for a given year trying out a specific strategy. Once a season is done, it will go to the next season/year, and re-try the strategy. once all the seasons have been exhausted it will restart at the first year with the next strategy.

END

Henry