# Notebook for Strategy 11 to 22

In [1]:
import pandas as pd
import numpy as np

import sys
sys.path.append('../')
from streak_counter import hit_checker_logs

import random
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook as tqdm

#import warnings
#warnings.filterwarnings("ignore")
import time

d = pd.read_csv('../retrosheet_data/2005-2016_game_logs.csv')

# warning pops out because 'left_field_umpire' can be a string or float dtype
game_info = pd.read_csv('../retrosheet_data/2005-2016_game_info.csv', low_memory=False)

d['double_header_flag']   = d.game_id.apply(lambda x: x[11:]).astype(int)


In [2]:
years = np.arange(2005,2016+1,1)
n_years = len(years)
seasonal_perf_df = pd.DataFrame()
results = pd.DataFrame()

strategies = {'strategy_11':'BA_rolling_5d',  #5d BA
              'strategy_12':'H/PA_rolling_5d', #5d H/PA
              'strategy_13':'H_rolling_5d',   #5d cumulative hits, pass this is going to be too random
              'strategy_14':'hit_1_0_5d_ratio', #5d at least 1 hit ratio
              'strategy_15':'BA_rolling_10d',  #10day rolling BA
              'strategy_16':'H/PA_rolling_10d', #10day H/PA
              'strategy_17':'H_rolling_10d',   #10 day cumulative hits
              'strategy_18':'hit_1_0_10d_ratio', # 10 day at least 1 hits ratio
              'strategy_19':'BA_rolling_30d',  #30day rolling BA
              'strategy_20':'H/PA_rolling_30d', #30day H/PA
              'strategy_21':'H_rolling_30d',   #30day cumulative hits
              'strategy_22':'hit_1_0_30d_ratio'}   # 30days hit ratio

d.loc[d['H'] >= 1, 'hit_1_0'] = 1
d.loc[d['H'] == 0, 'hit_1_0'] = 0

d.sort_values(by=['res_batter','year','month','day','game_id'],inplace=True)

d['hit_1_0_5d_ratio']    = d.groupby(['res_batter','year'])['hit_1_0'].apply(lambda g: g.rolling(5).mean())
d['hit_1_0_10d_ratio']    = d.groupby(['res_batter','year'])['hit_1_0'].apply(lambda g: g.rolling(10).mean())
d['hit_1_0_30d_ratio']    = d.groupby(['res_batter','year'])['hit_1_0'].apply(lambda g: g.rolling(30).mean())

In [3]:
d.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 615213 entries, 0 to 615212
Data columns (total 59 columns):
game_id                   615213 non-null object
year                      615213 non-null int64
month                     615213 non-null int64
day                       615213 non-null int64
res_batter                615213 non-null object
H                         615213 non-null float64
PA                        615213 non-null float64
AB                        615213 non-null int64
BB                        615213 non-null float64
IBB                       615213 non-null float64
HBP                       615213 non-null float64
ITF                       615213 non-null float64
1B                        615213 non-null float64
2B                        615213 non-null float64
3B                        615213 non-null float64
HR                        615213 non-null float64
SH                        615213 non-null int64
SF                        615213 non-null int64
BA 

# Strategy 11-22 given n years
NOT UPDATED
<br>Strategy 6: pick the current Career leader in terms of BA (batting average) batter must have played at least 1 game past 5 days
<br>Strategy 7: pick the current Career leader in terms of H/PA_car (hits per plate apperances) batter must have played at leats 1 game past 5 days
<br>Strategy 8: pick the current Career leader in terms of Hits, batter must have played at least 1 game past 5 days
<br>Strategy 9: pick the current Career leader in terms of Hits per game played throughout his career, batter must have played at least 1 game past 5 days
<br>Strategy 10: pick the current Career leader in terms of highest ratio of 'at least 1 hit games' career wise

## Run the cell below

<br>It will run every strategy from 2005-2016 (takes around 10 mins to process)
<br>The outputs are as follows:<br>
<br>1. seasonal_perf_df (seasonal performance of the strategy). Rows: strategies: Columns: year +  number of hits (success), year + number of attemps. 
<br>2. results : this is a data frame containing the aggregated results. rows: strategies. columns: number of years, total number of hits, total number of attempts, percentage (also known as success rate or 'P'), CI - confidence interval

In [4]:
interested_batters = []
for i in range(1,10):
    for team in ['visitor','home']:
        interested_batters.append(f'{team}_batter_{i}')

In [5]:
%%time

for strategy in strategies:
    hit_counter = 0
    total_counter = 0

    for year_ in years:
        df = d[d['year']==year_].copy()
        game_info_yearly = game_info[game_info['year']==year_].copy() 
        
        seasonal_hit_counter = 0
        seasonal_tot_counter = 0

        daily_games = df.groupby(['month','day'])
        daily_game_info = game_info_yearly.groupby(['month','day'])
        
        game_dates = list(daily_games.groups.keys())

        for i in range(30, len(game_dates)) :
            batter_rank = 1

            daily_df = daily_games.get_group(game_dates[i])

            prev_day_list = []
            
            for j in range(1,6):
                prev_day_list.append(daily_games.get_group(game_dates[i-j]))
                        
            prev_days  = pd.concat(prev_day_list).sort_values(by=['res_batter','month','day'])
            prev_days = prev_days.drop_duplicates(subset=['res_batter'],keep='last')
            
            max_games_played = prev_days.seasonal_game_played.max()
            
            leaders = prev_days[prev_days.seasonal_game_played >= 0.7 * max_games_played]
            leaders = leaders.sort_values(by=[strategies.get(strategy)], ascending=False)

            batter    = leaders.iloc[batter_rank - 1].res_batter.split()
            
            daily_df_info = daily_game_info.get_group(game_dates[i])
            starting_lineup = set(np.unique(daily_df_info[interested_batters]))
            
            available_batters = set(starting_lineup) - set(np.unique(daily_df_info[['home_starting_pitcher','vis._starting_pitcher']]))
            
            while batter[0] not in available_batters:
                batter_rank = batter_rank +1
                batter = leaders.iloc[batter_rank - 1].res_batter.split()
            
            daily_hit_result = hit_checker_logs(daily_df, batter)

            if daily_hit_result =='continue_streak':
                pass
            elif daily_hit_result =='lose_streak':
                pass
            else:
                hit_counter = hit_counter + 1
                seasonal_hit_counter = seasonal_hit_counter +1

            total_counter = total_counter + 1
            seasonal_tot_counter = seasonal_tot_counter + 1

        print('Complete: ',strategy, ' Year:', year_)
                
        column_hit   = str(year_) +'_hit_counter'
        column_total = str(year_) +'_hit_total'
        percentage = str(year_) +'_percentage'
        seasonal_perf_df.loc[strategy, column_hit] = seasonal_hit_counter
        seasonal_perf_df.loc[strategy, column_total] = seasonal_tot_counter
        seasonal_perf_df.loc[strategy, percentage] = seasonal_hit_counter / seasonal_tot_counter
    
    
    results.loc[strategy, 'n_years'] = n_years
    results.loc[strategy, 'hit_counter'] = hit_counter #theres already a counter
    results.loc[strategy, 'total_counter'] = total_counter # theres already a counter
    results.loc[strategy, 'percentage'] = hit_counter / total_counter

print('COMPLETE')

Complete:  strategy_11  Year: 2005
Complete:  strategy_11  Year: 2006
Complete:  strategy_11  Year: 2007
Complete:  strategy_11  Year: 2008
Complete:  strategy_11  Year: 2009
Complete:  strategy_11  Year: 2010
Complete:  strategy_11  Year: 2011
Complete:  strategy_11  Year: 2012
Complete:  strategy_11  Year: 2013
Complete:  strategy_11  Year: 2014
Complete:  strategy_11  Year: 2015
Complete:  strategy_11  Year: 2016
Complete:  strategy_12  Year: 2005
Complete:  strategy_12  Year: 2006
Complete:  strategy_12  Year: 2007
Complete:  strategy_12  Year: 2008
Complete:  strategy_12  Year: 2009
Complete:  strategy_12  Year: 2010
Complete:  strategy_12  Year: 2011
Complete:  strategy_12  Year: 2012
Complete:  strategy_12  Year: 2013
Complete:  strategy_12  Year: 2014
Complete:  strategy_12  Year: 2015
Complete:  strategy_12  Year: 2016
Complete:  strategy_13  Year: 2005
Complete:  strategy_13  Year: 2006
Complete:  strategy_13  Year: 2007
Complete:  strategy_13  Year: 2008
Complete:  strategy_

In [6]:
results = results.reset_index()
results = results.rename(columns={'index':'strategy'})

In [7]:
results

Unnamed: 0,strategy,n_years,hit_counter,total_counter,percentage
0,strategy_11,12.0,1248.0,1805.0,0.691413
1,strategy_12,12.0,1260.0,1805.0,0.698061
2,strategy_13,12.0,1305.0,1805.0,0.722992
3,strategy_14,12.0,1291.0,1805.0,0.715235
4,strategy_15,12.0,1266.0,1805.0,0.701385
5,strategy_16,12.0,1261.0,1805.0,0.698615
6,strategy_17,12.0,1304.0,1805.0,0.722438
7,strategy_18,12.0,1296.0,1805.0,0.718006
8,strategy_19,12.0,1305.0,1805.0,0.722992
9,strategy_20,12.0,1335.0,1805.0,0.739612


In [8]:
seasonal_perf_df

Unnamed: 0,2005_hit_counter,2005_hit_total,2005_percentage,2006_hit_counter,2006_hit_total,2006_percentage,2007_hit_counter,2007_hit_total,2007_percentage,2008_hit_counter,...,2013_percentage,2014_hit_counter,2014_hit_total,2014_percentage,2015_hit_counter,2015_hit_total,2015_percentage,2016_hit_counter,2016_hit_total,2016_percentage
strategy_11,105.0,150.0,0.7,95.0,150.0,0.633333,110.0,151.0,0.728477,113.0,...,0.746667,107.0,151.0,0.708609,91.0,149.0,0.610738,103.0,149.0,0.691275
strategy_12,111.0,150.0,0.74,93.0,150.0,0.62,108.0,151.0,0.715232,111.0,...,0.733333,106.0,151.0,0.701987,95.0,149.0,0.637584,98.0,149.0,0.657718
strategy_13,107.0,150.0,0.713333,104.0,150.0,0.693333,114.0,151.0,0.754967,113.0,...,0.68,105.0,151.0,0.695364,102.0,149.0,0.684564,115.0,149.0,0.771812
strategy_14,107.0,150.0,0.713333,111.0,150.0,0.74,104.0,151.0,0.688742,113.0,...,0.686667,112.0,151.0,0.741722,109.0,149.0,0.731544,99.0,149.0,0.66443
strategy_15,105.0,150.0,0.7,110.0,150.0,0.733333,113.0,151.0,0.748344,108.0,...,0.7,90.0,151.0,0.596026,98.0,149.0,0.657718,106.0,149.0,0.711409
strategy_16,109.0,150.0,0.726667,103.0,150.0,0.686667,107.0,151.0,0.708609,107.0,...,0.666667,92.0,151.0,0.609272,107.0,149.0,0.718121,108.0,149.0,0.724832
strategy_17,113.0,150.0,0.753333,107.0,150.0,0.713333,117.0,151.0,0.774834,112.0,...,0.673333,89.0,151.0,0.589404,109.0,149.0,0.731544,107.0,149.0,0.718121
strategy_18,106.0,150.0,0.706667,101.0,150.0,0.673333,106.0,151.0,0.701987,107.0,...,0.713333,106.0,151.0,0.701987,108.0,149.0,0.724832,107.0,149.0,0.718121
strategy_19,113.0,150.0,0.753333,106.0,150.0,0.706667,111.0,151.0,0.735099,111.0,...,0.68,107.0,151.0,0.708609,103.0,149.0,0.691275,107.0,149.0,0.718121
strategy_20,111.0,150.0,0.74,116.0,150.0,0.773333,111.0,151.0,0.735099,119.0,...,0.713333,103.0,151.0,0.682119,110.0,149.0,0.738255,106.0,149.0,0.711409


In [9]:
seasonal_perf_df[['2008_hit_counter','2008_hit_total','2008_percentage','2009_hit_counter','2009_hit_total','2009_percentage','2010_hit_counter','2010_hit_total','2010_percentage','2011_hit_counter','2011_hit_total','2011_percentage','2012_hit_counter','2012_hit_total','2012_percentage','2013_hit_counter','2013_hit_total','2013_percentage']]

Unnamed: 0,2008_hit_counter,2008_hit_total,2008_percentage,2009_hit_counter,2009_hit_total,2009_percentage,2010_hit_counter,2010_hit_total,2010_percentage,2011_hit_counter,2011_hit_total,2011_percentage,2012_hit_counter,2012_hit_total,2012_percentage,2013_hit_counter,2013_hit_total,2013_percentage
strategy_11,113.0,154.0,0.733766,107.0,151.0,0.708609,98.0,150.0,0.653333,107.0,149.0,0.718121,100.0,151.0,0.662252,112.0,150.0,0.746667
strategy_12,111.0,154.0,0.720779,113.0,151.0,0.748344,100.0,150.0,0.666667,112.0,149.0,0.751678,103.0,151.0,0.682119,110.0,150.0,0.733333
strategy_13,113.0,154.0,0.733766,116.0,151.0,0.768212,107.0,150.0,0.713333,108.0,149.0,0.724832,112.0,151.0,0.741722,102.0,150.0,0.68
strategy_14,113.0,154.0,0.733766,115.0,151.0,0.761589,113.0,150.0,0.753333,103.0,149.0,0.691275,102.0,151.0,0.675497,103.0,150.0,0.686667
strategy_15,108.0,154.0,0.701299,108.0,151.0,0.715232,98.0,150.0,0.653333,117.0,149.0,0.785235,108.0,151.0,0.715232,105.0,150.0,0.7
strategy_16,107.0,154.0,0.694805,105.0,151.0,0.695364,101.0,150.0,0.673333,109.0,149.0,0.731544,113.0,151.0,0.748344,100.0,150.0,0.666667
strategy_17,112.0,154.0,0.727273,113.0,151.0,0.748344,106.0,150.0,0.706667,110.0,149.0,0.738255,120.0,151.0,0.794702,101.0,150.0,0.673333
strategy_18,107.0,154.0,0.694805,122.0,151.0,0.807947,107.0,150.0,0.713333,114.0,149.0,0.765101,105.0,151.0,0.695364,107.0,150.0,0.713333
strategy_19,111.0,154.0,0.720779,109.0,151.0,0.721854,116.0,150.0,0.773333,107.0,149.0,0.718121,113.0,151.0,0.748344,102.0,150.0,0.68
strategy_20,119.0,154.0,0.772727,113.0,151.0,0.748344,117.0,150.0,0.78,110.0,149.0,0.738255,112.0,151.0,0.741722,107.0,150.0,0.713333


In [10]:
results.to_csv('strat_11_22.csv',index=None)
seasonal_perf_df.to_csv('strat_11_22_seasons.csv')

In [11]:
#create function to track the best players and chosen number of times and based on metric
#create function that takes top 5 guys
# create function that would see streak???

In [12]:
import statsmodels.api as sm

results = results.sort_values(by=['percentage'],ascending=False)
results = results.reset_index(drop=True)

p_highest = results.iloc[0].loc['hit_counter']
n_highest = results.iloc[0].loc['total_counter']
z_score_list = [0]
p_value_list = [0]

for row in range(1, results.shape[0]):
    p = results.iloc[row].loc['hit_counter']
    n = results.iloc[row].loc['total_counter']
    zscore, pvalue = sm.stats.proportions_ztest([p, p_highest], [n, n_highest], alternative = 'two-sided', prop_var = False)
    z_score_list.append(zscore)
    p_value_list.append(pvalue)


z_score_series = pd.Series(z_score_list)   
p_value_series = pd.Series(p_value_list)

results['zscore'] = z_score_series
results['pvalue'] = p_value_series

results

Unnamed: 0,strategy,n_years,hit_counter,total_counter,percentage,zscore,pvalue
0,strategy_21,12.0,1378.0,1805.0,0.763435,0.0,0.0
1,strategy_22,12.0,1349.0,1805.0,0.747368,-1.122868,0.261494
2,strategy_20,12.0,1335.0,1805.0,0.739612,-1.656155,0.09769
3,strategy_13,12.0,1305.0,1805.0,0.722992,-2.781162,0.005416
4,strategy_19,12.0,1305.0,1805.0,0.722992,-2.781162,0.005416
5,strategy_17,12.0,1304.0,1805.0,0.722438,-2.818266,0.004828
6,strategy_18,12.0,1296.0,1805.0,0.718006,-3.114217,0.001844
7,strategy_14,12.0,1291.0,1805.0,0.715235,-3.298403,0.000972
8,strategy_15,12.0,1266.0,1805.0,0.701385,-4.210681,2.5e-05
9,strategy_16,12.0,1261.0,1805.0,0.698615,-4.391472,1.1e-05


For now we are just going to save the results, and then combine and compare them to the other strategies. 

## How the loop works:

It loops through every strategy contained in a dictionary named 'strategies'
<br><br> Once it picks a strategy, it will start looping throughout the years 1 by 1
<br><br> Once it picks a year, it will go through every game starting from the 15th day of the season, it will calculate who's the current leader given the specific metric (metric in strategies dictionary)
<br><br>  Once it calculates the current leader (only considers results from previous days) it will load the current day's results and check if the chosen player got a hit or not. If this player did not play or did not get an AB it will verify with the next 'leader'

<br> It loops through every day of the regular season for a given year trying out a specific strategy. Once a season is done, it will go to the next season/year, and re-try the strategy. once all the seasons have been exhausted it will restart at the first year with the next strategy.

END

Henry