In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

**Local:**

In [2]:
import constants

In [3]:
constants.pandas_settings()

In [4]:
def load_fanduel_data():
    df = pd.read_csv(f'../data/contest-files/{constants.tournament_fname()}.csv', usecols=constants.keep_cols)
    df.columns = df.columns.str.lower()
    
    # print(len(df.index))
    
    ret = (df
           .rename({'nickname': 'name'}, axis=1)
           .loc[(df['injury indicator']!='O') & (df['salary']>7000)]
           .drop('injury indicator', axis=1)
           .dropna()
           .reset_index(drop=True)
          )
    
    for c in ['played', 'salary']:
        ret[c] = ret[c].astype('int')
    
    
    
    return(ret)

In [5]:
load_fanduel_data().head(constants.display_num)

Unnamed: 0,name,fppg,played,salary
0,Scottie Scheffler,95.18,19,12000
1,Jon Rahm,82.55,13,11900
2,Justin Thomas,99.59,16,11800
3,Xander Schauffele,84.79,15,11700
4,Matt Fitzpatrick,71.89,15,11600
5,Collin Morikawa,80.39,13,11400
6,Jordan Spieth,70.1,17,11300
7,Will Zalatoris,73.02,16,11200
8,Patrick Cantlay,86.95,12,11100
9,Cameron Smith,82.49,13,11000


In [6]:
strokes_gained_components = {
    'tee': {
        'url_id': 2567,
        'url': 'https://www.pgatour.com/stats/stat.02567.html',
        'shortened': 'ott'
    },
    'approach': {
        'url_id': 2568,
        'url': 'https://www.pgatour.com/stats/stat.02568.html',
        'shortened': 'app'
    },
    'around': {
        'url_id': 2569,
        'url': 'https://www.pgatour.com/stats/stat.02569.html',
        'shortened': 'arg'
    },
    'green': {
        'url_id': 2564,
        'url': 'https://www.pgatour.com/stats/stat.02564.html',
        'shortened': 'putt'
    },
    'tee-to-green': {
        'url_id': 2674,
        'url': 'https://www.pgatour.com/stats/stat.02674.html',
        'shortened' : 'ttg'
        }
}

html_dict = {
    'tee': 'https://www.pgatour.com/stats/stat.02567.html',
    'approach': 'https://www.pgatour.com/stats/stat.02568.html',
    'around': 'https://www.pgatour.com/stats/stat.02569.html',
    'green': 'https://www.pgatour.com/stats/stat.02564.html',
    'tee-to-green': 'https://www.pgatour.com/stats/stat.02674.html'
}

new_col_names = {
    'player name': 'name',
    'rank this week': ' cur-rank',
    'rank last week': ' prev-rank',
    'average': ' sg',
    'rounds': ' num-rounds',
    'measured rounds': ' num-measured'
}

abbrev_col_names = [ 'name', ' sg' ]

In [7]:
def load_shots_gained(golf_shot, abbreviate):
    
    if golf_shot.lower() not in strokes_gained_components:
        return None
    
    else:
        info = strokes_gained_components.get(golf_shot.lower(), None)
        if info is None:
            return None
        else:
            url = f'https://www.pgatour.com/stats/stat.0{ info["url_id"] }.html'
            
            ret = pd.read_html(url)[1].reset_index(drop=True)
            
            ret.columns = ret.columns.str.lower().str.replace('total sg:', ' sg').str.replace('\xa0', ' ')
            
            ret = ret.rename(new_col_names, axis=1)
            # ret.index = ret['name']
            # ret = ret.drop('name', axis=1)
            if abbreviate:
                ret = ret.loc[:, abbrev_col_names]
                
            
            ret.columns = ret.columns.str.replace(' ', f'{strokes_gained_components[golf_shot]["shortened"]}-')
            
            
            
            return(ret)

In [8]:
# def load_shots_gained(golf_shot, abbreviate=False):
    
#     if golf_shot.lower() not in html_dict:
#         return None
    
#     else:
#         url=html_dict.get(golf_shot.lower(), None)
#         if url is None:
#             return None
#         else:
#             ret = pd.read_html(url)[1].reset_index(drop=True)
            
#             ret.columns = ret.columns.str.lower().str.replace('total sg:', ' sg').str.replace('\xa0', ' ')
            
#             ret = ret.rename(new_col_names, axis=1)
#             # ret.index = ret['name']
#             # ret = ret.drop('name', axis=1)
#             if abbreviate:
#                 ret = ret.loc[:, abbrev_col_names]
                
            
#             ret.columns = ret.columns.str.replace(' ', f'{golf_shot}-')
            
            
            
#             return(ret)

In [9]:
def compile_strokes_gained_data(abbreviate=True):
    
    sg_frames = { gs: load_shots_gained(gs, abbreviate=abbreviate) for gs in html_dict }

    sg_data = sg_frames['tee']
    for k in list(html_dict.keys())[1:]:
        sg_data = sg_data.merge(sg_frames[k])

    # sg_data['long-sg'] = sg_data['tee-sg'] + sg_data['approach-sg']
    # sg_data['short-sg'] = sg_data['green-sg'] + sg_data['around-sg']
    
    sg_data.index = sg_data['name']
    
    sg_data = (sg_data
               .sort_values(by=['ott-sg', 'putt-sg'], ascending=False)
               .drop('name', axis=1)
              )
    
    
    
    return(sg_data)

In [10]:
def combine_with_fanduel():
    fd = load_fanduel_data()
    sg = compile_strokes_gained_data()
    
    focus_stats = ( constants.focus_stat, ) if constants.focus_stat_2 is None else (constants.focus_stat, constants.focus_stat_2)
    
    for sg_col in focus_stats:
        fd[sg_col] = fd['name'].apply(lambda x: sg.loc[x, sg_col] if x in sg.index else 0.0)
        fd[f'{sg_col}-per-10k'] = np.array( 10000 * fd[sg_col] / fd['salary'] )
    
    fd = fd.sort_values(by=[f'{focus_stats[0]}-per-10k'], ascending=False)
    
    convs = {'name': 'str', 'salary': 'int'}
    
    for col in fd.columns:
        fd[col] = fd[col].astype(convs.get(col, 'float'))
    
    # fd.index = fd['name']
    # fd = fd.drop('name', axis=1)
    fd = fd.dropna()
    # print(len(fd.index))
    return(fd)

In [11]:
combine_with_fanduel().head(50)

Unnamed: 0,name,fppg,played,salary,ttg-sg,ttg-sg-per-10k,putt-sg,putt-sg-per-10k
59,Luke List,57.72,22.0,8400,1.56,1.86,-0.78,-0.93
7,Will Zalatoris,73.02,16.0,11200,1.83,1.63,-0.04,-0.04
4,Matt Fitzpatrick,71.89,15.0,11600,1.87,1.61,0.39,0.34
35,Chris Kirk,66.39,19.0,9100,1.33,1.46,-0.02,-0.02
2,Justin Thomas,99.59,16.0,11800,1.72,1.45,0.22,0.19
1,Jon Rahm,82.55,13.0,11900,1.6,1.35,0.2,0.17
26,Mito Pereira,66.45,22.0,9600,1.23,1.28,0.06,0.07
16,Joaquin Niemann,72.85,17.0,10300,1.31,1.27,-0.01,-0.01
14,Sungjae Im,78.49,17.0,10500,1.31,1.25,0.16,0.15
20,Keegan Bradley,69.26,18.0,9900,1.23,1.25,0.1,0.11


In [12]:
import itertools
from itertools import combinations

from functools import cache
from tqdm.notebook import tqdm

from pandarallel import pandarallel
pandarallel.initialize(use_memory_fs=True)

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [13]:
MIN_SALARY = 9000

In [14]:
def filters(df):
    ret = df.copy(deep=True)
    
    ret = ret.loc[ ret['salary']>=MIN_SALARY ].reset_index(drop=True)
    
    return(ret)

In [20]:
data = combine_with_fanduel() #.head(50)
data = filters(data)

pnames = data['name'].values.tolist()
data['salary'] /= 100

data.index = data['name']
data = data.drop('name', axis=1)

cols_to_sum = ['salary', constants.focus_stat, constants.focus_stat_2, f'{constants.focus_stat}-per-10k', f'{constants.focus_stat_2}-per-10k']
cost_range = range(595,601)

@cache
def get_value(name, column):
    return( data.loc[name, column] )

@cache
def sum_values(names, column):
    return( sum( [ get_value(name, column) for name in names ] ) )

@cache
def is_valid_lineup(lineup):
    return( sum_values(lineup, 'salary') in cost_range and len(set(lineup))==6 )

@cache
def lineup_analysis(lineup):
    return(tuple( [ sum_values(tuple(set(lineup)),column) for column in cols_to_sum ] ) )

def lineup_analysis_wrapper(lineup):
    return( lineup_analysis(tuple(set(lineup.to_numpy()))) if is_valid_lineup(tuple(set(lineup.to_numpy()))) else (0.0,)*len(cols_to_sum)  )

def create_lineup_2_slices(slate_dict):
#     2 things of three
    ret_list = list()
    
    for half_slates in tqdm( [p for p in itertools.product(*slate_dict.values())] ):
        
        g1,g2,g3 = tuple(sorted(list(half_slates[0])))
        g4,g5,g6 = tuple(sorted(list(half_slates[1])))
        
        lu = (g1,g2,g3,g4,g5,g6)
        if is_valid_lineup(lu):
            ret_list.append(lu)
    
    return(tuple(ret_list))

def create_lineup_3_slices(slate_dict):
    
#     3 things of two
    return None

# Trying to get better about only passing tuples or other completely immutable for default and for cache
def create_lineups(slices = (2,)):
    
    # Not necessary but makes reading easier
    num_players = 6 # (n)
    num_slices = slices[0]
    step = int( len(pnames) / num_slices )
    
    r = int(num_players / num_slices) # (nCr)
    
    # for i in range(1,num_slices+1):
        # print(int(i*step))
    
    
    # slates = {f'slate{i+1}': tuple(map( tuple, itertools.combinations(pnames[:int(i*step)], r) )) for i in range(num_slices+1)}
    # slates.update( { f'slate{num_slices}': tuple(map( tuple, itertools.combinations(pnames[int(num_slices*step):], r) )) } )
    
    slates = dict()
    
    if num_slices == 2:
        
        slates = {
            'slate1': tuple(map( tuple, itertools.combinations(pnames[:step], r) )),
            'slate2': tuple(map( tuple, itertools.combinations(pnames[step:], r) ))
        }
        
    elif num_slices == 3:
        
        slates = {
            'slate1': tuple(map( tuple, itertools.combinations(pnames[:step], r) )),
            'slate2': tuple(map( tuple, itertools.combinations(pnames[step:int(2*step)], r) )),
            'slate3': tuple(map( tuple, itertools.combinations(pnames[int(2*step):], r) )),
            
        }
    
    operations = { 2: create_lineup_2_slices(slates), 3: create_lineup_3_slices(slates)}
    
    lineups = operations[num_slices]
    
    ret = pd.DataFrame(lineups, columns=['g1','g2','g3','g4','g5','g6'])
    
    ret[cols_to_sum] = ret.parallel_apply( lineup_analysis_wrapper, axis=1, result_type='expand' )
    
    ret['avg_value'] = ret[f'{constants.focus_stat}-per-10k'] + ret[f'{constants.focus_stat_2}-per-10k']
    ret['avg_value'] /= 2
    
    ret = (ret
           # .sort_values(by=f'{constants.focus_stat}-per-10k', ascending=False)
           .sort_values(by='avg_value', ascending=False)
           .drop_duplicates()
           .reset_index(drop=True)
          )
    
    
    ret.to_csv(f'../data/lineups-created/{constants.tournament_fname()}.csv', index=False)
    
    print('Done...')
    
    return None

def output_lineups(top_num=100):
    return(pd.read_csv(f'../data/lineups-created/{constants.tournament_fname()}.csv').head(top_num))
    

In [21]:
# if constants.create:
#     create_lineups()

# output_lineups()

In [22]:
def create_func():
    if constants.create:
        create_lineups()
    return(output_lineups())

In [23]:
create_func()

  0%|          | 0/1104660 [00:00<?, ?it/s]

Done...


Unnamed: 0,g1,g2,g3,g4,g5,g6,salary,ttg-sg,putt-sg,ttg-sg-per-10k,putt-sg-per-10k,avg_value
0,Chris Kirk,Matt Fitzpatrick,Max Homa,Billy Horschel,Keith Mitchell,Matt Kuchar,595.0,6.36,2.25,6.29,2.26,4.28
1,Cameron Young,Chris Kirk,Matt Fitzpatrick,Keith Mitchell,Matt Kuchar,Patrick Cantlay,600.0,6.56,2.11,6.44,2.1,4.27
2,Chris Kirk,Matt Fitzpatrick,Mito Pereira,Keith Mitchell,Matt Kuchar,Patrick Cantlay,598.0,6.68,1.95,6.6,1.94,4.27
3,Chris Kirk,Matt Fitzpatrick,Sungjae Im,Billy Horschel,Keith Mitchell,Matt Kuchar,600.0,6.53,2.08,6.41,2.09,4.25
4,Chris Kirk,Matt Fitzpatrick,Max Homa,Billy Horschel,Keith Mitchell,Marc Leishman,598.0,6.41,2.15,6.34,2.14,4.24
5,Matt Fitzpatrick,Max Homa,Mito Pereira,Billy Horschel,Keith Mitchell,Matt Kuchar,600.0,6.25,2.33,6.11,2.35,4.23
6,Chris Kirk,Matt Fitzpatrick,Max Homa,Billy Horschel,Gary Woodland,Keith Mitchell,597.0,6.78,1.74,6.74,1.7,4.22
7,Chris Kirk,Matt Fitzpatrick,Max Homa,Billy Horschel,Christiaan Bezuidenhout,Keith Mitchell,595.0,6.45,2.05,6.4,2.04,4.22
8,Cameron Young,Chris Kirk,Matt Fitzpatrick,Christiaan Bezuidenhout,Keith Mitchell,Patrick Cantlay,600.0,6.65,1.91,6.55,1.88,4.21
9,Chris Kirk,Matt Fitzpatrick,Mito Pereira,Gary Woodland,Keith Mitchell,Patrick Cantlay,600.0,7.1,1.44,7.05,1.37,4.21
