In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

**Local:**

In [2]:
import constants
import constraints

from datagolf import datagolf

**Optimizer: (bottom)**

In [3]:
import itertools
from itertools import combinations

from functools import cache
from tqdm.notebook import tqdm

from pandarallel import pandarallel
pandarallel.initialize(use_memory_fs=True)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
/dev/shm is 40B   =   4e-08KiB     


In [4]:
constants.pandas_settings()

In [5]:
def edit_fanduel():
    df = pd.read_csv(f'../data/contest-files/{constants.tournament}.csv', usecols=constants.keep_cols)
    df.columns = df.columns.str.lower()

    ret = (df
           .rename({'nickname': 'name'}, axis=1)
           #.loc[(df['injury indicator']!='O') & (df['salary']>7000)]
           .drop('injury indicator', axis=1)
           .dropna()
           .reset_index(drop=True)
          )

    col_types = { 'name': 'str', 'fppg': 'float' }
    
    for col in ret.columns:
        ret[col] = ret[col].astype(col_types.get(col,'int'))
    
    ret.to_pickle('../data/pickle-buffer/fanduel-data.pkl')
    
    return None


def load_fanduel():
    
    edit_fanduel()
    
    return pd.read_pickle('../data/pickle-buffer/fanduel-data.pkl')

In [6]:
strokes_gained_components = {
    'tee': {
        'url_id': 2567,
        #'url': 'https://www.pgatour.com/stats/stat.02567.html',
        'shortened': 'ott'
    },
    'approach': {
        'url_id': 2568,
        #'url': 'https://www.pgatour.com/stats/stat.02568.html',
        'shortened': 'app'
    },
    'around': {
        'url_id': 2569,
        #'url': 'https://www.pgatour.com/stats/stat.02569.html',
        'shortened': 'arg'
    },
    'green': {
        'url_id': 2564,
        #'url': 'https://www.pgatour.com/stats/stat.02564.html',
        'shortened': 'putt'
    },
    'tee-to-green': {
        'url_id': 2674,
        #'url': 'https://www.pgatour.com/stats/stat.02674.html',
        'shortened' : 'ttg'
        }
}

new_col_names = {
    'player name': 'name',
    'rank this week': ' cur-rank',
    'rank last week': ' prev-rank',
    'average': ' sg',
    'rounds': ' num-rounds',
    'measured rounds': ' num-measured'
}

abbrev_col_names = [ 'name', ' sg' ]

In [7]:
def strokes_gained_per(golf_shot, abbreviate=True):
    
    if golf_shot.lower() not in strokes_gained_components:
        return None
    
    else:
        info = strokes_gained_components.get(golf_shot.lower(), None)
        if info is None:
            return None
        
        else:
            url = f'https://www.pgatour.com/stats/stat.0{ info["url_id"] }.html'
            
            ret = pd.read_html(url)[1].reset_index(drop=True)
            
            ret.columns = ret.columns.str.lower().str.replace('total sg:', ' sg').str.replace('\xa0', ' ')
            
            ret = ret.rename(new_col_names, axis=1)
            #ret.index = ret['name']
            #ret = ret.drop('name', axis=1)
            if abbreviate:
                ret = ret.loc[:, abbrev_col_names]
            
            ret.columns = ret.columns.str.replace(' ', f'{strokes_gained_components[golf_shot]["shortened"]}-')
            
            ret.to_pickle(f'../data/pickle-buffer/{strokes_gained_components[golf_shot]["shortened"]}-sg.pkl')
            
            return None
        
def load_strokes_gained_per(golf_shot):
    
    #**
    strokes_gained_per(golf_shot)
    
    return pd.read_pickle(f'../data/pickle-buffer/{strokes_gained_components[golf_shot]["shortened"]}-sg.pkl')

In [8]:
def aggregate_strokes_gained():
    
    # Create dictionary containing strokes-gained data for each stroke
    sg_frames = { golf_shot: load_strokes_gained_per(golf_shot) for golf_shot in strokes_gained_components }

    # Initialize frame as tee and merge rest of shots
    sgdf = sg_frames['tee']
    for k in list(strokes_gained_components.keys())[1:]:
        sgdf = sgdf.merge(sg_frames[k])
    
    ret = (sgdf
           #.sort_values(by=constants.focus_stat, ascending=False)
           .reset_index(drop=True)
          )

    ret.to_pickle(f'../data/pickle-buffer/strokes-gained.pkl')                                                 
                                                 
    return None

def load_strokes_gained():
    
    aggregate_strokes_gained()
    
    return pd.read_pickle('../data/pickle-buffer/strokes-gained.pkl')

In [9]:
def combine_pga_fanduel():
    fd = load_fanduel()
    sg = load_strokes_gained()
    
    focus_stats = [ constants.focus_stat ]
    if constants.focus_stat_2 is not None:
        focus_stats.append(constants.focus_stat_2)
        if constants.focus_stat_3 is not None:
            focus_stats.append(constants.focus_stat_3)
    
    focus_stats = tuple(focus_stats)
    
    sg_lookup = load_strokes_gained()
    sg_lookup.index = sg_lookup['name']
    sg_lookup = sg_lookup.drop('name', axis=1)
    
    for sg_col in focus_stats:
        fd[sg_col] = fd['name'].apply(lambda x: sg_lookup.loc[x, sg_col] if x in sg_lookup.index else 0.0)
        fd[f'{sg_col}-per-10k'] = np.array( 10000 * fd[sg_col] / fd['salary'] )
    
    #for sg_col in focus_stats:
        #fd[sg_col] = fd['name'].apply(lambda x: sg.loc[x, sg_col] if x in sg.index else 0.0)
        #fd[f'{sg_col}-per-10k'] = np.array( 10000 * fd[sg_col] / fd['salary'] )
    
    #convs = {'name': 'str', 'salary': 'int'}
    
    #for col in fd.columns:
        #fd[col] = fd[col].astype(convs.get(col, 'float'))
    
    # fd.index = fd['name']
    # fd = fd.drop('name', axis=1)
                           
    fd = (fd
          .sort_values(by=[constants.focus_stat], ascending=False)
          .dropna()
         )
    
    fd.to_pickle(f'../data/pickle-buffer/{constants.tournament}.pkl')

    return None

In [10]:
def add_constraints():
    
    combine_pga_fanduel()
    ret = pd.read_pickle(f'../data/pickle-buffer/{constants.tournament}.pkl')
    
    if constraints.min_salary is not None:
        print(f'Excluding players less than ${constraints.min_salary}...')
        ret = (ret
               .loc[ ret['salary']>=constraints.min_salary ]
               .reset_index(drop=True)
              )

        
#    ret = (ret
#           .loc[ ret['name'].isin(dapi.players_who_made_cut()) ]
#           .reset_index(drop=True)
#          )
    
    ret.to_pickle(f'../data/pickle-buffer/optimizer-data.pkl')
    
    return None

In [11]:
def prepare_input():
    
    add_constraints()
    ret = pd.read_pickle(f'../data/pickle-buffer/optimizer-data.pkl')
    ret_names = ret['name'].values.tolist()
    
    ret['proj-pts'] = ret['name'].apply(datagolf.proj_pts)
    #ret['cfit-adj'] = ret['name'].apply(dapi.proj_skd)
    
    #ret['cfit-pts'] = ret['proj-pts']+(ret['proj-pts']*ret['cfit-adj'])
    
    ret['salary'] /= 100
    ret.index = ret['name']
    ret = ret.drop('name', axis=1)
    
    ret.to_pickle(f'../data/pickle-buffer/optimizer-data-clean.pkl')
    
    return ret_names
    

`from numba import vectorize

@vectorize(['float32(float32, float32)'], target='cuda')
def Multi(a, b):
    return a*b`

In [12]:
pnames = prepare_input()
data = pd.read_pickle(f'../data/pickle-buffer/optimizer-data-clean.pkl')

@cache
def get_value(name, column):
    return( data.loc[name, column] )

@cache
def sum_values(names, column):
    return( sum( [ get_value(name, column) for name in names ] ) )

@cache
def is_valid_lineup(lineup):
    return( sum_values(lineup, 'salary') in constraints.cost_range and len(set(lineup))==6 )

@cache
def lineup_analysis(lineup):
    return(tuple( [ sum_values(tuple(set(lineup)),column) for column in constraints.cols_to_sum ] ) )

def lineup_analysis_wrapper(lineup):
    return( lineup_analysis(tuple(set(lineup.to_numpy()))) if is_valid_lineup(tuple(set(lineup.to_numpy()))) else (0.0,)*len(constraints.cols_to_sum)  )

def create_lineup_2_slices(slate_dict):
#     2 things of three
    ret_list = list()
    
    for half_slates in tqdm( [p for p in itertools.product(*slate_dict.values())] ):
        
        g1,g2,g3 = tuple(sorted(list(half_slates[0])))
        g4,g5,g6 = tuple(sorted(list(half_slates[1])))
        
        lu = (g1,g2,g3,g4,g5,g6)
        if is_valid_lineup(lu):
            ret_list.append(lu)
    
    return tuple(ret_list)

def create_lineup_3_slices(slate_dict):
    
#     3 things of two
    ret_list = list()
    
    for third_slates in tqdm( [p for p in itertools.product(*slate_dict.values())] ):
        
        g1,g2 = tuple(sorted(list(third_slates[0])))
        g3,g4 = tuple(sorted(list(third_slates[1])))
        g5,g6 = tuple(sorted(list(third_slates[2])))
        
        lu = (g1,g2,g3,g4,g5,g6)
        if is_valid_lineup(lu):
            ret_list.append(lu)
    
    return tuple(ret_list)

# Trying to get better about only passing tuples or other completely immutable for default and for cache
def create_lineups():
    
    # Not necessary but makes reading easier
    num_players = 6 # (n)
    num_slices = constraints.slices
    
    
    step = int( len(pnames) * num_slices**-1 ) # Refers to partition of all names --> 2 slices of 120 players == (:120,120:)
    r = int(num_players / num_slices) # (nCr)
    #     All possible combos for each half of pnames, dont intersect as of rn
    
    slates = dict()
    
    if num_slices == 2:
        
        slates = {
            'slate1': tuple(map( tuple, itertools.combinations(pnames[:step], r) )),
            'slate2': tuple(map( tuple, itertools.combinations(pnames[step:], r) ))
        }
        
    elif num_slices == 3:

        slates = {
            'slate1': tuple(map( tuple, itertools.combinations(pnames[:step], r) )),
            'slate2': tuple(map( tuple, itertools.combinations(pnames[step:int(2*step)], r) )),
            'slate3': tuple(map( tuple, itertools.combinations(pnames[int(2*step):], r) )),
            
        }
    
    #operations = { 2: create_lineup_2_slices(slates), 3: create_lineup_3_slices(slates)}
    
    lineups = create_lineup_2_slices(slates) if num_slices==2 else create_lineup_3_slices(slates)
    
    ret = pd.DataFrame(lineups, columns=['g1','g2','g3','g4','g5','g6'])
    
    # Badda bing
    ret[constraints.cols_to_sum] = ret.parallel_apply( lineup_analysis_wrapper, axis=1, result_type='expand' )
    
    ret = (ret
           # .sort_values(by=f'{constants.focus_stat}-per-10k', ascending=False)
           .sort_values(by='proj-pts', ascending=False)
           .drop_duplicates()
           .reset_index(drop=True)
          )
    
    
    ret.to_pickle(f'../data/lineups-created/{constants.tournament}.pkl')
    
    print('Done...')
    
    return None

def output_lineups(top_num=100):
    return pd.read_pickle(f'../data/lineups-created/{constants.tournament}.pkl').head(top_num)
    

Excluding players less than $8000...


In [13]:
def output_lineups_by(sort_by=('proj-pts',)):
    return pd.read_pickle(f'../data/lineups-created/{constants.tournament}.pkl').sort_values(by=sort_by[0], ascending=False).head(100)

In [14]:
def create_func():
    if constants.create:
        print(f'Creating lineups...\n{constraints.slices} partitions of names\n')
        create_lineups()
    return output_lineups_by()

In [15]:
create_func()

Creating lineups...
3 partitions of names



  0%|          | 0/29250000 [00:00<?, ?it/s]

Done...


Unnamed: 0,g1,g2,g3,g4,g5,g6,salary,proj-pts
0,Nate Lashley,Sungjae Im,Greyson Sigg,Hideki Matsuyama,Brendan Steele,Matthew NeSmith,600.0,484.77
1,Nate Lashley,Sungjae Im,Hideki Matsuyama,Stephan Jaeger,Brendan Steele,Matthew NeSmith,600.0,484.42
2,Andrew Putnam,Sungjae Im,Greyson Sigg,Hideki Matsuyama,Brendan Steele,Matthew NeSmith,599.0,484.13
3,Andrew Putnam,Sungjae Im,Hideki Matsuyama,Stephan Jaeger,Brendan Steele,Matthew NeSmith,599.0,483.78
4,Adam Long,Sungjae Im,Hideki Matsuyama,Michael Gligic,Doug Ghim,Matthew NeSmith,599.0,483.68
5,Nate Lashley,Sungjae Im,Hideki Matsuyama,Tom Hoge,Brendan Steele,Doug Ghim,600.0,483.63
6,Andrew Putnam,Sungjae Im,Hideki Matsuyama,Tom Hoge,Brendan Steele,Doug Ghim,599.0,482.99
7,Adam Long,Sungjae Im,Adam Svensson,Tom Hoge,Brendan Steele,Matthew NeSmith,597.0,482.88
8,Nate Lashley,Sungjae Im,Brice Garnett,Hideki Matsuyama,Brendan Steele,Matthew NeSmith,597.0,482.42
9,Andrew Putnam,Sungjae Im,Hideki Matsuyama,Michael Thompson,Brendan Steele,Matthew NeSmith,600.0,482.3
