### Lineup Optimisation with Genetic Algorithms

In [1]:
import os
import glob
import random
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics import mean_squared_error

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
#Concatenate all csv files under a directory
def csv_concatenate(folder_path):
    files = glob.glob(folder_path + "/*.csv")
    df_list = []
    for file in tqdm(files):
        df_list.append(pd.read_csv(file, parse_dates=True, infer_datetime_format=True))
    #Fill nan with 0s as some values are empty for percentage points
    df = pd.concat(df_list).fillna(0).reset_index(drop=True)
    return df

In [None]:
def calculate_MAE(pred, true):
    n = len(pred)
    abs_error = 0 
    for i in range(n):
        abs_error += abs(pred[i] - true[i])
    mae = abs_error/n
    return mae

In [None]:
def calculate_RMSE(pred, true):
    return np.sqrt(mean_squared_error(pred, true))

In [None]:
def calculate_FPTS(df):
    #Scoring rules based on https://www.draftkings.co.uk/help/rules/4
    multipliers = {'PTS':1, '3P': 0.5, 'TRB':1.25, 'AST':1.5, 'STL':2, 'BLK':2, 'TOV':-0.5}
    
    indices = len(df)
    fpts_list = []
    
    for i in tqdm(range(indices)):
        fpts = 0
        doubles = 0
        for stat, multiplier in multipliers.items():
            if stat in ['PTS', 'TRB', 'AST', 'STL', 'BLK']:
                if df.loc[i, stat] >= 10:
                    doubles += 1
            fpts += df.loc[i, stat]*multiplier
        if doubles >= 2:
            fpts += 1.5
        if doubles >= 3:
            fpts += 3
        fpts_list.append(fpts) 
        
    return fpts_list

In [None]:
def add_positions(df):
    
    pos = {'PG':[], 'SG':[], 'SF':[], 'PF':[], 'C':[], 'G':[], 'F':[]}
    
    for i in range(df.shape[0]):
        for key in pos.keys():
            if key in df.loc[i, 'Pos']:
                pos[key].append(1)
            else:
                pos[key].append(0)
    
    for key in pos.keys():
        df[key] = pos[key]

In [None]:
#return a list of lineups with indices in df_target
def create_random_lineups(df, num_lineups):
    lineups = {'PG':[], 'SG':[], 'SF':[], 'PF':[], 'C':[], 'G':[], 'F':[], 'Util':[]}
    n = df.shape[0]
    
    for i in range(num_lineups):
        for key in list(lineups.keys())[:7]:
            lineups[key].append(df[df[key]==1].sample(1).index[0])
        
        lineups['Util'].append(df.sample(1).index[0])
    
    df_lineups = pd.DataFrame(lineups).loc[:,['PG', 'SG', 'SF', 'PF', 'C', 'G', 'F', 'Util']]
    
    return df_lineups 

In [None]:
def calculate_fitness(df_lineups):
    fitness = []
    
    for i in range(df_lineups.shape[0]):
        
        salary = df_target.loc[df_lineups.loc[i,:].values, 'Salary'].sum()
        total_FPTS = df_target.loc[df_lineups.loc[i,:].values, 'Pred'].sum()
        
        #Check for duplicates
        if len(set(df_lineups.loc[i,:].values)) < 8:
            fitness.append(0)
        
        #Check for Salary Caps
        elif salary >= salary_cap:
            fitness.append(0)
            
        #Calculate the cumulative predicted FPTS
        else:
            fitness.append(total_FPTS)
        
    return fitness

In [None]:
def breed(df_lineups):
    
    df_parents = df_lineups
    positions = list(df_lineups.columns)[:-1]
    df_children = pd.DataFrame([], columns=positions)
    
    df_fit = df_lineups.sort_values(by='Fitness', ascending=False).reset_index(drop=True)
    parents = df_fit.head(2)

    #Prevent breeding betwee two same lineups
    if all(parents.loc[0,:] != parents.loc[1,:]):
        next_index = 2

        while all(parents.loc[0,:] == parents.loc[1,:]):
            rows = [0, next_index]
            parents = df_fit.loc[rows]
            parents = parents.reset_index(drop=True)
            next_index += 1
            
    parents = parents.loc[:, positions]

    for pos in positions:
        #Add parents and swap each items position by position, creating 16 lineups
        df_children = df_children.append(parents, ignore_index=True)
        df_children.loc[df_children.shape[0]-2, pos] = parents.loc[1, pos]
        df_children.loc[df_children.shape[0]-1, pos] = parents.loc[0, pos]
    
    #Add parents for a total of 18 lineups
    df_children = df_children.append(parents).reset_index(drop=True)
    return df_children

In [None]:
def mutate(df_lineups, df_original, num_mutations):
    positions = list(df_lineups.columns)[:-1]
    df_mutants = pd.DataFrame([], columns=positions)
    df_parent = df_lineups.sort_values(by='Fitness', ascending=False).head(1).loc[:,positions]
    
    #Append the original
    df_mutants = df_mutants.append(df_parent)
    
    #Append the original and then mutate it by replacing it with a random sample for a random position
    for i in range(num_mutations):
        pos_to_swap = random.choice(positions)
        mutant_index = df_original.sample(1).index[0]
        
        df_mutants = df_mutants.append(df_parent, ignore_index=True)
        current_index = df_mutants.shape[0]-1
        df_mutants.loc[current_index, pos_to_swap] = df_original.loc[mutant_index, pos_to_swap]
        
    return df_mutants

In [None]:
def evolution(df_random_lineups, num_mutations, num_generations):
    
    df_init = df_random_lineups

    for i in range(num_generations):
    
        df_children = breed(df_init)
        df_children['Fitness'] = calculate_fitness(df_children)
        
        df_mutants = mutate(df_children, df_random_lineups, num_mutations)
        df_mutants['Fitness'] = calculate_fitness(df_mutants)
        
        df_init = df_mutants
    
    df_init['Fitness'] = calculate_fitness(df_init.drop('Fitness', axis=1))
    df_final = df_init.sort_values(by='Fitness', ascending=False).reset_index(drop=True)
    
    return df_final

In [None]:
def compounding_evolution(population_size=200, num_mutations=50, num_generations=20, num_compounding=5):

    optimal_lineups = []
    
    for i in tqdm(range(num_compounding)):
        df_random_lineups = create_random_lineups(df_target, population_size)
        df_random_lineups['Fitness'] = calculate_fitness(df_random_lineups)
        
        optimal = evolution(df_random_lineups, num_mutations, num_generations)
        
        optimal_lineups.append(list(optimal.loc[0,:].values[:-1].astype(int)))
        
    return optimal_lineups

In [None]:
cwd = os.getcwd().replace('/notebooks','')
data_dir = os.path.join(cwd, 'data')
np.random.seed(8)
salary_cap = 50000

df_pred = pd.read_csv(os.path.join(data_dir, 'Prediction', '20180514.csv'))

add_positions(df_pred)

In [None]:
df_cashline = pd.read_csv(os.path.join(data_dir, 'Contests', 'cashline.csv'))
df_cashline.head(5)

In [None]:
date = []
pred = []
actual = []
cashline = []
win = []
earnings = []


df_cashline = pd.read_csv(os.path.join(data_dir, 'cashline2.csv'))
df_cashline.head(5)

for i in tqdm(range(df_cashline.shape[0])):
    df_target = df_pred.loc[(df_pred['Salary']!=0)&(df_pred['Date']==df_cashline.loc[i, 'Date'])].reset_index(drop=True)
    
    optimal_lineups = compounding_evolution(population_size=200, num_mutations=50,
                                            num_generations=100, num_compounding=5)
    
    top_lineups = {'Lineup':[], 'Pred':[], 'Actual':[]}

    for j, indices in enumerate(optimal_lineups):
        top_lineups['Lineup'].append(j)
        top_lineups['Pred'].append(df_target.loc[indices, 'Pred'].sum())
        top_lineups['Actual'].append(df_target.loc[indices, 'FPTS'].sum())


    df_lineups = pd.DataFrame(top_lineups).sort_values(by='Pred', ascending=False).reset_index(drop=True)
    df_lineups = df_lineups.loc[:, ['Lineup', 'Pred', 'Actual']]

    rows = optimal_lineups[df_lineups.loc[0, 'Lineup']]
    df_best = df_target.loc[rows,:]
    display(df_best.loc[:,['Date', 'Name', 'Team', 'Pos', 'FPTS', 'Pred', 'Salary']])
    print('Salary:', df_target.loc[rows,'Salary'].sum())
    print('Predicted:', df_target.loc[rows,'Pred'].sum())
    print('Actual:', df_target.loc[rows,'FPTS'].sum())
    print('Cashline:', df_cashline.loc[i, 'Cashline'])
    
    pred.append(df_target.loc[rows,'Pred'].sum())
    actual.append(df_target.loc[rows,'FPTS'].sum())
    cashline.append(df_cashline.loc[i, 'Cashline'])
    
    if df_target.loc[rows, 'FPTS'].sum() >= df_cashline.loc[i, 'Cashline']:
        print('Win:', df_cashline.loc[i, 'Fee'])
        win.append(1)
        earnings.append(df_cashline.loc[i, 'Fee'])
    else:
        print('Lose:', df_cashline.loc[i, 'Fee'])
        win.append(0)
        earnings.append(-df_cashline.loc[i, 'Fee'])
    

In [None]:
print(sum(win)/len(win))
print(sum(earnings))

In [None]:
date = list(set(df_cashline['Date'].values))
df_result = pd.DataFrame({'Date': date,
                          'Predicted':pred,
                          'Actual': actual,
                          'Cashline': cashline,
                          'Win':win,
                          'Earnings': earnings
                         })

df_result.loc[:,['Date', 'Predicted', 'Actual', 'Cashline', 'Win', 'Earnings']]

### Baseline - random 10,000

In [None]:
df_result.loc[:,['Date', 'Predicted', 'Actual', 'Cashline', 'Win', 'Earnings']]

In [None]:
df_random_lineups = create_random_lineups(df_target, 200)
df_random_lineups['Fitness'] = calculate_fitness(df_random_lineups)
df_random_lineups.sort_values(by='Fitness', ascending=False).head(10)

### Visualisation

In [None]:
df_baseline = csv_concatenate(os.path.join(data_dir, data_dir, 'Dataframes', 'modelling', 'baseline'))
df_baseline['Baseline'] = calculate_FPTS(df_baseline)

In [None]:
#target_date = 20180310
target_date = 20180326

In [None]:
df_target = df_pred.loc[(df_pred['Salary']!=0)&(df_pred['Date']==target_date)].reset_index(drop=True)
    
optimal_lineups = compounding_evolution(population_size=200, num_mutations=50,
                                        num_generations=100, num_compounding=5)

top_lineups = {'Lineup':[], 'Pred':[], 'Actual':[]}

for j, indices in enumerate(optimal_lineups):
    top_lineups['Lineup'].append(j)
    top_lineups['Pred'].append(df_target.loc[indices, 'Pred'].sum())
    top_lineups['Actual'].append(df_target.loc[indices, 'FPTS'].sum())


df_lineups = pd.DataFrame(top_lineups).sort_values(by='Pred', ascending=False).reset_index(drop=True)
df_lineups = df_lineups.loc[:, ['Lineup', 'Pred', 'Actual']]

rows = optimal_lineups[df_lineups.loc[0, 'Lineup']]
df_best = df_target.loc[rows,:]
display(df_best.loc[:,['Date', 'Name', 'Team', 'Pos', 'FPTS', 'Pred', 'Salary']])
print('Salary:', df_target.loc[rows,'Salary'].sum())
print('Predicted:', df_target.loc[rows,'Pred'].sum())
print('Actual:', df_target.loc[rows,'FPTS'].sum())
print('Cashline:', int(df_cashline.loc[df_cashline['Date']==target_date, 'Cashline'].values))

In [None]:
print(calculate_MAE(df_best['FPTS'].values, df_best['Pred'].values))
print(calculate_RMSE(df_best['FPTS'], df_best['Pred']))

In [None]:
df_best

In [None]:
Baseline = []

for i in range(df_best.shape[0]):
    df_best = df_best.reset_index(drop=True)
    Baseline.append(df_baseline.loc[(df_baseline['Name']==df_best.loc[i, 'Name'])&(df_baseline['Date']==df_best.loc[i,'Date']), 'Baseline'])

In [None]:
import plotly.plotly as py
import plotly.graph_objs as go

names = [name.split(' ')[0][:1] + '. '+ name.split(' ')[1] for name in df_best['Name']]
positions = ['(PG)', '(SG)', '(SF)','(PF)','(C)','(G)','(F)','(Utility)']

names = [names[i]+'\n'+positions[i] for i in range(8)]

trace1 = go.Bar(
    x=names,
    y=df_best['FPTS'],
    name='Actual FPTS'
)

trace2 = go.Bar(
    x=names,
    y=df_best['Pred'],
    name='Prediction'
)

trace3 = go.Bar(
    x=names,
    y=Baseline,
    name='Baseline'
)


data = [trace1, trace2, trace3]
layout = go.Layout(
    
)

layout = go.Layout(
        title = 'FPTS: Actual and Predicted',
        barmode='group',
        legend = {"x":0.85, 'y':0.95, 'borderwidth': 1},
        yaxis = {"title":"FPTS"},
    )

config={'showLink': False}

fig = go.Figure(data=data, layout=layout)
plot_url = py.plot(fig, filename='prediction0326')
py.iplot(fig, filename='prediction0326')