In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from copy import deepcopy

In [23]:
# Read in Data
data538 = pd.read_csv('nba_elo.csv')

In [24]:
def get_post_elo(home_team, home_team_elo, away_team, away_team_elo, mov, decay, home_adv):
    
    home_post_elo = get_k(mov, home_team_elo, away_team_elo, decay)*(int(mov>0) - get_win_prob(home_team_elo, away_team_elo, home_adv)) + home_team_elo
    away_post_elo = away_team_elo + (home_team_elo - home_post_elo)
    
    return home_post_elo, away_post_elo
    
def get_win_prob(home_team_elo, away_team_elo, hfa):
    home_team_elo += hfa
    return 1.0/(1.0+(10.0**((away_team_elo-home_team_elo)/400)))

def get_k(mov, team_elo, opp_elo, decay):
    
    if mov > 0: elo_dif = team_elo-opp_elo
    else: elo_dif = opp_elo-team_elo
    
    K = 20.0*((np.abs(mov)+3.0)**.8)
    K = K/(7.5 + .006*elo_dif)
    return K*decay

In [72]:
def train_k_decay(data, elos={}, season_start_epsilon=1.0, epsilon_decay=1.0, hfa=100):
    
    epsilon = season_start_epsilon
    elo_dict = {}

    val_dict_list = []
    test_dict_list = []

    # For Each Game
    for i in range(data.shape[0]):

        # Get Game Info
        row = data.iloc[i]
        team = row.team1
        opp = row.team2
        mov = row.score1 - row.score2

        # Seasonal ELO Adjustment for Every Team
        if i > 0 and row.season != data.iloc[i-1].season:
            for k in elo_dict.keys():
                elo_dict[k] = .75*elo_dict[k] + .25*1505

            # Reset Epsilon
            epsilon = season_start_epsilon

        # If team's first game, use 538's elo to start
        if team not in elo_dict.keys():
            elo_dict[team] = row['elo1_pre']
        if opp not in elo_dict.keys():
            elo_dict[opp] = row['elo2_pre']

        # Get Pre-Game ELO Estimates
        team_pre = elo_dict[team]
        opp_pre = elo_dict[opp]

        # Adjust Epsilon Every 100 Games
        if i % 100 == 0:
            epsilon *= epsilon_decay

        # Update ELOs based on game results    
        elo_dict[team], elo_dict[opp] = get_post_elo(team, team_pre, opp, opp_pre, mov, epsilon, hfa)
        
    return elo_dict

def k_decay_validation_dict(data, elos={}, season_start_epsilon=1.0, epsilon_decay=1.0, hfa=100):
    
    '''
    Inputs:
     - data: Set of games to validate the ELO model on
     - elos: set of trained elos
     - val_dfs: List of DataFrames that contain ELOs for every game
     - season_start_epsilon
     - epsilon_decay
     - hfa: Home Field Advantage adjustment
     
     Outputs:
      - Row for DataFrame of Total Loss and overall Prediction Accuracy, for every combination of hyperparameters
    
    '''
    
    elo_dict = elos
    val_dict_list = []
    epsilon = season_start_epsilon

    for i in range(data.shape[0]):
        
        row = data.iloc[i]
        team = row.team1
        opp = row.team2
        mov = row.score1 - row.score2

        # Seasonal ELO Adjustment for Every Team
        if i > 0 and row.season != data.iloc[i-1].season:
            for k in elo_dict.keys():
                elo_dict[k] = .75*elo_dict[k] + .25*1505

            # Reset Epsilon
            epsilon = season_start_epsilon

        # If team's first game, use 538's elo to start
        if team not in elo_dict.keys():
            elo_dict[team] = row['elo1_pre']
        if opp not in elo_dict.keys():
            elo_dict[opp] = row['elo2_pre']

        # Get Pre-Game ELO Estimates
        team_pre = elo_dict[team]
        opp_pre = elo_dict[opp]

        # Adjust Epsilon Every 100 Games
        if i % 100 == 0:
            epsilon *= epsilon_decay

        # Update ELOs based on game results    
        elo_dict[team], elo_dict[opp] = get_post_elo(team, team_pre, opp, opp_pre, mov, epsilon, hfa)
        
        # Build DataFrame with Post-Game ELOs
        team_post = elo_dict[team]
        opp_post = elo_dict[opp]
        val_dict_list.append({'team1': team, 'team2': opp, 'team1_pre': team_pre, 'team2_pre': opp_pre, 'team1_post': team_post, 'team2_post': opp_post, 'MOV': mov})

    # New DataFrame of Elos Over Games in Validation Set
    val_df = pd.DataFrame(val_dict_list)
    
    preds = []
    losses = []
    outcomes = []

    # For Each Game in Validation Set
    for i in range(val_df.shape[0]):

        # Get Game Info
        row = val_df.iloc[i]
        mov = row.MOV 
        team_elo = row['team1_pre']
        opp_elo = row['team2_pre']

        # Get Predictions, Loss, and Results
        preds.append(get_win_prob(team_elo, opp_elo, hfa) > .5)
        losses.append(np.log(get_win_prob(team_elo, opp_elo, hfa) if mov > 0 else 1 - get_win_prob(team_elo, opp_elo, hfa)))
        outcomes.append(mov > 0)

    # True if Success, Otherwise False
    success_list = [preds[j] == outcomes[j] for j in range(len(preds))]

    # Make Row for DataFrame of Validation Results w/ Loss as accuracy
    return {'total_loss': np.sum(losses), 'accuracy': np.mean(success_list), 'season_start_ep': season_start_epsilon, 'epsilon_decay': epsilon_decay, 'HFA': hfa}

def test_model_dict(data, elos, season_start_epsilon=1.0, epsilon_decay=1.0, hfa=100):
    '''
    Inputs: All the model stuff
    Outputs: Dictionary with model accuracy and total loss
    '''
    return k_decay_validation_dict(data, elos, season_start_epsilon=1.0, epsilon_decay=1.0, hfa=100)

def fivethirtyeight_results_dict(data):
    
    # Get Predictions
    elo_preds = [get_win_prob(team_elo, opp_elo, 100) > .5 for team_elo, opp_elo in zip(data['elo1_pre'], data['elo2_pre'])]
    
    # Get Outcomes
    outcomes = [s1 > s2 for s1, s2 in zip(data['score1'], data['score2'])]
    
    # Get Success Rates
    elo_acc = np.mean([elo_preds[j] == outcomes[j] for j in range(len(elo_preds))])
                
    # Get Losses
    data['win_prob'] = [get_win_prob(elo1, elo2, 100) for elo1, elo2 in zip(data['elo1_pre'], data['elo2_pre'])]            
    elo_loss = np.sum([np.log(prob if outcome == 1 else 1 - prob) for prob, outcome in zip(data['win_prob'], outcomes)])
                
    return {'total_loss_538': elo_loss, 'accuracy_538': elo_acc, 'season_start_ep': 1, 'epsilon_decay': 1, 'HFA': 100}
    

In [73]:
# Training, Validation, and Test Sets

VALIDATION_START_YEAR = 2015
TEST_START_YEAR = 2017

train_games = data538[data538['season'] >= 2010]
train_games = train_games[train_games['season'] < VALIDATION_START_YEAR]

validation_games = data538[data538['season'] >= VALIDATION_START_YEAR]
validation_games = validation_games[validation_games['season'] < TEST_START_YEAR]

test_games = data538[data538['season'] >= TEST_START_YEAR]
test_games = test_games[test_games['season'] < 2019]

In [80]:
# Hyperparameters to Tune
from hyperopt import hp

season_start_epsilons = [.95, 1.0, 1.1, 1.2]
epsilon_decays = [.9, .95, .98, 1.0, 1.05]
hfas = [30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150]

ModuleNotFoundError: No module named 'hyperopt'

In [78]:
val_results_list = []

for ep in season_start_epsilons:
    for dec in epsilon_decays:

        new_elos = train_k_decay(train_games, elos={}, season_start_epsilon=ep, epsilon_decay=dec, hfa=100)
        val_results_list.append(k_decay_validation_dict(validation_games, elos=new_elos, season_start_epsilon=ep, epsilon_decay=dec, hfa=100))
        
val_results_df = pd.DataFrame(val_results_list)
print('Best Validation Loss: ', np.max(val_results_df['total_loss']))
print('Accompanying Accuracy: ', val_results_df.loc[np.argmax(val_results_df['total_loss'])].accuracy)
print('Accompanying start_epsilon: ', val_results_df.loc[np.argmax(val_results_df['total_loss'])].season_start_ep)
print('Accompanying epsilon_decay: ', val_results_df.loc[np.argmax(val_results_df['total_loss'])].epsilon_decay)
print('Accompanying HFA: ', val_results_df.loc[np.argmax(val_results_df['total_loss'])].HFA)

  # Remove the CWD from sys.path while we load stuff.


Best Validation Loss:  -1560.83666557
Accompanying Accuracy:  0.682146935668
Accompanying start_epsilon:  1.3
Accompanying epsilon_decay:  0.95
Accompanying HFA:  100.0


will be corrected to return the positional maximum in the future.
Use 'series.values.argmax' to get the position of the maximum now.
  return getattr(obj, method)(*args, **kwds)


In [74]:
fivethirtyeight_results_dict(validation_games)

{'HFA': 100,
 'accuracy_538': 0.68062428625808913,
 'epsilon_decay': 1,
 'season_start_ep': 1,
 'total_loss_538': -1559.0776770450607}

In [None]:
print('Best Validation Accuracy: ', np.max(val_results_df['accuracy']))
print('Best start_epsilon: ', val_results_df.loc[np.argmax(val_results_df['accuracy'])].season_start_ep)
print('Best epsilon_decay: ', val_results_df.loc[np.argmax(val_results_df['accuracy'])].epsilon_decay)
print('Best epsilon_decay: ', val_results_df.loc[np.argmax(val_results_df['accuracy'])].HFA)