In [2]:
import pandas as pd
from IPython.core.display import display, HTML
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib


display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_columns', 50)

In [3]:
nba_1984_2018_initials = {
    'Atlanta Hawks': 'ATL',
    'Boston Celtics': 'BOS',
    'Brooklyn Nets': 'BRK',
    'Charlotte Hornets': 'CHH',
    'Charlotte Bobcats': 'CHO',
    'Chicago Bulls': 'CHI',
    'Cleveland Cavaliers': 'CLE',
    'Dallas Mavericks': 'DAL',
    'Denver Nuggets': 'DEN',
    'Detroit Pistons': 'DET',
    'Golden State Warriors': 'GSW',
    'Houston Rockets': 'HOU',
    'Indiana Pacers': 'IND',
    'Kansas City Kings': 'KCK',
    'Los Angeles Clippers': 'LAC',
    'Los Angeles Lakers': 'LAL',
    'Memphis Grizzlies': 'MEM',
    'Miami Heat': 'MIA',
    'Milwaukee Bucks': 'MIL',
    'Minnesota Timberwolves': 'MIN',
    'New Orleans Hornets' : 'NOP',
    'New Orleans/Oklahoma City Hornets': 'NOK',
    'New Orleans Pelicans': 'NOP',
    'New Jersey Nets': 'NJN',
    'New York Knicks': 'NYK',
    'Oklahoma City Thunder': 'OKC',
    'Orlando Magic': 'ORL',
    'Philadelphia 76ers': 'PHI',
    'Phoenix Suns': 'PHO',
    'Portland Trail Blazers': 'POR',
    'Sacramento Kings': 'SAC',
    'San Diego Clippers': 'SDC',
    'San Antonio Spurs': 'SAS',
    'Seattle SuperSonics': 'SEA',
    'Toronto Raptors': 'TOR',
    'Vancouver Grizzlies': 'VAN',
    'Utah Jazz': 'UTA',
    'Washington Bullets': 'WSB',
    'Washington Wizards': 'WAS'
}
nba_1984_2018_initials_reversed = {v:k for k, v in nba_1984_2018_initials.items()}

In [4]:
boxscore_season_range_mask = lambda df, start_year, end_year:  (df["season"] >= start_year) & (df["season"] <= end_year)
boxscore_date_range_mask = lambda df, start_date, end_date: (df["date"] >= start_date) & (df["date"] <= end_date)
boxscore_team_mask = lambda df, team_initials: (df["team1"] == team_initials) | (df["team2"] == team_initials)
boxscore_regular_season_mask = lambda df: pd.isnull(df["playoff"])

summary_season_range = lambda df, start_year, end_year: df.loc[start_year:end_year]
summary_season_query = lambda df, years, teams, col_names: df.loc[(years, teams), col_names]
summary_season_remove_league_average = lambda: df

# Features for model

Visualizations in the data exploration notebook show that the most important indicators of wins on a season level are net rating, SRS, and ELO rating. It's reasonable to hypothesize that those same metrics are indicators of win probability on a game by game basis. First model will attempt to predict single game home team win probability based solely on pre-game net rating, SRS, and ELO rating of home and away team. Initial net rating and SRS rating are set to their values at the end of the prior season for each team, or to the lower quartile league value of the preceding season for expansions teams in their first season.

## Create dataset

### Desired

| Name           |  Type         | Description                                                                        
| -------------- |-------------- | ---------------------------------------------------------------|
| d_SRS          | double        |   Difference between pre-game SRS of home team and away team   |
| d_ELO          | double        |   Difference between pre-game ELO of home team and away team   |
| d_NetRtg       | double        |   Difference between average NetRtg of home team and away team |
| result         | int           |   1 = home team win                                            |

In [234]:
def margin_for_team(abbrev, margins):
    '''
    find the average margin of victory for a team over a period of time
    
    abbrev: string, 3 letter initial of NBA team
    margins: dict, key is team initials and value is list containing the margin of victory for each game
    
    returns average margin of victory for given team over time period encompassed by margins.
    '''
    return sum(margins[abbrev]) / len(margins[abbrev])

def weighted_margin_for_team(abbrev, margins):
    '''
    find the weighted average margin of victory for a team, where previous season average margin is weighted by ((82 - games played) / 82) and current season average margin is weighted by (games played / 82)
    
    abbrev: string, 3 letter initial of NBA team
    margins: dict, key is team initials and value is list containing the margin of victory for each game where first element equals last season average margin of victory
    
    returns weighted average margin of victory for given team over time period encompassed by margins.
    '''
    last_season_margin = margins[abbrev][0]
    this_season_margins = margins[abbrev][1:]
    gp = len(this_season_margins)
    if gp == 0:
        # no games played, return last season's margin
        return last_season_margin
    elif gp > 82:
        # team in playoffs, use only this season's margin
        return sum(this_season_margins) / len(this_season_margins)
    else:
        current_season_margin = sum(this_season_margins) / len(this_season_margins)
        return (gp / 82 * current_season_margin) + (((82 - gp) / 82)  * last_season_margin) 

def sos_for_team(abbrev, schedule, margins):
    '''
    find the strength of schedule for a team over a period of time, where sos is defined as average margin of victory of opponents faced so far, weighted by games played.
    
    abbrev: string, 3 letter initial of NBA team
    margins: dict, key is team initials and value is list containing the margin of victory for each game
    schedule: dict, key is team initials, value is dictionary where key is opponent initials, and value is # of games played vs. opponent during period in question
    
    returns strength of schedule for given team over time period encompassed by margins and schedule. 
    '''
    opp_movs = []
    for abbrev, gp in schedule[abbrev].items():
        opp_movs += [margin_for_team(abbrev, margins)] * gp
    return sum(opp_movs) / len(opp_movs)

def weighted_sos_for_team(abbrev, schedule, margins):
    '''
    see sos_for_team, but calculates average margin for each team using weighted_margin_for_team rather than margin_for_team
    
    abbrev: string, 3 letter initial of NBA team
    margins: dict, key is team initials and value is list containing the margin of victory for each game where first element equals last season average margin of victory
    schedule: dict, key is team initials, value is dictionary where key is opponent initials, and value is # of games played vs. opponent during period in question
    
    returns weighted strength of schedule for given team over time period encompassed by margins and schedule. 
    '''
    opp_movs = []
    for abbrev, gp in schedule[abbrev].items():
        opp_movs += [weighted_margin_for_team(abbrev, margins)] * gp
    return sum(opp_movs) / len(opp_movs)

def average_net_rating_for_team(abbrev, ratings):
    '''
    find the average net rating for a team during a period of time
    
    abbrev: string, 3 letter initial of NBA team
    ratings: dict, key is team initials and value is list containing net rating for each game played during period in question
    
    returns the average net rating of given team over time period encompassed by net_ratings
    '''
    return sum(ratings[abbrev]) / len(ratings[abbrev])

def weighted_average_net_rating_for_team(abbrev, ratings):
    '''
    see average_net_rating_for_team, but instead weighs previous season rating by ((82 - games played) / 82) and current season average net rating by (games played / 82)
    
    abbrev: string, 3 letter initial of NBA team
    ratings: dict, key is team initials and value is list containing net rating for each game played during period in question where first element equals last season's average net rating
    
    returns the average net rating of given team over time period encompassed by net_ratings
    '''
    last_season_rating = ratings[abbrev][0]
    this_season_ratings = ratings[abbrev][1:]
    gp = len(this_season_ratings)
    if gp == 0:
        # no games played, return last season's average net rating
        return last_season_rating
    elif gp > 82:
        # team in playoffs, use only this season's net ratings
        return sum(this_season_ratings) / len(this_season_ratings)
    else:
        current_season_rating = sum(this_season_ratings) / len(this_season_ratings)
        return (gp / 82 * current_season_rating) + (((82 - gp) / 82)  * last_season_rating)

def is_first_game_of_season(game, abbrev, boxscores_df):
    '''
    determine whether given game is the first game of the season for the given team
    
    game: pd.Series, see nba_boxscores_1984_2018.csv for format (game is single row)
    abbrev: string, 3 letter initial of NBA team
    boxscores_df: pd.DataFrame, see nba_boxscores_1984_2018.csv for format
    
    returns boolean
    '''
    first_game_indx = boxscores_df[boxscore_team_mask(boxscores_df, abbrev) & boxscore_season_range_mask(boxscores_df, game["season"], game["season"])].head(1).index[0]
    return game.name == first_game_indx

def regular_season_metrics(abbrev, season, season_summaries_df, rating_cols):
    '''
    determine end of regular season metrics for the given team and given season
    
    abbrev: string, 3 letter initial of NBA team
    season: int, season to get metrics for (2017 = 2016-17)
    season_summaries_df: pd.DataFrame, see nba_season_summaries_1984_2018.csv for format
    rating_cols: list of metrics desired, see nba_season_summaries_1984_2018.csv columns for possible values
    
    returns pd.Series with requested metrics
    '''
    team_name = "Charlotte Hornets" if abbrev == "CHO" and season > 2014 else nba_1984_2018_initials_reversed[abbrev]
    return summary_season_query(season_summaries_df, season, team_name, rating_cols)

def abbrev_dict_for_season(season, season_summaries_df):
    '''
    generate a lookup table to map boxscore teams (3 letter initials) to season summary teams (full names)
    
    season: int, (2017 = 2016-17 NBA season)
    season_summaries_df: pd.DataFrame, see nba_season_summaries_1984_2018.csv for format
    '''
    team_names = season_summaries_df.loc[season].index.tolist()
    team_names.remove("League Average")
    d = {k:v for k, v in nba_1984_2018_initials.items() if k in team_names}
     # handle edge case of boxscores using "CHO" for both Charlotte Bobcats and post 2014 Charlotte Hornets 
    if season > 2014:
        d["Charlotte Hornets"] = "CHO"
    return d

In [5]:
start_year = 1984
end_year = 2018
df_season_summaries = pd.read_csv("../Data/nba_season_summaries_{}_{}.csv".format(start_year, end_year), index_col=[0, 1])
# For MultiIndex slicing support
df_season_summaries.sort_index(inplace=True) 
df_boxscores = pd.read_csv("../Data/nba_boxscores_{}_{}.csv".format(start_year, end_year), index_col=0, parse_dates=[1], infer_datetime_format=True)

In [236]:
def compute_features_for_season(season, df_boxscores, df_season_summaries, debug=False, weighted=True):
    '''
    generate dataframe with desired model features for all the games in a given season
    
    season: int, (2017 = 2016-17 NBA season)
    df_boxscores: pd.DataFrame, see nba_boxscores_1984_2018.csv for format
    df_season_summaries: pd.DataFrame, see nba_season_summaries_1984_2018.csv for format
    debug (optional, default=False): boolean, toggle debug print statements
    weighted (optional, default=True): boolean, determines whether SRS and average net rating calculations are weighted according to games played. When true, the average net rating of a team before a game equals (gp/82 * current_season_average_net) + ((82 - gp)/82) * last_season_average_net  
    
    returns a dataframe with model features (see Features for model --> Desired for details)
    '''
    previous_season = season - 1
    if debug:
        print("Computing features for season: {}".format(season))
        print("Performing setup for season")
        print("****************************")
    # Get end of regular season SRS, NetRtg, and MOV for previous season 
    last_season_team_names = df_season_summaries.loc[previous_season].index.tolist()
    last_season_team_names.remove("League Average")
    last_season_abbrev_dict = {v:k for k, v in abbrev_dict_for_season(previous_season, df_season_summaries).items()}
    last_season_metrics = summary_season_query(df_season_summaries, previous_season, last_season_team_names, ["SRS", "NetRtg", "MOV"]).loc[previous_season]
    if debug:
        print("Successfully received metrics for {} season".format(previous_season))
    # dictionary for mapping between team initials (used in boxscore) and full team names (used in summary)
    this_season_abbrev_dict = {v:k for k, v in abbrev_dict_for_season(season, df_season_summaries).items()}
    if debug:
        print("Successfully created abbreviation mapping for {} season".format(season))
    # dictionaries for storing margins of victory and schedule (used for SRS) and net ratings for each team on a per-game basis (used for avg. NetRtg)
    margins = {}
    schedule = {}
    net_ratings = {}
    for abbrev in this_season_abbrev_dict.keys():
        margins[abbrev] = []
        schedule[abbrev] = {}
        net_ratings[abbrev] = []
    if debug:
        print("Successfully initialized dictionaries for storing MOV, schedule, and net ratings for each team on per-game basis")
    # dictionary for storing feature data
    columns = [
       "season", "team1", "team2", "team1_SRS", "team2_SRS", "team1_NetRtg", "team2_NetRtg", "team1_ELO", "team2_ELO", "result"
    ]
    data = {c:[] for c in columns}
    if debug:
        print("Successfully initialized dictionaries for storing feature data")
        print("Finished setup for season")
        print("****************************")
        print("Beginning walkthrough of {} season".format(season))
        i = 0
    # walk through current season game by game, computing desired features for each game
    for _, game in df_boxscores[boxscore_season_range_mask(df_boxscores, season, season)].iterrows():
        team1 = game["team1"]
        team1_score = game["score1"]
        team2 = game["team2"]
        team2_score = game["score2"]
        data["team1"].append(team1)
        data["team2"].append(team2)
        data["season"].append(season)
        # determine result of game
        data["result"].append(1 if team1_score > team2_score else 0)
        if debug:
            print("Game {}".format(i))
            print("****************************")
            print("{} at {}: {} - {}".format(team2, team1, team2_score, team1_score))
            print("Recorded result as {} for {}".format(data["result"][-1], team1))
        # determine pre-game SRS and NetRtg for home and away team
        for t, key_prefix in zip([team1, team2], ["team1", "team2"]):
            if is_first_game_of_season(game, t, df_boxscores):
                if debug:
                    print("First game of season for {}. Attempt to use last season's metrics".format(t))
                # first game of season for team, use last season's SRS and NetRtg values. Add NetRtg and MOV from last season as first element of current season tally to reduce variance for early season games.
                try:
                    srs = last_season_metrics.loc[last_season_abbrev_dict[t]]["SRS"]
                    net = last_season_metrics.loc[last_season_abbrev_dict[t]]["NetRtg"]
                    mov = last_season_metrics.loc[last_season_abbrev_dict[t]]["MOV"]
                    data["{}_SRS".format(key_prefix)].append(srs)
                    data["{}_NetRtg".format(key_prefix)].append(net)
                    net_ratings[t].append(net)
                    margins[t].append(mov)
                    if debug:
                        print("Recorded SRS: {}, NetRtg: {} for {}".format(srs, net, t))
                        print("Added NetRtg: {} and MOV: {} to per-game dictionaries for {}".format(net, mov, t))
                except KeyError:
                    if debug:
                        print("Expansion team, no results avaiable from last season. Attempt to use lower quartile results of previous season")
                    # first season of expansion franchise. Set to lower quartile value of previous season
                    srs = last_season_metrics["SRS"].quantile(0.25)
                    net = last_season_metrics["NetRtg"].quantile(0.25)
                    mov = last_season_metrics["MOV"].quantile(0.25)
                    data["{}_SRS".format(key_prefix)].append(srs)
                    data["{}_NetRtg".format(key_prefix)].append(net)
                    net_ratings[t].append(net)
                    margins[t].append(mov)
                    if debug:
                        print("Recorded SRS: {}, NetRtg: {} for {}".format(srs, net, t))
                        print("Added NetRtg: {} and MOV: {} to per-game dictionaries for {}".format(net, mov, t))
            else:
                # compute pre-game SRS, avg. NetRtg
                net = weighted_average_net_rating_for_team(t, net_ratings) if weighted else average_net_rating_for_team(t, net_ratings)
                srs = (weighted_margin_for_team(t, margins) + weighted_sos_for_team(t, schedule, margins)) if weighted else (margin_for_team(t, margins) + sos_for_team(t, schedule, margins))
                data["{}_NetRtg".format(key_prefix)].append(net)
                data["{}_SRS".format(key_prefix)].append(srs)
                if debug:
                    print("Calculated pre-game SRS: {} and average NetRtg: {} for {}".format(srs, net, t))
            # determine pre-game ELO
            elo = game["elo1_pre"] if t == team1 else game["elo2_pre"]
            data["{}_ELO".format(key_prefix)].append(elo)
            if debug:
                print("Pre-game ELO: {} for {}".format(elo, t))
        # update margins
        mov_team1 = team1_score - team2_score
        mov_team2 = -mov_team1
        margins[team1].append(mov_team1)
        margins[team2].append(mov_team2)
        if debug:
            print("Updated margins for {}: {}, {}: {}".format(team1, mov_team1, team2, mov_team2))
        # update schedule
        gp = schedule[team1].get(team2, None)
        if gp is None:
            schedule[team1][team2] = 1
            schedule[team2][team1] = 1
        else:
            schedule[team1][team2] = gp + 1
            schedule[team2][team1] = gp + 1
        if debug:
            print("Updated schedule. {} now played {} {} times, and {} played {} {} times".format(team1, team2, schedule[team1][team2], team2, team1, schedule[team2][team1]))
        # update net ratings
        net_team1 = game["team1_NetRtg"]
        net_team2 = game["team2_NetRtg"]
        net_ratings[team1].append(net_team1)
        net_ratings[team2].append(net_team2)
        if debug:
            print("Updated net ratings for {}: {}, {}: {}".format(team1, net_team1, team2, net_team2))
            i += 1
            print("****************************")
    # assure shape match after feature processing
    total_games_for_season =  df_boxscores[df_boxscores["season"] == season].shape[0]
    for key in data.keys():
        assert len(data[key]) == total_games_for_season, "Mismatch for key {}".format(key)
    # Calculate differences
    df = pd.DataFrame(data)
    df["d_ELO"] = df["team1_ELO"] - df["team2_ELO"]
    df["d_NetRtg"] = df["team1_NetRtg"] - df["team2_NetRtg"]
    df["d_SRS"] = df["team1_SRS"] - df["team2_SRS"]
    df = df.drop(["team1_NetRtg", "team2_NetRtg", "team1_SRS", "team2_SRS", "team1_ELO", "team2_ELO"], axis=1)
    return df

In [237]:
start_year = 1985
end_year = 2017
unweighted_df = pd.concat([compute_features_for_season(season, df_boxscores, df_season_summaries, debug=False, weighted=False) for season in range(start_year, end_year+1)])
weighted_df = pd.concat([compute_features_for_season(season, df_boxscores, df_season_summaries, debug=False, weighted=True) for season in range(start_year, end_year+1)])

In [240]:
print(unweighted_df.shape)
unweighted_df.tail()

(39965, 7)


Unnamed: 0,result,season,team1,team2,d_ELO,d_NetRtg,d_SRS
1304,1,2017,GSW,CLE,159.455333,7.49011,7.566171
1305,1,2017,GSW,CLE,170.092273,7.871692,7.852413
1306,0,2017,CLE,GSW,-179.160392,-8.155207,-8.066312
1307,1,2017,CLE,GSW,-189.432191,-8.173078,-8.000196
1308,1,2017,GSW,CLE,143.730127,7.649889,7.4443


In [241]:
print(weighted_df.shape)
weighted_df.tail()

(39965, 7)


Unnamed: 0,result,season,team1,team2,d_ELO,d_NetRtg,d_SRS
1304,1,2017,GSW,CLE,159.455333,7.523863,7.598383
1305,1,2017,GSW,CLE,170.092273,7.909079,7.887791
1306,0,2017,CLE,GSW,-179.160392,-8.195135,-8.104033
1307,1,2017,CLE,GSW,-189.432191,-8.212771,-8.037329
1308,1,2017,GSW,CLE,143.730127,7.683859,7.475875


## Consolidate indexes

Feature dataframe indices don't match boxscore dataframe indices. Consolidate to support merging later on

In [243]:
# feature dataframes have game data from from 1985 to 2017
start_year_box = 1984
end_year_box = 2018
start_year_feature = 1985
end_year_feature = 2017

df_boxscores = pd.read_csv("../Data/nba_boxscores_{}_{}.csv".format(start_year_box, end_year_box), index_col=0, parse_dates=[1], infer_datetime_format=True)
df_boxscores = df_boxscores[boxscore_season_range_mask(df_boxscores, start_year_feature, end_year_feature)]
df_boxscores.shape

(39965, 36)

In [244]:
weighted_df.index = df_boxscores.index
unweighted_df.index = df_boxscores.index
print((df_boxscores.index == weighted_df.index).all())
print((df_boxscores.index == unweighted_df.index).all())

True
True


## $\checkmark$ Consolidate indexes

In [250]:
unweighted_df.to_csv("../Data/feature_df_unweighted.csv")
weighted_df.to_csv("../Data/feature_df_weighted.csv")

## $\checkmark$  Create dataset

# Training logistic regression model

## Split into training and test sets

In [155]:
weighted_df = pd.read_csv("../Data/feature_df_weighted.csv", index_col=0)
unweighted_df = pd.read_csv("../Data/feature_df_unweighted.csv", index_col=0)
print(weighted_df.shape)
print(unweighted_df.shape)

(39965, 7)
(39965, 7)


In [156]:
weighted_X = weighted_df.drop(["result", "team1", "team2"], axis=1)
unweighted_X = unweighted_df.drop(["result", "team1", "team2"], axis=1)

weighted_y = weighted_df[["result", "season"]]
unweighted_y = unweighted_df[["result", "season"]]

In [157]:
# 80-20 split of seasons
start_year = 1985
end_year = 2017
seasons = list(range(start_year, end_year+1))
train_seasons = np.random.choice(seasons, int(len(seasons) * .8), replace=False)
train_seasons

array([2015, 1994, 1987, 1996, 2006, 2012, 2000, 1989, 1991, 2017, 2014,
       2009, 2003, 2005, 2001, 1998, 1993, 1986, 1990, 1985, 2007, 2004,
       1992, 2011, 1997, 1999])

In [165]:
weighted_X_train = []
weighted_y_train = []
weighted_X_test = []
weighted_y_test = []
unweighted_X_train = []
unweighted_y_train = []
unweighted_X_test = []
unweighted_y_test = []
for _, g in weighted_X.iterrows():
    if g["season"] in train_seasons:
        weighted_X_train.append(g)
        weighted_y_train.append(weighted_y.loc[g.name])
    else:
        weighted_X_test.append(g)
        weighted_y_test.append(weighted_y.loc[g.name])
for _, g in unweighted_X.iterrows():
    if g["season"] in train_seasons:
        unweighted_X_train.append(g)
        unweighted_y_train.append(unweighted_y.loc[g.name])
    else:
        unweighted_X_test.append(g)
        unweighted_y_test.append(unweighted_y.loc[g.name])
weighted_X_train = pd.DataFrame(weighted_X_train).drop("season", axis=1)
weighted_y_train = pd.DataFrame(weighted_y_train)["result"]
weighted_X_test = pd.DataFrame(weighted_X_test).drop("season", axis=1)
weighted_y_test = pd.DataFrame(weighted_y_test)["result"]
unweighted_X_train = pd.DataFrame(unweighted_X_train).drop("season", axis=1)
unweighted_y_train = pd.DataFrame(unweighted_y_train)["result"]
unweighted_X_test = pd.DataFrame(unweighted_X_test).drop("season", axis=1)
unweighted_y_test = pd.DataFrame(unweighted_y_test)["result"]
Xs = [weighted_X_train, weighted_X_test, unweighted_X_train, unweighted_X_test]
ys = [weighted_y_train, weighted_y_test, unweighted_y_train, unweighted_y_test]
for X, y in zip(Xs, ys):
    print((X.index == y.index).all())

True
True
True
True


In [166]:
print("****************")
print("Training sets:")
print("****************")
print("Weighted X shape: {}".format(weighted_X_train.shape))
print("Weighted y shape: {}".format(weighted_y_train.shape))
print("Unweighted X shape: {}".format(unweighted_X_train.shape))
print("Unweighted y shape: {}".format(unweighted_y_train.shape))
print("****************")
print("Testing sets:")
print("****************")
print("Weighted X shape: {}".format(weighted_X_test.shape))
print("Weighted y shape: {}".format(weighted_y_test.shape))
print("Unweighted X shape: {}".format(unweighted_X_test.shape))
print("Unweighted y shape: {}".format(unweighted_y_test.shape))

****************
Training sets:
****************
Weighted X shape: (31244, 3)
Weighted y shape: (31244,)
Unweighted X shape: (31244, 3)
Unweighted y shape: (31244,)
****************
Testing sets:
****************
Weighted X shape: (8721, 3)
Weighted y shape: (8721,)
Unweighted X shape: (8721, 3)
Unweighted y shape: (8721,)


In [169]:
datasets = [weighted_X_train, weighted_y_train, weighted_X_test, weighted_y_test, unweighted_X_train, unweighted_y_train, unweighted_X_test, unweighted_y_test]
fns = ["../Data/weighted_X_train.pkl", "../Data/weighted_y_train.pkl", "../Data/weighted_X_test.pkl", "../Data/weighted_y_test.pkl", "../Data/unweighted_X_train.pkl", "../Data/unweighted_y_train.pkl", "../Data/unweighted_X_test.pkl", "../Data/unweighted_y_test.pkl"]

for dataset, fn in zip(datasets, fns):
    joblib.dump(dataset, fn)

## $\checkmark$ Split into training and test sets

## Find best hyperparameters for model

### Weighted 

In [170]:
log = LogisticRegression()
parameters = {
    "penalty": ["l1", "l2"],
    "C": list(range(1,11))
}
weighted_clf = GridSearchCV(log, parameters)
weighted_clf.fit(weighted_X_train, weighted_y_train)
pd.DataFrame(weighted_clf.cv_results_).sort_values("mean_test_score", ascending=False)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_C,param_penalty,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
14,0.061062,0.001709,0.684451,0.684195,8,l1,"{'C': 8, 'penalty': 'l1'}",1,0.696303,0.678909,0.680941,0.685823,0.676109,0.687854,0.005488,3.8e-05,0.00861,0.003829
0,0.061173,0.364862,0.684419,0.684195,1,l1,"{'C': 1, 'penalty': 'l1'}",2,0.696111,0.679005,0.681037,0.685727,0.676109,0.687854,0.010179,0.51358,0.008509,0.003771
3,0.036153,0.001955,0.684387,0.684307,2,l2,"{'C': 2, 'penalty': 'l2'}",3,0.696495,0.678765,0.680749,0.685871,0.675917,0.688286,0.000972,7.6e-05,0.008786,0.004041
5,0.037497,0.002048,0.684387,0.684307,3,l2,"{'C': 3, 'penalty': 'l2'}",3,0.696495,0.678765,0.680749,0.685871,0.675917,0.688286,0.001878,0.000387,0.008786,0.004041
7,0.034664,0.001852,0.684387,0.684307,4,l2,"{'C': 4, 'penalty': 'l2'}",3,0.696495,0.678765,0.680749,0.685871,0.675917,0.688286,0.000662,0.000222,0.008786,0.004041
1,0.035736,0.001769,0.684387,0.684291,1,l2,"{'C': 1, 'penalty': 'l2'}",3,0.696495,0.678717,0.680749,0.685871,0.675917,0.688286,0.001052,1.6e-05,0.008786,0.004063
11,0.035461,0.002386,0.684355,0.684307,6,l2,"{'C': 6, 'penalty': 'l2'}",7,0.696495,0.678765,0.680749,0.685871,0.675821,0.688286,0.000317,0.000472,0.008817,0.004041
17,0.03779,0.002204,0.684355,0.684307,9,l2,"{'C': 9, 'penalty': 'l2'}",7,0.696495,0.678765,0.680749,0.685871,0.675821,0.688286,0.001791,0.000357,0.008817,0.004041
15,0.035734,0.001733,0.684355,0.684307,8,l2,"{'C': 8, 'penalty': 'l2'}",7,0.696495,0.678765,0.680749,0.685871,0.675821,0.688286,0.00112,3.2e-05,0.008817,0.004041
13,0.035171,0.001758,0.684355,0.684307,7,l2,"{'C': 7, 'penalty': 'l2'}",7,0.696495,0.678765,0.680749,0.685871,0.675821,0.688286,0.000842,2.3e-05,0.008817,0.004041


### Unweighted

In [171]:
log = LogisticRegression()
parameters = {
    "penalty": ["l1", "l2"],
    "C": list(range(1,11))
}
unweighted_clf = GridSearchCV(log, parameters)
unweighted_clf.fit(unweighted_X_train, unweighted_y_train)
pd.DataFrame(unweighted_clf.cv_results_).sort_values("mean_test_score", ascending=False)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_C,param_penalty,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
19,0.039651,0.001707,0.684675,0.68538,10,l2,"{'C': 10, 'penalty': 'l2'}",1,0.694383,0.681118,0.682381,0.685487,0.677261,0.689534,0.001079,2.8e-05,0.007176,0.003437
11,0.036957,0.001743,0.684675,0.68538,6,l2,"{'C': 6, 'penalty': 'l2'}",1,0.694383,0.681118,0.682381,0.685487,0.677261,0.689534,0.001633,6.9e-05,0.007176,0.003437
3,0.040483,0.001798,0.684675,0.68538,2,l2,"{'C': 2, 'penalty': 'l2'}",1,0.694383,0.681118,0.682381,0.685439,0.677261,0.689582,0.002909,3.7e-05,0.007176,0.003456
17,0.03819,0.001959,0.684675,0.68538,9,l2,"{'C': 9, 'penalty': 'l2'}",1,0.694383,0.681118,0.682381,0.685487,0.677261,0.689534,0.001531,0.000277,0.007176,0.003437
5,0.036814,0.001747,0.684675,0.68538,3,l2,"{'C': 3, 'penalty': 'l2'}",1,0.694383,0.681118,0.682381,0.685487,0.677261,0.689534,0.001627,4.2e-05,0.007176,0.003437
15,0.037386,0.001813,0.684675,0.68538,8,l2,"{'C': 8, 'penalty': 'l2'}",1,0.694383,0.681118,0.682381,0.685487,0.677261,0.689534,0.001263,4.3e-05,0.007176,0.003437
7,0.037871,0.001787,0.684675,0.68538,4,l2,"{'C': 4, 'penalty': 'l2'}",1,0.694383,0.681118,0.682381,0.685487,0.677261,0.689534,0.001316,1.1e-05,0.007176,0.003437
13,0.037705,0.001882,0.684675,0.68538,7,l2,"{'C': 7, 'penalty': 'l2'}",1,0.694383,0.681118,0.682381,0.685487,0.677261,0.689534,0.002124,0.000128,0.007176,0.003437
9,0.037839,0.001775,0.684675,0.68538,5,l2,"{'C': 5, 'penalty': 'l2'}",1,0.694383,0.681118,0.682381,0.685487,0.677261,0.689534,0.002932,1.8e-05,0.007176,0.003437
1,0.037757,0.001951,0.684675,0.68538,1,l2,"{'C': 1, 'penalty': 'l2'}",1,0.694383,0.681118,0.682381,0.685439,0.677261,0.689582,0.001225,0.000154,0.007176,0.003456


## $\checkmark$ Find best hyperparameters for model

## Evaluate best model on test sets, and compare to FiveThirtyEight's ELO prediction probability

In [172]:
best_weighted_clf = weighted_clf.best_estimator_
best_unweighted_clf = unweighted_clf.best_estimator_
print(best_weighted_clf)
print(best_unweighted_clf)

LogisticRegression(C=8, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


In [250]:
start_year = 1984
end_year = 2018
df_boxscores = pd.read_csv("../Data/nba_boxscores_{}_{}.csv".format(start_year, end_year), index_col=0, parse_dates=[1], infer_datetime_format=True)
df = df_boxscores[boxscore_season_range_mask(df_boxscores, 1985, 2017)].copy()
elo_results = []
actual_results = []
for _, g in df.iterrows():
    elo_results.append(1 if g["elo_prob1"] > g["elo_prob2"] else 0) 
    actual_results.append(1 if g["score1"] > g["score2"] else 0)
df["elo_pred"] = elo_results 
df["result"] = actual_results

In [251]:
elo_test_pred = df.loc[weighted_X_test.index]["elo_pred"].values
y_test = df.loc[weighted_X_test.index]["result"].values
best_weighted_clf_pred = best_weighted_clf.predict(weighted_X_test)
best_unweighted_clf_pred = best_unweighted_clf.predict(unweighted_X_test)

In [258]:
elo_score = [1 if pred == actual else 0 for pred, actual in zip(elo_test_pred, y_test)]
weighted_score = [1 if pred == actual else 0 for pred, actual in zip(best_weighted_clf_pred, y_test)]
unweighted_score = [1 if pred == actual else 0 for pred, actual in zip(best_unweighted_clf_pred, y_test)]
print("Using FiveThirtyEight's ELO prediction probabilities, the result for the home team is predicted correctly at a rate of {}".format(round(sum(elo_score) / len(elo_score), 4)))
print("Using logistic regression classifier trained on weighted statistics, the result for the home team is predicted correctly at a rate of {}".format(round(sum(weighted_score) / len(weighted_score), 4)))
print("Using logistic regression classifier trained on unweighted statistics, the result for the home team is predicted correctly at a rate of {}".format(round(sum(unweighted_score) / len(unweighted_score), 4)))

Using FiveThirtyEight's ELO prediction probabilities, the result for the home team is predicted correctly at a rate of 0.682
Using logistic regression classifier trained on weighted statistics, the result for the home team is predicted correctly at a rate of 0.6832
Using logistic regression classifier trained on unweighted statistics, the result for the home team is predicted correctly at a rate of 0.6836


In [265]:
X = pd.concat([weighted_X_test, weighted_X_train]).sort_index()
df["weighted_clf_pred"] = best_weighted_clf.predict(X)
df["weighted_clf_pred_prob"] = best_weighted_clf.predict_proba(X)[:, 1]
df["unweighted_clf_pred"] = best_unweighted_clf.predict(X)
df["unweighted_clf_pred_prob"] = best_unweighted_clf.predict_proba(X)[:, 1]

df.to_csv("../Data/nba_boxscores_predictions_1985_2017.csv")

In [226]:
joblib.dump(best_weighted_clf, '../Data/best_weighted_clf.pkl') 
joblib.dump(best_unweighted_clf, '../Data/best_unweighted_clf.pkl'); 

## $\checkmark$ Evaluate best model on test sets, and compare to FiveThirtyEight's ELO prediction probability

## Train new model on all available data

In [22]:
weighted_X_train = joblib.load("../Data/weighted_X_train.pkl")
weighted_X_test = joblib.load("../Data/weighted_X_test.pkl")
weighted_y_train = joblib.load("../Data/weighted_y_train.pkl")
weighted_y_test = joblib.load("../Data/weighted_y_test.pkl")

X = pd.concat([weighted_X_test, weighted_X_train]).sort_index()
y = pd.concat([weighted_y_test, weighted_y_train]).sort_index()

print(X.shape)
print(y.shape)

(39965, 3)
(39965,)


In [23]:
log = LogisticRegression()
parameters = {
    "penalty": ["l1", "l2"],
    "C": list(range(1,11))
}
weighted_clf = GridSearchCV(log, parameters, cv=10)
weighted_clf.fit(X, y)
pd.DataFrame(weighted_clf.cv_results_).sort_values("mean_test_score", ascending=False)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_C,param_penalty,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,split5_test_score,split5_train_score,split6_test_score,split6_train_score,split7_test_score,split7_train_score,split8_test_score,split8_train_score,split9_test_score,split9_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.1109,0.00113,0.683523,0.683682,1,l1,"{'C': 1, 'penalty': 'l1'}",1,0.678589,0.684377,0.694771,0.682245,0.70653,0.680939,0.70628,0.681161,0.670671,0.68509,0.657908,0.686452,0.670671,0.68509,0.696446,0.682226,0.670671,0.68534,0.682683,0.683894,0.025015,0.000244,0.015866,0.001816
4,0.102825,0.001061,0.683523,0.683693,3,l1,"{'C': 3, 'penalty': 'l1'}",1,0.678339,0.684294,0.694771,0.682245,0.70653,0.680994,0.70628,0.681161,0.670921,0.68509,0.658158,0.686424,0.67042,0.685062,0.695946,0.682393,0.670671,0.685368,0.683183,0.683894,0.015223,0.000216,0.015793,0.001788
2,0.129813,0.001379,0.683473,0.683698,2,l1,"{'C': 2, 'penalty': 'l1'}",3,0.67909,0.684294,0.694771,0.68219,0.70678,0.681133,0.70628,0.681105,0.670921,0.68509,0.657157,0.686424,0.66992,0.685007,0.696446,0.682338,0.670671,0.685451,0.682683,0.68395,0.033125,0.000647,0.016052,0.001789
12,0.096659,0.001027,0.683473,0.683615,7,l1,"{'C': 7, 'penalty': 'l1'}",3,0.67909,0.684322,0.694771,0.682218,0.70678,0.680939,0.705779,0.681078,0.670921,0.68509,0.657157,0.686397,0.66992,0.684868,0.696697,0.682143,0.670671,0.68534,0.682933,0.683755,0.02297,0.000136,0.016001,0.001808
11,0.060167,0.000931,0.683448,0.683626,6,l2,"{'C': 6, 'penalty': 'l2'}",5,0.678589,0.684433,0.694521,0.682106,0.70728,0.680855,0.70603,0.680911,0.670921,0.685062,0.657157,0.686369,0.670671,0.685146,0.695696,0.682199,0.670921,0.685173,0.682683,0.684006,0.001084,2.6e-05,0.015947,0.001857
18,0.089981,0.001074,0.683448,0.683684,10,l1,"{'C': 10, 'penalty': 'l1'}",5,0.678339,0.684266,0.694771,0.682273,0.70678,0.681217,0.70628,0.68105,0.671171,0.684923,0.657658,0.686424,0.67017,0.685201,0.696196,0.682365,0.67042,0.685285,0.682683,0.683839,0.013953,0.000204,0.015953,0.001761
17,0.06236,0.000934,0.683448,0.683626,9,l2,"{'C': 9, 'penalty': 'l2'}",5,0.678589,0.684433,0.694521,0.682106,0.70728,0.680855,0.70603,0.680911,0.670921,0.685062,0.657157,0.686369,0.670671,0.685146,0.695696,0.682199,0.670921,0.685173,0.682683,0.684006,0.004818,2.2e-05,0.015947,0.001857
15,0.060457,0.000947,0.683448,0.683626,8,l2,"{'C': 8, 'penalty': 'l2'}",5,0.678589,0.684433,0.694521,0.682106,0.70728,0.680855,0.70603,0.680911,0.670921,0.685062,0.657157,0.686369,0.670671,0.685146,0.695696,0.682199,0.670921,0.685173,0.682683,0.684006,0.000956,5.9e-05,0.015947,0.001857
13,0.060877,0.000963,0.683448,0.683626,7,l2,"{'C': 7, 'penalty': 'l2'}",5,0.678589,0.684433,0.694521,0.682106,0.70728,0.680855,0.70603,0.680911,0.670921,0.685062,0.657157,0.686369,0.670671,0.685146,0.695696,0.682199,0.670921,0.685173,0.682683,0.684006,0.002098,5.8e-05,0.015947,0.001857
10,0.105577,0.000938,0.683448,0.68367,6,l1,"{'C': 6, 'penalty': 'l1'}",5,0.678339,0.684294,0.694771,0.682245,0.70678,0.680966,0.70628,0.681133,0.670671,0.685118,0.657908,0.68648,0.66992,0.684979,0.696196,0.682365,0.670671,0.685368,0.682933,0.683755,0.015097,2.6e-05,0.015952,0.001802


In [24]:
log = LogisticRegression()
parameters = {
    "penalty": ["l1", "l2"],
    "C": list(range(1,11))
}
unweighted_clf = GridSearchCV(log, parameters, cv=10)
unweighted_clf.fit(X, y)
pd.DataFrame(unweighted_clf.cv_results_).sort_values("mean_test_score", ascending=False)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_C,param_penalty,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,split5_test_score,split5_train_score,split6_test_score,split6_train_score,split7_test_score,split7_train_score,split8_test_score,split8_train_score,split9_test_score,split9_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
16,0.091189,0.000955,0.683498,0.683648,9,l1,"{'C': 9, 'penalty': 'l1'}",1,0.678339,0.684211,0.694771,0.682245,0.70678,0.680939,0.70628,0.681105,0.670671,0.68509,0.657658,0.686508,0.67042,0.68509,0.696446,0.682226,0.670921,0.685146,0.682683,0.683922,0.022809,7.4e-05,0.015952,0.00181
10,0.103542,0.001022,0.683473,0.683665,6,l1,"{'C': 6, 'penalty': 'l1'}",2,0.678589,0.684377,0.694771,0.682218,0.70678,0.680994,0.70628,0.681022,0.670671,0.685062,0.657407,0.68648,0.67017,0.685062,0.696446,0.682254,0.670671,0.685368,0.682933,0.683811,0.02156,0.000229,0.016024,0.001829
9,0.061413,0.000991,0.683448,0.683626,5,l2,"{'C': 5, 'penalty': 'l2'}",3,0.678589,0.684433,0.694521,0.682106,0.70728,0.680855,0.70603,0.680911,0.670921,0.685062,0.657157,0.686369,0.670671,0.685146,0.695696,0.682199,0.670921,0.685173,0.682683,0.684006,0.0027,6.9e-05,0.015947,0.001857
17,0.103211,0.001482,0.683448,0.683626,9,l2,"{'C': 9, 'penalty': 'l2'}",3,0.678589,0.684433,0.694521,0.682106,0.70728,0.680855,0.70603,0.680911,0.670921,0.685062,0.657157,0.686369,0.670671,0.685146,0.695696,0.682199,0.670921,0.685173,0.682683,0.684006,0.018222,0.000456,0.015947,0.001857
15,0.069048,0.001195,0.683448,0.683626,8,l2,"{'C': 8, 'penalty': 'l2'}",3,0.678589,0.684433,0.694521,0.682106,0.70728,0.680855,0.70603,0.680911,0.670921,0.685062,0.657157,0.686369,0.670671,0.685146,0.695696,0.682199,0.670921,0.685173,0.682683,0.684006,0.015031,0.000396,0.015947,0.001857
13,0.066716,0.001013,0.683448,0.683626,7,l2,"{'C': 7, 'penalty': 'l2'}",3,0.678589,0.684433,0.694521,0.682106,0.70728,0.680855,0.70603,0.680911,0.670921,0.685062,0.657157,0.686369,0.670671,0.685146,0.695696,0.682199,0.670921,0.685173,0.682683,0.684006,0.005148,0.000143,0.015947,0.001857
11,0.06305,0.000965,0.683448,0.683626,6,l2,"{'C': 6, 'penalty': 'l2'}",3,0.678589,0.684433,0.694521,0.682106,0.70728,0.680855,0.70603,0.680911,0.670921,0.685062,0.657157,0.686369,0.670671,0.685146,0.695696,0.682199,0.670921,0.685173,0.682683,0.684006,0.005689,8.4e-05,0.015947,0.001857
19,0.082553,0.001176,0.683448,0.683626,10,l2,"{'C': 10, 'penalty': 'l2'}",3,0.678589,0.684433,0.694521,0.682106,0.70728,0.680855,0.70603,0.680911,0.670921,0.685062,0.657157,0.686369,0.670671,0.685146,0.695696,0.682199,0.670921,0.685173,0.682683,0.684006,0.021672,0.000267,0.015947,0.001857
7,0.059556,0.001022,0.683448,0.683626,4,l2,"{'C': 4, 'penalty': 'l2'}",3,0.678589,0.684433,0.694521,0.682106,0.70728,0.680855,0.70603,0.680911,0.670921,0.685062,0.657157,0.686369,0.670671,0.685146,0.695696,0.682199,0.670921,0.685173,0.682683,0.684006,0.000543,0.000315,0.015947,0.001857
1,0.060792,0.000994,0.683423,0.683623,1,l2,"{'C': 1, 'penalty': 'l2'}",10,0.678589,0.684433,0.694521,0.682106,0.70728,0.680855,0.70603,0.680911,0.670921,0.685062,0.657157,0.686369,0.670671,0.685146,0.695445,0.682199,0.670921,0.685173,0.682683,0.683978,0.001782,0.000124,0.015928,0.001856


In [27]:
best_weighted_clf = weighted_clf.best_estimator_
best_unweighted_clf = unweighted_clf.best_estimator_
print(best_weighted_clf)
print(best_unweighted_clf)
joblib.dump(best_weighted_clf, "../Data/best_weighted_clf_all_data.pkl")
joblib.dump(best_unweighted_clf, "../Data/best_unweighted_clf_all_data.pkl");

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
LogisticRegression(C=9, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


## $\checkmark$ Train new model on all available data