In [1]:
import pandas as pd
from IPython.core.display import display, HTML

display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_columns', 50)

In [2]:
nba_1984_2018_initials = {
    'Atlanta Hawks': 'ATL',
    'Boston Celtics': 'BOS',
    'Brooklyn Nets': 'BRK',
    'Charlotte Hornets': 'CHH',
    'Charlotte Bobcats': 'CHO',
    'Chicago Bulls': 'CHI',
    'Cleveland Cavaliers': 'CLE',
    'Dallas Mavericks': 'DAL',
    'Denver Nuggets': 'DEN',
    'Detroit Pistons': 'DET',
    'Golden State Warriors': 'GSW',
    'Houston Rockets': 'HOU',
    'Indiana Pacers': 'IND',
    'Kansas City Kings': 'KCK',
    'Los Angeles Clippers': 'LAC',
    'Los Angeles Lakers': 'LAL',
    'Memphis Grizzlies': 'MEM',
    'Miami Heat': 'MIA',
    'Milwaukee Bucks': 'MIL',
    'Minnesota Timberwolves': 'MIN',
    'New Orleans Hornets' : 'NOP',
    'New Orleans/Oklahoma City Hornets': 'NOK',
    'New Orleans Pelicans': 'NOP',
    'New Jersey Nets': 'NJN',
    'New York Knicks': 'NYK',
    'Oklahoma City Thunder': 'OKC',
    'Orlando Magic': 'ORL',
    'Philadelphia 76ers': 'PHI',
    'Phoenix Suns': 'PHO',
    'Portland Trail Blazers': 'POR',
    'Sacramento Kings': 'SAC',
    'San Diego Clippers': 'SDC',
    'San Antonio Spurs': 'SAS',
    'Seattle SuperSonics': 'SEA',
    'Toronto Raptors': 'TOR',
    'Vancouver Grizzlies': 'VAN',
    'Utah Jazz': 'UTA',
    'Washington Bullets': 'WSB',
    'Washington Wizards': 'WAS'
}
nba_1984_2018_initials_reversed = {v:k for k, v in nba_1984_2018_initials.items()}

In [3]:
boxscore_season_range_mask = lambda df, start_year, end_year:  (df["season"] >= start_year) & (df["season"] <= end_year)
boxscore_date_range_mask = lambda df, start_date, end_date: (df["date"] >= start_date) & (df["date"] <= end_date)
boxscore_team_mask = lambda df, team_initials: (df["team1"] == team_initials) | (df["team2"] == team_initials)
boxscore_regular_season_mask = lambda df: pd.isnull(df["playoff"])

summary_season_range = lambda df, start_year, end_year: df.loc[start_year:end_year]
summary_season_query = lambda df, years, teams, col_names: df.loc[(years, teams), col_names]
summary_season_remove_league_average = lambda: df

# Features for model

Visualizations in the data exploration notebook show that the most important indicators of wins on a season level are net rating, SRS, and ELO rating. It's reasonable to hypothesize that those same metrics are indicators of win probability on a game by game basis. First model will attempt to predict single game home team win probability based solely on pre-game net rating, SRS, and ELO rating of home and away team. Initial net rating and SRS rating are set to their values at the end of the prior season for each team, or to the lower quartile league value of the preceding season for expansions teams in their first season.

## Creating dataset

### Desired

| Name           |  Type         | Description                                                                        
| -------------  |-------------  | ----------------------------------------------------------------------------------|
| team1_SRS      | double        |   Pre-game SRS of home team                                                       |
| team2_SRS      | double        |   Pre-game SRS of away team                                                       |     
| team1_NetRtg   | double        |   Pre-game season average net rating of home team                                 |
| team2_NetRtg   | double        |   Pre-game season average net rating of away team                                 |
| team1_ELO      | double        |   Pre-game ELO rating of home team                                                |
| team2_ELO      | double        |   Pre-game ELO rating of away team                                                |
| result         | int           |   1 = home team win                                                               |

In [4]:
def margin_for_team(abbrev, margins):
    '''
    find the average margin of victory for a team over a period of time
    
    abbrev: string, 3 letter initial of NBA team
    margins: dict, key is team initials and value is list containing the margin of victory for each game
    
    returns average margin of victory for given team over time period encompassed by margins.
    '''
    return sum(margins[abbrev]) / len(margins[abbrev])

def weighted_margin_for_team(abbrev, margins):
    '''
    find the weighted average margin of victory for a team, where previous season average margin is weighted by ((82 - games played) / 82) and current season average margin is weighted by (games played / 82)
    
    abbrev: string, 3 letter initial of NBA team
    margins: dict, key is team initials and value is list containing the margin of victory for each game where first element equals last season average margin of victory
    
    returns weighted average margin of victory for given team over time period encompassed by margins.
    '''
    last_season_margin = margins[abbrev][0]
    this_season_margins = margins[abbrev][1:]
    gp = len(this_season_margins)
    if gp == 0:
        # no games played, return last season's margin
        return last_season_margin
    elif gp > 82:
        # team in playoffs, use only this season's margin
        return sum(this_season_margins) / len(this_season_margins)
    else:
        current_season_margin = sum(this_season_margins) / len(this_season_margins)
        return (gp / 82 * current_season_margin) + (((82 - gp) / 82)  * last_season_margin) 

def sos_for_team(abbrev, schedule, margins):
    '''
    find the strength of schedule for a team over a period of time, where sos is defined as average margin of victory of opponents faced so far, weighted by games played.
    
    abbrev: string, 3 letter initial of NBA team
    margins: dict, key is team initials and value is list containing the margin of victory for each game
    schedule: dict, key is team initials, value is dictionary where key is opponent initials, and value is # of games played vs. opponent during period in question
    
    returns strength of schedule for given team over time period encompassed by margins and schedule. 
    '''
    opp_movs = []
    for abbrev, gp in schedule[abbrev].items():
        opp_movs += [margin_for_team(abbrev, margins)] * gp
    return sum(opp_movs) / len(opp_movs)

def weighted_sos_for_team(abbrev, schedule, margins):
    '''
    see sos_for_team, but calculates average margin for each team using weighted_margin_for_team rather than margin_for_team
    
    abbrev: string, 3 letter initial of NBA team
    margins: dict, key is team initials and value is list containing the margin of victory for each game where first element equals last season average margin of victory
    schedule: dict, key is team initials, value is dictionary where key is opponent initials, and value is # of games played vs. opponent during period in question
    
    returns weighted strength of schedule for given team over time period encompassed by margins and schedule. 
    '''
    opp_movs = []
    for abbrev, gp in schedule[abbrev].items():
        opp_movs += [weighted_margin_for_team(abbrev, margins)] * gp
    return sum(opp_movs) / len(opp_movs)

def average_net_rating_for_team(abbrev, ratings):
    '''
    find the average net rating for a team during a period of time
    
    abbrev: string, 3 letter initial of NBA team
    ratings: dict, key is team initials and value is list containing net rating for each game played during period in question
    
    returns the average net rating of given team over time period encompassed by net_ratings
    '''
    return sum(ratings[abbrev]) / len(ratings[abbrev])

def weighted_average_net_rating_for_team(abbrev, ratings):
    '''
    see average_net_rating_for_team, but instead weighs previous season rating by ((82 - games played) / 82) and current season average net rating by (games played / 82)
    
    abbrev: string, 3 letter initial of NBA team
    ratings: dict, key is team initials and value is list containing net rating for each game played during period in question where first element equals last season's average net rating
    
    returns the average net rating of given team over time period encompassed by net_ratings
    '''
    last_season_rating = ratings[abbrev][0]
    this_season_ratings = ratings[abbrev][1:]
    gp = len(this_season_ratings)
    if gp == 0:
        # no games played, return last season's average net rating
        return last_season_rating
    elif gp > 82:
        # team in playoffs, use only this season's net ratings
        return sum(this_season_ratings) / len(this_season_ratings)
    else:
        current_season_rating = sum(this_season_ratings) / len(this_season_ratings)
        return (gp / 82 * current_season_rating) + (((82 - gp) / 82)  * last_season_rating)

def is_first_game_of_season(game, abbrev, boxscores_df):
    '''
    determine whether given game is the first game of the season for the given team
    
    game: pd.Series, see nba_boxscores_1984_2018.csv for format (game is single row)
    abbrev: string, 3 letter initial of NBA team
    boxscores_df: pd.DataFrame, see nba_boxscores_1984_2018.csv for format
    
    returns boolean
    '''
    first_game_indx = boxscores_df[boxscore_team_mask(boxscores_df, abbrev) & boxscore_season_range_mask(boxscores_df, game["season"], game["season"])].head(1).index[0]
    return game.name == first_game_indx

def regular_season_metrics(abbrev, season, season_summaries_df, rating_cols):
    '''
    determine end of regular season metrics for the given team and given season
    
    abbrev: string, 3 letter initial of NBA team
    season: int, season to get metrics for (2017 = 2016-17)
    season_summaries_df: pd.DataFrame, see nba_season_summaries_1984_2018.csv for format
    rating_cols: list of metrics desired, see nba_season_summaries_1984_2018.csv columns for possible values
    
    returns pd.Series with requested metrics
    '''
    team_name = "Charlotte Hornets" if abbrev == "CHO" and season > 2014 else nba_1984_2018_initials_reversed[abbrev]
    return summary_season_query(season_summaries_df, season, team_name, rating_cols)

def abbrev_dict_for_season(season, season_summaries_df):
    '''
    generate a lookup table to map boxscore teams (3 letter initials) to season summary teams (full names)
    
    season: int, (2017 = 2016-17 NBA season)
    season_summaries_df: pd.DataFrame, see nba_season_summaries_1984_2018.csv for format
    '''
    team_names = season_summaries_df.loc[season].index.tolist()
    team_names.remove("League Average")
    d = {k:v for k, v in nba_1984_2018_initials.items() if k in team_names}
     # handle edge case of boxscores using "CHO" for both Charlotte Bobcats and post 2014 Charlotte Hornets 
    if season > 2014:
        d["Charlotte Hornets"] = "CHO"
    return d

In [5]:
start_year = 1984
end_year = 2018
df_season_summaries = pd.read_csv("../Data/nba_season_summaries_{}_{}.csv".format(start_year, end_year), index_col=[0, 1])
# For MultiIndex slicing support
df_season_summaries.sort_index(inplace=True) 
df_boxscores = pd.read_csv("../Data/nba_boxscores_{}_{}.csv".format(start_year, end_year), index_col=0, parse_dates=[1], infer_datetime_format=True)

In [7]:
def compute_features_for_season(season, df_boxscores, df_season_summaries, debug=False, weighted=True):
    '''
    generate dataframe with desired model features for all the games in a given season
    
    season: int, (2017 = 2016-17 NBA season)
    df_boxscores: pd.DataFrame, see nba_boxscores_1984_2018.csv for format
    df_season_summaries: pd.DataFrame, see nba_season_summaries_1984_2018.csv for format
    debug (optional, default=False): boolean, toggle debug print statements
    weighted (optional, default=True): boolean, determines whether SRS and average net rating calculations are weighted according to games played. When true, the average net rating of a team before a game equals (gp/82 * current_season_average_net) + ((82 - gp)/82) * last_season_average_net  
    
    returns a dataframe with model features (see Features for model --> Desired for details)
    '''
    previous_season = season - 1
    if debug:
        print("Computing features for season: {}".format(season))
        print("Performing setup for season")
        print("****************************")
    # Get end of regular season SRS, NetRtg, and MOV for previous season 
    last_season_team_names = df_season_summaries.loc[previous_season].index.tolist()
    last_season_team_names.remove("League Average")
    last_season_abbrev_dict = {v:k for k, v in abbrev_dict_for_season(previous_season, df_season_summaries).items()}
    last_season_metrics = summary_season_query(df_season_summaries, previous_season, last_season_team_names, ["SRS", "NetRtg", "MOV"]).loc[previous_season]
    if debug:
        print("Successfully received metrics for {} season".format(previous_season))
    # dictionary for mapping between team initials (used in boxscore) and full team names (used in summary)
    this_season_abbrev_dict = {v:k for k, v in abbrev_dict_for_season(season, df_season_summaries).items()}
    if debug:
        print("Successfully created abbreviation mapping for {} season".format(season))
    # dictionaries for storing margins of victory and schedule (used for SRS) and net ratings for each team on a per-game basis (used for avg. NetRtg)
    margins = {}
    schedule = {}
    net_ratings = {}
    for abbrev in this_season_abbrev_dict.keys():
        margins[abbrev] = []
        schedule[abbrev] = {}
        net_ratings[abbrev] = []
    if debug:
        print("Successfully initialized dictionaries for storing MOV, schedule, and net ratings for each team on per-game basis")
    # dictionary for storing feature data
    columns = [
       "season", "team1", "team2", "team1_SRS", "team2_SRS", "team1_NetRtg", "team2_NetRtg", "team1_ELO", "team2_ELO", "result"
    ]
    data = {c:[] for c in columns}
    if debug:
        print("Successfully initialized dictionaries for storing feature data")
        print("Finished setup for season")
        print("****************************")
        print("Beginning walkthrough of {} season".format(season))
        i = 0
    # walk through current season game by game, computing desired features for each game
    for _, game in df_boxscores[boxscore_season_range_mask(df_boxscores, season, season)].iterrows():
        team1 = game["team1"]
        team1_score = game["score1"]
        team2 = game["team2"]
        team2_score = game["score2"]
        data["team1"].append(team1)
        data["team2"].append(team2)
        data["season"].append(season)
        # determine result of game
        data["result"].append(1 if team1_score > team2_score else 0)
        if debug:
            print("Game {}".format(i))
            print("****************************")
            print("{} at {}: {} - {}".format(team2, team1, team2_score, team1_score))
            print("Recorded result as {} for {}".format(data["result"][-1], team1))
        # determine pre-game SRS and NetRtg for home and away team
        for t, key_prefix in zip([team1, team2], ["team1", "team2"]):
            if is_first_game_of_season(game, t, df_boxscores):
                if debug:
                    print("First game of season for {}. Attempt to use last season's metrics".format(t))
                # first game of season for team, use last season's SRS and NetRtg values. Add NetRtg and MOV from last season as first element of current season tally to reduce variance for early season games.
                try:
                    srs = last_season_metrics.loc[last_season_abbrev_dict[t]]["SRS"]
                    net = last_season_metrics.loc[last_season_abbrev_dict[t]]["NetRtg"]
                    mov = last_season_metrics.loc[last_season_abbrev_dict[t]]["MOV"]
                    data["{}_SRS".format(key_prefix)].append(srs)
                    data["{}_NetRtg".format(key_prefix)].append(net)
                    net_ratings[t].append(net)
                    margins[t].append(mov)
                    if debug:
                        print("Recorded SRS: {}, NetRtg: {} for {}".format(srs, net, t))
                        print("Added NetRtg: {} and MOV: {} to per-game dictionaries for {}".format(net, mov, t))
                except KeyError:
                    if debug:
                        print("Expansion team, no results avaiable from last season. Attempt to use lower quartile results of previous season")
                    # first season of expansion franchise. Set to lower quartile value of previous season
                    srs = last_season_metrics["SRS"].quantile(0.25)
                    net = last_season_metrics["NetRtg"].quantile(0.25)
                    mov = last_season_metrics["MOV"].quantile(0.25)
                    data["{}_SRS".format(key_prefix)].append(srs)
                    data["{}_NetRtg".format(key_prefix)].append(net)
                    net_ratings[t].append(net)
                    margins[t].append(mov)
                    if debug:
                        print("Recorded SRS: {}, NetRtg: {} for {}".format(srs, net, t))
                        print("Added NetRtg: {} and MOV: {} to per-game dictionaries for {}".format(net, mov, t))
            else:
                # compute pre-game SRS, avg. NetRtg
                net = weighted_average_net_rating_for_team(t, net_ratings) if weighted else average_net_rating_for_team(t, net_ratings)
                srs = (weighted_margin_for_team(t, margins) + weighted_sos_for_team(t, schedule, margins)) if weighted else (margin_for_team(t, margins) + sos_for_team(t, schedule, margins))
                data["{}_NetRtg".format(key_prefix)].append(net)
                data["{}_SRS".format(key_prefix)].append(srs)
                if debug:
                    print("Calculated pre-game SRS: {} and average NetRtg: {} for {}".format(srs, net, t))
            # determine pre-game ELO
            elo = game["elo1_pre"] if t == team1 else game["elo2_pre"]
            data["{}_ELO".format(key_prefix)].append(elo)
            if debug:
                print("Pre-game ELO: {} for {}".format(elo, t))
        # update margins
        mov_team1 = team1_score - team2_score
        mov_team2 = -mov_team1
        margins[team1].append(mov_team1)
        margins[team2].append(mov_team2)
        if debug:
            print("Updated margins for {}: {}, {}: {}".format(team1, mov_team1, team2, mov_team2))
        # update schedule
        gp = schedule[team1].get(team2, None)
        if gp is None:
            schedule[team1][team2] = 1
            schedule[team2][team1] = 1
        else:
            schedule[team1][team2] = gp + 1
            schedule[team2][team1] = gp + 1
        if debug:
            print("Updated schedule. {} now played {} {} times, and {} played {} {} times".format(team1, team2, schedule[team1][team2], team2, team1, schedule[team2][team1]))
        # update net ratings
        net_team1 = game["team1_NetRtg"]
        net_team2 = game["team2_NetRtg"]
        net_ratings[team1].append(net_team1)
        net_ratings[team2].append(net_team2)
        if debug:
            print("Updated net ratings for {}: {}, {}: {}".format(team1, net_team1, team2, net_team2))
            i += 1
            print("****************************")
    # assure shape match after feature processing
    total_games_for_season =  df_boxscores[df_boxscores["season"] == season].shape[0]
    for key in data.keys():
        assert len(data[key]) == total_games_for_season, "Mismatch for key {}".format(key)
    return pd.DataFrame(data)

In [8]:
start_year = 1985
end_year = 2018
unweighted_df = pd.concat([compute_features_for_season(season, df_boxscores, df_season_summaries, debug=False, weighted=False) for season in range(start_year, end_year)])
weighted_df = pd.concat([compute_features_for_season(season, df_boxscores, df_season_summaries, debug=False, weighted=True) for season in range(start_year, end_year)])

In [9]:
print(unweighted_df.shape)
print(weighted_df.shape)
print(df_boxscores[boxscore_season_range_mask(df_boxscores, start_year, end_year-1)].shape)

(39967, 10)
(39967, 10)
(39967, 36)


In [11]:
unweighted_df.to_csv("../Data/feature_df_unweighted.csv")
weighted_df.to_csv("../Data/feature_df_weighted.csv")

## $\checkmark$  Creating dataset