In [2]:
import pandas as pd
from IPython.core.display import display, HTML

display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_columns', 50)

In [87]:
nba_1984_2018_initials = {
    'Atlanta Hawks': 'ATL',
    'Boston Celtics': 'BOS',
    'Brooklyn Nets': 'BRK',
    'Charlotte Hornets': 'CHH',
    'Charlotte Bobcats': 'CHO',
    'Chicago Bulls': 'CHI',
    'Cleveland Cavaliers': 'CLE',
    'Dallas Mavericks': 'DAL',
    'Denver Nuggets': 'DEN',
    'Detroit Pistons': 'DET',
    'Golden State Warriors': 'GSW',
    'Houston Rockets': 'HOU',
    'Indiana Pacers': 'IND',
    'Kansas City Kings': 'KCK',
    'Los Angeles Clippers': 'LAC',
    'Los Angeles Lakers': 'LAL',
    'Memphis Grizzlies': 'MEM',
    'Miami Heat': 'MIA',
    'Milwaukee Bucks': 'MIL',
    'Minnesota Timberwolves': 'MIN',
    'New Orleans Hornets' : 'NOP',
    'New Orleans/Oklahoma City Hornets': 'NOK',
    'New Orleans Pelicans': 'NOP',
    'New Jersey Nets': 'NJN',
    'New York Knicks': 'NYK',
    'Oklahoma City Thunder': 'OKC',
    'Orlando Magic': 'ORL',
    'Philadelphia 76ers': 'PHI',
    'Phoenix Suns': 'PHO',
    'Portland Trail Blazers': 'POR',
    'Sacramento Kings': 'SAC',
    'San Diego Clippers': 'SDC',
    'San Antonio Spurs': 'SAS',
    'Seattle SuperSonics': 'SEA',
    'Toronto Raptors': 'TOR',
    'Vancouver Grizzlies': 'VAN',
    'Utah Jazz': 'UTA',
    'Washington Bullets': 'WSB',
    'Washington Wizards': 'WAS'
}
nba_1984_2018_initials_reversed = {v:k for k, v in nba_1984_2018_initials.items()}

In [16]:
boxscore_season_range_mask = lambda df, start_year, end_year:  (df["season"] >= start_year) & (df["season"] <= end_year)
boxscore_date_range_mask = lambda df, start_date, end_date: (df["date"] >= start_date) & (df["date"] <= end_date)
boxscore_team_mask = lambda df, team_initials: (df["team1"] == team_initials) | (df["team2"] == team_initials)
boxscore_regular_season_mask = lambda df: pd.isnull(df["playoff"])

summary_season_range = lambda df, start_year, end_year: df.loc[start_year:end_year]
summary_season_query = lambda df, years, teams, col_names: df.loc[(years, teams), col_names]
summary_season_remove_league_average = lambda: df

# Features for model

Visualizations in the data exploration notebook show that the most important indicators of wins on a season level are net rating, SRS, and ELO rating. It's reasonable to hypothesize that those same metrics are indicators of win probability on a game by game basis. First model will attempt to predict single game home team win probability based solely on pre-game net rating, SRS, and ELO rating of home and away team. Initial net rating and SRS rating are set to their values at the end of the prior season for each team

## Creating dataset

### Desired

| Name           |  Type         | Description                                                                        
| -------------  |-------------  | ----------------------------------------------------------------------------------|
| team1          | string        |   Home team initials                                                              |
| team2          | string        |   Away team initials                                                              |
| team1_SRS      | double        |   Pre-game SRS of home team                                                       |
| team2_SRS      | double        |   Pre-game SRS of away team                                                       |     
| team1_NetRtg   | double        |   Pre-game season average net rating of home team                                 |
| team2_NetRtg   | double        |   Pre-game season average net rating of away team                                 |
| team1_ELO      | double        |   Pre-game ELO rating of home team                                                |
| team2_ELO      | double        |   Pre-game ELO rating of away team                                                |
| result         | int           |   1 = home team win                                                               |

In [238]:
def margin_for_team(abbrev, margins):
    '''
    find the average margin of victory for a team over a period of time
    
    abbrev: string, 3 letter initial of NBA team
    margins: dict, key is team initials and value is list containing the margin of victory for each game
    
    returns average margin of victory for given team over time period encompassed by margins.
    '''
    return sum(margins[abbrev]) / len(margins[abbrev])

def sos_for_team(abbrev, schedule, margins):
    '''
    find the strength of schedule for a team over a period of time, where sos is defined as average margin of victory of opponents faced so far, weighted by games played.
    
    abbrev: string, 3 letter initial of NBA team
    margins: dict, key is team initials and value is list containing the margin of victory for each game
    schedule: dict, key is team initials, value is dictionary where key is opponent initials, and value is # of games played vs. opponent during period in question
    
    returns strength of schedule for given team over time period encompassed by margins and schedule. 
    '''
    opp_movs = []
    for abbrev, gp in schedule[abbrev].items():
        opp_movs += [margin_for_team(abbrev, margins)] * gp
    return sum(opp_movs) / len(opp_movs)

def is_first_game_of_season(game, abbrev, boxscores_df):
    '''
    determine whether given game is the first game of the season for the given team
    
    game: pd.Series, see nba_boxscores_1984_2018.csv for format (game is single row)
    abbrev: string, 3 letter initial of NBA team
    boxscores_df: pd.DataFrame, see nba_boxscores_1984_2018.csv for format
    
    returns boolean
    '''
    first_game_indx = boxscores_df[boxscore_team_mask(boxscores_df, abbrev) & boxscore_season_range_mask(boxscores_df, game["season"], game["season"])].head(1).index[0]
    return game.name == first_game_indx

def regular_season_metrics(abbrev, season, season_summaries_df, rating_cols):
    '''
    determine end of regular season metrics for the given team and given season
    
    abbrev: string, 3 letter initial of NBA team
    season: int, season to get metrics for (2017 = 2016-17)
    season_summaries_df: pd.DataFrame, see nba_season_summaries_1984_2018.csv for format
    rating_cols: list of metrics desired, see nba_season_summaries_1984_2018.csv columns for possible values
    
    returns pd.Series with requested metrics
    '''
    team_name = "Charlotte Hornets" if abbrev == "CHO" and season > 2014 else nba_1984_2018_initials_reversed[abbrev]
    return summary_season_query(season_summaries_df, season, team_name, rating_cols)

def abbrev_dict_for_season(season, season_summaries_df):
    '''
    generate a lookup table to map boxscore teams (3 letter initials) to season summary teams (full names)
    
    season: int, (2017 = 2016-17 NBA season)
    season_summaries_df: pd.DataFrame, see nba_season_summaries_1984_2018.csv for format
    '''
    team_names = season_summaries_df.loc[season].index.tolist()
    team_names.remove("League Average")
    d = {k:v for k, v in nba_1984_2018_initials.items() if k in team_names}
     # handle edge case of boxscores using "CHO" for both Charlotte Bobcats and post 2014 Charlotte Hornets 
    if season > 2014:
        d["Charlotte Hornets"] = "CHO"
    return d

In [253]:
start_year = 1984
end_year = 2018
df_season_summaries = pd.read_csv("../Data/nba_season_summaries_{}_{}.csv".format(start_year, end_year), index_col=[0, 1])
# For MultiIndex slicing support
df_season_summaries.sort_index(inplace=True) 
df_boxscores = pd.read_csv("../Data/nba_boxscores_{}_{}.csv".format(start_year, end_year), index_col=0, parse_dates=[1], infer_datetime_format=True)

In [275]:
s = last_season_metrics["SRS"]
s.quantile(0.25)

-2.95

In [277]:
season = 2005

last_season_team_names = df_season_summaries.loc[season-1].index.tolist()
last_season_team_names.remove("League Average")
last_season_abbrev_dict = {v:k for k, v in abbrev_dict_for_season(season-1, df_season_summaries).items()}
last_season_metrics = summary_season_query(df_season_summaries, season-1, last_season_team_names, ["SRS", "NetRtg"]).loc[season-1]

this_season_abbrev_dict = {v:k for k, v in abbrev_dict_for_season(season, df_season_summaries).items()}

# Used for computing SRS and avg. NetRtg during season
margins = {}
schedule = {}
net_ratings = {}
for abbrev in this_season_abbrev_dict.keys():
    margins[abbrev] = []
    schedule[abbrev] = {}
    net_ratings[abbrev] = []
columns = [
    "team1", "team2", "team1_SRS", "team2_SRS", "team1_NetRtg", "team2_NetRtg", "team1_ELO", "team2_ELO", "result"
]
data = {c:[] for c in columns}

for _, game in df_boxscores[boxscore_season_range_mask(df_boxscores, season, season)].iterrows():
    team1 = game["team1"]
    team1_score = game["score1"]
    team2 = game["team2"]
    team2_score = game["score2"]
    # record teams
    data["team1"] = team1
    data["team2"] = team2
    # determine pre-game SRS and NetRtg
    for t, key_prefix in zip([team1, team2], ["team1", "team2"]):
        if is_first_game_of_season(game, t, df_boxscores):
            # first game of season for team, use last season's SRS and NetRtg values
            try:
                srs = last_season_metrics.loc[last_season_abbrev_dict[t]]["SRS"]
                net = last_season_metrics.loc[last_season_abbrev_dict[t]]["NetRtg"]
                data["{}_SRS".format(key_prefix)].append(srs)
                data["{}_NetRtg".format(key_prefix)].append(net)
                net_ratings[t].append(net)
            except KeyError:
                # first season of expansion franchise. Set both to lower quartile value of previous season
                srs = last_season_metrics["SRS"].quantile(0.25)
                net = last_season_metrics["NetRtg"].quantile(0.25)
                data["{}_SRS".format(key_prefix)].append(srs)
                data["{}_NetRtg".format(key_prefix)].append(net)
                net_ratings[t].append(net)
        else:
            # compute pre-game SRS, avg. NetRtg
            data["{}_NetRtg".format(key_prefix)] = sum(net_ratings[t]) / len(net_ratings[t])
            data["{}_SRS".format(key_prefix)] = margin_for_team(t, margins) + sos_for_team(t, schedule, margins)
        # determine pre-game ELO
        data["{}_ELO".format(key_prefix)] = game["elo1_pre"] if t == team1 else game["elo2_pre"]
    # determine result of game
    data["result"] = 1 if mov_team1 > 0 else 0
    # update margins
    mov_team1 = team1_score - team2_score
    mov_team2 = -mov_team1
    margins[team1].append(mov_team1)
    margins[team2].append(mov_team2)
    # update schedule
    gp = schedule[team1].get(team2, None)
    if gp is None:
        schedule[team1][team2] = 1
        schedule[team2][team1] = 1
    else:
        schedule[team1][team2] = gp + 1
        schedule[team2][team1] = gp + 1

First game for DAL
First game for SAC
First game for LAL
First game for DEN
First game for DET
First game for HOU
First game for SAS
First game for NJN
First game for MIA
First game for CLE
First game for IND
First game for NOP
First game for LAC
First game for SEA
First game for GSW
First game for POR
First game for ORL
First game for MIL
First game for MIN
First game for NYK
First game for BOS
First game for PHI
First game for PHO
First game for ATL
First game for MEM
First game for WAS
First game for TOR
First game for UTA
First game for CHO
CHO
First game for CHI
