In [None]:
import pandas as pd
import datetime
from bs4 import BeautifulSoup
import requests

## Import historical game by game data from FiveThirtyEight GitHub Repo

[FiveThirtyEight Data Repo](https://github.com/fivethirtyeight/data)

In [None]:
url = "https://projects.fivethirtyeight.com/nba-model/nba_elo.csv"
df = pd.read_csv(url).astype({'date': 'datetime64[ns]'})
df.head()

### Analyze fields

See [FiveThirtyEight NBA Predictions](https://projects.fivethirtyeight.com/2018-nba-predictions/) for explanation of Elo and CARM-Elo

| Name           |  Type         | Description                                                                        
| -------------  |-------------  | ----------------------------------------------------------------------------------|
| date           | datetime      |   Game date                                                                       |                      
| season         | int           |   End year of season                                                              |
| neutral        | bool          |   1=neutral home court                                                            |
| playoff        | string        |   1946-2015: t = playoffs. Starting in 2016: q=conference quarterfinals, s=conference semifinals, c=conference final,f=finals                                                                 |
| team1          | string        |   Home team initials                                                              |
| team2          | string        |   Away team initials                                                              |
| elo1_pre       | double        |   Home team Elo before game                                                       |
| elo2_pre       | double        |   Away team Elo before game                                                       |
| elo_prob1      | double        |   Win probability based on elo1_pre                                               |
| elo_prob2      | double        |   Win probability based on elo2_pre                                               |
| elo1_post      | double        |   Home team Elo after game                                                        |
| elo2_post      | double        |   Away team Elo after game                                                        |
| carmelo1_pre\* | double        |   Home team CARM-Elo before game                                                  |
| carmelo2_pre\* | double        |   Away team CARM-Elo before game                                                  |
| carmelo1_post\*| double        |   Home team CARM-Elo after game                                                   |
| carmelo2_post\*| double        |   Away team CARM-Elo after game                                                   |
| carmelo_prob1\*| double        |   Win probability based on carmelo1_pre                                           |
| carmelo_prob2\*| double        |   Win probability based on carmelo2_pre                                           |
| score1         | int           |   Home team final score                                                           |
| score2         | int           |   Away team final score                                                           |

\* data only available from 2015-2018

#### Determine CARM-Elo starting point

In [None]:
df_carm_elo = df[["date", "season", "team1", "team2", "carmelo1_pre", "carmelo2_pre", "carmelo1_post", "carmelo2_post"]]
df_carm_elo[~(pd.isnull(df_carm_elo["carmelo1_pre"]))].sort_values("date").head()

#### CARM-Elo data available starting with 2015-2016 season

## Import historical season data from basketball-reference

[Basketball-Reference Miscellaneous season stats](https://www.basketball-reference.com/leagues/NBA_2018.html#misc_stats::none)

In [None]:
def get_season_data(end_year):
    ''' 
    get cumulative statistics for season specified by end_year
    
    end_year: int, year to query (ex: 2018 queries 2017-2018 season)
    
    returns Pandas dataframe w/ basketball-reference.com's miscellaneous stats table for season specified by end_year
    '''
    from bs4 import Comment
    html = "https://www.basketball-reference.com/leagues/NBA_{}.html".format(end_year)
    result = requests.get(html)
    soup = BeautifulSoup(result.content, "html.parser")
    # html tree is strange...table is wrapped inside a comment
    table = [c for c in (soup.find('div', id="all_misc_stats")).children if type(c) == Comment][0]
    # parse table with pandas
    df = pd.read_html(table, header=1)[0]
    df["Season"] = end_year
    return df

#### Option 1) Download latest data

In [None]:
start_year = 1951
end_year = 2018
dfs_1951_2018 = [get_season_data(i) for i in range(1951, 2019)]
df_1951_2018 = pd.concat(dfs_1951_2018)
df_1951_2018["Team"] =  df_1951_2018["Team"].map(lambda s: s.replace("*", ""))
df_1951_2018 = df_1951_2018.set_index(["Season", "Team"])
pd.set_option('display.max_columns', 30)
df_1951_2018.head()
# Save
# df_1951_2018.to_csv("../Data/nba_season_data.csv")

#### Option 2) Use saved offline file

In [None]:
df_1951_2018 = pd.read_csv("../Data/nba_season_data.csv")
df_1951_2018["Team"] =  df_1951_2018["Team"].map(lambda s: s.replace("*", ""))
df_1951_2018 = df_1951_2018.set_index(["Season", "Team"])
pd.set_option('display.max_columns', 30)
df_1951_2018.head()

### Analyze fields

| Name      |  Type  | Description                                                                                   |
| --------- |--------| --------------------------------------------------------------------------------------------- |
| Rk        | double |   Rank (used to index for sorting within webpage)                                             |
| Age       | double |   Age of Player at the start of February 1st of that season.                                  |
| W         | double |   Wins                                                                                        |
| L         | double |   Losses                                                                                      |
| PW        | double |   Pythagorean wins, i.e., expected wins based on points scored and allowed                    |
| PL        | double |   Pythagorean losses, i.e., expected losses based on points scored and allowed                |
| MOV       | double |   Margin of Victory                                                                           |
| SOS       | double |   Strength of Schedule; a rating of strength of schedule. The rating is denominated in points above/below average, where zero is average.                                                                          |
| SRS       | double |   Simple Rating System; a team rating that takes into account average point differential and strength of schedule. The rating is denominated in points above/below average, where zero is average.                |
| ORtg      | double |   An estimate of points produced (players) or scored (teams) per 100 possessions              |  
| DRtg      | double |   An estimate of points allowed per 100 possessions                                           |
| Pace      | double |   An estimate of possessions per 48 minutes                                                   |
| FTr       | double |   Number of FT Attempts Per FG Attempt                                                        |
| 3PAr\*\*  | double |   Percentage of FG Attempts from 3-Point Range                                                |
| TS%       | double |   A measure of shooting efficiency that takes into account 2-point field goals, 3-point field goals, and free throws.                                                                                              |
| eFG%      | double |   Adjusts for the fact that a 3-point field goal is worth one more point than a 2-point field goal.                                                                                                                |
| TOV%\*    | double |   An estimate of turnovers committed per 100 plays.                                           |
| ORB%\*    | int    |   An estimate of the percentage of available offensive rebounds a team grabbed.               |
| FT/FGA    | int    |   Free Throws Per Field Goal Attempt                                                          |
| eFG%.1    | double |    Opponent Effective Field Goal Percentage                                                   |
| TOV%.1\*  | double |   Opponent Turnover Percentage                                                                |
| DRB%\*    | int    |   An estimate of the percentage of available defensive rebounds a team grabbed.               |
| FT/FGA.1\*| int    |   Opponent Free Throws Per Field Goal Attempt                                                 |    
| Arena     | string |   Home Arena                                                                                  |
| Attend.   | double |   Cumulative home attendance                                                                  |
| Attend./G | double |   Attendance per home game at the team's primary arena                                        |

\* data only available from 1974-2018

\*\* data only available from 1980-2018

NOTE: No data available for 1954-1955 Baltimore Bullets

## Import additional historical game by game data from basketball-reference

Starting from the 1983-1984 season, basketball-reference.com has game by game advanced statistics such as ORtg, DRtg, and eFG%

[Example boxscore](https://www.basketball-reference.com/boxscores/201803050CHI.html)

In [None]:
def boxscore_links_for_date(date):
    '''
    get list of basketball-reference links to boxscores for games on given date
    
    date: datetime.datetime object with year, month, and day specified
    
    returns list of urls to basketball-reference single game boxscores for given date
    '''
    link = "https://www.basketball-reference.com/boxscores/?month={}&day={}&year={}".format(date.month, date.day, date.year)
    result = requests.get(link)
    soup = BeautifulSoup(result.content, "html.parser")
    return ["http://www.basketball-reference.com" + game.find("a").get("href") for game in soup.find_all("td", {"class": "right gamelink"})]

In [None]:
def boxscore_dict_for_link(link):
    '''
    advanced box score stats as dictionary from basketball-reference boxscore link
    
    link: string, link to single game basketball-reference boxscore
    
    returns dictionary with advanced stats for home (team1) and away (team2) teams for boxscore linked
    NOTE: basketball-reference only supports single game advanced stats starting from 1983-1984 NBA season
    '''
    result = requests.get(link)
    soup = BeautifulSoup(result.content, "html.parser")
    from bs4 import Comment
    assert soup.find("div", id="all_four_factors") != None, "Advanced box score metrics only available for dates with at least 1 NBA game starting from 1983-1984 season"
    for c in soup.find("div", id="all_four_factors").children:
        if type(c) == Comment:
            s_ind = c.index("<table")
            e_ind = c.index("</table>")
            table_html = c[s_ind:e_ind+8]
            break
    df = pd.read_html(table_html, header=1, index_col=0)[0]
    df["DRtg"] = df["ORtg"].values[::-1]
    df["NetRtg"] = df["ORtg"] - df["DRtg"]
    d = {}
    for i, team_name in enumerate(df.index):
        prefix = "team2_" if i == 0 else "team1_"
        for col in df.columns:
            d[prefix+col] = df.loc[team_name, col]
    return d

#### Crawling through nba_elo data to find advanced box score metrics for all games since 1983-1984 NBA season

In [None]:
# Basketball-reference.com has advanced metrics for single games starting w/ 1983-1984 NBA season
season_start_dates = []
season_end_dates = []
for i in range(1985, 2019):
    season_df = df[(df["season"] == i) & (~pd.isnull(df["score1"]))]
    season_start_dates.append(season_df.head(1)["date"])
    season_end_dates.append(season_df.tail(1)["date"])
# Query basketball reference game by game, and save results for each year
for season_start_date, season_end_date in zip(season_start_dates, season_end_dates):
    df_slice = df[(df["date"] >= season_start_date) & (df["date"] <= season_end_date)].copy()
    current_date = season_start_date
    boxscores_for_date = []
    boxscores = boxscore_links_for_date(current_date)
    data = []
    for _, row in df_slice.iterrows():
        print("{} vs. {} on {}".format(row["team1"], row["team2"], row["date"]))
        if current_date != row["date"]:
            # get new boxscores for date
            print("New day ({}), getting boxscores".format(row["date"]))
            boxscores = boxscore_links_for_date(row["date"])
            current_date = row["date"]
        home_team = row["team1"]
        boxscore_for_game = list(filter(lambda link: home_team in link, boxscores))[0]
        d = boxscore_dict_for_link(boxscore_for_game)
        for key, val in d.items():
            row[key] = val
        data.append(row)
    # save data for year
    print("Saving for season from {} to {}".format(season_start_date, season_end_date))
    pd.DataFrame(data).to_csv("nba_game_data_{}-{}-{}_to_{}-{}-{}.csv".format(season_start_date.year, season_start_date.month, season_start_date.day, season_end_date.year, season_end_date.month, season_end_date.day))