In [1]:
import glob
import time
import datetime
import subprocess
from subprocess import Popen, PIPE

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from IPython.core.display import display, HTML

display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_columns', 50)

## Import historical game by game data from FiveThirtyEight GitHub Repo

[FiveThirtyEight Data Repo](https://github.com/fivethirtyeight/data)

In [90]:
url = "https://projects.fivethirtyeight.com/nba-model/nba_elo.csv"
df_elo_1946_2018 = pd.read_csv(url).astype({'date': 'datetime64[ns]'})
df_elo_1946_2018.head()

Unnamed: 0,date,season,neutral,playoff,team1,team2,elo1_pre,elo2_pre,elo_prob1,elo_prob2,elo1_post,elo2_post,carmelo1_pre,carmelo2_pre,carmelo1_post,carmelo2_post,carmelo_prob1,carmelo_prob2,score1,score2
0,1946-11-01,1947,0,,TRH,NYK,1300.0,1300.0,0.640065,0.359935,1293.2767,1306.7233,,,,,,,66.0,68.0
1,1946-11-02,1947,0,,CHS,NYK,1300.0,1306.7233,0.631101,0.368899,1309.6521,1297.0712,,,,,,,63.0,47.0
2,1946-11-02,1947,0,,PRO,BOS,1300.0,1300.0,0.640065,0.359935,1305.1542,1294.8458,,,,,,,59.0,53.0
3,1946-11-02,1947,0,,STB,PIT,1300.0,1300.0,0.640065,0.359935,1304.6908,1295.3092,,,,,,,56.0,51.0
4,1946-11-02,1947,0,,DTF,WSC,1300.0,1300.0,0.640065,0.359935,1279.6189,1320.3811,,,,,,,33.0,50.0


### Analyze fields

See [FiveThirtyEight NBA Predictions](https://projects.fivethirtyeight.com/2018-nba-predictions/) for explanation of Elo and CARM-Elo

| Name           |  Type         | Description                                                                        
| -------------  |-------------  | ----------------------------------------------------------------------------------|
| date           | datetime      |   Game date                                                                       |                      
| season         | int           |   End year of season                                                              |
| neutral        | bool          |   1=neutral home court                                                            |
| playoff        | string        |   1946-2015: t = playoffs. Starting in 2016: q=conference quarterfinals, s=conference semifinals, c=conference final,f=finals                                                                 |
| team1          | string        |   Home team initials                                                              |
| team2          | string        |   Away team initials                                                              |
| elo1_pre       | double        |   Home team Elo before game                                                       |
| elo2_pre       | double        |   Away team Elo before game                                                       |
| elo_prob1      | double        |   Win probability based on elo1_pre                                               |
| elo_prob2      | double        |   Win probability based on elo2_pre                                               |
| elo1_post      | double        |   Home team Elo after game                                                        |
| elo2_post      | double        |   Away team Elo after game                                                        |
| carmelo1_pre\* | double        |   Home team CARM-Elo before game                                                  |
| carmelo2_pre\* | double        |   Away team CARM-Elo before game                                                  |
| carmelo1_post\*| double        |   Home team CARM-Elo after game                                                   |
| carmelo2_post\*| double        |   Away team CARM-Elo after game                                                   |
| carmelo_prob1\*| double        |   Win probability based on carmelo1_pre                                           |
| carmelo_prob2\*| double        |   Win probability based on carmelo2_pre                                           |
| score1         | int           |   Home team final score                                                           |
| score2         | int           |   Away team final score                                                           |

\* data only available from 2015-2018

#### Determine CARM-Elo starting point

In [8]:
df_carm_elo = df[["date", "season", "team1", "team2", "carmelo1_pre", "carmelo2_pre", "carmelo1_post", "carmelo2_post"]]
df_carm_elo[~(pd.isnull(df_carm_elo["carmelo1_pre"]))].sort_values("date").head()

Unnamed: 0,date,season,team1,team2,carmelo1_pre,carmelo2_pre,carmelo1_post,carmelo2_post
63157,2015-10-27,2016,ATL,DET,1542.664875,1451.827385,1521.809995,1472.682265
63158,2015-10-27,2016,CHI,CLE,1564.372491,1732.025482,1570.473937,1725.924036
63159,2015-10-27,2016,GSW,NOP,1730.513765,1555.126845,1734.342589,1551.298021
63173,2015-10-28,2016,LAL,MIN,1317.548331,1345.379348,1312.320723,1350.606956
63172,2015-10-28,2016,SAC,LAC,1487.260869,1671.590488,1481.391159,1677.460198


#### CARM-Elo data available starting with 2015-2016 season

## Import historical season data from basketball-reference

[Basketball-Reference Miscellaneous season stats](https://www.basketball-reference.com/leagues/NBA_2018.html#misc_stats::none)

** NOTE: ORtg and DRtg available starting from 1950-1951 NBA season **

In [204]:
def get_season_data(end_year):
    ''' 
    get cumulative statistics for season specified by end_year
    
    end_year: int, year to query (ex: 2018 queries 2017-2018 season)
    
    returns Pandas dataframe w/ basketball-reference.com's miscellaneous stats table for season specified by end_year
    '''
    from bs4 import Comment
    html = "https://www.basketball-reference.com/leagues/NBA_{}.html".format(end_year)
    result = requests.get(html)
    soup = BeautifulSoup(result.content, "html.parser")
    # html tree is strange...table is wrapped inside a comment
    table = [c for c in (soup.find('div', id="all_misc_stats")).children if type(c) == Comment][0]
    # parse table with pandas
    df = pd.read_html(table, header=1)[0]
    df["Season"] = end_year
    return df

### Option 1) Download latest data

In [209]:
def generate_season_summaries_for(start_year, end_year):
    '''
    generate dataframe with basketball-reference season summary data (see https://www.basketball-reference.com/leagues/NBA_2018.html#misc_stats::none) for all seasons [start_year, end_year] (inclusive)

    start_year: int, lower bound for year (ex: 2000 = 1999-2000 NBA season)
    end_year: int, upper bound for year

    returns dataframe with season summary data for all seasons [start_year, end_year] (inclusive)
    '''
    assert start_year >= 1950, "Start year must be 1950 or later"
    df_season_summaries = pd.concat([get_season_data(i) for i in range(start_year, end_year+1)])
    # Playoff teams labeled with * in basketball-reference data. Remove distinction for easier grouping.
    df_season_summaries["Team"] =  df_season_summaries["Team"].map(lambda s: s.replace("*", ""))
    df_season_summaries["NetRtg"] = df_season_summaries["ORtg"] - df_season_summaries["DRtg"]
    df_season_summaries = df_season_summaries.set_index(["Season", "Team"])
    # Re-order columns
    cols = df_season_summaries_1951_2018.columns.values.tolist()
    df_season_summaries = df_season_summaries_1951_2018.reindex_axis(cols[:cols.index("Pace")] + ["NetRtg"] + cols[cols.index("Pace"):-1], axis=1)
    return df_season_summaries

In [210]:
# ORtg and DRtg available starting from 1950-51 season 
start_year = 1951
end_year = 2018
generate_season_summaries_for(start_year, end_year).to_csv("../Data/nba_season_summaries_{}_{}.csv".format(start_year, end_year))

### Option 2) Use saved offline file

In [2]:
start_year = 1951
end_year = 2018
df_season_summaries_1951_2018 = pd.read_csv("../Data/nba_season_summaries_{}_{}.csv".format(start_year, end_year), index_col=[0, 1])
# For MultiIndex slicing support
df_season_summaries_1951_2018.sort_index(inplace=True) 
df_season_summaries_1951_2018.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Rk,Age,W,L,PW,PL,MOV,SOS,SRS,ORtg,DRtg,NetRtg,Pace,FTr,3PAr,TS%,eFG%,TOV%,ORB%,FT/FGA,eFG%.1,TOV%.1,DRB%,FT/FGA.1,Arena,Attend.,Attend./G
Season,Team,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
1951,Baltimore Bullets,8.0,,24.0,42.0,27.0,39.0,-2.29,0.35,-1.94,83.9,86.3,-2.4,96.8,0.364,,0.421,0.353,,,0.271,,,,,Baltimore Coliseum,,
1951,Boston Celtics,6.0,,39.0,30.0,34.0,35.0,-0.26,-0.15,-0.41,87.3,87.6,-0.3,95.9,0.431,,0.441,0.368,,,0.312,,,,,Boston Garden,197888.0,
1951,Fort Wayne Pistons,7.0,,32.0,36.0,29.0,39.0,-1.87,0.07,-1.81,82.0,83.8,-1.8,102.4,0.403,,0.41,0.338,,,0.29,,,,,North Side High School Gym,,
1951,Indianapolis Olympians,9.0,,31.0,37.0,27.0,41.0,-2.37,0.37,-2.0,83.6,86.0,-2.4,96.4,0.329,,0.42,0.363,,,0.236,,,,,Hinkle Fieldhouse,,
1951,League Average,,,,,32.0,32.0,0.0,0.0,0.0,85.1,85.1,0.0,97.4,0.399,,0.428,0.357,,,0.293,,,,,,197888.0,


### Analyze fields

| Name      |  Type  | Description                                                                                   |
| --------- |--------| --------------------------------------------------------------------------------------------- |
| Rk        | double |   Rank (used to index for sorting within webpage)                                             |
| Age       | double |   Age of Player at the start of February 1st of that season.                                  |
| W         | double |   Wins                                                                                        |
| L         | double |   Losses                                                                                      |
| PW        | double |   Pythagorean wins, i.e., expected wins based on points scored and allowed                    |
| PL        | double |   Pythagorean losses, i.e., expected losses based on points scored and allowed                |
| MOV       | double |   Margin of Victory                                                                           |
| SOS       | double |   Strength of Schedule; a rating of strength of schedule. The rating is denominated in points above/below average, where zero is average.                                                                          |
| SRS       | double |   Simple Rating System; a team rating that takes into account average point differential and strength of schedule. The rating is denominated in points above/below average, where zero is average.                |
| ORtg      | double |   An estimate of points produced (players) or scored (teams) per 100 possessions              |  
| DRtg      | double |   An estimate of points allowed per 100 possessions                                           |
| NetRtg    | double |   ORtg - DRtg                                                                                 |
| Pace      | double |   An estimate of possessions per 48 minutes                                                   |
| FTr       | double |   Number of FT Attempts Per FG Attempt                                                        |
| 3PAr\*\*  | double |   Percentage of FG Attempts from 3-Point Range                                                |
| TS%       | double |   A measure of shooting efficiency that takes into account 2-point field goals, 3-point field goals, and free throws.                                                                                              |
| eFG%      | double |   Adjusts for the fact that a 3-point field goal is worth one more point than a 2-point field goal.                                                                                                                |
| TOV%\*    | double |   An estimate of turnovers committed per 100 plays.                                           |
| ORB%\*    | int    |   An estimate of the percentage of available offensive rebounds a team grabbed.               |
| FT/FGA    | int    |   Free Throws Per Field Goal Attempt                                                          |
| eFG%.1    | double |    Opponent Effective Field Goal Percentage                                                   |
| TOV%.1\*  | double |   Opponent Turnover Percentage                                                                |
| DRB%\*    | int    |   An estimate of the percentage of available defensive rebounds a team grabbed.               |
| FT/FGA.1\*| int    |   Opponent Free Throws Per Field Goal Attempt                                                 |    
| Arena     | string |   Home Arena                                                                                  |
| Attend.   | double |   Cumulative home attendance                                                                  |
| Attend./G | double |   Attendance per home game at the team's primary arena                                        |

\* data only available from 1974-2018

\*\* data only available from 1980-2018

NOTE: No data available for 1954-1955 Baltimore Bullets

## Import additional historical game by game data from basketball-reference

Starting from the 1983-1984 season, basketball-reference.com has game by game advanced statistics such as ORtg, DRtg, and eFG%

[Example boxscore](https://www.basketball-reference.com/boxscores/201803050CHI.html)

In [None]:
def boxscore_links_for_date(date):
    '''
    get list of basketball-reference links to boxscores for games on given date
    
    date: datetime.datetime object with year, month, and day specified
    
    returns list of urls to basketball-reference single game boxscores for given date
    '''
    link = "https://www.basketball-reference.com/boxscores/?month={}&day={}&year={}".format(date.month, date.day, date.year)
    result = requests.get(link)
    soup = BeautifulSoup(result.content, "html.parser")
    return ["http://www.basketball-reference.com" + game.find("a").get("href") for game in soup.find_all("td", {"class": "right gamelink"})]

In [None]:
def boxscore_dict_for_link(link):
    '''
    advanced box score stats as dictionary from basketball-reference boxscore link
    
    link: string, link to single game basketball-reference boxscore
    
    returns dictionary with advanced stats for home (team1) and away (team2) teams for boxscore linked
    NOTE: basketball-reference only supports single game advanced stats starting from 1983-1984 NBA season
    '''
    result = requests.get(link)
    soup = BeautifulSoup(result.content, "html.parser")
    from bs4 import Comment
    assert soup.find("div", id="all_four_factors") != None, "Advanced box score metrics only available for dates with at least 1 NBA game starting from 1983-1984 season"
    for c in soup.find("div", id="all_four_factors").children:
        if type(c) == Comment:
            s_ind = c.index("<table")
            e_ind = c.index("</table>")
            table_html = c[s_ind:e_ind+8]
            break
    df = pd.read_html(table_html, header=1, index_col=0)[0]
    df["DRtg"] = df["ORtg"].values[::-1]
    df["NetRtg"] = df["ORtg"] - df["DRtg"]
    d = {}
    for i, team_name in enumerate(df.index):
        prefix = "team2_" if i == 0 else "team1_"
        for col in df.columns:
            d[prefix+col] = df.loc[team_name, col]
    return d

### Option 1) Download data

#### Crawling through FiveThirtyEight nba_elo data to find advanced box score metrics for all games since 1983-1984 NBA season

In [None]:
start_year = 2017
end_year = 2018

session = subprocess.Popen(['python', 'scrape_boxscores.py', start_year, end_year], stdout=PIPE, stderr=PIPE)
stdout, stderr = session.communicate()

print(stdout)
if stderr:
    raise Exception("Error "+str(stderr))

#### Concatenate outputs from script to one csv

In [None]:
df_boxscores_1984_2018 = pd.concat([pd.read_csv(fn, index_col=0, parse_dates=[1], infer_datetime_format=True) for fn in sorted(glob.glob("../Data/nba_game_data*.csv"))])
print(df_boxscores_1984_2018.shape)
df_boxscores_1984_2018 = df_boxscores_1984_2018.dropna(subset=["score1", "score2", "team1_NetRtg", "team2_NetRtg", "elo1_pre", "elo2_pre"])
print(df_boxscores_1984_2018.shape)
df_boxscores_1984_2018.to_csv("../Data/nba_boxscores_1984_2018.csv")

### Option 2) Load offline data

In [4]:
df_boxscores_1984_2018 = pd.read_csv("../Data/nba_boxscores_1984_2018.csv", index_col=0, parse_dates=[1], infer_datetime_format=True)
df_boxscores_1984_2018.shape

(41937, 36)

### Analyze Fields

Source: NBA game data (1984-2018) from [FiveThirtyEight Data Repo](https://github.com/fivethirtyeight/data) and [basketball-reference.com's boxscore data](https://www.basketball-reference.com/boxscores/)

See [FiveThirtyEight NBA Predictions](https://projects.fivethirtyeight.com/2018-nba-predictions/) for explanation of Elo and CARM-Elo

| Name           |  Type         | Description                                                                        
| -------------  |-------------  | ----------------------------------------------------------------------------------|
| date           | datetime      |   Game date                                                                       |                      
| season         | int           |   End year of season                                                              |
| neutral        | bool          |   1=neutral home court                                                            |
| playoff        | string        |   1946-2015: t = playoffs. Starting in 2016: q=conference quarterfinals, s=conference semifinals, c=conference final,f=finals                                                                 |
| team1          | string        |   Home team initials                                                              |
| team2          | string        |   Away team initials                                                              |
| elo1_pre       | double        |   Home team Elo before game                                                       |
| elo2_pre       | double        |   Away team Elo before game                                                       |
| elo_prob1      | double        |   Win probability based on elo1_pre                                               |
| elo_prob2      | double        |   Win probability based on elo2_pre                                               |
| elo1_post      | double        |   Home team Elo after game                                                        |
| elo2_post      | double        |   Away team Elo after game                                                        |
| carmelo1_pre\* | double        |   Home team CARM-Elo before game                                                  |
| carmelo2_pre\* | double        |   Away team CARM-Elo before game                                                  |
| carmelo1_post\*| double        |   Home team CARM-Elo after game                                                   |
| carmelo2_post\*| double        |   Away team CARM-Elo after game                                                   |
| carmelo_prob1\*| double        |   Win probability based on carmelo1_pre                                           |
| carmelo_prob2\*| double        |   Win probability based on carmelo2_pre                                           |
| score1         | int           |   Home team final score                                                           |
| score2         | int           |   Away team final score                                                           |
| team1_pace     | double        |   Home team possessions per 48 mins                                               |
| team1_eFG%     | double        |   Home team eFG%                                                                  |
| team1_TOV%     | double        |   Home team TOV%                                                                  |
| team1_ORB%     | double        |   Home team ORB%                                                                  |
| team1_FT/FGA   | double        |   Home team FT/FGA                                                                |
| team1_ORtg     | double        |   Home team ORtg                                                                  |
| team1_DRtg     | double        |   Home team DRtg                                                                  |
| team1_NetRtg   | double        |   Home team NetRtg                                                                |
| team2_pace     | double        |   Away team possessions per 48 mins                                               |
| team2_eFG%     | double        |   Away team eFG%                                                                  |
| team2_TOV%     | double        |   Away team TOV%                                                                  |
| team2_ORB%     | double        |   Away team ORB%                                                                  |
| team2_FT/FGA   | double        |   Away team FT/FGA                                                                |
| team2_ORtg     | double        |   Away team ORtg                                                                  |
| team2_DRtg     | double        |   Away team DRtg                                                                  |
| team2_NetRtg   | double        |   Away team NetRtg                                                                |


\* data only available from 2015-2018


# Consolidate datasets

1. Game by game ELO data from FiveThirtyEight starting from [1946 season](https://projects.fivethirtyeight.com/nba-model/nba_elo.csv)
2. basketball-reference season summaries w/ advanced stats available starting from [1951 season](https://www.basketball-reference.com/leagues/NBA_1951.html)
3. Game by game boxscores w/ advanced stats from basketball-reference starting from [1984 season](https://www.basketball-reference.com/boxscores/?month=10&day=28&year=1983)

**Current datasets** 
* nba_season_summaries_1951_2018.csv - 2
* nba_boxscores_1984_2018.csv - 1 + 3

**Desired datasets**
* Season summaries 1984-2018 with regular season ELO (ELO rating after last game of regular season)

## Outline

* Filter nba_season_summaries_1951_2018.csv for seasons between 1984 and 2018
* Find abbreviations for all team names from 1984-2018 (summaries use full name, boxscores use initials)
* Use nba_boxscores_1984_2018.csv to determine end of regular season ELO for every team from 1984 to 2018

** Filter nba_season_summaries_1951_2018.csv for seasons between 1984 and 2018 **

In [197]:
df_season_summaries_1984_2018 = df_season_summaries_1951_2018.loc[1984:2018].copy()
df_season_summaries_1984_2018.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Rk,Age,W,L,PW,PL,MOV,SOS,SRS,ORtg,DRtg,NetRtg,Pace,FTr,3PAr,TS%,eFG%,TOV%,ORB%,FT/FGA,eFG%.1,TOV%.1,DRB%,FT/FGA.1,Arena,Attend.,Attend./G
Season,Team,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
1984,Atlanta Hawks,14.0,26.6,40.0,42.0,37.0,45.0,-1.29,0.22,-1.08,105.5,106.9,-1.4,95.3,0.355,0.016,0.529,0.476,14.4,31.6,0.27,0.482,15.1,65.2,0.268,Omni Coliseum,286049.0,
1984,Boston Celtics,1.0,28.0,62.0,20.0,57.0,25.0,6.56,-0.14,6.42,110.9,104.4,6.5,99.7,0.333,0.032,0.554,0.504,14.6,34.2,0.264,0.475,13.8,69.7,0.225,Boston Garden,606857.0,
1984,Chicago Bulls,23.0,24.4,27.0,55.0,28.0,54.0,-5.18,0.5,-4.69,102.4,107.5,-5.1,99.8,0.36,0.017,0.526,0.475,16.3,32.3,0.268,0.497,15.6,67.2,0.266,Chicago Stadium,256430.0,
1984,Cleveland Cavaliers,21.0,25.4,28.0,54.0,30.0,52.0,-4.26,0.55,-3.71,104.1,108.5,-4.4,97.4,0.301,0.023,0.512,0.468,14.0,33.5,0.224,0.49,13.2,70.8,0.28,Coliseum at Richfield,208094.0,
1984,Dallas Mavericks,11.0,24.7,43.0,39.0,42.0,40.0,0.43,-0.27,0.15,110.0,109.6,0.4,99.0,0.325,0.025,0.547,0.503,13.6,31.7,0.245,0.503,14.4,65.7,0.232,Reunion Arena,538162.0,


**$\checkmark$ Filter nba_season_summaries_1951_2018.csv for seasons between 1984 and 2018**

** Find abbreviations for all team names from 1984-2018 **

In [198]:
teams = set([index[1] for index, _ in df_season_summaries_1984_2018.iterrows() if index[1] != "League Average"])
teams

{'Atlanta Hawks',
 'Boston Celtics',
 'Brooklyn Nets',
 'Charlotte Bobcats',
 'Charlotte Hornets',
 'Chicago Bulls',
 'Cleveland Cavaliers',
 'Dallas Mavericks',
 'Denver Nuggets',
 'Detroit Pistons',
 'Golden State Warriors',
 'Houston Rockets',
 'Indiana Pacers',
 'Kansas City Kings',
 'Los Angeles Clippers',
 'Los Angeles Lakers',
 'Memphis Grizzlies',
 'Miami Heat',
 'Milwaukee Bucks',
 'Minnesota Timberwolves',
 'New Jersey Nets',
 'New Orleans Hornets',
 'New Orleans Pelicans',
 'New Orleans/Oklahoma City Hornets',
 'New York Knicks',
 'Oklahoma City Thunder',
 'Orlando Magic',
 'Philadelphia 76ers',
 'Phoenix Suns',
 'Portland Trail Blazers',
 'Sacramento Kings',
 'San Antonio Spurs',
 'San Diego Clippers',
 'Seattle SuperSonics',
 'Toronto Raptors',
 'Utah Jazz',
 'Vancouver Grizzlies',
 'Washington Bullets',
 'Washington Wizards'}

In [199]:
nba_1984_2018_initials = {
    'Atlanta Hawks': 'ATL',
    'Boston Celtics': 'BOS',
    'Brooklyn Nets': 'BRK',
    'Charlotte Hornets': 'CHH',
    'Charlotte Bobcats': 'CHO',
    'Chicago Bulls': 'CHI',
    'Cleveland Cavaliers': 'CLE',
    'Dallas Mavericks': 'DAL',
    'Denver Nuggets': 'DEN',
    'Detroit Pistons': 'DET',
    'Golden State Warriors': 'GSW',
    'Houston Rockets': 'HOU',
    'Indiana Pacers': 'IND',
    'Kansas City Kings': 'KCK',
    'Los Angeles Clippers': 'LAC',
    'Los Angeles Lakers': 'LAL',
    'Memphis Grizzlies': 'MEM',
    'Miami Heat': 'MIA',
    'Milwaukee Bucks': 'MIL',
    'Minnesota Timberwolves': 'MIN',
    'New Orleans Hornets' : 'NOP',
    'New Orleans/Oklahoma City Hornets': 'NOK',
    'New Orleans Pelicans': 'NOP',
    'New Jersey Nets': 'NJN',
    'New York Knicks': 'NYK',
    'Oklahoma City Thunder': 'OKC',
    'Orlando Magic': 'ORL',
    'Philadelphia 76ers': 'PHI',
    'Phoenix Suns': 'PHO',
    'Portland Trail Blazers': 'POR',
    'Sacramento Kings': 'SAC',
    'San Diego Clippers': 'SDC',
    'San Antonio Spurs': 'SAS',
    'Seattle SuperSonics': 'SEA',
    'Toronto Raptors': 'TOR',
    'Vancouver Grizzlies': 'VAN',
    'Utah Jazz': 'UTA',
    'Washington Bullets': 'WSB',
    'Washington Wizards': 'WAS'
}

In [200]:
assert len(teams) == len(nba_1984_2018_initials.values())
for initial in nba_1984_2018_initials.values():
    assert df_boxscores_1984_2018[df_boxscores_1984_2018["team1"] == initial].shape[0] != 0, "Non-existing initial {}".format(initial)

** $\checkmark$ Find abbreviations for all team names from 1984-2018 **



** Use nba_boxscores_1984_2018.csv to determine end of regular season ELO for every team from 1984 to 2018 **

In [201]:
def regular_season_elo(boxscore_df, team_initials, season):
    '''
    Find the end of regular season ELO rating for a team (or ELO after latest game played to date, if season ongoing)
    
    boxscore_df: DataFrame, see https://projects.fivethirtyeight.com/nba-model/nba_elo.csv for format
    team_initials: string, official three letter initial for team
    season: int, year of season (ex: 2017 refers to 2016-17 season)
    '''
    # handle edge case where 'new' Charlotte Hornets use initials CHO in basketball-reference and ELO dataset 
    if team_initials == "CHH" and season in list(range(2015, datetime.datetime.now().year + 1)):
        team_initials = "CHO"
    team_mask = (boxscore_df["team1"] == team_initials) | (boxscore_df["team2"] == team_initials)
    season_mask = boxscore_df["season"] == season
    regular_season_mask = pd.isnull(boxscore_df["playoff"])
    last_game_indx = boxscore_df[team_mask & season_mask & regular_season_mask].tail(1).index[0]
    team1 = boxscore_df.loc[last_game_indx]["team1"]
    return boxscore_df.loc[last_game_indx]["elo1_post"] if team1 == team_initials else boxscore_df.loc[last_game_indx]["elo2_post"]

In [202]:
seasons_and_elos = [(index[0], regular_season_elo(df_boxscores_1984_2018, nba_1984_2018_initials[index[1]], index[0]) if index[1] != "League Average" else -1) for index, row in df_season_summaries_1984_2018.iterrows()]

In [203]:
season_elos = []
for indx, (season, elo_rating) in enumerate(seasons_and_elos):
    if elo_rating == -1:
        # compute average for year
        season_elos.append(np.mean([r for s, r in seasons_and_elos if s == season and r != -1]))
    else:
        season_elos.append(elo_rating)
df_season_summaries_1984_2018["ELO"] = season_elos

In [226]:
df_season_summaries_1984_2018

Unnamed: 0_level_0,Unnamed: 1_level_0,Rk,Age,W,L,PW,PL,MOV,SOS,SRS,ORtg,DRtg,NetRtg,Pace,FTr,3PAr,TS%,eFG%,TOV%,ORB%,FT/FGA,eFG%.1,TOV%.1,DRB%,FT/FGA.1,Arena,Attend.,Attend./G,ELO
Season,Team,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
1984,Atlanta Hawks,14.0,26.6,40.0,42.0,37.0,45.0,-1.29,0.22,-1.08,105.5,106.9,-1.4,95.3,0.355,0.016,0.529,0.476,14.4,31.6,0.270,0.482,15.1,65.2,0.268,Omni Coliseum,286049.0,,1510.229000
1984,Boston Celtics,1.0,28.0,62.0,20.0,57.0,25.0,6.56,-0.14,6.42,110.9,104.4,6.5,99.7,0.333,0.032,0.554,0.504,14.6,34.2,0.264,0.475,13.8,69.7,0.225,Boston Garden,606857.0,,1686.844400
1984,Chicago Bulls,23.0,24.4,27.0,55.0,28.0,54.0,-5.18,0.50,-4.69,102.4,107.5,-5.1,99.8,0.360,0.017,0.526,0.475,16.3,32.3,0.268,0.497,15.6,67.2,0.266,Chicago Stadium,256430.0,,1334.640400
1984,Cleveland Cavaliers,21.0,25.4,28.0,54.0,30.0,52.0,-4.26,0.55,-3.71,104.1,108.5,-4.4,97.4,0.301,0.023,0.512,0.468,14.0,33.5,0.224,0.490,13.2,70.8,0.280,Coliseum at Richfield,208094.0,,1374.278700
1984,Dallas Mavericks,11.0,24.7,43.0,39.0,42.0,40.0,0.43,-0.27,0.15,110.0,109.6,0.4,99.0,0.325,0.025,0.547,0.503,13.6,31.7,0.245,0.503,14.4,65.7,0.232,Reunion Arena,538162.0,,1509.936500
1984,Denver Nuggets,15.0,27.0,38.0,44.0,38.0,44.0,-1.10,-0.17,-1.27,111.3,112.3,-1.0,110.5,0.337,0.032,0.553,0.498,12.8,29.3,0.276,0.522,14.6,66.6,0.277,McNichols Sports Arena,462407.0,,1500.419200
1984,Detroit Pistons,4.0,25.4,49.0,33.0,50.0,32.0,3.59,-0.06,3.52,111.5,108.1,3.4,103.8,0.322,0.018,0.532,0.482,12.7,36.7,0.250,0.500,14.8,67.7,0.263,Pontiac Silverdome,652865.0,,1604.563100
1984,Golden State Warriors,20.0,25.6,37.0,45.0,32.0,50.0,-3.40,0.05,-3.35,105.5,108.8,-3.3,103.3,0.342,0.030,0.520,0.471,14.9,35.6,0.254,0.519,17.0,64.2,0.250,Oakland-Alameda County Coliseum Arena,316844.0,,1435.733500
1984,Houston Rockets,18.0,26.8,29.0,53.0,33.0,49.0,-3.09,-0.04,-3.12,105.3,108.2,-2.9,104.7,0.284,0.020,0.535,0.497,15.6,32.8,0.210,0.486,14.1,67.5,0.285,The Summit,425755.0,,1374.411700
1984,Indiana Pacers,22.0,24.4,26.0,56.0,28.0,54.0,-4.82,0.57,-4.25,101.8,106.5,-4.7,102.1,0.297,0.029,0.531,0.487,15.9,28.1,0.228,0.497,16.2,66.8,0.255,Market Square Arena,410626.0,,1389.934900


** $\checkmark$ Use nba_boxscores_1984_2018.csv to determine end of regular season ELO for every team from 1984 to 2018 **

In [None]:
df_season_summaries_1984_2018.to_csv("../Data/nba_season_summaries_1984_2018.csv")

# $\checkmark$ Consolidate datasets

