In [1]:
import pandas as pd
import datetime
from bs4 import BeautifulSoup
import requests

## Import historical game by game data from FiveThirtyEight GitHub Repo

[FiveThirtyEight Data Repo](https://github.com/fivethirtyeight/data)

In [277]:
url = "https://projects.fivethirtyeight.com/nba-model/nba_elo.csv"
df = pd.read_csv(url).astype({'date': 'datetime64[ns]'})
df.head()

Unnamed: 0,date,season,neutral,playoff,team1,team2,elo1_pre,elo2_pre,elo_prob1,elo_prob2,elo1_post,elo2_post,carmelo1_pre,carmelo2_pre,carmelo1_post,carmelo2_post,carmelo_prob1,carmelo_prob2,score1,score2
0,1946-11-01,1947,0,,TRH,NYK,1300.0,1300.0,0.640065,0.359935,1293.2767,1306.7233,,,,,,,66.0,68.0
1,1946-11-02,1947,0,,CHS,NYK,1300.0,1306.7233,0.631101,0.368899,1309.6521,1297.0712,,,,,,,63.0,47.0
2,1946-11-02,1947,0,,PRO,BOS,1300.0,1300.0,0.640065,0.359935,1305.1542,1294.8458,,,,,,,59.0,53.0
3,1946-11-02,1947,0,,STB,PIT,1300.0,1300.0,0.640065,0.359935,1304.6908,1295.3092,,,,,,,56.0,51.0
4,1946-11-02,1947,0,,DTF,WSC,1300.0,1300.0,0.640065,0.359935,1279.6189,1320.3811,,,,,,,33.0,50.0


### Analyze fields

See [FiveThirtyEight NBA Predictions](https://projects.fivethirtyeight.com/2018-nba-predictions/) for explanation of Elo and CARM-Elo

| Name          |  Type         | Description                                                                        |
| ------------- |-------------  | ---------------------------------------------------------------------------------- |
| date          | datetime      |   Game date                                                                        |                              
| season        | int           |   End year of season                                                               |
| neutral       | bool          |   1=neutral home court                                                             |
| playoff       | string        |   q=conference quarterfinals, s=conference semifinals, c=conference final, f=finals|
| team1         | string        |   Home team initials                                                               |
| team2         | string        |   Away team initials                                                               |
| elo1_pre      | double        |   Home team Elo before game                                                        |
| elo2_pre      | double        |   Away team Elo before game                                                        |
| elo_prob1     | double        |   Win probability based on elo1_pre                                                |
| elo_prob2     | double        |   Win probability based on elo2_pre                                                |
| elo1_post     | double        |   Home team Elo after game                                                         |
| elo2_post     | double        |   Away team Elo after game                                                         |
| carmelo1_pre*  | double        |   Home team CARM-Elo before game                                                   |
| carmelo2_pre*  | double        |   Away team CARM-Elo before game                                                   |
| carmelo1_post* | double        |   Home team CARM-Elo after game                                                    |
| carmelo2_post* | double        |   Away team CARM-Elo after game                                                    |
| carmelo_prob1* | double        |   Win probability based on carmelo1_pre                                            |
| carmelo_prob2* | double        |   Win probability based on carmelo2_pre                                            |
| score1        | int           |   Home team final score                                                            |
| score2        | int           |   Away team final score                                                            |

\* data only available from 2015-2018

#### Determine CARM-Elo starting point

In [None]:
df_carm_elo.so

In [308]:
df_carm_elo = df[["date", "season", "team1", "team2", "carmelo1_pre", "carmelo2_pre", "carmelo1_post", "carmelo2_post"]]
df_carm_elo[~(pd.isnull(df_carm_elo["carmelo1_pre"]))].sort_values("date").head()

Unnamed: 0,date,season,team1,team2,carmelo1_pre,carmelo2_pre,carmelo1_post,carmelo2_post
63157,2015-10-27,2016,ATL,DET,1542.664875,1451.827385,1521.809995,1472.682265
63158,2015-10-27,2016,CHI,CLE,1564.372491,1732.025482,1570.473937,1725.924036
63159,2015-10-27,2016,GSW,NOP,1730.513765,1555.126845,1734.342589,1551.298021
63173,2015-10-28,2016,LAL,MIN,1317.548331,1345.379348,1312.320723,1350.606956
63172,2015-10-28,2016,PHO,DAL,1485.316896,1488.680679,1465.523926,1508.473649


#### CARM-Elo data available starting with 2015-2016 season

## Import historical season data from basketball-reference

[Basketball-Reference Miscellaneous season stats](https://www.basketball-reference.com/leagues/NBA_2018.html#misc_stats::none)

In [309]:
def get_season_data(end_year):
    ''' 
    get cumulative statistics for season specified by end_year
    
    end_year: int, year to query (ex: 2018 queries 2017-2018 season)
    
    returns Pandas dataframe w/ basketball-reference.com's miscellaneous stats table for season specified by end_year
    '''
    from bs4 import Comment
    html = "https://www.basketball-reference.com/leagues/NBA_{}.html".format(end_year)
    result = requests.get(html)
    soup = BeautifulSoup(result.content, "html.parser")
    # html tree is strange...table is wrapped inside a comment
    table = [c for c in (soup.find('div', id="all_misc_stats")).children if type(c) == Comment][0]
    # parse table with pandas
    df = pd.read_html(table, header=1)[0]
    df["Season"] = end_year
    return df

In [328]:
start_year = 1951
end_year = 2018
dfs_1951_2018 = [get_season_data(i) for i in range(1951, 2019)]

In [334]:
df_1951_2018 = pd.concat(dfs_1951_2018)

In [335]:
df_1951_2018["Team"] =  df_1951_2018["Team"].map(lambda s: s.replace("*", ""))

In [336]:
df_1951_2018 = df_1951_2018.set_index(["Season", "Team"])
pd.set_option('display.max_columns', 30)
df_1951_2018.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Rk,Age,W,L,PW,PL,MOV,SOS,SRS,ORtg,DRtg,Pace,FTr,3PAr,TS%,eFG%,TOV%,ORB%,FT/FGA,eFG%.1,TOV%.1,DRB%,FT/FGA.1,Arena,Attend.,Attend./G
Season,Team,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
1951,Minneapolis Lakers,1.0,,44.0,24.0,49.0,19.0,5.41,-0.63,4.79,86.4,80.7,94.8,0.356,,0.436,0.373,,,0.262,,,,,Minneapolis Auditorium,,
1951,Philadelphia Warriors,2.0,,40.0,26.0,43.0,23.0,3.76,-0.36,3.4,84.8,81.0,99.3,0.385,,0.425,0.35,,,0.294,,,,,Philadelphia Arena,,
1951,Rochester Royals,3.0,,41.0,27.0,42.0,26.0,2.99,-0.44,2.54,89.3,86.2,92.3,0.418,,0.452,0.378,,,0.315,,,,,Edgerton Park Arena,,
1951,Syracuse Nationals,4.0,,32.0,34.0,34.0,32.0,0.53,0.09,0.62,86.4,85.9,98.1,0.491,,0.435,0.351,,,0.356,,,,,State Fair Coliseum,,
1951,New York Knicks,5.0,,36.0,30.0,34.0,32.0,0.41,0.07,0.49,88.0,87.6,94.8,0.415,,0.445,0.379,,,0.296,,,,,Madison Square Garden (III),,


### Analyze fields

| Name      |  Type  | Description                                                                                   |
| --------- |--------| --------------------------------------------------------------------------------------------- |
| Rk        | double |   Rank (used to index for sorting within webpage)                                             |
| Age       | double |   Age of Player at the start of February 1st of that season.                                  |
| W         | double |   Wins                                                                                        |
| L         | double |   Losses                                                                                      |
| PW        | double |   Pythagorean wins, i.e., expected wins based on points scored and allowed                    |
| PL        | double |   Pythagorean losses, i.e., expected losses based on points scored and allowed                |
| MOV       | double |   Margin of Victory                                                                           |
| SOS       | double |   Strength of Schedule; a rating of strength of schedule. The rating is denominated in points above/below average, where zero is average.                                                                          |
| SRS       | double |   Simple Rating System; a team rating that takes into account average point differential and strength of schedule. The rating is denominated in points above/below average, where zero is average.                |
| ORtg      | double |   An estimate of points produced (players) or scored (teams) per 100 possessions              |  
| DRtg      | double |   An estimate of points allowed per 100 possessions                                           |
| Pace      | double |   An estimate of possessions per 48 minutes                                                   |
| FTr       | double |   Number of FT Attempts Per FG Attempt                                                        |
| 3PAr      | double |   Percentage of FG Attempts from 3-Point Range                                                |
| TS%       | double |   A measure of shooting efficiency that takes into account 2-point field goals, 3-point field goals, and free throws.                                                                                              |
| eFG%      | double |   Adjusts for the fact that a 3-point field goal is worth one more point than a 2-point field goal.                                                                                                                |
| TOV%      | double |   An estimate of turnovers committed per 100 plays.                                           |
| ORB%      | int    |   An estimate of the percentage of available offensive rebounds a team grabbed.               |
| FT/FGA    | int    |   Free Throws Per Field Goal Attempt                                                          |
| eFG%.1    | double |    Opponent Effective Field Goal Percentage                                                   |
| TOV%.1    | double |   Opponent Turnover Percentage                                                                |
| DRB%      | int    |   An estimate of the percentage of available defensive rebounds a team grabbed.               |
| FT/FGA.1  | int    |   Opponent Free Throws Per Field Goal Attempt                                                 |    
| Arena     | string |   Home Arena                                                                                  |
| Attend.   | double |   Cumulative home attendance                                                                  |
| Attend./G | double |   Attendance per home game at the team's primary arena                                        |