# Beating the NBA Bookmakers Odds

In [1]:
import pandas as pd
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score

In [2]:
import warnings
warnings.filterwarnings('ignore')

## Loading the Data and Preprocessing

In [3]:
boxscores = pd.read_csv("data/games.csv", index_col=0)

display(boxscores.head())
display(boxscores.tail())

Unnamed: 0,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,fta,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,240.0,240.0,39.0,81.0,0.481,6.0,20.0,0.3,14.0,18.0,...,22.8,29.0,178.0,111.0,DAL,95,1,2016,2015-12-09,True
1,240.0,240.0,36.0,100.0,0.36,7.0,31.0,0.226,16.0,19.0,...,50.0,32.6,152.0,111.0,ATL,98,0,2016,2015-12-09,False
2,240.0,240.0,37.0,85.0,0.435,8.0,19.0,0.421,17.0,23.0,...,20.0,30.9,148.0,116.0,SAS,107,1,2018,2017-10-18,False
3,240.0,240.0,41.0,89.0,0.461,8.0,21.0,0.381,17.0,19.0,...,28.6,30.9,138.0,118.0,MIN,99,0,2018,2017-10-18,True
4,240.0,240.0,27.0,86.0,0.314,6.0,26.0,0.231,15.0,20.0,...,16.8,30.9,157.0,90.0,MEM,92,1,2021,2021-04-30,False


Unnamed: 0,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,fta,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
17767,240.0,240.0,35.0,81.0,0.432,11.0,26.0,0.423,27.0,36.0,...,34.2,33.7,160.0,118.0,OKC,92,0,2019,2018-10-19,True
17768,240.0,240.0,37.0,74.0,0.5,13.0,25.0,0.52,26.0,37.0,...,25.0,30.0,139.0,129.0,ORL,108,1,2017,2016-12-14,True
17769,240.0,240.0,42.0,89.0,0.472,14.0,33.0,0.424,10.0,20.0,...,25.6,29.9,175.0,126.0,LAC,113,0,2017,2016-12-14,False
17770,240.0,240.0,41.0,85.0,0.482,9.0,26.0,0.346,26.0,30.0,...,27.7,27.1,150.0,126.0,MIA,106,1,2020,2020-09-19,True
17771,240.0,240.0,33.0,85.0,0.388,12.0,44.0,0.273,28.0,34.0,...,51.5,36.2,141.0,114.0,BOS,117,0,2020,2020-09-19,False


In [4]:
boxscore = boxscores.sort_values("date")
boxscores = boxscores.reset_index(drop=True)
del boxscores["mp.1"]
del boxscores["mp_opp.1"]
del boxscores["index_opp"]

In [5]:
display(boxscores.head())
display(boxscores.tail())

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,240.0,39.0,81.0,0.481,6.0,20.0,0.3,14.0,18.0,0.778,...,22.8,29.0,178.0,111.0,DAL,95,1,2016,2015-12-09,True
1,240.0,36.0,100.0,0.36,7.0,31.0,0.226,16.0,19.0,0.842,...,50.0,32.6,152.0,111.0,ATL,98,0,2016,2015-12-09,False
2,240.0,37.0,85.0,0.435,8.0,19.0,0.421,17.0,23.0,0.739,...,20.0,30.9,148.0,116.0,SAS,107,1,2018,2017-10-18,False
3,240.0,41.0,89.0,0.461,8.0,21.0,0.381,17.0,19.0,0.895,...,28.6,30.9,138.0,118.0,MIN,99,0,2018,2017-10-18,True
4,240.0,27.0,86.0,0.314,6.0,26.0,0.231,15.0,20.0,0.75,...,16.8,30.9,157.0,90.0,MEM,92,1,2021,2021-04-30,False


Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
17767,240.0,35.0,81.0,0.432,11.0,26.0,0.423,27.0,36.0,0.75,...,34.2,33.7,160.0,118.0,OKC,92,0,2019,2018-10-19,True
17768,240.0,37.0,74.0,0.5,13.0,25.0,0.52,26.0,37.0,0.703,...,25.0,30.0,139.0,129.0,ORL,108,1,2017,2016-12-14,True
17769,240.0,42.0,89.0,0.472,14.0,33.0,0.424,10.0,20.0,0.5,...,25.6,29.9,175.0,126.0,LAC,113,0,2017,2016-12-14,False
17770,240.0,41.0,85.0,0.482,9.0,26.0,0.346,26.0,30.0,0.867,...,27.7,27.1,150.0,126.0,MIA,106,1,2020,2020-09-19,True
17771,240.0,33.0,85.0,0.388,12.0,44.0,0.273,28.0,34.0,0.824,...,51.5,36.2,141.0,114.0,BOS,117,0,2020,2020-09-19,False


In [6]:
# Creating the Target Variable to Preiict the Winner of the Game
def add_target(group):
    group["target"] = group["won"].shift(-1)
    return group

boxscores = boxscores.groupby("team", group_keys=False).apply(add_target)

boxscores 

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,240.0,39.0,81.0,0.481,6.0,20.0,0.300,14.0,18.0,0.778,...,29.0,178.0,111.0,DAL,95,1,2016,2015-12-09,True,True
1,240.0,36.0,100.0,0.360,7.0,31.0,0.226,16.0,19.0,0.842,...,32.6,152.0,111.0,ATL,98,0,2016,2015-12-09,False,False
2,240.0,37.0,85.0,0.435,8.0,19.0,0.421,17.0,23.0,0.739,...,30.9,148.0,116.0,SAS,107,1,2018,2017-10-18,False,True
3,240.0,41.0,89.0,0.461,8.0,21.0,0.381,17.0,19.0,0.895,...,30.9,138.0,118.0,MIN,99,0,2018,2017-10-18,True,False
4,240.0,27.0,86.0,0.314,6.0,26.0,0.231,15.0,20.0,0.750,...,30.9,157.0,90.0,MEM,92,1,2021,2021-04-30,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17767,240.0,35.0,81.0,0.432,11.0,26.0,0.423,27.0,36.0,0.750,...,33.7,160.0,118.0,OKC,92,0,2019,2018-10-19,True,True
17768,240.0,37.0,74.0,0.500,13.0,25.0,0.520,26.0,37.0,0.703,...,30.0,139.0,129.0,ORL,108,1,2017,2016-12-14,True,
17769,240.0,42.0,89.0,0.472,14.0,33.0,0.424,10.0,20.0,0.500,...,29.9,175.0,126.0,LAC,113,0,2017,2016-12-14,False,
17770,240.0,41.0,85.0,0.482,9.0,26.0,0.346,26.0,30.0,0.867,...,27.1,150.0,126.0,MIA,106,1,2020,2020-09-19,True,


In [7]:
boxscores["target"][pd.isnull(boxscores["target"])] = 2
boxscores["target"] = boxscores["target"].astype(int, errors="ignore")
boxscores 

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,240.0,39.0,81.0,0.481,6.0,20.0,0.300,14.0,18.0,0.778,...,29.0,178.0,111.0,DAL,95,1,2016,2015-12-09,True,1
1,240.0,36.0,100.0,0.360,7.0,31.0,0.226,16.0,19.0,0.842,...,32.6,152.0,111.0,ATL,98,0,2016,2015-12-09,False,0
2,240.0,37.0,85.0,0.435,8.0,19.0,0.421,17.0,23.0,0.739,...,30.9,148.0,116.0,SAS,107,1,2018,2017-10-18,False,1
3,240.0,41.0,89.0,0.461,8.0,21.0,0.381,17.0,19.0,0.895,...,30.9,138.0,118.0,MIN,99,0,2018,2017-10-18,True,0
4,240.0,27.0,86.0,0.314,6.0,26.0,0.231,15.0,20.0,0.750,...,30.9,157.0,90.0,MEM,92,1,2021,2021-04-30,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17767,240.0,35.0,81.0,0.432,11.0,26.0,0.423,27.0,36.0,0.750,...,33.7,160.0,118.0,OKC,92,0,2019,2018-10-19,True,1
17768,240.0,37.0,74.0,0.500,13.0,25.0,0.520,26.0,37.0,0.703,...,30.0,139.0,129.0,ORL,108,1,2017,2016-12-14,True,2
17769,240.0,42.0,89.0,0.472,14.0,33.0,0.424,10.0,20.0,0.500,...,29.9,175.0,126.0,LAC,113,0,2017,2016-12-14,False,2
17770,240.0,41.0,85.0,0.482,9.0,26.0,0.346,26.0,30.0,0.867,...,27.1,150.0,126.0,MIA,106,1,2020,2020-09-19,True,2


In [8]:
display(boxscores[boxscores["team"] == "NYK"].head)
display(boxscores[boxscores["team"] == "LAL"].head)
display(boxscores[boxscores["team"] == "BOS"].head)


<bound method NDFrame.head of           mp    fg   fga    fg%    3p   3pa    3p%    ft   fta    ft%  ...  \
114    240.0  36.0  86.0  0.419   3.0  18.0  0.167  16.0  20.0  0.800  ...   
121    240.0  41.0  88.0  0.466   3.0  18.0  0.167  25.0  26.0  0.962  ...   
206    240.0  39.0  91.0  0.429   8.0  28.0  0.286  22.0  32.0  0.688  ...   
220    240.0  39.0  87.0  0.448  12.0  29.0  0.414  10.0  12.0  0.833  ...   
228    240.0  36.0  82.0  0.439  13.0  30.0  0.433   7.0  11.0  0.636  ...   
...      ...   ...   ...    ...   ...   ...    ...   ...   ...    ...  ...   
17636  240.0  36.0  88.0  0.409   5.0  22.0  0.227  18.0  20.0  0.900  ...   
17639  240.0  40.0  82.0  0.488  13.0  32.0  0.406  18.0  24.0  0.750  ...   
17701  240.0  40.0  88.0  0.455   7.0  21.0  0.333  19.0  24.0  0.792  ...   
17745  240.0  37.0  82.0  0.451   9.0  28.0  0.321  14.0  15.0  0.933  ...   
17765  240.0  35.0  97.0  0.361   8.0  29.0  0.276  17.0  21.0  0.810  ...   

       usg%_max_opp  ortg_max_opp

<bound method NDFrame.head of           mp    fg   fga    fg%    3p   3pa    3p%    ft   fta    ft%  ...  \
37     240.0  34.0  82.0  0.415  11.0  40.0  0.275  17.0  23.0  0.739  ...   
44     240.0  39.0  78.0  0.500  10.0  30.0  0.333  28.0  43.0  0.651  ...   
55     240.0  32.0  90.0  0.356   4.0  19.0  0.211  13.0  17.0  0.765  ...   
58     240.0  43.0  91.0  0.473   8.0  26.0  0.308  16.0  23.0  0.696  ...   
74     240.0  43.0  90.0  0.478  10.0  30.0  0.333  14.0  17.0  0.824  ...   
...      ...   ...   ...    ...   ...   ...    ...   ...   ...    ...  ...   
17547  240.0  31.0  86.0  0.360   6.0  23.0  0.261  22.0  26.0  0.846  ...   
17585  240.0  43.0  86.0  0.500  14.0  29.0  0.483  23.0  28.0  0.821  ...   
17586  240.0  36.0  82.0  0.439   7.0  23.0  0.304  17.0  23.0  0.739  ...   
17592  240.0  31.0  83.0  0.373   3.0  21.0  0.143  12.0  16.0  0.750  ...   
17615  240.0  47.0  85.0  0.553  19.0  41.0  0.463  26.0  30.0  0.867  ...   

       usg%_max_opp  ortg_max_opp

<bound method NDFrame.head of           mp    fg   fga    fg%    3p   3pa    3p%    ft   fta    ft%  ...  \
43     240.0  42.0  91.0  0.462  17.0  42.0  0.405  17.0  25.0  0.680  ...   
135    240.0  42.0  86.0  0.488  16.0  32.0  0.500  17.0  23.0  0.739  ...   
141    240.0  45.0  83.0  0.542  12.0  33.0  0.364  26.0  28.0  0.929  ...   
176    240.0  30.0  81.0  0.370   7.0  25.0  0.280  26.0  36.0  0.722  ...   
181    240.0  42.0  89.0  0.472  12.0  31.0  0.387  12.0  14.0  0.857  ...   
...      ...   ...   ...    ...   ...   ...    ...   ...   ...    ...  ...   
17662  240.0  36.0  89.0  0.404  12.0  34.0  0.353  15.0  17.0  0.882  ...   
17677  240.0  39.0  93.0  0.419  12.0  39.0  0.308  15.0  19.0  0.789  ...   
17690  240.0  28.0  88.0  0.318   5.0  28.0  0.179  11.0  12.0  0.917  ...   
17734  265.0  39.0  97.0  0.402  14.0  37.0  0.378  24.0  31.0  0.774  ...   
17770  240.0  41.0  85.0  0.482   9.0  26.0  0.346  26.0  30.0  0.867  ...   

       usg%_max_opp  ortg_max_opp

In [9]:
display(boxscores["won"].value_counts())
# display(boxscores["target"].value_counts())

True     8886
False    8886
Name: won, dtype: int64

In [10]:
# Determine the columns that can be trained on
# We will ignore the columns that are not available at the start of the game (that are null)

nulls = pd.isnull(boxscores).sum()
nulls = nulls[nulls > 0]
non_null_columns = boxscores.columns[~boxscores.columns.isin(nulls.index)]

non_null_columns


Index(['mp', 'fg', 'fga', 'fg%', '3p', '3pa', '3p%', 'ft', 'fta', 'ft%',
       ...
       'usg%_max_opp', 'ortg_max_opp', 'drtg_max_opp', 'team_opp', 'total_opp',
       'home_opp', 'season', 'date', 'won', 'target'],
      dtype='object', length=142)

In [11]:
boxscores = boxscores[non_null_columns].copy()
display(boxscores.head())
display(boxscores.tail())

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,240.0,39.0,81.0,0.481,6.0,20.0,0.3,14.0,18.0,0.778,...,29.0,178.0,111.0,DAL,95,1,2016,2015-12-09,True,1
1,240.0,36.0,100.0,0.36,7.0,31.0,0.226,16.0,19.0,0.842,...,32.6,152.0,111.0,ATL,98,0,2016,2015-12-09,False,0
2,240.0,37.0,85.0,0.435,8.0,19.0,0.421,17.0,23.0,0.739,...,30.9,148.0,116.0,SAS,107,1,2018,2017-10-18,False,1
3,240.0,41.0,89.0,0.461,8.0,21.0,0.381,17.0,19.0,0.895,...,30.9,138.0,118.0,MIN,99,0,2018,2017-10-18,True,0
4,240.0,27.0,86.0,0.314,6.0,26.0,0.231,15.0,20.0,0.75,...,30.9,157.0,90.0,MEM,92,1,2021,2021-04-30,False,1


Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
17767,240.0,35.0,81.0,0.432,11.0,26.0,0.423,27.0,36.0,0.75,...,33.7,160.0,118.0,OKC,92,0,2019,2018-10-19,True,1
17768,240.0,37.0,74.0,0.5,13.0,25.0,0.52,26.0,37.0,0.703,...,30.0,139.0,129.0,ORL,108,1,2017,2016-12-14,True,2
17769,240.0,42.0,89.0,0.472,14.0,33.0,0.424,10.0,20.0,0.5,...,29.9,175.0,126.0,LAC,113,0,2017,2016-12-14,False,2
17770,240.0,41.0,85.0,0.482,9.0,26.0,0.346,26.0,30.0,0.867,...,27.1,150.0,126.0,MIA,106,1,2020,2020-09-19,True,2
17771,240.0,33.0,85.0,0.388,12.0,44.0,0.273,28.0,34.0,0.824,...,36.2,141.0,114.0,BOS,117,0,2020,2020-09-19,False,2


In [12]:
# Min-Max Scaler:
# We will use the Min-Max Scaler to all statistical features in the dataset to get them wihtin a range of 0 to 1
# We do this to normalize and reduces the variance of the data while preserving the relationships between the different 
# features as well as the relative order, distribution, and distnace of the datapoints

scaler = MinMaxScaler()

non_stat_columns = ["season", "date", "won", "target", "team", "team_opp"]
stat_columns = boxscores.columns[~boxscores.columns.isin(non_stat_columns)]

boxscores[stat_columns] = scaler.fit_transform(boxscores[stat_columns])

display(boxscores.head())
display(boxscores.tail())

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,0.0,0.454545,0.308824,0.507177,0.206897,0.242424,0.356295,0.302326,0.269841,0.740957,...,0.088575,0.421801,0.4,DAL,0.298077,1.0,2016,2015-12-09,True,1
1,0.0,0.386364,0.588235,0.217703,0.241379,0.409091,0.268409,0.348837,0.285714,0.815636,...,0.134788,0.298578,0.4,ATL,0.326923,0.0,2016,2015-12-09,False,0
2,0.0,0.409091,0.367647,0.397129,0.275862,0.227273,0.5,0.372093,0.349206,0.695449,...,0.112965,0.279621,0.458824,SAS,0.413462,1.0,2018,2017-10-18,False,1
3,0.0,0.5,0.426471,0.45933,0.275862,0.257576,0.452494,0.372093,0.285714,0.87748,...,0.112965,0.232227,0.482353,MIN,0.336538,0.0,2018,2017-10-18,True,0
4,0.0,0.181818,0.382353,0.107656,0.206897,0.333333,0.274347,0.325581,0.301587,0.708285,...,0.112965,0.322275,0.152941,MEM,0.269231,1.0,2021,2021-04-30,False,1


Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
17767,0.0,0.363636,0.308824,0.389952,0.37931,0.333333,0.502375,0.604651,0.555556,0.708285,...,0.148909,0.336493,0.482353,OKC,0.269231,0.0,2019,2018-10-19,True,1
17768,0.0,0.409091,0.205882,0.552632,0.448276,0.318182,0.617577,0.581395,0.571429,0.653442,...,0.101412,0.236967,0.611765,ORL,0.423077,1.0,2017,2016-12-14,True,2
17769,0.0,0.522727,0.426471,0.485646,0.482759,0.439394,0.503563,0.209302,0.301587,0.416569,...,0.100128,0.407583,0.576471,LAC,0.471154,0.0,2017,2016-12-14,False,2
17770,0.0,0.5,0.367647,0.509569,0.310345,0.333333,0.410926,0.581395,0.460317,0.844807,...,0.064185,0.2891,0.576471,MIA,0.403846,1.0,2020,2020-09-19,True,2
17771,0.0,0.318182,0.367647,0.284689,0.413793,0.606061,0.324228,0.627907,0.52381,0.794632,...,0.181001,0.246445,0.435294,BOS,0.509615,0.0,2020,2020-09-19,False,2


In [13]:
boxscores.groupby(["home"]).apply(lambda x: x[x["won"] == 1].shape[0] / x.shape[0])

home
0.0    0.428314
1.0    0.571686
dtype: float64

In [14]:
# In our model we will be training on data based on an n-game lookback window (rolling average)
# Our objective in doing so is to better model and take into account the team's recent form and performance 
# when predicting the outcome of a game

# We will create a function that will create a rolling average of the data for each team for each game
bs_rolling = boxscores[list(stat_columns) + ["won", "team", "season"]]

def team_averages(group):
    group[stat_columns] = group[stat_columns].rolling(10).mean()
    return group

# We will then apply this function to each team in the dataset
bs_rolling = bs_rolling.groupby(["team", "season"], group_keys=False).apply(team_averages)


In [15]:
bs_rolling

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,total_opp,home_opp,won,team,season
0,,,,,,,,,,,...,,,,,,,,True,ATL,2016
1,,,,,,,,,,,...,,,,,,,,False,DAL,2016
2,,,,,,,,,,,...,,,,,,,,False,MIN,2018
3,,,,,,,,,,,...,,,,,,,,True,SAS,2018
4,,,,,,,,,,,...,,,,,,,,False,ORL,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17767,0.025,0.434091,0.413235,0.392344,0.389655,0.378788,0.465083,0.553488,0.461905,0.803734,...,0.0699,0.282914,0.215019,0.408057,0.480000,0.525000,0.4,True,LAC,2019
17768,0.000,0.465909,0.295588,0.533014,0.382759,0.346970,0.496200,0.460465,0.412698,0.753092,...,0.0520,0.334696,0.122593,0.329858,0.544706,0.365385,0.4,True,LAC,2017
17769,0.050,0.495455,0.419118,0.455742,0.351724,0.343939,0.446793,0.316279,0.304762,0.698250,...,0.0821,0.307547,0.127599,0.409005,0.465882,0.398077,0.5,False,ORL,2017
17770,0.025,0.502273,0.427941,0.460526,0.427586,0.463636,0.436698,0.393023,0.322222,0.800000,...,0.1086,0.322117,0.150578,0.440284,0.528235,0.420192,0.3,True,BOS,2020


In [16]:
bs_rolling.columns

Index(['mp', 'fg', 'fga', 'fg%', '3p', '3pa', '3p%', 'ft', 'fta', 'ft%',
       ...
       'blk%_max_opp', 'tov%_max_opp', 'usg%_max_opp', 'ortg_max_opp',
       'drtg_max_opp', 'total_opp', 'home_opp', 'won', 'team', 'season'],
      dtype='object', length=139)

In [17]:
rolling_cols = [f"{col}_10" for col in bs_rolling.columns]
bs_rolling.columns = rolling_cols
boxscores = pd.concat([boxscores, bs_rolling], axis=1)

boxscores = boxscores.dropna()
boxscores = boxscores.reset_index(drop=True)

boxscores

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,blk%_max_opp_10,tov%_max_opp_10,usg%_max_opp_10,ortg_max_opp_10,drtg_max_opp_10,total_opp_10,home_opp_10,won_10,team_10,season_10
0,0.0,0.431818,0.264706,0.521531,0.275862,0.272727,0.432304,0.325581,0.301587,0.708285,...,0.0735,0.389937,0.243261,0.434123,0.485882,0.375962,0.6,True,MIL,2018
1,0.0,0.272727,0.235294,0.332536,0.275862,0.363636,0.339667,0.372093,0.349206,0.695449,...,0.0535,0.294444,0.165854,0.337441,0.468235,0.358654,0.5,False,SAS,2018
2,0.0,0.681818,0.529412,0.576555,0.206897,0.333333,0.274347,0.279070,0.206349,0.917153,...,0.0527,0.299790,0.160847,0.342180,0.490588,0.360577,0.5,True,SAS,2018
3,0.0,0.568182,0.544118,0.442584,0.724138,0.606061,0.566508,0.209302,0.174603,0.805134,...,0.0639,0.400105,0.175225,0.462085,0.560000,0.337500,0.6,False,MIA,2022
4,0.0,0.477273,0.455882,0.409091,0.517241,0.590909,0.414489,0.255814,0.222222,0.766628,...,0.1438,0.320335,0.151091,0.432701,0.512941,0.372115,0.7,True,GSW,2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15877,0.0,0.363636,0.308824,0.389952,0.379310,0.333333,0.502375,0.604651,0.555556,0.708285,...,0.0699,0.282914,0.215019,0.408057,0.480000,0.525000,0.4,True,LAC,2019
15878,0.0,0.409091,0.205882,0.552632,0.448276,0.318182,0.617577,0.581395,0.571429,0.653442,...,0.0520,0.334696,0.122593,0.329858,0.544706,0.365385,0.4,True,LAC,2017
15879,0.0,0.522727,0.426471,0.485646,0.482759,0.439394,0.503563,0.209302,0.301587,0.416569,...,0.0821,0.307547,0.127599,0.409005,0.465882,0.398077,0.5,False,ORL,2017
15880,0.0,0.500000,0.367647,0.509569,0.310345,0.333333,0.410926,0.581395,0.460317,0.844807,...,0.1086,0.322117,0.150578,0.440284,0.528235,0.420192,0.3,True,BOS,2020


In [18]:
# We are now shifting columns to determine the next opponent and game date for each team per instance

def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col

def add_col(df, col_name):
    return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))

boxscores["home_next"] = add_col(boxscores, "home")
boxscores["team_opp_next"] = add_col(boxscores, "team_opp")
boxscores["date_next"] = add_col(boxscores, "date")

boxscores

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,ortg_max_opp_10,drtg_max_opp_10,total_opp_10,home_opp_10,won_10,team_10,season_10,home_next,team_opp_next,date_next
0,0.0,0.431818,0.264706,0.521531,0.275862,0.272727,0.432304,0.325581,0.301587,0.708285,...,0.434123,0.485882,0.375962,0.6,True,MIL,2018,1.0,IND,2018-01-03
1,0.0,0.272727,0.235294,0.332536,0.275862,0.363636,0.339667,0.372093,0.349206,0.695449,...,0.337441,0.468235,0.358654,0.5,False,SAS,2018,1.0,MIN,2018-03-17
2,0.0,0.681818,0.529412,0.576555,0.206897,0.333333,0.274347,0.279070,0.206349,0.917153,...,0.342180,0.490588,0.360577,0.5,True,SAS,2018,0.0,NYK,2021-05-13
3,0.0,0.568182,0.544118,0.442584,0.724138,0.606061,0.566508,0.209302,0.174603,0.805134,...,0.462085,0.560000,0.337500,0.6,False,MIA,2022,1.0,ATL,2022-04-19
4,0.0,0.477273,0.455882,0.409091,0.517241,0.590909,0.414489,0.255814,0.222222,0.766628,...,0.432701,0.512941,0.372115,0.7,True,GSW,2022,1.0,HOU,2016-04-27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15877,0.0,0.363636,0.308824,0.389952,0.379310,0.333333,0.502375,0.604651,0.555556,0.708285,...,0.408057,0.480000,0.525000,0.4,True,LAC,2019,0.0,ORL,2016-12-14
15878,0.0,0.409091,0.205882,0.552632,0.448276,0.318182,0.617577,0.581395,0.571429,0.653442,...,0.329858,0.544706,0.365385,0.4,True,LAC,2017,,,
15879,0.0,0.522727,0.426471,0.485646,0.482759,0.439394,0.503563,0.209302,0.301587,0.416569,...,0.409005,0.465882,0.398077,0.5,False,ORL,2017,,,
15880,0.0,0.500000,0.367647,0.509569,0.310345,0.333333,0.410926,0.581395,0.460317,0.844807,...,0.440284,0.528235,0.420192,0.3,True,BOS,2020,,,


In [19]:
# We are creating a matchup dataframe by merging the home and opponenet team data for each game

matchup = boxscores.merge(boxscores[rolling_cols + ["team_opp_next", "date_next", "team"]], 
                          left_on=["team", "date_next"], right_on=["team_opp_next", "date_next"])

matchup

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp_10_y,ortg_max_opp_10_y,drtg_max_opp_10_y,total_opp_10_y,home_opp_10_y,won_10_y,team_10_y,season_10_y,team_opp_next_y,team_y
0,0.00,0.477273,0.426471,0.430622,0.379310,0.409091,0.421615,0.069767,0.079365,0.611435,...,0.176380,0.632701,0.523529,0.363462,0.4,False,DET,2016,IND,DET
1,0.00,0.272727,0.308824,0.272727,0.448276,0.439394,0.467933,0.302326,0.365079,0.513419,...,0.149294,0.457820,0.425882,0.384615,0.5,False,IND,2018,DET,IND
2,0.00,0.545455,0.338235,0.595694,0.413793,0.348485,0.527316,0.348837,0.380952,0.579930,...,0.121566,0.390521,0.441176,0.381731,0.5,False,MIL,2018,DET,MIL
3,0.00,0.409091,0.308824,0.449761,0.482759,0.530303,0.426366,0.372093,0.333333,0.735123,...,0.170218,0.408531,0.502353,0.341346,0.8,True,GSW,2022,DAL,GSW
4,0.00,0.477273,0.558824,0.332536,0.379310,0.500000,0.352732,0.093023,0.095238,0.666278,...,0.138126,0.366825,0.371765,0.375000,0.7,True,BRK,2016,PHI,BRK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15533,0.25,0.522727,0.544118,0.392344,0.275862,0.378788,0.327791,0.465116,0.476190,0.623104,...,0.266496,0.357346,0.450588,0.454808,0.3,False,NYK,2017,MIL,NYK
15534,0.25,0.409091,0.485294,0.308612,0.241379,0.272727,0.377672,0.581395,0.507937,0.752625,...,0.183569,0.577725,0.535294,0.490385,0.4,False,DET,2020,PHO,DET
15535,0.00,0.454545,0.250000,0.566986,0.310345,0.272727,0.485748,0.558140,0.412698,0.913652,...,0.129782,0.418483,0.464706,0.380769,0.3,True,LAC,2016,MEM,LAC
15536,0.00,0.568182,0.294118,0.672249,0.482759,0.469697,0.475059,0.372093,0.269841,0.934656,...,0.145058,0.338863,0.482353,0.423077,0.6,False,OKC,2018,LAC,OKC


In [20]:
matchup[["team_x", "team_opp_next_x", "team_y", "team_opp_next_y", "date_next"]]

Unnamed: 0,team_x,team_opp_next_x,team_y,team_opp_next_y,date_next
0,IND,DET,DET,IND,2017-11-08
1,DET,IND,IND,DET,2017-11-08
2,DET,MIL,MIL,DET,2017-12-06
3,DAL,GSW,GSW,DAL,2022-01-05
4,PHI,BRK,BRK,PHI,2016-03-15
...,...,...,...,...,...
15533,MIL,NYK,NYK,MIL,2018-12-25
15534,PHO,DET,DET,PHO,2022-01-16
15535,MEM,LAC,LAC,MEM,2021-02-26
15536,LAC,OKC,OKC,LAC,2018-10-19


In [21]:
non_stat_columns = list(matchup.columns[matchup.dtypes == "object"]) + non_stat_columns

non_stat_columns

['team_x',
 'team_opp',
 'date',
 'team_10_x',
 'team_opp_next_x',
 'date_next',
 'team_10_y',
 'team_opp_next_y',
 'team_y',
 'season',
 'date',
 'won',
 'target',
 'team',
 'team_opp']

In [22]:
# We are now splitting our matchup data into the training and target matrices
# We will be removing the non-statistical columns from the training matrix and using this as X
# We will be using the target column as y

X_columns = matchup.columns[~matchup.columns.isin(non_stat_columns)]
X = matchup[X_columns]
y = matchup["target"]

In [23]:
X

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,stl%_max_opp_10_y,blk%_max_opp_10_y,tov%_max_opp_10_y,usg%_max_opp_10_y,ortg_max_opp_10_y,drtg_max_opp_10_y,total_opp_10_y,home_opp_10_y,won_10_y,season_10_y
0,0.00,0.477273,0.426471,0.430622,0.379310,0.409091,0.421615,0.069767,0.079365,0.611435,...,0.0658,0.0801,0.314256,0.176380,0.632701,0.523529,0.363462,0.4,False,2016
1,0.00,0.272727,0.308824,0.272727,0.448276,0.439394,0.467933,0.302326,0.365079,0.513419,...,0.0591,0.1264,0.406709,0.149294,0.457820,0.425882,0.384615,0.5,False,2018
2,0.00,0.545455,0.338235,0.595694,0.413793,0.348485,0.527316,0.348837,0.380952,0.579930,...,0.0435,0.0727,0.331237,0.121566,0.390521,0.441176,0.381731,0.5,False,2018
3,0.00,0.409091,0.308824,0.449761,0.482759,0.530303,0.426366,0.372093,0.333333,0.735123,...,0.0510,0.0871,0.246436,0.170218,0.408531,0.502353,0.341346,0.8,True,2022
4,0.00,0.477273,0.558824,0.332536,0.379310,0.500000,0.352732,0.093023,0.095238,0.666278,...,0.0828,0.0943,0.353145,0.138126,0.366825,0.371765,0.375000,0.7,True,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15533,0.25,0.522727,0.544118,0.392344,0.275862,0.378788,0.327791,0.465116,0.476190,0.623104,...,0.1096,0.1056,0.346541,0.266496,0.357346,0.450588,0.454808,0.3,False,2017
15534,0.25,0.409091,0.485294,0.308612,0.241379,0.272727,0.377672,0.581395,0.507937,0.752625,...,0.0586,0.0920,0.326310,0.183569,0.577725,0.535294,0.490385,0.4,False,2020
15535,0.00,0.454545,0.250000,0.566986,0.310345,0.272727,0.485748,0.558140,0.412698,0.913652,...,0.0633,0.0403,0.589099,0.129782,0.418483,0.464706,0.380769,0.3,True,2016
15536,0.00,0.568182,0.294118,0.672249,0.482759,0.469697,0.475059,0.372093,0.269841,0.934656,...,0.0593,0.0923,0.422222,0.145058,0.338863,0.482353,0.423077,0.6,False,2018


In [24]:
y

0        1
1        1
2        1
3        0
4        0
        ..
15533    1
15534    1
15535    0
15536    1
15537    1
Name: target, Length: 15538, dtype: int32

In [25]:
y.value_counts()

1    7772
0    7766
Name: target, dtype: int64

In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Building the RNN Model

In [27]:
import torch
import torch.nn as nn
import torch.optim as optim

import torch.nn.functional as F

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return x

class TransformerClassifier(nn.Module):
    def __init__(self, num_features, num_classes, dim_model=64, num_heads=4, num_layers=3, dropout=0.1):
        super(TransformerClassifier, self).__init__()
        self.linear_in = nn.Linear(num_features, dim_model)
        self.pos_encoder = PositionalEncoding(dim_model)
        encoder_layers = nn.TransformerEncoderLayer(d_model=dim_model, nhead=num_heads, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_layers)
        self.linear_out = nn.Linear(dim_model, num_classes)

    def forward(self, x):
        x = self.linear_in(x)
        x = self.pos_encoder(x)
        x = self.transformer_encoder(x)
        x = x.mean(dim=1)
        x = self.linear_out(x)
        return x




In [28]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, device):
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.device = device

    def forward(self, x):
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_().to(self.device)
        c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_().to(self.device)
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
        out = self.fc(out[:, -1, :]) 

        return out

input_dim = 413    # Number of input features
hidden_dim = 100   # Example value, you might need to experiment with this
layer_dim = 2      # Example value, can be 1, 2, or more
output_dim = 2     # For binary classification




In [29]:
import numpy as np


X_train = np.array(X_train.values, dtype = np.float32)
y_train = np.array(y_train.values, dtype = np.float32)

In [79]:
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import precision_score, recall_score, f1_score
from torch.optim.lr_scheduler import StepLR



X_tensor = torch.tensor(X_train)
print(X_tensor.dtype)
y_tensor = torch.tensor(y_train)
# Create DataLoader for batch processing
batch_size = 32
dataset = TensorDataset(X_tensor, y_tensor)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


# Check for GPU availability and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Instantiate the model and move it to the device (GPU or CPU)
model = LSTMClassifier(input_dim, hidden_dim, layer_dim, output_dim, device).to(device)
# model = TransformerClassifier(num_features=input_dim, num_classes=2, dim_model=128, num_heads=8, num_layers=4, dropout=0.2).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# scheduler = StepLR(optimizer, step_size=10, gamma=0.1)


num_epochs=25

# Function to calculate accuracy
def calculate_accuracy(y_true, y_pred):
    predicted = torch.argmax(y_pred, dim=1)
    correct = (predicted == y_true).float().sum()
    return correct / y_true.shape[0]

# Training loop
num_epochs = 10
best_accuracy = 0.0

for epoch in range(num_epochs):
    total_loss = 0
    total_accuracy = 0
    all_predictions = []
    all_targets = []
    for batch, (inputs, targets) in enumerate(dataloader):
        # Move data to device
        inputs, targets = inputs.unsqueeze(1).to(device).float(), targets.to(device).long()

        #inputs.shape should be [32, 1, 413]

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # scheduler.step()


        # Calculate batch loss and accuracy
        total_loss += loss.item()
        accuracy = calculate_accuracy(targets, outputs)
        total_accuracy += accuracy.item()

        # Collect predictions and true labels
        _, predicted = torch.max(outputs.data, 1)
        all_predictions.extend(predicted.cpu().numpy())
        all_targets.extend(targets.cpu().numpy())

    # Calculate metrics
    average_loss = total_loss / len(dataloader)
    average_accuracy = total_accuracy / len(dataloader)
    precision = precision_score(all_targets, all_predictions)
    recall = recall_score(all_targets, all_predictions)
    f1 = f1_score(all_targets, all_predictions)

    # Print metrics
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {average_loss:.4f}, Accuracy: {average_accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

    # Save the model if it has the best accuracy so far
    if average_accuracy > best_accuracy:
        best_accuracy = average_accuracy
        torch.save(model.state_dict(), 'best_lstm_classifier.pth')
        print(f"Saved model with accuracy: {best_accuracy:.4f}")

# Save the model after training
torch.save(model.state_dict(), 'lstm_classifier.pth')

torch.float32
Using device: cpu
Epoch 1/10, Loss: 0.6939, Accuracy: 0.4934, Precision: 0.4940, Recall: 0.4507, F1 Score: 0.4713
Saved model with accuracy: 0.4934
Epoch 2/10, Loss: 0.6934, Accuracy: 0.5020, Precision: 0.5023, Recall: 0.5320, F1 Score: 0.5167
Saved model with accuracy: 0.5020
Epoch 3/10, Loss: 0.6932, Accuracy: 0.5056, Precision: 0.5068, Recall: 0.5029, F1 Score: 0.5048
Saved model with accuracy: 0.5056
Epoch 4/10, Loss: 0.6934, Accuracy: 0.4921, Precision: 0.4919, Recall: 0.4299, F1 Score: 0.4588
Epoch 5/10, Loss: 0.6932, Accuracy: 0.5038, Precision: 0.5114, Recall: 0.2089, F1 Score: 0.2966
Epoch 6/10, Loss: 0.6932, Accuracy: 0.5006, Precision: 0.5007, Recall: 0.8135, F1 Score: 0.6199
Epoch 7/10, Loss: 0.6932, Accuracy: 0.5034, Precision: 0.5056, Recall: 0.3469, F1 Score: 0.4115
Epoch 8/10, Loss: 0.6932, Accuracy: 0.4945, Precision: 0.4969, Recall: 0.7434, F1 Score: 0.5956
Epoch 9/10, Loss: 0.6932, Accuracy: 0.4983, Precision: 0.4993, Recall: 0.5479, F1 Score: 0.5224
Ep

In [80]:
from sklearn.metrics import classification_report

# Convert your test data to a type that TensorFlow supports
X_test = torch.tensor(np.array(X_test, dtype = np.float32))
y_test = torch.tensor(np.array(y_test, dtype = np.float32))
print(X_test.shape, y_test.shape)

test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    
    inputs, targets = X_test.unsqueeze(1).to(device).float(), y_test.to(device).long()
    outputs = model(inputs)
    _, predicted = torch.max(outputs.data, 1)
    total += y_test.size(0)
    correct += (predicted == targets).sum().item()


accuracy = accuracy_score(y_test, np.array(predicted.cpu()))
print("Accuracy:", accuracy)

print(classification_report(y_test, np.array(predicted.cpu())))
print(model)


torch.Size([3108, 413]) torch.Size([3108])
Accuracy: 0.5019305019305019
              precision    recall  f1-score   support

         0.0       0.50      1.00      0.67      1560
         1.0       0.00      0.00      0.00      1548

    accuracy                           0.50      3108
   macro avg       0.25      0.50      0.33      3108
weighted avg       0.25      0.50      0.34      3108

LSTMClassifier(
  (lstm): LSTM(413, 100, num_layers=2, batch_first=True)
  (fc): Linear(in_features=100, out_features=2, bias=True)
)


# Load in and Simulate Model with Betting Odds

In [35]:
bets = pd.read_csv("data/games.csv", index_col=0)
bets = bets[bets['season'] == 2020]
bets = bets.sort_values("date")
bets = bets[(bets['date'] >= '2019-10-01') & (bets['date'] <= '2020-01-26')]
bets

Unnamed: 0,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,fta,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
976,240.0,240.0,37.0,85.0,0.435,13.0,33.0,0.394,15.0,21.0,...,22.2,42.4,203.0,116.0,LAC,112,1,2020,2019-10-22,False
977,240.0,240.0,42.0,81.0,0.519,11.0,31.0,0.355,17.0,24.0,...,25.0,35.8,300.0,125.0,LAL,102,0,2020,2019-10-22,True
7883,265.0,265.0,42.0,103.0,0.408,14.0,40.0,0.350,32.0,38.0,...,25.0,37.7,146.0,124.0,NOP,122,0,2020,2019-10-22,True
7882,265.0,265.0,43.0,102.0,0.422,19.0,45.0,0.422,17.0,20.0,...,22.2,35.7,158.0,114.0,TOR,130,1,2020,2019-10-22,False
16465,240.0,240.0,49.0,98.0,0.500,11.0,33.0,0.333,15.0,18.0,...,53.2,36.6,144.0,121.0,SAC,95,0,2020,2019-10-23,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14969,240.0,240.0,54.0,93.0,0.581,15.0,35.0,0.429,29.0,34.0,...,17.9,35.0,181.0,148.0,WAS,133,0,2020,2020-01-26,True
14968,240.0,240.0,42.0,88.0,0.477,11.0,33.0,0.333,38.0,42.0,...,38.1,36.4,168.0,131.0,ATL,152,1,2020,2020-01-26,False
14267,240.0,240.0,44.0,92.0,0.478,16.0,36.0,0.444,19.0,29.0,...,50.0,33.8,160.0,127.0,BOS,108,0,2020,2020-01-26,True
14266,240.0,240.0,41.0,89.0,0.461,8.0,33.0,0.242,18.0,22.0,...,39.3,33.7,173.0,111.0,NOP,123,1,2020,2020-01-26,False


In [36]:
display(bets[bets['date'] == '2019-10-22'])
display(bets[bets['date'] == '2019-10-22']['team'])

Unnamed: 0,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,fta,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
976,240.0,240.0,37.0,85.0,0.435,13.0,33.0,0.394,15.0,21.0,...,22.2,42.4,203.0,116.0,LAC,112,1,2020,2019-10-22,False
977,240.0,240.0,42.0,81.0,0.519,11.0,31.0,0.355,17.0,24.0,...,25.0,35.8,300.0,125.0,LAL,102,0,2020,2019-10-22,True
7883,265.0,265.0,42.0,103.0,0.408,14.0,40.0,0.35,32.0,38.0,...,25.0,37.7,146.0,124.0,NOP,122,0,2020,2019-10-22,True
7882,265.0,265.0,43.0,102.0,0.422,19.0,45.0,0.422,17.0,20.0,...,22.2,35.7,158.0,114.0,TOR,130,1,2020,2019-10-22,False


976     LAL
977     LAC
7883    TOR
7882    NOP
Name: team, dtype: object

In [37]:
odds = pd.read_csv("data/nbaodds2019-20.csv", index_col=0)
odds

Unnamed: 0_level_0,Rot,VH,Team,1st,2nd,3rd,4th,Final,Open,Close,ML,2H
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1022,501,V,NewOrleans,30,31,25,31,122,231.5,229.5,230,113
1022,502,H,Toronto,27,29,32,29,130,6.5,6.5,-280,6
1022,503,V,LALakers,25,29,31,17,102,227,3.5,-180,5
1022,504,H,LAClippers,22,40,23,27,112,1.5,224,150,110.5
1023,505,V,Detroit,27,27,29,36,119,210,211,240,104
...,...,...,...,...,...,...,...,...,...,...,...,...
126,522,H,NewYork,30,25,30,25,110,221.5,2.5,-140,0.5
126,523,V,Washington,33,39,30,31,133,240,241.5,-105,1.5
126,524,H,Atlanta,31,47,41,33,152,1.5,1,-115,119
126,525,V,Indiana,27,29,37,36,129,220.5,221,130,1.5


In [38]:
display(sorted(bets['team'].unique()))

['ATL',
 'BOS',
 'BRK',
 'CHI',
 'CHO',
 'CLE',
 'DAL',
 'DEN',
 'DET',
 'GSW',
 'HOU',
 'IND',
 'LAC',
 'LAL',
 'MEM',
 'MIA',
 'MIL',
 'MIN',
 'NOP',
 'NYK',
 'OKC',
 'ORL',
 'PHI',
 'PHO',
 'POR',
 'SAC',
 'SAS',
 'TOR',
 'UTA',
 'WAS']

In [39]:
team_abbreviations = {
    'Atlanta': 'ATL',
    'Boston': 'BOS',
    'Brooklyn': 'BRK',
    'Charlotte': 'CHO',
    'Chicago': 'CHI',
    'Cleveland': 'CLE',
    'Dallas': 'DAL',
    'Denver': 'DEN',
    'Detroit': 'DET',
    'GoldenState': 'GSW',
    'Houston': 'HOU',
    'Indiana': 'IND',
    'LAClippers': 'LAC',
    'LALakers': 'LAL',
    'Memphis': 'MEM',
    'Miami': 'MIA',
    'Milwaukee': 'MIL',
    'Minnesota': 'MIN',
    'NewOrleans': 'NOP',
    'NewYork': 'NYK',
    'OklahomaCity': 'OKC',
    'Orlando': 'ORL',
    'Philadelphia': 'PHI',
    'Phoenix': 'PHO',
    'Portland': 'POR',
    'Sacramento': 'SAC',
    'SanAntonio': 'SAS',
    'Toronto': 'TOR',
    'Utah': 'UTA',
    'Washington': 'WAS'
}

odds['Team'] = odds['Team'].map(team_abbreviations)
odds


Unnamed: 0_level_0,Rot,VH,Team,1st,2nd,3rd,4th,Final,Open,Close,ML,2H
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1022,501,V,NOP,30,31,25,31,122,231.5,229.5,230,113
1022,502,H,TOR,27,29,32,29,130,6.5,6.5,-280,6
1022,503,V,LAL,25,29,31,17,102,227,3.5,-180,5
1022,504,H,LAC,22,40,23,27,112,1.5,224,150,110.5
1023,505,V,DET,27,27,29,36,119,210,211,240,104
...,...,...,...,...,...,...,...,...,...,...,...,...
126,522,H,NYK,30,25,30,25,110,221.5,2.5,-140,0.5
126,523,V,WAS,33,39,30,31,133,240,241.5,-105,1.5
126,524,H,ATL,31,47,41,33,152,1.5,1,-115,119
126,525,V,IND,27,29,37,36,129,220.5,221,130,1.5


In [40]:
num_nan = odds['Team'].isnull().sum()
print(num_nan)


0


## Testing on this Seasons Data and Simulating the Bet Predictions
### We will assume a standard $100 bet per game, and will not include any forms of transaction costs
### We will determine the overall performance of the model the profit/loss from betting $100 on every game this season
### using our model thus far

In [41]:
bets = bets.sort_values("date")
bets = bets.reset_index(drop=True)
bets

Unnamed: 0,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,fta,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,240.0,240.0,37.0,85.0,0.435,13.0,33.0,0.394,15.0,21.0,...,22.2,42.4,203.0,116.0,LAC,112,1,2020,2019-10-22,False
1,240.0,240.0,42.0,81.0,0.519,11.0,31.0,0.355,17.0,24.0,...,25.0,35.8,300.0,125.0,LAL,102,0,2020,2019-10-22,True
2,265.0,265.0,42.0,103.0,0.408,14.0,40.0,0.350,32.0,38.0,...,25.0,37.7,146.0,124.0,NOP,122,0,2020,2019-10-22,True
3,265.0,265.0,43.0,102.0,0.422,19.0,45.0,0.422,17.0,20.0,...,22.2,35.7,158.0,114.0,TOR,130,1,2020,2019-10-22,False
4,240.0,240.0,33.0,88.0,0.375,9.0,34.0,0.265,10.0,15.0,...,34.0,32.2,140.0,95.0,ORL,94,1,2020,2019-10-23,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1379,240.0,240.0,48.0,93.0,0.516,13.0,28.0,0.464,20.0,21.0,...,42.9,33.4,250.0,134.0,POR,139,1,2020,2020-01-26,False
1380,240.0,240.0,44.0,84.0,0.524,8.0,19.0,0.421,18.0,26.0,...,100.0,31.7,126.0,111.0,PHO,109,0,2020,2020-01-26,True
1381,240.0,240.0,49.0,92.0,0.533,6.0,19.0,0.316,6.0,13.0,...,34.0,35.1,238.0,124.0,BRK,97,0,2020,2020-01-26,True
1382,240.0,240.0,40.0,88.0,0.455,11.0,33.0,0.333,19.0,27.0,...,22.2,29.2,167.0,117.0,DEN,117,1,2020,2020-01-26,False


In [42]:
del bets["mp.1"]
del bets["mp_opp.1"]
del bets["index_opp"]
bets

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,240.0,37.0,85.0,0.435,13.0,33.0,0.394,15.0,21.0,0.714,...,22.2,42.4,203.0,116.0,LAC,112,1,2020,2019-10-22,False
1,240.0,42.0,81.0,0.519,11.0,31.0,0.355,17.0,24.0,0.708,...,25.0,35.8,300.0,125.0,LAL,102,0,2020,2019-10-22,True
2,265.0,42.0,103.0,0.408,14.0,40.0,0.350,32.0,38.0,0.842,...,25.0,37.7,146.0,124.0,NOP,122,0,2020,2019-10-22,True
3,265.0,43.0,102.0,0.422,19.0,45.0,0.422,17.0,20.0,0.850,...,22.2,35.7,158.0,114.0,TOR,130,1,2020,2019-10-22,False
4,240.0,33.0,88.0,0.375,9.0,34.0,0.265,10.0,15.0,0.667,...,34.0,32.2,140.0,95.0,ORL,94,1,2020,2019-10-23,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1379,240.0,48.0,93.0,0.516,13.0,28.0,0.464,20.0,21.0,0.952,...,42.9,33.4,250.0,134.0,POR,139,1,2020,2020-01-26,False
1380,240.0,44.0,84.0,0.524,8.0,19.0,0.421,18.0,26.0,0.692,...,100.0,31.7,126.0,111.0,PHO,109,0,2020,2020-01-26,True
1381,240.0,49.0,92.0,0.533,6.0,19.0,0.316,6.0,13.0,0.462,...,34.0,35.1,238.0,124.0,BRK,97,0,2020,2020-01-26,True
1382,240.0,40.0,88.0,0.455,11.0,33.0,0.333,19.0,27.0,0.704,...,22.2,29.2,167.0,117.0,DEN,117,1,2020,2020-01-26,False


In [43]:
bets = bets.groupby("team", group_keys=False).apply(add_target)
bets["target"][pd.isnull(bets["target"])] = 2
bets["target"] = bets["target"].astype(int, errors="ignore")
bets

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,240.0,37.0,85.0,0.435,13.0,33.0,0.394,15.0,21.0,0.714,...,42.4,203.0,116.0,LAC,112,1,2020,2019-10-22,False,1
1,240.0,42.0,81.0,0.519,11.0,31.0,0.355,17.0,24.0,0.708,...,35.8,300.0,125.0,LAL,102,0,2020,2019-10-22,True,1
2,265.0,42.0,103.0,0.408,14.0,40.0,0.350,32.0,38.0,0.842,...,37.7,146.0,124.0,NOP,122,0,2020,2019-10-22,True,0
3,265.0,43.0,102.0,0.422,19.0,45.0,0.422,17.0,20.0,0.850,...,35.7,158.0,114.0,TOR,130,1,2020,2019-10-22,False,0
4,240.0,33.0,88.0,0.375,9.0,34.0,0.265,10.0,15.0,0.667,...,32.2,140.0,95.0,ORL,94,1,2020,2019-10-23,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1379,240.0,48.0,93.0,0.516,13.0,28.0,0.464,20.0,21.0,0.952,...,33.4,250.0,134.0,POR,139,1,2020,2020-01-26,False,2
1380,240.0,44.0,84.0,0.524,8.0,19.0,0.421,18.0,26.0,0.692,...,31.7,126.0,111.0,PHO,109,0,2020,2020-01-26,True,2
1381,240.0,49.0,92.0,0.533,6.0,19.0,0.316,6.0,13.0,0.462,...,35.1,238.0,124.0,BRK,97,0,2020,2020-01-26,True,2
1382,240.0,40.0,88.0,0.455,11.0,33.0,0.333,19.0,27.0,0.704,...,29.2,167.0,117.0,DEN,117,1,2020,2020-01-26,False,2


In [44]:
bets["won"].value_counts()
bets["target"].value_counts()

1    677
0    677
2     30
Name: target, dtype: int64

In [45]:
nulls = pd.isnull(bets).sum()
nulls = nulls[nulls > 0]
non_null_columns = bets.columns[~bets.columns.isin(nulls.index)]
bets = bets[non_null_columns].copy()
bets

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,240.0,37.0,85.0,0.435,13.0,33.0,0.394,15.0,21.0,0.714,...,42.4,203.0,116.0,LAC,112,1,2020,2019-10-22,False,1
1,240.0,42.0,81.0,0.519,11.0,31.0,0.355,17.0,24.0,0.708,...,35.8,300.0,125.0,LAL,102,0,2020,2019-10-22,True,1
2,265.0,42.0,103.0,0.408,14.0,40.0,0.350,32.0,38.0,0.842,...,37.7,146.0,124.0,NOP,122,0,2020,2019-10-22,True,0
3,265.0,43.0,102.0,0.422,19.0,45.0,0.422,17.0,20.0,0.850,...,35.7,158.0,114.0,TOR,130,1,2020,2019-10-22,False,0
4,240.0,33.0,88.0,0.375,9.0,34.0,0.265,10.0,15.0,0.667,...,32.2,140.0,95.0,ORL,94,1,2020,2019-10-23,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1379,240.0,48.0,93.0,0.516,13.0,28.0,0.464,20.0,21.0,0.952,...,33.4,250.0,134.0,POR,139,1,2020,2020-01-26,False,2
1380,240.0,44.0,84.0,0.524,8.0,19.0,0.421,18.0,26.0,0.692,...,31.7,126.0,111.0,PHO,109,0,2020,2020-01-26,True,2
1381,240.0,49.0,92.0,0.533,6.0,19.0,0.316,6.0,13.0,0.462,...,35.1,238.0,124.0,BRK,97,0,2020,2020-01-26,True,2
1382,240.0,40.0,88.0,0.455,11.0,33.0,0.333,19.0,27.0,0.704,...,29.2,167.0,117.0,DEN,117,1,2020,2020-01-26,False,2


In [46]:
non_stat_columns = ["season", "date", "won", "target", "team", "team_opp"]
stat_columns = bets.columns[~bets.columns.isin(non_stat_columns)]

bets[stat_columns] = scaler.fit_transform(bets[stat_columns])
bets

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,0.0,0.410256,0.415094,0.409877,0.454545,0.452381,0.511945,0.342857,0.380952,0.499124,...,0.247059,0.484043,0.4500,LAC,0.453488,1.0,2020,2019-10-22,False,1
1,0.0,0.538462,0.339623,0.617284,0.363636,0.404762,0.445392,0.400000,0.452381,0.488616,...,0.160784,1.000000,0.5625,LAL,0.337209,0.0,2020,2019-10-22,True,1
2,0.5,0.538462,0.754717,0.343210,0.500000,0.619048,0.436860,0.828571,0.785714,0.723292,...,0.185621,0.180851,0.5500,NOP,0.569767,0.0,2020,2019-10-22,True,0
3,0.5,0.564103,0.735849,0.377778,0.727273,0.738095,0.559727,0.400000,0.357143,0.737303,...,0.159477,0.244681,0.4250,TOR,0.662791,1.0,2020,2019-10-22,False,0
4,0.0,0.307692,0.471698,0.261728,0.272727,0.476190,0.291809,0.200000,0.238095,0.416813,...,0.113725,0.148936,0.1875,ORL,0.244186,1.0,2020,2019-10-23,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1379,0.0,0.692308,0.566038,0.609877,0.454545,0.333333,0.631399,0.485714,0.380952,0.915937,...,0.129412,0.734043,0.6750,POR,0.767442,1.0,2020,2020-01-26,False,2
1380,0.0,0.589744,0.396226,0.629630,0.227273,0.119048,0.558020,0.428571,0.500000,0.460595,...,0.107190,0.074468,0.3875,PHO,0.418605,0.0,2020,2020-01-26,True,2
1381,0.0,0.717949,0.547170,0.651852,0.136364,0.119048,0.378840,0.085714,0.190476,0.057793,...,0.151634,0.670213,0.5500,BRK,0.279070,0.0,2020,2020-01-26,True,2
1382,0.0,0.487179,0.471698,0.459259,0.363636,0.452381,0.407850,0.457143,0.523810,0.481611,...,0.074510,0.292553,0.4625,DEN,0.511628,1.0,2020,2020-01-26,False,2


In [47]:
bets.groupby(["home"]).apply(lambda x: x[x["won"] == 1].shape[0] / x.shape[0])

home
0.0    0.449422
1.0    0.550578
dtype: float64

In [48]:
bets_rolling = bets[list(stat_columns) + ["won", "team", "season"]]
bets_rolling = bets_rolling.groupby(["team", "season"], group_keys=False).apply(team_averages)
bets_rolling

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,total_opp,home_opp,won,team,season
0,,,,,,,,,,,...,,,,,,,,False,LAL,2020
1,,,,,,,,,,,...,,,,,,,,True,LAC,2020
2,,,,,,,,,,,...,,,,,,,,True,TOR,2020
3,,,,,,,,,,,...,,,,,,,,False,NOP,2020
4,,,,,,,,,,,...,,,,,,,,False,CLE,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1379,0.0,0.579487,0.441509,0.584198,0.336364,0.288095,0.515358,0.320000,0.314286,0.571804,...,0.131853,0.322919,0.149542,0.248404,0.51750,0.434884,0.7,False,IND,2020
1380,0.0,0.610256,0.515094,0.565185,0.381818,0.414286,0.464164,0.400000,0.383333,0.656042,...,0.176255,0.448649,0.130980,0.335638,0.52375,0.469767,0.3,True,MEM,2020
1381,0.0,0.512821,0.541509,0.442963,0.259091,0.340476,0.363823,0.311429,0.335714,0.523643,...,0.207915,0.278811,0.138693,0.421809,0.44875,0.444186,0.3,True,NYK,2020
1382,0.0,0.510256,0.533962,0.441481,0.445455,0.619048,0.394369,0.508571,0.507143,0.640280,...,0.205212,0.290595,0.181961,0.359043,0.50375,0.496512,0.5,False,HOU,2020


In [49]:
rolling_cols = [f"{col}_10" for col in bets_rolling.columns]
bets_rolling.columns = rolling_cols
bets = pd.concat([bets, bets_rolling], axis=1)

bets = bets.dropna()
bets = bets.reset_index(drop=True)

bets

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,blk%_max_opp_10,tov%_max_opp_10,usg%_max_opp_10,ortg_max_opp_10,drtg_max_opp_10,total_opp_10,home_opp_10,won_10,team_10,season_10
0,0.0,0.461538,0.433962,0.454321,0.454545,0.452381,0.511945,0.342857,0.309524,0.707531,...,0.223359,0.286054,0.137647,0.300532,0.47875,0.445349,0.5,False,DET,2020
1,0.0,0.538462,0.415094,0.555556,0.181818,0.190476,0.382253,0.400000,0.285714,1.000000,...,0.247297,0.416757,0.189412,0.326596,0.46250,0.559302,0.5,False,GSW,2020
2,0.0,0.410256,0.660377,0.269136,0.045455,0.428571,0.052901,0.371429,0.309524,0.805604,...,0.286873,0.394595,0.134902,0.359043,0.39625,0.430233,0.6,False,CHI,2020
3,0.0,0.615385,0.528302,0.558025,0.636364,0.666667,0.530717,0.257143,0.285714,0.485114,...,0.181467,0.311027,0.126797,0.270213,0.42500,0.377907,0.3,False,OKC,2020
4,0.0,0.666667,0.377358,0.733333,0.363636,0.333333,0.510239,0.371429,0.309524,0.805604,...,0.140347,0.253081,0.114641,0.284574,0.48000,0.451163,0.7,True,MIL,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1109,0.0,0.692308,0.566038,0.609877,0.454545,0.333333,0.631399,0.485714,0.380952,0.915937,...,0.131853,0.322919,0.149542,0.248404,0.51750,0.434884,0.7,False,IND,2020
1110,0.0,0.589744,0.396226,0.629630,0.227273,0.119048,0.558020,0.428571,0.500000,0.460595,...,0.176255,0.448649,0.130980,0.335638,0.52375,0.469767,0.3,True,MEM,2020
1111,0.0,0.717949,0.547170,0.651852,0.136364,0.119048,0.378840,0.085714,0.190476,0.057793,...,0.207915,0.278811,0.138693,0.421809,0.44875,0.444186,0.3,True,NYK,2020
1112,0.0,0.487179,0.471698,0.459259,0.363636,0.452381,0.407850,0.457143,0.523810,0.481611,...,0.205212,0.290595,0.181961,0.359043,0.50375,0.496512,0.5,False,HOU,2020


In [50]:
bets["home_next"] = add_col(bets, "home")
bets["team_opp_next"] = add_col(bets, "team_opp")
bets["date_next"] = add_col(bets, "date")

bets

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,ortg_max_opp_10,drtg_max_opp_10,total_opp_10,home_opp_10,won_10,team_10,season_10,home_next,team_opp_next,date_next
0,0.0,0.461538,0.433962,0.454321,0.454545,0.452381,0.511945,0.342857,0.309524,0.707531,...,0.300532,0.47875,0.445349,0.5,False,DET,2020,1.0,MIN,2019-11-11
1,0.0,0.538462,0.415094,0.555556,0.181818,0.190476,0.382253,0.400000,0.285714,1.000000,...,0.326596,0.46250,0.559302,0.5,False,GSW,2020,1.0,UTA,2019-11-11
2,0.0,0.410256,0.660377,0.269136,0.045455,0.428571,0.052901,0.371429,0.309524,0.805604,...,0.359043,0.39625,0.430233,0.6,False,CHI,2020,1.0,NYK,2019-11-12
3,0.0,0.615385,0.528302,0.558025,0.636364,0.666667,0.530717,0.257143,0.285714,0.485114,...,0.270213,0.42500,0.377907,0.3,False,OKC,2020,0.0,IND,2019-11-12
4,0.0,0.666667,0.377358,0.733333,0.363636,0.333333,0.510239,0.371429,0.309524,0.805604,...,0.284574,0.48000,0.451163,0.7,True,MIL,2020,1.0,CHI,2019-11-14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1109,0.0,0.692308,0.566038,0.609877,0.454545,0.333333,0.631399,0.485714,0.380952,0.915937,...,0.248404,0.51750,0.434884,0.7,False,IND,2020,,,
1110,0.0,0.589744,0.396226,0.629630,0.227273,0.119048,0.558020,0.428571,0.500000,0.460595,...,0.335638,0.52375,0.469767,0.3,True,MEM,2020,,,
1111,0.0,0.717949,0.547170,0.651852,0.136364,0.119048,0.378840,0.085714,0.190476,0.057793,...,0.421809,0.44875,0.444186,0.3,True,NYK,2020,,,
1112,0.0,0.487179,0.471698,0.459259,0.363636,0.452381,0.407850,0.457143,0.523810,0.481611,...,0.359043,0.50375,0.496512,0.5,False,HOU,2020,,,


In [51]:
bet_matchup = bets.merge(bets[rolling_cols + ["team_opp_next", "date_next", "team"]], 
                          left_on=["team", "date_next"], right_on=["team_opp_next", "date_next"])

bet_matchup

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp_10_y,ortg_max_opp_10_y,drtg_max_opp_10_y,total_opp_10_y,home_opp_10_y,won_10_y,team_10_y,season_10_y,team_opp_next_y,team_y
0,0.0,0.410256,0.660377,0.269136,0.045455,0.428571,0.052901,0.371429,0.309524,0.805604,...,0.254118,0.384574,0.37250,0.422093,0.6,False,NYK,2020,CHI,NYK
1,0.0,0.615385,0.528302,0.558025,0.636364,0.666667,0.530717,0.257143,0.285714,0.485114,...,0.109935,0.263830,0.46375,0.382558,0.5,True,IND,2020,OKC,IND
2,0.0,0.666667,0.377358,0.733333,0.363636,0.333333,0.510239,0.371429,0.309524,0.805604,...,0.137124,0.337234,0.39500,0.402326,0.5,True,CHI,2020,MIL,CHI
3,0.0,0.692308,0.528302,0.637037,0.272727,0.142857,0.607509,0.028571,0.000000,0.649737,...,0.126797,0.270213,0.42500,0.377907,0.3,False,OKC,2020,IND,OKC
4,0.0,0.512821,0.490566,0.474074,0.545455,0.523810,0.551195,0.057143,0.023810,0.707531,...,0.129020,0.259574,0.44125,0.375581,0.6,True,PHI,2020,ORL,PHI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1073,0.0,0.666667,0.528302,0.609877,0.363636,0.428571,0.426621,0.657143,0.619048,0.718039,...,0.153856,0.447340,0.53250,0.424419,0.6,True,DEN,2020,HOU,DEN
1074,0.0,0.487179,0.433962,0.483951,0.500000,0.500000,0.522184,0.342857,0.309524,0.707531,...,0.151111,0.412766,0.55625,0.560465,0.4,False,NOP,2020,BOS,NOP
1075,0.0,0.384615,0.509434,0.323457,0.409091,0.547619,0.392491,0.314286,0.333333,0.539405,...,0.123660,0.327660,0.53750,0.476744,0.5,True,LAC,2020,ORL,LAC
1076,0.0,0.435897,0.358491,0.479012,0.409091,0.309524,0.597270,0.428571,0.595238,0.299475,...,0.156471,0.416489,0.56125,0.443023,0.3,True,BOS,2020,NOP,BOS


In [52]:
bet_matchup['date_next']

0       2019-11-12
1       2019-11-12
2       2019-11-14
3       2019-11-12
4       2019-11-13
           ...    
1073    2020-01-26
1074    2020-01-26
1075    2020-01-26
1076    2020-01-26
1077    2020-01-26
Name: date_next, Length: 1078, dtype: object

In [53]:
bet_matchup = bet_matchup.sort_values("date_next")
bet_matchup


Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp_10_y,ortg_max_opp_10_y,drtg_max_opp_10_y,total_opp_10_y,home_opp_10_y,won_10_y,team_10_y,season_10_y,team_opp_next_y,team_y
0,0.0,0.410256,0.660377,0.269136,0.045455,0.428571,0.052901,0.371429,0.309524,0.805604,...,0.254118,0.384574,0.37250,0.422093,0.6,False,NYK,2020,CHI,NYK
1,0.0,0.615385,0.528302,0.558025,0.636364,0.666667,0.530717,0.257143,0.285714,0.485114,...,0.109935,0.263830,0.46375,0.382558,0.5,True,IND,2020,OKC,IND
3,0.0,0.692308,0.528302,0.637037,0.272727,0.142857,0.607509,0.028571,0.000000,0.649737,...,0.126797,0.270213,0.42500,0.377907,0.3,False,OKC,2020,IND,OKC
5,0.0,0.256410,0.433962,0.224691,0.272727,0.404762,0.334471,0.371429,0.476190,0.369527,...,0.134902,0.359043,0.39625,0.430233,0.6,False,CHI,2020,NYK,CHI
21,0.0,0.487179,0.490566,0.444444,0.227273,0.571429,0.199659,0.200000,0.190476,0.595447,...,0.151242,0.333511,0.37250,0.305814,0.4,False,ORL,2020,PHI,ORL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1061,0.0,0.743590,0.452830,0.755556,0.636364,0.333333,0.875427,0.257143,0.190476,0.865149,...,0.121699,0.307447,0.49375,0.516279,0.6,False,POR,2020,IND,POR
1060,0.0,0.461538,0.396226,0.481481,0.227273,0.357143,0.310580,0.400000,0.380952,0.667250,...,0.135163,0.360638,0.55750,0.475581,0.4,True,MEM,2020,PHO,MEM
1055,0.0,0.641026,0.528302,0.582716,0.818182,0.595238,0.757679,0.257143,0.238095,0.649737,...,0.143660,0.197872,0.51500,0.394186,0.7,True,IND,2020,POR,IND
1068,0.0,0.512821,0.339623,0.585185,0.636364,0.500000,0.668942,0.457143,0.428571,0.695271,...,0.131895,0.364362,0.53500,0.456977,0.6,False,SAS,2020,TOR,SAS


In [54]:
bet_matchup[["team_x", "team_opp_next_x", "team_y", "team_opp_next_y", "date_next"]]

Unnamed: 0,team_x,team_opp_next_x,team_y,team_opp_next_y,date_next
0,CHI,NYK,NYK,CHI,2019-11-12
1,OKC,IND,IND,OKC,2019-11-12
3,IND,OKC,OKC,IND,2019-11-12
5,NYK,CHI,CHI,NYK,2019-11-12
21,PHI,ORL,ORL,PHI,2019-11-13
...,...,...,...,...,...
1061,IND,POR,POR,IND,2020-01-26
1060,PHO,MEM,MEM,PHO,2020-01-26
1055,POR,IND,IND,POR,2020-01-26
1068,TOR,SAS,SAS,TOR,2020-01-26


In [55]:
bet_matchup[["team_x", "team_opp_next_x", "team_y", "team_opp_next_y", "date_next"]][bet_matchup['date_next'] == '2019-10-25']


Unnamed: 0,team_x,team_opp_next_x,team_y,team_opp_next_y,date_next


In [56]:
non_stat_columns = list(bet_matchup.columns[bet_matchup.dtypes == "object"]) + non_stat_columns

non_stat_columns

['team_x',
 'team_opp',
 'date',
 'team_10_x',
 'team_opp_next_x',
 'date_next',
 'team_10_y',
 'team_opp_next_y',
 'team_y',
 'season',
 'date',
 'won',
 'target',
 'team',
 'team_opp']

In [62]:
# X_columns_bet = bet_matchup.columns[~bet_matchup.columns.isin(non_stat_columns)]
# X_bet = bet_matchup[X_columns_bet]
# y_bet = bet_matchup["target"]
# display(X_bet)
# display(y_bet)

In [59]:
# y_bet.value_counts()

1    539
0    539
Name: target, dtype: int64

In [60]:
# X_bet = X_bet.astype('float32')
# y_bet = y_bet.astype('float32')

In [63]:
# y_bet_pred = rnn_model.predict(X_bet)

In [64]:
# y_bet_pred_classes = np.argmax(y_bet_pred, axis=1)
# bet_accuracy = accuracy_score(y_bet, y_bet_pred_classes)
# print("Accuracy:", bet_accuracy)

# print(classification_report(y_bet, y_bet_pred_classes))

In [65]:
odds

Unnamed: 0_level_0,Rot,VH,Team,1st,2nd,3rd,4th,Final,Open,Close,ML,2H
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1022,501,V,NOP,30,31,25,31,122,231.5,229.5,230,113
1022,502,H,TOR,27,29,32,29,130,6.5,6.5,-280,6
1022,503,V,LAL,25,29,31,17,102,227,3.5,-180,5
1022,504,H,LAC,22,40,23,27,112,1.5,224,150,110.5
1023,505,V,DET,27,27,29,36,119,210,211,240,104
...,...,...,...,...,...,...,...,...,...,...,...,...
126,522,H,NYK,30,25,30,25,110,221.5,2.5,-140,0.5
126,523,V,WAS,33,39,30,31,133,240,241.5,-105,1.5
126,524,H,ATL,31,47,41,33,152,1.5,1,-115,119
126,525,V,IND,27,29,37,36,129,220.5,221,130,1.5


In [66]:
odds.reset_index(inplace=True)
odds['date'] = odds.index
odds


Unnamed: 0,Date,Rot,VH,Team,1st,2nd,3rd,4th,Final,Open,Close,ML,2H,date
0,1022,501,V,NOP,30,31,25,31,122,231.5,229.5,230,113,0
1,1022,502,H,TOR,27,29,32,29,130,6.5,6.5,-280,6,1
2,1022,503,V,LAL,25,29,31,17,102,227,3.5,-180,5,2
3,1022,504,H,LAC,22,40,23,27,112,1.5,224,150,110.5,3
4,1023,505,V,DET,27,27,29,36,119,210,211,240,104,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1379,126,522,H,NYK,30,25,30,25,110,221.5,2.5,-140,0.5,1379
1380,126,523,V,WAS,33,39,30,31,133,240,241.5,-105,1.5,1380
1381,126,524,H,ATL,31,47,41,33,152,1.5,1,-115,119,1381
1382,126,525,V,IND,27,29,37,36,129,220.5,221,130,1.5,1382


In [67]:
odds_test = odds.copy()
odds_test

Unnamed: 0,Date,Rot,VH,Team,1st,2nd,3rd,4th,Final,Open,Close,ML,2H,date
0,1022,501,V,NOP,30,31,25,31,122,231.5,229.5,230,113,0
1,1022,502,H,TOR,27,29,32,29,130,6.5,6.5,-280,6,1
2,1022,503,V,LAL,25,29,31,17,102,227,3.5,-180,5,2
3,1022,504,H,LAC,22,40,23,27,112,1.5,224,150,110.5,3
4,1023,505,V,DET,27,27,29,36,119,210,211,240,104,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1379,126,522,H,NYK,30,25,30,25,110,221.5,2.5,-140,0.5,1379
1380,126,523,V,WAS,33,39,30,31,133,240,241.5,-105,1.5,1380
1381,126,524,H,ATL,31,47,41,33,152,1.5,1,-115,119,1381
1382,126,525,V,IND,27,29,37,36,129,220.5,221,130,1.5,1382


In [68]:
# Convert all values in date to strings and add a '-' between every two characters
odds_test['Date'] = odds_test['Date'].astype(str).apply(lambda x: '-'.join(x[i:i+2] for i in range(0, len(x), 2)))
odds_test['Date']


0       10-22
1       10-22
2       10-22
3       10-22
4       10-23
        ...  
1379     12-6
1380     12-6
1381     12-6
1382     12-6
1383     12-6
Name: Date, Length: 1384, dtype: object

In [69]:
odds_test['Date'] = odds_test['Date'].apply(lambda x: '2019-' + x if x[:2] in ['10', '11', '12'] else '2020-' + x)
odds_test

Unnamed: 0,Date,Rot,VH,Team,1st,2nd,3rd,4th,Final,Open,Close,ML,2H,date
0,2019-10-22,501,V,NOP,30,31,25,31,122,231.5,229.5,230,113,0
1,2019-10-22,502,H,TOR,27,29,32,29,130,6.5,6.5,-280,6,1
2,2019-10-22,503,V,LAL,25,29,31,17,102,227,3.5,-180,5,2
3,2019-10-22,504,H,LAC,22,40,23,27,112,1.5,224,150,110.5,3
4,2019-10-23,505,V,DET,27,27,29,36,119,210,211,240,104,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1379,2019-12-6,522,H,NYK,30,25,30,25,110,221.5,2.5,-140,0.5,1379
1380,2019-12-6,523,V,WAS,33,39,30,31,133,240,241.5,-105,1.5,1380
1381,2019-12-6,524,H,ATL,31,47,41,33,152,1.5,1,-115,119,1381
1382,2019-12-6,525,V,IND,27,29,37,36,129,220.5,221,130,1.5,1382


In [70]:
odds_test = odds_test.drop('date', axis=1)
odds_test

Unnamed: 0,Date,Rot,VH,Team,1st,2nd,3rd,4th,Final,Open,Close,ML,2H
0,2019-10-22,501,V,NOP,30,31,25,31,122,231.5,229.5,230,113
1,2019-10-22,502,H,TOR,27,29,32,29,130,6.5,6.5,-280,6
2,2019-10-22,503,V,LAL,25,29,31,17,102,227,3.5,-180,5
3,2019-10-22,504,H,LAC,22,40,23,27,112,1.5,224,150,110.5
4,2019-10-23,505,V,DET,27,27,29,36,119,210,211,240,104
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1379,2019-12-6,522,H,NYK,30,25,30,25,110,221.5,2.5,-140,0.5
1380,2019-12-6,523,V,WAS,33,39,30,31,133,240,241.5,-105,1.5
1381,2019-12-6,524,H,ATL,31,47,41,33,152,1.5,1,-115,119
1382,2019-12-6,525,V,IND,27,29,37,36,129,220.5,221,130,1.5


In [71]:
merged_df = bet_matchup.merge(odds_test, left_on=['team_x', 'date_next'], right_on=['Team', 'Date'])
print(merged_df.shape)
merged_df

(700, 438)


Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,Team,1st,2nd,3rd,4th,Final,Open,Close,ML,2H
0,0.0,0.410256,0.660377,0.269136,0.045455,0.428571,0.052901,0.371429,0.309524,0.805604,...,CHI,30,30,25,35,120,6,6.5,-275,2
1,0.0,0.615385,0.528302,0.558025,0.636364,0.666667,0.530717,0.257143,0.285714,0.485114,...,OKC,25,17,20,23,85,209.5,212.5,100,3
2,0.0,0.692308,0.528302,0.637037,0.272727,0.142857,0.607509,0.028571,0.000000,0.649737,...,IND,31,21,34,25,111,3,1.5,-120,104
3,0.0,0.256410,0.433962,0.224691,0.272727,0.404762,0.334471,0.371429,0.476190,0.369527,...,NYK,22,32,31,17,102,213,212,225,104
4,0.0,0.487179,0.490566,0.444444,0.227273,0.571429,0.199659,0.200000,0.190476,0.595447,...,PHI,30,23,29,15,97,1.5,207.5,110,107
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,0.0,0.307692,0.415094,0.293827,0.136364,0.333333,0.204778,0.657143,0.571429,0.819615,...,IND,29,38,29,19,115,211.5,3.5,-155,108
696,0.0,0.282051,0.264151,0.362963,0.045455,0.142857,0.180887,0.657143,0.738095,0.513135,...,CLE,25,18,26,28,97,217,217.5,350,1
697,0.0,0.358974,0.603774,0.244444,0.409091,0.619048,0.351536,0.628571,0.523810,0.870403,...,LAC,28,29,31,17,105,7,6.5,-280,107
698,0.0,0.256410,0.283019,0.316049,0.181818,0.428571,0.213311,0.714286,0.738095,0.611208,...,BOS,24,26,30,29,109,6.5,7.5,-400,2.5


In [72]:
filtered_df = merged_df.drop(columns=odds_test.columns)
print(filtered_df.shape)
filtered_df


(700, 425)


Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp_10_y,ortg_max_opp_10_y,drtg_max_opp_10_y,total_opp_10_y,home_opp_10_y,won_10_y,team_10_y,season_10_y,team_opp_next_y,team_y
0,0.0,0.410256,0.660377,0.269136,0.045455,0.428571,0.052901,0.371429,0.309524,0.805604,...,0.254118,0.384574,0.37250,0.422093,0.6,False,NYK,2020,CHI,NYK
1,0.0,0.615385,0.528302,0.558025,0.636364,0.666667,0.530717,0.257143,0.285714,0.485114,...,0.109935,0.263830,0.46375,0.382558,0.5,True,IND,2020,OKC,IND
2,0.0,0.692308,0.528302,0.637037,0.272727,0.142857,0.607509,0.028571,0.000000,0.649737,...,0.126797,0.270213,0.42500,0.377907,0.3,False,OKC,2020,IND,OKC
3,0.0,0.256410,0.433962,0.224691,0.272727,0.404762,0.334471,0.371429,0.476190,0.369527,...,0.134902,0.359043,0.39625,0.430233,0.6,False,CHI,2020,NYK,CHI
4,0.0,0.487179,0.490566,0.444444,0.227273,0.571429,0.199659,0.200000,0.190476,0.595447,...,0.151242,0.333511,0.37250,0.305814,0.4,False,ORL,2020,PHI,ORL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,0.0,0.307692,0.415094,0.293827,0.136364,0.333333,0.204778,0.657143,0.571429,0.819615,...,0.168366,0.313298,0.50500,0.422093,0.5,False,PHI,2020,IND,PHI
696,0.0,0.282051,0.264151,0.362963,0.045455,0.142857,0.180887,0.657143,0.738095,0.513135,...,0.125229,0.394681,0.48625,0.411628,0.3,False,TOR,2020,CLE,TOR
697,0.0,0.358974,0.603774,0.244444,0.409091,0.619048,0.351536,0.628571,0.523810,0.870403,...,0.084052,0.303191,0.46625,0.398837,0.5,False,SAC,2020,LAC,SAC
698,0.0,0.256410,0.283019,0.316049,0.181818,0.428571,0.213311,0.714286,0.738095,0.611208,...,0.133464,0.271277,0.42625,0.373256,0.6,False,CHO,2020,BOS,CHO


In [73]:
X_columns_bet = filtered_df.columns[~filtered_df.columns.isin(non_stat_columns)]
X_bet = filtered_df[X_columns_bet]
y_bet = filtered_df["target"]
display(X_bet)
display(y_bet)

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,stl%_max_opp_10_y,blk%_max_opp_10_y,tov%_max_opp_10_y,usg%_max_opp_10_y,ortg_max_opp_10_y,drtg_max_opp_10_y,total_opp_10_y,home_opp_10_y,won_10_y,season_10_y
0,0.0,0.410256,0.660377,0.269136,0.045455,0.428571,0.052901,0.371429,0.309524,0.805604,...,0.158756,0.157336,0.340108,0.254118,0.384574,0.37250,0.422093,0.6,False,2020
1,0.0,0.615385,0.528302,0.558025,0.636364,0.666667,0.530717,0.257143,0.285714,0.485114,...,0.195622,0.180116,0.423784,0.109935,0.263830,0.46375,0.382558,0.5,True,2020
2,0.0,0.692308,0.528302,0.637037,0.272727,0.142857,0.607509,0.028571,0.000000,0.649737,...,0.121429,0.181467,0.311027,0.126797,0.270213,0.42500,0.377907,0.3,False,2020
3,0.0,0.256410,0.433962,0.224691,0.272727,0.404762,0.334471,0.371429,0.476190,0.369527,...,0.112673,0.286873,0.394595,0.134902,0.359043,0.39625,0.430233,0.6,False,2020
4,0.0,0.487179,0.490566,0.444444,0.227273,0.571429,0.199659,0.200000,0.190476,0.595447,...,0.107834,0.173359,0.298595,0.151242,0.333511,0.37250,0.305814,0.4,False,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,0.0,0.307692,0.415094,0.293827,0.136364,0.333333,0.204778,0.657143,0.571429,0.819615,...,0.153917,0.122780,0.280000,0.168366,0.313298,0.50500,0.422093,0.5,False,2020
696,0.0,0.282051,0.264151,0.362963,0.045455,0.142857,0.180887,0.657143,0.738095,0.513135,...,0.179724,0.207722,0.396324,0.125229,0.394681,0.48625,0.411628,0.3,False,2020
697,0.0,0.358974,0.603774,0.244444,0.409091,0.619048,0.351536,0.628571,0.523810,0.870403,...,0.123041,0.144981,0.271676,0.084052,0.303191,0.46625,0.398837,0.5,False,2020
698,0.0,0.256410,0.283019,0.316049,0.181818,0.428571,0.213311,0.714286,0.738095,0.611208,...,0.122811,0.282432,0.402919,0.133464,0.271277,0.42625,0.373256,0.6,False,2020


0      1
1      0
2      1
3      0
4      0
      ..
695    1
696    0
697    1
698    1
699    0
Name: target, Length: 700, dtype: int32