In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch

In [None]:
df = pd.read_csv('nba_games.csv')

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,0,240.0,240.0,39.0,81.0,0.481,6.0,20.0,0.3,14.0,...,22.8,29.0,178.0,111.0,DAL,95,1,2016,2015-12-09,True
1,1,240.0,240.0,36.0,100.0,0.36,7.0,31.0,0.226,16.0,...,50.0,32.6,152.0,111.0,ATL,98,0,2016,2015-12-09,False
2,2,240.0,240.0,37.0,85.0,0.435,8.0,19.0,0.421,17.0,...,20.0,30.9,148.0,116.0,SAS,107,1,2018,2017-10-18,False
3,3,240.0,240.0,41.0,89.0,0.461,8.0,21.0,0.381,17.0,...,28.6,30.9,138.0,118.0,MIN,99,0,2018,2017-10-18,True
4,4,240.0,240.0,27.0,86.0,0.314,6.0,26.0,0.231,15.0,...,16.8,30.9,157.0,90.0,MEM,92,1,2021,2021-04-30,False


In [None]:
df = df.sort_values('date')
#reset indices to be based on date; drop old index column
df = df.reset_index(drop=True)
df.drop(columns=df.columns[0], axis=1, inplace=True)

In [None]:
df.head()

Unnamed: 0,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,fta,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,240.0,240.0,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,...,69.4,43.7,206.0,104.0,GSW,111,1,2016,2015-10-27,False
1,240.0,240.0,38.0,94.0,0.404,9.0,29.0,0.31,10.0,17.0,...,53.2,34.6,162.0,104.0,CHI,97,1,2016,2015-10-27,False
2,240.0,240.0,37.0,87.0,0.425,7.0,19.0,0.368,16.0,23.0,...,30.4,29.0,138.0,105.0,CLE,95,0,2016,2015-10-27,True
3,240.0,240.0,41.0,96.0,0.427,9.0,30.0,0.3,20.0,22.0,...,37.5,38.9,201.0,120.0,NOP,95,0,2016,2015-10-27,True
4,240.0,240.0,37.0,82.0,0.451,8.0,27.0,0.296,12.0,15.0,...,33.3,23.6,132.0,104.0,DET,106,0,2016,2015-10-27,False


In [None]:
#delete unnecessary columns
del df["mp.1"] #Minutes played already present
del df["mp_opp.1"]
del df["index_opp"] #unecessary indexing

In [None]:
#Input: team box score
def add_Next_Victory(team):
  team["Next_Victory"] = team["won"].shift(-1) #creates a new column called Next_Victory which shows the outcome of the next game played
  return team
df = df.groupby("team", group_keys=False).apply(add_Next_Victory)

In [None]:
#replaces missing 'next_victory' values with 2 to represent that game hasn't been played yet
# Also change true to 1 and false to 0
df["Next_Victory"][pd.isnull(df["Next_Victory"])] = 2
df["Next_Victory"] = df["Next_Victory"].astype(int, errors="ignore")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Next_Victory"][pd.isnull(df["Next_Victory"])] = 2


In [None]:
#find null columns
nulls = pd.isnull(df).sum()
nulls = nulls[nulls > 0]

In [None]:
#drop null columns
valid_columns = df.columns[~df.columns.isin(nulls.index)]
df = df[valid_columns].copy()

In [None]:
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,Next_Victory
0,240.0,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,0.704,...,43.7,206.0,104.0,GSW,111,1,2016,2015-10-27,False,0
1,240.0,38.0,94.0,0.404,9.0,29.0,0.310,10.0,17.0,0.588,...,34.6,162.0,104.0,CHI,97,1,2016,2015-10-27,False,1
2,240.0,37.0,87.0,0.425,7.0,19.0,0.368,16.0,23.0,0.696,...,29.0,138.0,105.0,CLE,95,0,2016,2015-10-27,True,1
3,240.0,41.0,96.0,0.427,9.0,30.0,0.300,20.0,22.0,0.909,...,38.9,201.0,120.0,NOP,95,0,2016,2015-10-27,True,1
4,240.0,37.0,82.0,0.451,8.0,27.0,0.296,12.0,15.0,0.800,...,23.6,132.0,104.0,DET,106,0,2016,2015-10-27,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17767,240.0,34.0,85.0,0.400,15.0,38.0,0.395,14.0,19.0,0.737,...,36.3,133.0,112.0,GSW,107,0,2022,2022-06-10,False,0
17768,240.0,41.0,88.0,0.466,9.0,40.0,0.225,13.0,15.0,0.867,...,94.4,300.0,112.0,BOS,94,0,2022,2022-06-13,True,1
17769,240.0,31.0,75.0,0.413,11.0,32.0,0.344,21.0,31.0,0.677,...,36.2,222.0,107.0,GSW,104,1,2022,2022-06-13,False,0
17770,240.0,34.0,80.0,0.425,11.0,28.0,0.393,11.0,12.0,0.917,...,31.5,186.0,111.0,GSW,103,0,2022,2022-06-16,False,2


In [None]:
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,orb,drb,trb,ast,stl,blk,tov,pf,pts,ts%,efg%,3par,ftr,orb%,drb%,trb%,ast%,stl%,blk%,tov%,usg%,ortg,drtg,fg_max,fga_max,fg%_max,3p_max,3pa_max,3p%_max,ft_max,fta_max,ft%_max,orb_max,drb_max,trb_max,ast_max,stl_max,blk_max,tov_max,pf_max,pts_max,+/-_max,ts%_max,efg%_max,3par_max,ftr_max,orb%_max,drb%_max,trb%_max,ast%_max,stl%_max,blk%_max,tov%_max,usg%_max,ortg_max,drtg_max,team,total,home,mp_opp,fg_opp,fga_opp,fg%_opp,3p_opp,3pa_opp,3p%_opp,ft_opp,fta_opp,ft%_opp,orb_opp,drb_opp,trb_opp,ast_opp,stl_opp,blk_opp,tov_opp,pf_opp,pts_opp,ts%_opp,efg%_opp,3par_opp,ftr_opp,orb%_opp,drb%_opp,trb%_opp,ast%_opp,stl%_opp,blk%_opp,tov%_opp,usg%_opp,ortg_opp,drtg_opp,fg_max_opp,fga_max_opp,fg%_max_opp,3p_max_opp,3pa_max_opp,3p%_max_opp,ft_max_opp,fta_max_opp,ft%_max_opp,orb_max_opp,drb_max_opp,trb_max_opp,ast_max_opp,stl_max_opp,blk_max_opp,tov_max_opp,pf_max_opp,pts_max_opp,+/-_max_opp,ts%_max_opp,efg%_max_opp,3par_max_opp,ftr_max_opp,orb%_max_opp,drb%_max_opp,trb%_max_opp,ast%_max_opp,stl%_max_opp,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,Next_Victory
0,240.0,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,0.704,8.0,25.0,33.0,21.0,9.0,3.0,18.0,26.0,95.0,0.501,0.458,0.217,0.325,18.6,54.3,37.1,60.0,9.0,4.5,15.9,100.0,94.9,110.9,7.0,20.0,1.0,2.0,7.0,0.667,10.0,15.0,1.0,3.0,5.0,6.0,9.0,3.0,3.0,5.0,5.0,18.0,5.0,1.0,1.0,1.0,0.75,19.8,26.3,13.6,43.4,5.3,6.3,37.5,38.9,201.0,120.0,NOP,95,0,240.0,41.0,96.0,0.427,9.0,30.0,0.3,20.0,22.0,0.909,21.0,35.0,56.0,29.0,8.0,7.0,20.0,29.0,111.0,0.525,0.474,0.313,0.229,45.7,81.4,62.9,70.7,8.0,10.8,15.9,100.0,110.9,94.9,14.0,26.0,1.0,5.0,12.0,1.0,7.0,7.0,1.0,5.0,5.0,9.0,7.0,2.0,2.0,7.0,5.0,40.0,20.0,1.136,1.0,0.462,0.571,39.0,27.2,25.2,50.1,19.1,7.9,69.4,43.7,206.0,104.0,GSW,111,1,2016,2015-10-27,False,0
1,240.0,38.0,94.0,0.404,9.0,29.0,0.31,10.0,17.0,0.588,11.0,39.0,50.0,26.0,5.0,7.0,10.0,21.0,95.0,0.468,0.452,0.309,0.181,21.6,84.8,51.5,68.4,5.0,10.3,9.0,100.0,95.5,97.5,12.0,22.0,0.571,3.0,7.0,0.5,3.0,4.0,1.0,4.0,10.0,12.0,7.0,1.0,2.0,3.0,4.0,25.0,9.0,0.714,0.714,1.0,2.0,10.8,41.9,23.8,31.2,2.8,18.5,30.4,29.0,138.0,105.0,CLE,95,0,240.0,37.0,87.0,0.425,7.0,19.0,0.368,16.0,23.0,0.696,7.0,40.0,47.0,13.0,6.0,10.0,13.0,22.0,97.0,0.499,0.466,0.218,0.264,15.2,78.4,48.5,35.1,6.0,15.4,11.8,100.0,97.5,95.5,8.0,22.0,0.75,3.0,5.0,0.75,5.0,5.0,1.0,2.0,8.0,10.0,5.0,2.0,6.0,4.0,6.0,19.0,9.0,0.82,0.875,0.556,1.333,12.2,38.5,26.0,30.3,2.8,14.0,53.2,34.6,162.0,104.0,CHI,97,1,2016,2015-10-27,False,1
2,240.0,37.0,87.0,0.425,7.0,19.0,0.368,16.0,23.0,0.696,7.0,40.0,47.0,13.0,6.0,10.0,13.0,22.0,97.0,0.499,0.466,0.218,0.264,15.2,78.4,48.5,35.1,6.0,15.4,11.8,100.0,97.5,95.5,8.0,22.0,0.75,3.0,5.0,0.75,5.0,5.0,1.0,2.0,8.0,10.0,5.0,2.0,6.0,4.0,6.0,19.0,9.0,0.82,0.875,0.556,1.333,12.2,38.5,26.0,30.3,2.8,14.0,53.2,34.6,162.0,104.0,CHI,97,1,240.0,38.0,94.0,0.404,9.0,29.0,0.31,10.0,17.0,0.588,11.0,39.0,50.0,26.0,5.0,7.0,10.0,21.0,95.0,0.468,0.452,0.309,0.181,21.6,84.8,51.5,68.4,5.0,10.3,9.0,100.0,95.5,97.5,12.0,22.0,0.571,3.0,7.0,0.5,3.0,4.0,1.0,4.0,10.0,12.0,7.0,1.0,2.0,3.0,4.0,25.0,9.0,0.714,0.714,1.0,2.0,10.8,41.9,23.8,31.2,2.8,18.5,30.4,29.0,138.0,105.0,CLE,95,0,2016,2015-10-27,True,1
3,240.0,41.0,96.0,0.427,9.0,30.0,0.3,20.0,22.0,0.909,21.0,35.0,56.0,29.0,8.0,7.0,20.0,29.0,111.0,0.525,0.474,0.313,0.229,45.7,81.4,62.9,70.7,8.0,10.8,15.9,100.0,110.9,94.9,14.0,26.0,1.0,5.0,12.0,1.0,7.0,7.0,1.0,5.0,5.0,9.0,7.0,2.0,2.0,7.0,5.0,40.0,20.0,1.136,1.0,0.462,0.571,39.0,27.2,25.2,50.1,19.1,7.9,69.4,43.7,206.0,104.0,GSW,111,1,240.0,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,0.704,8.0,25.0,33.0,21.0,9.0,3.0,18.0,26.0,95.0,0.501,0.458,0.217,0.325,18.6,54.3,37.1,60.0,9.0,4.5,15.9,100.0,94.9,110.9,7.0,20.0,1.0,2.0,7.0,0.667,10.0,15.0,1.0,3.0,5.0,6.0,9.0,3.0,3.0,5.0,5.0,18.0,5.0,1.0,1.0,1.0,0.75,19.8,26.3,13.6,43.4,5.3,6.3,37.5,38.9,201.0,120.0,NOP,95,0,2016,2015-10-27,True,1
4,240.0,37.0,82.0,0.451,8.0,27.0,0.296,12.0,15.0,0.8,7.0,33.0,40.0,22.0,9.0,4.0,15.0,25.0,94.0,0.53,0.5,0.329,0.183,16.3,58.9,40.4,59.5,9.4,6.0,14.5,100.0,98.6,111.2,8.0,16.0,1.0,2.0,6.0,1.0,3.0,4.0,1.0,3.0,7.0,8.0,4.0,4.0,3.0,5.0,4.0,20.0,10.0,1.33,1.5,1.0,2.0,20.5,28.8,17.9,34.6,10.6,7.1,57.1,33.8,258.0,121.0,ATL,94,1,240.0,37.0,96.0,0.385,12.0,29.0,0.414,20.0,26.0,0.769,23.0,36.0,59.0,23.0,5.0,3.0,15.0,15.0,106.0,0.493,0.448,0.302,0.271,41.1,83.7,59.6,62.2,5.2,5.5,12.3,100.0,111.2,98.6,7.0,19.0,0.6,4.0,7.0,0.571,6.0,10.0,1.0,8.0,11.0,19.0,5.0,2.0,2.0,3.0,4.0,21.0,26.0,0.685,0.643,0.833,0.625,18.5,41.2,24.8,35.6,3.2,4.7,33.3,23.6,132.0,104.0,DET,106,0,2016,2015-10-27,False,1


In [None]:
predictors = ['ts%', '+/-_max_opp', '+/-_max', 'orb%', 'drb%', 'ast%', 'stl%', 'blk%', 'tov%', '3par', 'ftr', 'ts%_opp', 'drb%_opp', 'orb%_opp', 'ast%_opp', 'stl%_opp', 'blk%_opp', 'tov%_opp', '3par_opp', 'home_opp']

In [None]:
df_without_missing_values = df.dropna()

In [None]:
X = df_without_missing_values[predictors]
y = df_without_missing_values['Next_Victory']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=493)

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
GB = GradientBoostingRegressor()
GB.fit(X_train, y_train)
GB_predict_Train=GB.predict(X_train)

RMSE1=sqrt(mean_squared_error(y_train,GB_predict_Train))
print("RMSE (training) for GB:{0:10f}".format(RMSE1))
GB_predict_Test=GB.predict(X_test)
RMSE= sqrt(mean_squared_error(y_test,GB_predict_Test))
print("RMSE (Test Data) for GB:{0:10f}".format(RMSE))

RMSE (training) for GB:  0.486225
RMSE (Test Data) for GB:  0.504093


In [None]:
def backtest(data, model, predictors, start=2, step=1):
    all_predictions = []

    seasons = sorted(data["season"].unique())

    for i in range(start, len(seasons), step):
        season = seasons[i]
        train = data[data["season"] < season]
        test = data[data["season"] == season]

        model.fit(train[predictors], train["Next_Victory"])

        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["Next_Victory"], preds], axis=1)
        combined.columns = ["actual", "prediction"]

        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [None]:
from sklearn.linear_model import RidgeClassifier
rr = RidgeClassifier(alpha=1)

In [None]:
predictions = backtest(df, rr, predictors)

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(predictions["actual"], predictions["prediction"])

0.5317840600543045