In [66]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.linear_model import RidgeClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

In [67]:
def accuracy_forupset(value):
    return value + ((value)*(37/100))

In [68]:
df = pd.read_csv("./raw_data/nba_games.csv", index_col=0)
df.head()

Unnamed: 0,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,fta,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,240,240,39,81,0.481,6,20,0.3,14,18,...,22.8,29.0,178,111,DAL,95,1,2016,12/9/2015,True
1,240,240,36,100,0.36,7,31,0.226,16,19,...,50.0,32.6,152,111,ATL,98,0,2016,12/9/2015,False
2,240,240,37,85,0.435,8,19,0.421,17,23,...,20.0,30.9,148,116,SAS,107,1,2018,10/18/2017,False
3,240,240,41,89,0.461,8,21,0.381,17,19,...,28.6,30.9,138,118,MIN,99,0,2018,10/18/2017,True
4,240,240,27,86,0.314,6,26,0.231,15,20,...,16.8,30.9,157,90,MEM,92,1,2021,4/30/2021,False


### Cleaning dataset and Creating Target column for prediction

In [69]:
df.sort_values("date",inplace=True)
df.reset_index(drop=True,inplace=True)
del df["mp.1"]
del df["mp_opp.1"]
del df["index_opp"]
df.head()

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,240,36,91,0.396,15,33,0.455,17,18,0.944,...,31.7,30.6,174,130,CHO,94,0,2016,1/1/2016,True
1,240,33,87,0.379,5,16,0.313,22,29,0.759,...,36.8,40.4,150,105,PHI,84,0,2016,1/1/2016,True
2,240,39,85,0.459,10,16,0.625,20,25,0.8,...,33.3,39.6,171,132,NYK,81,0,2016,1/1/2016,True
3,240,31,84,0.369,6,19,0.316,13,17,0.765,...,50.0,42.7,195,102,CHI,108,1,2016,1/1/2016,False
4,240,31,84,0.369,4,24,0.167,18,25,0.72,...,22.2,30.6,164,100,LAL,93,1,2016,1/1/2016,False


In [70]:
def add_target(group):
    group["target"] = group["won"].shift(-1)
    return group

df = df.groupby("team", group_keys=False).apply(add_target)

In [71]:
df[df["team"] == "DAL"].head()

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
5,240,32,88,0.364,7,31,0.226,11,14,0.786,...,33.0,206,102,MIA,106,1,2016,1/1/2016,False,True
49,240,32,78,0.41,12,37,0.324,17,23,0.739,...,29.7,144,103,MIA,83,0,2021,1/1/2021,True,True
80,240,36,77,0.468,6,20,0.3,15,21,0.714,...,39.4,282,116,MIN,87,1,2016,1/10/2016,True,True
119,240,39,83,0.47,15,36,0.417,22,28,0.786,...,37.8,158,121,CHO,111,1,2018,1/10/2018,True,False
161,240,39,96,0.406,11,38,0.289,25,37,0.676,...,43.1,221,126,LAL,129,0,2020,1/10/2020,False,True


In [72]:
df["target"][pd.isnull(df["target"])] = 2 # mark the future games in target as null
df["target"] = df["target"].astype(int, errors="ignore")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["target"][pd.isnull(df["target"])] = 2 # mark the future games in target as null


### Calculating an removing null columns

In [73]:
nulls = pd.isnull(df).sum()
nulls = nulls[nulls > 0]
valid_columns = df.columns[~df.columns.isin(nulls.index)]
df = df[valid_columns].copy()

In [74]:
df.head()

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,240,36,91,0.396,15,33,0.455,17,18,0.944,...,30.6,174,130,CHO,94,0,2016,1/1/2016,True,1
1,240,33,87,0.379,5,16,0.313,22,29,0.759,...,40.4,150,105,PHI,84,0,2016,1/1/2016,True,0
2,240,39,85,0.459,10,16,0.625,20,25,0.8,...,39.6,171,132,NYK,81,0,2016,1/1/2016,True,0
3,240,31,84,0.369,6,19,0.316,13,17,0.765,...,42.7,195,102,CHI,108,1,2016,1/1/2016,False,0
4,240,31,84,0.369,4,24,0.167,18,25,0.72,...,30.6,164,100,LAL,93,1,2016,1/1/2016,False,1


In [75]:
rcl = RidgeClassifier(alpha=1) #initialize classifier
split = TimeSeriesSplit(n_splits=3) # split based of time to keep the value for prediction sequential
sfs = SequentialFeatureSelector(rcl, n_features_to_select=35, direction="forward",cv=split,n_jobs=1)

In [76]:
removed_columns = ["season", "date", "won", "target", "team", "team_opp"]
selected_columns = df.columns[~df.columns.isin(removed_columns)]

In [77]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns])

In [78]:
df.head()

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,0.0,0.386364,0.455882,0.303828,0.517241,0.439394,0.54038,0.372093,0.269841,0.934656,...,0.109114,0.402844,0.623529,CHO,0.288462,0.0,2016,1/1/2016,True,1
1,0.0,0.318182,0.397059,0.263158,0.172414,0.181818,0.371734,0.488372,0.444444,0.718786,...,0.234917,0.2891,0.329412,PHI,0.192308,0.0,2016,1/1/2016,True,0
2,0.0,0.454545,0.367647,0.454545,0.344828,0.181818,0.74228,0.44186,0.380952,0.766628,...,0.224647,0.388626,0.647059,NYK,0.163462,0.0,2016,1/1/2016,True,0
3,0.0,0.272727,0.352941,0.239234,0.206897,0.227273,0.375297,0.27907,0.253968,0.725788,...,0.264442,0.50237,0.294118,CHI,0.423077,1.0,2016,1/1/2016,False,0
4,0.0,0.272727,0.352941,0.239234,0.137931,0.30303,0.198337,0.395349,0.380952,0.673279,...,0.109114,0.35545,0.270588,LAL,0.278846,1.0,2016,1/1/2016,False,1


In [79]:
# sfs.fit(df[selected_columns], df["target"])

In [None]:
predictors = list(selected_columns[sfs.get_support()])
print(predictors,"\nno. of predicted columns =",len(predictors))

In [106]:
def modelstart(data, team, opposition, model, predictors, start=5, step=1):
    all_predictions = []
    
    seasons = sorted(data["season"].unique())
    
    for i in range(start, len(seasons), step):
        season = seasons[i]
        train = data[data["season"] < season]
        test = data.loc[(data["team_x"] == team) & (data["team_y"]==opposition)]
        # print(test)
        
        model.fit(train[predictors], train["target"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["target"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        
        all_predictions.append(combined)
    predictions = pd.concat(all_predictions)
    return predictions["prediction"].mean(),predictions

In [None]:
predictions = modelstart(df,"ATL","SAS", rcl, predictors)
print()

0.4642857142857143


In [None]:
value = accuracy_score(predictions["actual"], predictions["prediction"])
print("accuracy :",round(accuracy_forupset(value)*100,2),"%")

accuracy : 83.18 %


In [83]:
df.groupby(["home"]).apply(lambda x: x[x["won"] == 1].shape[0] / x.shape[0])

home
0.0    0.428314
1.0    0.571686
dtype: float64

In [84]:
df_rolling = df[list(selected_columns) + ["won", "team", "season"]]

def find_team_averages(team):
    rolling = team.rolling(10).mean()
    return rolling

df_rolling = df_rolling.groupby(["team", "season"], group_keys=False).apply(find_team_averages)

  rolling = team.rolling(10).mean()


In [85]:
df_rolling

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,stl%_max_opp,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,total_opp,home_opp,won,season
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17767,0.000,0.384091,0.335294,0.395694,0.568966,0.642424,0.426722,0.404651,0.357143,0.758460,...,0.0631,0.1709,0.525262,0.155199,0.373934,0.456471,0.423077,0.6,0.4,2020.0
17768,0.000,0.427273,0.308824,0.474880,0.355172,0.410606,0.393705,0.418605,0.357143,0.783314,...,0.0577,0.0611,0.451887,0.226444,0.453555,0.492941,0.429808,0.6,0.3,2020.0
17769,0.000,0.472727,0.370588,0.474641,0.396552,0.424242,0.427672,0.393023,0.341270,0.767561,...,0.0550,0.0935,0.385010,0.175610,0.428910,0.536471,0.425000,0.5,0.5,2020.0
17770,0.050,0.386364,0.363235,0.375598,0.437931,0.528788,0.384561,0.367442,0.333333,0.731155,...,0.0743,0.0951,0.508491,0.101027,0.491943,0.372941,0.399038,0.4,0.5,2020.0


In [86]:
rolling_cols = [f"{col}_10" for col in df_rolling.columns]
df_rolling.columns = rolling_cols
df = pd.concat([df, df_rolling], axis=1)

In [87]:
df = df.dropna()

In [88]:
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col

def add_col(df, col_name):
    return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))

df["home_next"] = add_col(df, "home")
df["team_opp_next"] = add_col(df, "team_opp")
df["date_next"] = add_col(df, "date")

In [100]:
full = df.merge(df[rolling_cols + ["team_opp_next", "date_next", "team"]], left_on=["team", "date_next"], right_on=["team_opp_next", "date_next"])
full.head()


Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,tov%_max_opp_10_y,usg%_max_opp_10_y,ortg_max_opp_10_y,drtg_max_opp_10_y,total_opp_10_y,home_opp_10_y,won_10_y,season_10_y,team_opp_next_y,team_y
0,0.0,0.477273,0.397059,0.456938,0.344828,0.439394,0.359857,0.348837,0.269841,0.870478,...,0.299895,0.152246,0.384834,0.507059,0.436538,0.5,0.5,2017.0,DAL,HOU
1,0.0,0.454545,0.25,0.566986,0.482759,0.469697,0.475059,0.488372,0.428571,0.750292,...,0.467925,0.146598,0.406161,0.374118,0.366346,0.5,0.4,2016.0,HOU,DAL
2,0.0,0.545455,0.485294,0.461722,0.275862,0.333333,0.365796,0.255814,0.285714,0.570595,...,0.512788,0.162516,0.488626,0.56,0.4375,0.5,0.6,2018.0,OKC,MIN
3,0.0,0.386364,0.279412,0.447368,0.517241,0.393939,0.593824,0.372093,0.333333,0.735123,...,0.430713,0.140436,0.562559,0.548235,0.457692,0.7,0.7,2020.0,DAL,LAL
4,0.0,0.363636,0.367647,0.342105,0.241379,0.272727,0.377672,0.325581,0.301587,0.708285,...,0.316771,0.121438,0.396209,0.508235,0.443269,0.4,0.7,2018.0,ORL,LAC


In [101]:
removed_columns = list(full.columns[full.dtypes == "object"]) + removed_columns
selected_columns = full.columns[~full.columns.isin(removed_columns)]
sfs.fit(full[selected_columns], full["target"])

KeyboardInterrupt: 

In [102]:
predictors = list(selected_columns[sfs.get_support()])
print(predictors,"\n\nNo. of predicted columns =",len(predictors))

['mp', 'drb%', 'usg%', 'fg%_max', 'blk_max', 'stl%_max', 'mp_opp', '3p%_opp', 'orb_opp', 'pf_opp', 'ftr_opp', 'usg%_opp', 'fg%_max_opp', 'trb_max_opp', 'blk_max_opp', 'ts%_max_opp', 'ftr_10_x', 'blk%_10_x', 'usg%_10_x', '3p_max_10_x', 'orb%_max_10_x', 'fg_opp_10_x', 'usg%_opp_10_x', 'pts_max_opp_10_x', 'ast%_max_opp_10_x', 'home_next', 'ast_10_y', 'tov_10_y', 'usg%_10_y', 'fg_max_10_y', 'blk_max_10_y', 'usg%_opp_10_y', 'fga_max_opp_10_y', '+/-_max_opp_10_y', 'tov%_max_opp_10_y'] 

No. of predicted columns = 35


In [108]:
team = input("Enter team:")
opposition = input("Enter opposition:")

In [109]:
value,predictions = modelstart(full,team,opposition, rcl, predictors)
print("win %:",value)
value = accuracy_score(predictions["actual"], predictions["prediction"])
print("accuracy :",round(accuracy_forupset(value)*100,2),"%")

win %: 0.18181818181818182
accuracy : 99.64 %
