### NBA Game Prediction 

### i.) Importing Libraries, Dependencies, Dataframe Building/Filtering/Cleaning

In [120]:
# Import libraries and dependencies 
# Note: Use nba_games.csv for full dataset in csv format

import pandas as pd
from pathlib import Path
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier
from sklearn.preprocessing import MinMaxScaler

In [84]:
# Import .csv and convert to workable dataframe
# Each index row constitutes one NBA game

nba_games_path = Path("nba_games.csv")

df = pd.read_csv(nba_games_path, index_col=0)
display(df.head(3))
display(df.tail(3))

Unnamed: 0,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,fta,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,240.0,240.0,39.0,81.0,0.481,6.0,20.0,0.3,14.0,18.0,...,22.8,29.0,178.0,111.0,DAL,95,1,2016,2015-12-09,True
1,240.0,240.0,36.0,100.0,0.36,7.0,31.0,0.226,16.0,19.0,...,50.0,32.6,152.0,111.0,ATL,98,0,2016,2015-12-09,False
2,240.0,240.0,37.0,85.0,0.435,8.0,19.0,0.421,17.0,23.0,...,20.0,30.9,148.0,116.0,SAS,107,1,2018,2017-10-18,False


Unnamed: 0,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,fta,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
17769,240.0,240.0,42.0,89.0,0.472,14.0,33.0,0.424,10.0,20.0,...,25.6,29.9,175.0,126.0,LAC,113,0,2017,2016-12-14,False
17770,240.0,240.0,41.0,85.0,0.482,9.0,26.0,0.346,26.0,30.0,...,27.7,27.1,150.0,126.0,MIA,106,1,2020,2020-09-19,True
17771,240.0,240.0,33.0,85.0,0.388,12.0,44.0,0.273,28.0,34.0,...,51.5,36.2,141.0,114.0,BOS,117,0,2020,2020-09-19,False


In [85]:
# Sort values by the date column to get the dataframe in chrono order & drop = True to remove 'old index' after reset_index

df = df.sort_values("date")
df = df.reset_index(drop=True)
df = df.drop(columns={"mp.1", "mp_opp.1", "index_opp"})

In [86]:
display(df.head(3))
display(df.tail(3))

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,240.0,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,0.704,...,69.4,43.7,206.0,104.0,GSW,111,1,2016,2015-10-27,False
1,240.0,38.0,94.0,0.404,9.0,29.0,0.31,10.0,17.0,0.588,...,53.2,34.6,162.0,104.0,CHI,97,1,2016,2015-10-27,False
2,240.0,37.0,87.0,0.425,7.0,19.0,0.368,16.0,23.0,0.696,...,30.4,29.0,138.0,105.0,CLE,95,0,2016,2015-10-27,True


Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
17769,240.0,31.0,75.0,0.413,11.0,32.0,0.344,21.0,31.0,0.677,...,33.3,36.2,222.0,107.0,GSW,104,1,2022,2022-06-13,False
17770,240.0,34.0,80.0,0.425,11.0,28.0,0.393,11.0,12.0,0.917,...,33.3,31.5,186.0,111.0,GSW,103,0,2022,2022-06-16,False
17771,240.0,38.0,92.0,0.413,19.0,46.0,0.413,8.0,8.0,1.0,...,100.0,42.6,141.0,126.0,BOS,90,1,2022,2022-06-16,True


In [88]:
df["team"]

0        NOP
1        CLE
2        CHI
3        GSW
4        ATL
        ... 
17767    BOS
17768    GSW
17769    BOS
17770    BOS
17771    GSW
Name: team, Length: 17772, dtype: object

In [87]:
# Predict how team does on next game ('target' game)

def add_target(team):
    team["target"] = team["won"].shift(-1)
    return team

# Split df dataframe into smaller dataframe groupby 'team' and group_keys=False so we don't add extra indexing
df = df.groupby("team", group_keys=False).apply(add_target)

In [90]:
# i.e. looking at one specific team 
# target is the ideal predicted value of the next game 

df[df["team"] == "WAS"]

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
11,240.0,33.0,84.0,0.393,7.0,28.0,0.250,15.0,25.0,0.600,...,25.2,134.0,98.0,ORL,87,1,2016,2015-10-28,True,True
63,240.0,35.0,68.0,0.515,12.0,21.0,0.571,36.0,44.0,0.818,...,32.4,138.0,122.0,MIL,113,1,2016,2015-10-30,True,False
69,240.0,38.0,90.0,0.422,6.0,22.0,0.273,28.0,33.0,0.848,...,28.7,153.0,108.0,NYK,117,0,2016,2015-10-31,False,True
124,240.0,42.0,87.0,0.483,8.0,23.0,0.348,10.0,14.0,0.714,...,30.0,160.0,109.0,SAS,99,0,2016,2015-11-04,True,False
161,240.0,36.0,88.0,0.409,8.0,25.0,0.320,18.0,23.0,0.783,...,41.6,146.0,103.0,BOS,118,1,2016,2015-11-06,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17471,240.0,38.0,82.0,0.463,8.0,21.0,0.381,18.0,20.0,0.900,...,34.4,245.0,116.0,BOS,144,1,2022,2022-04-03,False,True
17483,240.0,56.0,101.0,0.554,13.0,30.0,0.433,7.0,14.0,0.500,...,32.8,168.0,132.0,MIN,114,1,2022,2022-04-05,True,False
17510,240.0,42.0,86.0,0.488,10.0,35.0,0.286,9.0,11.0,0.818,...,60.2,159.0,114.0,ATL,118,1,2022,2022-04-06,False,False
17538,240.0,35.0,69.0,0.507,9.0,28.0,0.321,13.0,17.0,0.765,...,35.5,135.0,104.0,NYK,114,0,2022,2022-04-08,False,False


In [94]:
# Processing required on the newly created future ideally predicted 'target' column 
# Finds all the 'NaN' missing outcome values that were pulled back and no game was played
# Replace all the 'NaN' values of each of the 30 NBA teams with an int= 2


df["target"][pd.isnull(df["target"])] = 2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [95]:
# Convert all the boolean True/False of the 'target' values into binary, 0=False, 1=True
df["target"] = df["target"].astype(int, errors="ignore")

In [97]:
# Since every NBA game will have a winner and loser we want to make sure it was sum zero outcome and 50/50 win/losses
df["won"].value_counts()

False    8886
True     8886
Name: won, dtype: int64

In [98]:
# Remove all 'null' values so machine learning script will function better
nulls = pd.isnull(df)

In [103]:
# Counts the number of nulls (represented as 'True'), most are 'False' (no nulls) however
nulls = nulls.sum()
nulls = nulls[nulls > 0]
nulls

+/-             17772
mp_max          17772
mp_max.1        17772
+/-_opp         17772
mp_max_opp      17772
mp_max_opp.1    17772
dtype: object

In [104]:
nulls.index

Index(['+/-', 'mp_max', 'mp_max.1', '+/-_opp', 'mp_max_opp', 'mp_max_opp.1'], dtype='object')

In [106]:
# Create a list (pandas index) of valid columns, where null = 0
# ~ (tilde) is a negation operator for a pandas series to check if the dataframe column is in the above 'nulls' index
# only keeping the ones not included with the tilde, so filtering out the tilde df.columns.isn(nulls.index) columns

valid_columns = df.columns[~df.columns.isin(nulls.index)]

In [107]:
valid_columns

Index(['mp', 'fg', 'fga', 'fg%', '3p', '3pa', '3p%', 'ft', 'fta', 'ft%',
       ...
       'usg%_max_opp', 'ortg_max_opp', 'drtg_max_opp', 'team_opp', 'total_opp',
       'home_opp', 'season', 'date', 'won', 'target'],
      dtype='object', length=142)

In [108]:
df = df[valid_columns].copy()

In [109]:
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,240.0,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,0.704,...,43.7,206.0,104.0,GSW,111,1,2016,2015-10-27,False,0
1,240.0,38.0,94.0,0.404,9.0,29.0,0.310,10.0,17.0,0.588,...,34.6,162.0,104.0,CHI,97,1,2016,2015-10-27,False,1
2,240.0,37.0,87.0,0.425,7.0,19.0,0.368,16.0,23.0,0.696,...,29.0,138.0,105.0,CLE,95,0,2016,2015-10-27,True,1
3,240.0,41.0,96.0,0.427,9.0,30.0,0.300,20.0,22.0,0.909,...,38.9,201.0,120.0,NOP,95,0,2016,2015-10-27,True,1
4,240.0,37.0,82.0,0.451,8.0,27.0,0.296,12.0,15.0,0.800,...,23.6,132.0,104.0,DET,106,0,2016,2015-10-27,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17767,240.0,34.0,85.0,0.400,15.0,38.0,0.395,14.0,19.0,0.737,...,36.3,133.0,112.0,GSW,107,0,2022,2022-06-10,False,0
17768,240.0,41.0,88.0,0.466,9.0,40.0,0.225,13.0,15.0,0.867,...,94.4,300.0,112.0,BOS,94,0,2022,2022-06-13,True,1
17769,240.0,31.0,75.0,0.413,11.0,32.0,0.344,21.0,31.0,0.677,...,36.2,222.0,107.0,GSW,104,1,2022,2022-06-13,False,0
17770,240.0,34.0,80.0,0.425,11.0,28.0,0.393,11.0,12.0,0.917,...,31.5,186.0,111.0,GSW,103,0,2022,2022-06-16,False,2


### ii.) Machine Learning Analysis

In [115]:
# Instantiate the ML model RidgeClassifier
# Uses ridge regression to classify whether a team will lose or win the future game
# alpha parameter

rr = RidgeClassifier(alpha=1)

In [116]:
# Instantiate the TimeSeriesSplit model_selection

split = TimeSeriesSplit(n_splits=3)

In [117]:
# Instantiate the SequentialFeatureSelector
# SequeSequentialFeatureSelector pass in the 'rr' ML model and the selector trains the model using different features (column)
# Picks the best features
# Pick n-features
# Pick 'forward' -> starts with 0-features then iterates up to n=30 features

sfs = SequentialFeatureSelector(rr, n_features_to_select=30, direction="forward", cv=split)


In [118]:
# Scale some of the columns 
# Ridge Regression works best if the data is scaled
# Remove columns or isolate the ones we don't want to scale

removed_columns = ["season", "date", "won", "target", "team", "team_opp"] 

In [119]:
# Define selected_columns that we want to scale

selected_columns = df.columns[~df.columns.isin(removed_columns)]

In [121]:
# Instantiate the MinMaxSacler scaler model
# Scale data

scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns])

In [122]:
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,0.0,0.363636,0.338235,0.366029,0.206897,0.212121,0.395487,0.418605,0.412698,0.654609,...,0.277279,0.554502,0.317647,GSW,0.451923,1.0,2016,2015-10-27,False,0
1,0.0,0.431818,0.500000,0.322967,0.310345,0.378788,0.368171,0.209302,0.253968,0.519253,...,0.160462,0.345972,0.317647,CHI,0.317308,1.0,2016,2015-10-27,False,1
2,0.0,0.409091,0.397059,0.373206,0.241379,0.227273,0.437055,0.348837,0.349206,0.645274,...,0.088575,0.232227,0.329412,CLE,0.298077,0.0,2016,2015-10-27,True,1
3,0.0,0.500000,0.529412,0.377990,0.310345,0.393939,0.356295,0.441860,0.333333,0.893816,...,0.215661,0.530806,0.505882,NOP,0.298077,0.0,2016,2015-10-27,True,1
4,0.0,0.409091,0.323529,0.435407,0.275862,0.348485,0.351544,0.255814,0.222222,0.766628,...,0.019255,0.203791,0.317647,DET,0.403846,0.0,2016,2015-10-27,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17767,0.0,0.340909,0.367647,0.313397,0.517241,0.515152,0.469121,0.302326,0.285714,0.693116,...,0.182285,0.208531,0.411765,GSW,0.413462,0.0,2022,2022-06-10,False,0
17768,0.0,0.500000,0.411765,0.471292,0.310345,0.545455,0.267221,0.279070,0.222222,0.844807,...,0.928113,1.000000,0.411765,BOS,0.288462,0.0,2022,2022-06-13,True,1
17769,0.0,0.272727,0.220588,0.344498,0.379310,0.424242,0.408551,0.465116,0.476190,0.623104,...,0.181001,0.630332,0.352941,GSW,0.384615,1.0,2022,2022-06-13,False,0
17770,0.0,0.340909,0.294118,0.373206,0.379310,0.363636,0.466746,0.232558,0.174603,0.903151,...,0.120668,0.459716,0.400000,GSW,0.375000,0.0,2022,2022-06-16,False,2


In [123]:
# Fit the SequSequentialFeatureSelector
# where sfs.fit(X, y) X = features, y = target

sfs.fit(df[selected_columns], df["target"])

SequentialFeatureSelector(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                          estimator=RidgeClassifier(alpha=1),
                          n_features_to_select=30)

In [124]:
# Pull out the list of feature/columns that the sfs model thinks we should use in our models

predictors = list(selected_columns[sfs.get_support()])
predictors

['mp',
 'fg%',
 '3p%',
 'orb',
 'ts%',
 'usg%',
 '3p%_max',
 'ft_max',
 'fta_max',
 '+/-_max',
 'drb%_max',
 'trb%_max',
 'tov%_max',
 'usg%_max',
 'mp_opp',
 'fg_opp',
 '3p_opp',
 'ft%_opp',
 'blk_opp',
 'usg%_opp',
 'fga_max_opp',
 '3p_max_opp',
 'ft_max_opp',
 'ft%_max_opp',
 'blk_max_opp',
 'pf_max_opp',
 'pts_max_opp',
 'drb%_max_opp',
 'blk%_max_opp',
 'usg%_max_opp']

In [None]:
# Create backtest function to start making predictions
# start = 2, start with first two seasons to predict third, then iterate with step = 1, 2nd iteration first 3 seasons to predict 4th, etc.

def backtest (data, model, predictors, start=2, step=1)
    all_predictions =[]
    
    # Create list of seasons that exist in data
    seasons = sorted(data["season"].unique())
    
    # i.e. for i in range (2, 7, 1)
    for i in range(start, len(seasons), step):
        season = season[i]
        
        # data we use to train our ML algorithm
        # train is all data before test season
        # test is data from current season
        train = data[data["season"] < season]
        test = data[data["season"] == season]
        
        model.fit(train[predictors], train["target"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)

In [126]:
sorted(df["season"].unique())

[2016, 2017, 2018, 2019, 2020, 2021, 2022]