In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("nba_games.csv", index_col=0)

In [4]:
df = df.sort_values("date")

In [5]:
df = df.reset_index(drop=True) #Changes the index values to match

In [7]:
del df["mp.1"]
del df["mp_opp.1"] #Extra Columns which are not needed
del df["index_opp"]

In [None]:
def add_target(team): #Dataframe that has boxscores for one team
    team["target"] = team["won"].shift(-1) #target indicates if team won or loss next game
    return team

df = df.groupby("team", group_keys=False).apply(add_target) 

In [None]:
df[df["team"] == "NOP"]

In [None]:
df["team"]

In [None]:
df["target"][pd.isnull(df["target"])] = 2
df["target"] = df["target"].astype(int, errors ="ignore") #Changes datatype from boolean to an integer

In [None]:
df["won"].value_counts()

In [None]:
df["target"].value_counts()

In [15]:
# Machine learning models don't work well with null values so we need to remove or replace them
nulls = pd.isnull(df).sum()

In [16]:
nulls = nulls[nulls > 0] #Shows the columns where null is more than 0

In [17]:
valid_columns = df.columns[~df.columns.isin(nulls.index)] #Getting rid of null columns

In [18]:
valid_columns

Index(['mp', 'fg', 'fga', 'fg%', '3p', '3pa', '3p%', 'ft', 'fta', 'orb',
       ...
       'usg%_max_opp', 'ortg_max_opp', 'drtg_max_opp', 'team_opp', 'total_opp',
       'home_opp', 'season', 'date', 'won', 'target'],
      dtype='object', length=138)

In [19]:
df = df[valid_columns].copy()

In [21]:
# Select a smaller number of columns to train the model with because a high number of columns can influence the machine learning
from sklearn.model_selection import TimeSeriesSplit #Split the data so you can train one half to make a prediction on the other half
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier

rr = RidgeClassifier(alpha=1)
split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rr, n_features_to_select=30, direction="forward", cv=split) #Test the model using different features to decide which feature is best for the model

In [22]:
#Ridge regression works best when you scale the data by subtracting mean and dividing by standard deviation
#Scaling columns so they fall between 0 and 1
removed_columns=["season", "date", "won", "target", "team", "team_opp"]

In [23]:
selected_columns = df.columns[~df.columns.isin(removed_columns)] #Selecting all the columns except the removed columns

In [24]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns])

In [26]:
sfs.fit(df[selected_columns], df["target"])

In [27]:
predictors = list(selected_columns[sfs.get_support()])

In [28]:
predictors

['fg',
 'ft',
 'pf',
 'pts',
 '3par',
 'ftr',
 'usg%',
 'stl_max',
 'pf_max',
 'pts_max',
 'gmsc_max',
 'orb%_max',
 'trb%_max',
 'stl%_max',
 'ortg_max',
 'total',
 'fga_opp',
 '3pa_opp',
 'pf_opp',
 '3par_opp',
 'ast%_opp',
 'usg%_opp',
 'tov_max_opp',
 'ftr_max_opp',
 'orb%_max_opp',
 'drb%_max_opp',
 'trb%_max_opp',
 'ast%_max_opp',
 'blk%_max_opp',
 'ortg_max_opp']

In [29]:
def backtest(data, model, predictors, start=2, step=1): 
    all_predictions = []

    seasons = sorted(data["season"].unique())

    for i in range(start, len(seasons), step):
        season = seasons[i]

        train = data[data["season"] < season]
        test = data[data["season"] == season]

        model.fit(train[predictors], train["target"])

        preds = model.predict(test[predictors]) #By default this will make a numpy array
        preds = pd.Series(preds, index=test.index) #Convert to a pandas series instead

        combined = pd.concat([test["target"], preds], axis=1)
        combined.columns = ["actual","prediction"]

        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [30]:
predictions = backtest(df, rr, predictors)

In [32]:
from sklearn.metrics import accuracy_score

predictions = predictions[predictions["actual"] != 2]
accuracy_score(predictions["actual"], predictions["prediction"])

0.550949050949051

In [None]:
df.groupby("home").apply(lambda x: x[x["won"] == 1].shape[0] / x.shape[0]) #Split the data into two groups home/away and find the winning percentage for the two groups 

In [None]:
# Filter the required columns and ensure 'won', 'team', and 'season' are preserved
df_rolling = df[list(selected_columns) + ["won", "team", "season"]]

# Function to calculate rolling averages for numeric columns
def find_team_averages(team):
    # Select only numeric columns for rolling mean
    numeric_columns = team.select_dtypes(include="number").columns
    rolling = team[numeric_columns].rolling(20).mean()
    # Avoid reattaching duplicate columns by excluding them
    non_numeric_columns = team[["won", "team", "season"]].loc[:, ~team[["won", "team", "season"]].columns.isin(numeric_columns)]
    return rolling.join(non_numeric_columns)

# Group by 'team' and 'season' and apply the rolling average function
df_rolling = df_rolling.groupby(["team", "season"], group_keys=False).apply(find_team_averages)

In [None]:
df_rolling

In [37]:
rolling_cols = [f"{col}_20" for col in df_rolling.columns]
df_rolling.columns = rolling_cols
df = pd.concat([df, df_rolling], axis=1)

In [38]:
df = df.dropna() #Drops columns which are NaN (first 20 games of season)

In [None]:
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col

def add_col(df, col_name):
    return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))

df["home_next"] = add_col(df, "home")
df["team_opp_next"] = add_col(df, "team_opp")
df["date_next"] = add_col(df, "date")

In [41]:
df = df.copy()

In [42]:
full = df.merge(
    df[rolling_cols + ["team_opp_next", "date_next", "team"]], 
    left_on=["team", "date_next"], 
    right_on=["team_opp_next", "date_next"]
)

In [None]:
full

In [None]:
full[["team_x", "team_opp_next_x", "team_y", "team_opp_next_y", "date_next"]]

In [45]:
removed_columns = list(full.columns[full.dtypes == "object"]) + removed_columns

In [None]:
removed_columns

In [47]:
selected_columns = full.columns[~full.columns.isin(removed_columns)]
sfs.fit(full[selected_columns], full["target"])

In [48]:
predictors = list(selected_columns[sfs.get_support()])

In [49]:
predictors

['mp',
 'fg%',
 'fta',
 'pf',
 '3par',
 'usg%',
 'drb%_max',
 'mp_opp',
 'usg%_opp',
 '3p_max_opp',
 'orb_max_opp',
 'stl_max_opp',
 'mp_20_x',
 'ts%_20_x',
 'usg%_20_x',
 'gmsc_max_20_x',
 'mp_opp_20_x',
 '3p%_opp_20_x',
 'pts_opp_20_x',
 'usg%_opp_20_x',
 'ast_max_opp_20_x',
 'blk_max_opp_20_x',
 'ast%_max_opp_20_x',
 'fg%_20_y',
 'ts%_20_y',
 'usg%_20_y',
 'gmsc_max_20_y',
 'usg%_opp_20_y',
 'pts_max_opp_20_y',
 'blk%_max_opp_20_y']

In [50]:
predictions = backtest(full, rr, predictors)

In [51]:
accuracy_score(predictions["actual"], predictions["prediction"])
#Initial prediction using home/away : 0.557

0.6520210896309314