In [1]:
import pandas as pd
import tensorflow as tf




In [2]:
df = pd.read_csv("nba_games_v2.csv", index_col=0)
#sort by date so the previous values always already happen
df = df.sort_values("date")
#resetting the column number
df = df.reset_index(drop=True)

In [3]:
#deleting redundant columns
redundant_columns = ["mp", "mp.1", "mp_opp.1", "mp_max.1", "mp_max_opp.1", "+/-", "+/-_opp", "+/-_max", "+/-_max_opp", "pts", "pts_opp", "pts_max", "pts_max_opp", "usg%", "usg%_max", "usg%_opp", "usg%_max_opp"]
for col in redundant_columns:
    del df[col]


In [4]:
nulls = pd.isnull(df)
#sum up the null = true columns
nulls = nulls.sum()
nulls = nulls[nulls > 0] #only having the null columns
#checking the df column if it is not in the nulls index
valid_columns = df.columns[~df.columns.isin(nulls.index)]
df = df[valid_columns].copy()

In [5]:
# creating new columns net rtng, Assist turnover, Rebound Deferential & b2b
df["net_rtg"] = df["ortg"] - df["drtg"]
df["net_rtg_opp"] = df["ortg_opp"] - df["drtg_opp"]

df["ast_tov"] = df["ast"] / df["tov"]
df["ast_tov_opp"] = df["ast_opp"] / df["tov_opp"]

df["rb_diff"] = df["trb"] - df["trb_opp"]
df["rb_diff_opp"] = df["trb_opp"] - df["trb"]




In [6]:
# prevent fragmentation
df = df.copy()

In [7]:
df["won_opp"] = ~df["won"]

In [8]:
# columns to retrieve past information from
no_roll_columns = ["season", "date", "team", "team_opp", "b2b", "b2b_opp", "home", "home_opp", "total", "total_opp",]
roll_columns = df.columns[~df.columns.isin(no_roll_columns)]
roll_home_columns = [r for r in roll_columns if "_opp" not in r]
roll_away_columns = [r for r in roll_columns if "_opp" in r]

In [9]:
window_sizes = [5, 82]
all_rolling_columns = []
    #using rolling average of the teams previous performance
for window_size in window_sizes:
    #define df_rolling
    df_rolling = df[list(roll_home_columns) + ["team"]]
    #define rolling function
    def find_team_averages(team):
        #shift(1) to obtain the rolling averages before this game
        rolling = team[roll_home_columns].shift(1).rolling(window_size, min_periods=1).mean()
        return rolling
    #using rolling average on team
    df_rolling = df_rolling.groupby(["team"], group_keys=False).apply(find_team_averages)
    #renaming columns to avoid overlap when merging df and df_rolling
    df_rolling = df_rolling.add_suffix(f'_{window_size}')
    all_rolling_columns = all_rolling_columns + list(df_rolling.columns)
    #adding columns to make a wide dataframe
    df = pd.concat([df, df_rolling], axis=1)
    df = df.copy()


for window_size in window_sizes:
    #define df_rolling
    df_rolling = df[list(roll_away_columns) + ["team_opp",]]
    #define rolling function
    def find_team_averages(team):
        #shift(1) to obtain the rolling averages before this game
        rolling = team[roll_away_columns].shift(1).rolling(window_size, min_periods=1).mean()
        return rolling
    #using rolling average on team
    df_rolling = df_rolling.groupby(["team_opp",], group_keys=False).apply(find_team_averages)
    #renaming columns to avoid overlap when merging df and df_rolling
    df_rolling = df_rolling.add_suffix(f'_{window_size}')
    all_rolling_columns = all_rolling_columns + list(df_rolling.columns)
    #adding columns to make a wide dataframe
    df = pd.concat([df, df_rolling], axis=1)
    df = df.copy()

In [10]:
df = df.dropna()

In [11]:
#scaling the data
to_not_scale = ["season", "date", "won", "team", "team_opp", "won_opp"]
scale_columns = df.columns[~df.columns.isin(to_not_scale)]

In [12]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[scale_columns] = scaler.fit_transform(df[scale_columns])

In [13]:
df = df.copy()

In [14]:
df[["won", "won_opp"]] = df[["won", "won_opp"]].astype(int, errors="ignore")

In [15]:
features_to_pick = all_rolling_columns + ["home", "home_opp"]

In [16]:
from sklearn.ensemble import GradientBoostingClassifier
X = df[features_to_pick]
y = df["won"]
features = X.columns
rf = GradientBoostingClassifier()
rf.fit(X, y)
importances = rf.feature_importances_

In [17]:
# Create a list of tuples where each tuple contains the feature and its importance
feature_importance_tuples = list(zip(features, importances))

# Sort the list of tuples based on the importance scores in descending order
sorted_feature_importance = sorted(feature_importance_tuples, key=lambda x: x[1], reverse=True)
sorted_feature_importance

[('net_rtg_82', 0.14808480850277572),
 ('net_rtg_opp_82', 0.09332637524113249),
 ('net_rtg_5', 0.0773366255391942),
 ('net_rtg_opp_5', 0.04216184979609045),
 ('won_82', 0.02468583128019011),
 ('won_opp_82', 0.02412017697178458),
 ('efg%_opp_82', 0.01898016040800889),
 ('3par_max_5', 0.013432329773888346),
 ('efg%_max_82', 0.012368904264386903),
 ('drb%_opp_5', 0.009614743330487814),
 ('3p%_5', 0.009530960822534447),
 ('trb%_82', 0.008578514067614895),
 ('ft_max_opp_82', 0.008478475273826856),
 ('stl%_max_opp_5', 0.008463118181016976),
 ('stl%_82', 0.008444846121391169),
 ('efg%_5', 0.007960358080207125),
 ('ast_82', 0.007937102009833785),
 ('ast%_max_5', 0.007906344598041806),
 ('ast%_82', 0.007896937635610224),
 ('ast_tov_82', 0.007872367100175828),
 ('ast_max_5', 0.007326459013214925),
 ('trb_82', 0.007061793087314523),
 ('ast_opp_82', 0.00695079096535944),
 ('ts%_82', 0.00691662283921374),
 ('fg_max_5', 0.006910207200991821),
 ('ortg_5', 0.00690181837659336),
 ('ft_max_5', 0.0066061

In [18]:
# Extract the top features
top_features = [feature[0] for feature in sorted_feature_importance[:50] if feature[1] > 0]
top_features

['net_rtg_82',
 'net_rtg_opp_82',
 'net_rtg_5',
 'net_rtg_opp_5',
 'won_82',
 'won_opp_82',
 'efg%_opp_82',
 '3par_max_5',
 'efg%_max_82',
 'drb%_opp_5',
 '3p%_5',
 'trb%_82',
 'ft_max_opp_82',
 'stl%_max_opp_5',
 'stl%_82',
 'efg%_5',
 'ast_82',
 'ast%_max_5',
 'ast%_82',
 'ast_tov_82',
 'ast_max_5',
 'trb_82',
 'ast_opp_82',
 'ts%_82',
 'fg_max_5',
 'ortg_5',
 'ft_max_5',
 'ortg_max_5',
 'stl%_max_5',
 'drb%_82',
 'ast%_opp_82',
 'fg%_max_82',
 'rb_diff_82',
 'drb%_opp_82',
 'stl_max_82',
 'drb_82',
 'fta_max_opp_82',
 'ast_5',
 '3par_82',
 'stl%_5',
 'ts%_max_82',
 'blk%_max_opp_82',
 '3p%_max_5',
 'ast%_opp_5',
 '3par_max_opp_5',
 '3p_max_opp_82',
 'pf_max_opp_82',
 'fga_max_82',
 'drb_max_5',
 'ft%_opp_82']

In [36]:
tensortotal = pd.concat([df[features_to_pick], df["won"]], axis=1)


In [37]:
tensorX = tf.convert_to_tensor(tensortotal[top_features])
tensorY = tf.convert_to_tensor(tensortotal["won"])


In [38]:
tensorX

<tf.Tensor: shape=(10165, 50), dtype=float64, numpy=
array([[0.08527132, 0.60122699, 0.08527132, ..., 0.33333333, 0.2       ,
        0.14346895],
       [0.29767442, 0.35787321, 0.29767442, ..., 0.26666667, 0.2       ,
        0.48394004],
       [0.57829457, 0.88752556, 0.57829457, ..., 0.13333333, 0.3       ,
        0.67880086],
       ...,
       [0.54592551, 0.58214874, 0.55658915, ..., 0.40406504, 0.4       ,
        0.55301092],
       [0.54163358, 0.58234825, 0.47813953, ..., 0.40894309, 0.42      ,
        0.54817987],
       [0.597353  , 0.58803432, 0.60620155, ..., 0.48943089, 0.62      ,
        0.59745652]])>

In [39]:
# Define the number of rows for testing
num_test_rows = 82

# Split the data into training and testing sets
X_train, X_test = tensorX[:-num_test_rows, :], tensorX[-num_test_rows:, :]
y_train, y_test = tensorY[:-num_test_rows], tensorY[-num_test_rows:]

In [40]:
#set random seed
tf.random.set_seed(42)

#create model
model = tf.keras.Sequential([
    tf.keras.Input(shape=(50,)),
    tf.keras.layers.Dense(32, activation="tanh"),
    tf.keras.layers.Dense(16, activation="tanh"),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

#compile the model
model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"]
)

model.fit(X_train, y_train, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x16efaebbd90>

In [41]:
model.evaluate(X_test, y_test)



[0.679732620716095, 0.5609756112098694]