# Creating Prediction Models: Random Forest, XGBoost, Neural Networks. With Time-Sensitive Cross-Validation and Principal Component Analysis

In [2]:
#First, we import the libraries and classes we'll use
import sklearn as skl
import pandas as pd
import numpy as np
import random
import statistics
from IPython.display import clear_output
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import KFold
from sklearn.model_selection import TimeSeriesSplit
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from pathlib import Path
from itertools import product
import datetime

base_dir = Path.cwd()
games = pd.read_csv(base_dir / "games_almost_ready_for_training.csv")
games['game_date'] = pd.to_datetime(games['game_date'])

## A Heads-Up on the games DataFrame

Let's explain the relevant columns in the games DataFrame:
- 2 team name columns: 'team_name_home', 'team_name_away'. Note that at some point in time, the Charlotte Bobcats changed their name to the Charlotte Hornets and the New Orleans Hornets became the New Orleans Pelicans. I rename them for consistency. These will then be OneHotEncoded.
- 1 date column: 'game_date'. Since many models can't process timeframes, we set the earliest date in the DF to 0 and the rest are transformed into the number of days since the earliest date.
- 6 win-loss % columns: 'record_home_wins', 'record_home_losses', 'record_away_wins', 'record_away_losses', 'home_wl%', 'away_wl%'.
- 2 Elo columns: 'elo_home', 'elo_away'. These are Elos *before* the game. For more details on these ratings, see Step 3 of the "Data Collection for RF, XGB, NN" notebook or the ReadMe in https://github.com/Matija-Sreckovic/NBA-Simple-Prediction-Models.
- 2 team OffRtg-DefRtg-GmSc ratings columns: 'rating_home', 'rating_away'. For more details on these scores, see Step 5 of "Data Collection for RF, XGB, NN" notebook or the ReadMe in https://github.com/Matija-Sreckovic/NBA-Prediciton-Model.
- 12 individual OffRtg-DefRtg-GmSc ratings columns: 'rating_home_player1', ..., 'rating_home_player5', 'rating_away_player1', ..., 'rating_away_player5'. 5 for each of the teams' best players' ratings.
- 8 Offensive/Defensive Rating columns: 'ortg_home', 'ortg_away', 'drtg_home', 'drtg_away', 'home_ortg_last_season', ... Contains teams' offensive/defensive ratings up to that point in the season, or for all of last season.

In [5]:
def load_games_dataframe(games, columns_to_drop):
    """Here, we just load the games DataFrame and create a bunch of useful columns (difference between Elos, "Offrtg-Defrtg-GmSc" ratings, home/away net 
    ratings, difference between net ratings, WL% difference, ...
    The columns_to_drop argument should take values 0, 1 or 2: if it's 0, we don't remove any columns. 
    If it's 1, we remove a lot of them: individual Elos, individual team "ODG" ratings, individual player ODG ratings, individual offensive/defensive 
    ratings for both this season and last, individual WL%, and individual net ratings last season. It turns out the leaving the individual net ratings
    and player ODG ratings in helps the model perform better.
    If it's 2, we remove all individual columns and keep only the differences between the teams' stats.
    """
    #drop the obsolete 'matchup_home' column, and game_id since it's just an index column
    games = games.drop(columns = ['matchup_home'])
    games = games.drop(columns = ['game_id'])

    #turn the game_date into datetime format
    games['game_date'] = pd.to_datetime(games['game_date'])
    
    #drop a column to prevent data leakage - this is the margin of victory of the home team (negative if the home team lost)
    games = games.drop(columns = ['plus_minus_home'])
    
    #add the difference in Elo ratings as a column
    games['elo_diff'] = games['elo_home'] - games['elo_away']
    #add the gamescore-offrtg-defrtg rating difference as a column
    games['rating_diff'] = games['rating_home'] - games['rating_away']
    #add the rating *difference* between the nth best players, but don't drop the individual ratings
    for i in list(range(1,6)):
        games['rating_player_diff' + str(i)] = games['rating_home_player' + str(i)] - games['rating_away_player' + str(i)]
    #add home and away net rating as difference of offrtg and defrtg
    games['home_netrtg'] = games['ortg_home'] - games['drtg_home']
    games['away_netrtg'] = games['ortg_away'] - games['drtg_away']
    #add home net rating and away net rating from last season
    games['home_netrtg_last_season'] = games['home_ortg_last_season'] - games['home_drtg_last_season']
    games['away_netrtg_last_season'] = games['away_ortg_last_season'] - games['away_drtg_last_season']
    #add difference in win-loss percentage
    games['wl%_diff'] = games['home_wl%'] - games['away_wl%']
    #add net rating difference
    games['netrg_diff'] = games['home_netrtg'] - games['away_netrtg']
    #add netrtg difference from last season
    games['netrtg_diff_last_season'] = games['home_netrtg_last_season'] - games['away_netrtg_last_season']

    #add games played home/away (games played so far this season)
    games['GP_home'] = games['record_home_wins'] + games['record_home_losses']
    games['GP_away'] = games['record_away_wins'] + games['record_away_losses']

    #in 2013-14, the Charlotte Hornets were called the Bobcats, and (maybe) the New Orleans Pelicans were called the Hornets
    games.replace('Charlotte Bobcats', 'Charlotte Hornets', inplace=True)
    games.replace('New Orleans Hornets', 'New Orleans Pelicans', inplace=True)

    if columns_to_drop not in [0,1,2]:
        raise ValueError("Wrong value of columns_to_drop: please enter 0, 1 or 2.")

    if (columns_to_drop == 1) or (columns_to_drop == 2):
        games = games.drop(columns = ['elo_home', 'elo_away'])
        games = games.drop(columns = ['rating_home', 'rating_away'])
        games = games.drop(columns = ['ortg_home', 'drtg_home'])
        games = games.drop(columns = ['ortg_away', 'drtg_away'])
        games = games.drop(columns = ['home_ortg_last_season', 'home_drtg_last_season', 'away_ortg_last_season', 'away_drtg_last_season'])
        games = games.drop(columns = ['record_home_wins', 'record_home_losses', 'record_away_wins', 'record_away_losses'])
        games = games.drop(columns = ['home_wl%', 'away_wl%'])
        games = games.drop(columns = ['home_netrtg_last_season', 'away_netrtg_last_season'])

    if columns_to_drop == 2:
        games = games.drop(columns = ['home_netrtg', 'away_netrtg'])
        games = games.drop(columns = ['rating_home_player' + str(i), 'rating_away_player' + str(i)])

    #turning dates into days since first game in table
    games['game_date'] = (games['game_date'] - games['game_date'].min()) / pd.Timedelta(days=1)

    #get target column
    wl_home = games['wl_home']
    games = games.drop(columns = 'wl_home')
    
    return games, wl_home

In [52]:
def get_preprocessor(games, classifier, use_pca):
    """
    The preprocessing step. The classifier argument takes values "RF", "XGB" or "NN". 
    The PCA argument should be 1 (use PCA) or 0 (don't use PCA).
    If we use PCA after transforming the columns, we need to rescale the numerical features.
    NB: the target column needs to be removed from the games table before applying the preprocessor!
    """
    categorical_columns = ['team_name_home', 'team_name_away', 'season_type']
    numerical_columns = [col for col in games.columns if col not in categorical_columns]
    if classifier not in ["RF", "XGB", "NN"]:
        raise ValueError("The classifier argument should take values in [\"RF\", \"XGB\", \"NN\"]")
    if use_pca:
        transformer = ColumnTransformer(
            transformers=[
                ('team_names', OneHotEncoder(), categorical_columns),  # Apply OneHotEncoder to team columns
                ('standard_scale_num_cols', StandardScaler(), numerical_columns) # Scale numerical columns for PCA
            ],
        )
        pca = PCA(n_components=30)  # Reduce to 30 dimensions
        return make_pipeline(transformer, pca)
    elif classifier in ["RF", "XGB"]:
        if not use_pca:
            preprocessor = ColumnTransformer(
                transformers=[
                    ('team_names', OneHotEncoder(), categorical_columns)  # Apply OneHotEncoder to team columns
                ],
                remainder='passthrough'  # Keep other columns as is (e.g., numeric columns)
            )
            return preprocessor    
    else:
        numerical_columns = [col for col in games.columns if col not in categorical_columns]
        preprocessor = ColumnTransformer(
            transformers=[
                ('team_names', OneHotEncoder(), categorical_columns),  # Apply OneHotEncoder to team columns
                ('numerical_for_rescaling', StandardScaler(), numerical_columns),
            ],
        )
        return preprocessor

In [7]:
# Function to assign seasons
def assign_season(game_date):
    year = game_date.year
    if game_date.month < 10:  # If before October, it's part of the previous season
        year -= 1
    return f"{year}-{year + 1}"

# Add season column
games['season'] = games['game_date'].apply(assign_season)

In [8]:
def get_cross_val_indices(games, n_folds=3):
    """
    We need time-dependent cross-validation, meaning that no row of the test set can go before the train set. 
    However, I also want the train and test sets to consist of entire *seasons*; with TimeSeriesSplit they end at a random point in the season.
    I want to train on at least 5 seasons, hence current_training_number = 4 and n_folds <= 7 (recall that there are 12 seasons, from
    2013-14 to 2024-25).
    I want the training_set:test_set ratio to be at least 4:1, hence the line new_testing_number <= new_number // 4.
    So I create n_folds train-test pairs, where each new training set is a bit bigger than the last (at least 5 seasons, but less than 12).
    The test set consists of seasons immediately following the train set.
    """
    if n_folds > 7:
        raise ValueError("Too many folds!")
    training_numbers = []
    training_indices = [] # an element of this list is a list of indices
    testing_numbers = []
    testing_indices = [] # an element of this list is a list of indices
    n_folds_left = n_folds
    current_training_number = 4
    for i in list(range(0, n_folds)):
        new_number = random.randint(current_training_number + 1 , 12 - n_folds_left)
        training_numbers.append(new_number)
        new_testing_number = random.randint(1, new_number // 4)
        testing_numbers.append(new_testing_number) # these indicate on how *many* seasons we test
        training_indices_to_append = []
        testing_indices_to_append = []
        for j in list(range(2013, 2013 + new_number)):
            season_rows = games[games['season'] == str(j) + "-" + str(j+1)]
            season_indices = season_rows.index.tolist()
            for idx in season_indices:
                training_indices_to_append.append(idx)
        training_indices.append(training_indices_to_append)
        for j in list(range(2013 + new_number, 2013 + new_number + testing_numbers[i])):
            season_rows = games[games['season'] == str(j) + "-" + str(j+1)]
            season_indices = season_rows.index.tolist()
            for idx in season_indices:
                testing_indices_to_append.append(idx)
        testing_indices.append(testing_indices_to_append)
        n_folds_left -= 1
        current_training_number = new_number
    return training_indices, testing_indices

In [9]:
games.columns

Index(['Unnamed: 0', 'game_id', 'game_date', 'team_name_home',
       'team_name_away', 'matchup_home', 'wl_home', 'record_home_wins',
       'record_home_losses', 'record_away_wins', 'record_away_losses',
       'elo_home', 'elo_away', 'plus_minus_home', 'season_type', 'home_wl%',
       'away_wl%', 'rating_home', 'rating_away', 'rating_home_player1',
       'rating_away_player1', 'rating_home_player2', 'rating_away_player2',
       'rating_home_player3', 'rating_away_player3', 'rating_home_player4',
       'rating_away_player4', 'rating_home_player5', 'rating_away_player5',
       'ortg_home', 'drtg_home', 'ortg_away', 'drtg_away',
       'home_ortg_last_season', 'home_drtg_last_season',
       'away_ortg_last_season', 'away_drtg_last_season', 'season'],
      dtype='object')

In [10]:
#Random Forest hyperparameter tuning:
games = pd.read_csv(base_dir / "games_almost_ready_for_training.csv")
games['game_date'] = pd.to_datetime(games['game_date'])

# Add season column
games['season'] = games['game_date'].apply(assign_season)

#Get the training and testing indices for cross-validation
training_indices, testing_indices = get_cross_val_indices(games)

#each element of these two lists is itself a 3-element list, containing the best scores for each of the 3 train-test splits
max_keys = []
max_values = []
best_keys_per_fold_and_setting = [] 
best_scores_per_fold_and_setting = []
for i in [True, False]:
    for j in list(range(0,3)):
        max_key_folds = [0,0,0]
        max_value_folds = [0,0,0]
        
        games = pd.read_csv(base_dir / "games_almost_ready_for_training.csv")
        games['game_date'] = pd.to_datetime(games['game_date'])
    
        # Add season column
        games['season'] = games['game_date'].apply(assign_season)
    
        # Load the games DataFrame - drop relevant columns
        games, wl_home = load_games_dataframe(games, j)
    
        #Get the preprocessor - the i parameter decides if we use PCA or not.
        preprocessor = get_preprocessor(games, "XGB", i)
        
        #Candidate parameter values
        n_estimators_list = [50, 100, 400]
        min_samples_split_list = [2, 10, 100]
        min_samples_leaf_list = [1, 10, 100]
        max_features_list = ['sqrt', None]
        max_leaf_nodes_list = [100, 1000, None]
        
        #Cartesian product of the lists
        all_combinations = list(product(
            n_estimators_list,
            min_samples_split_list,
            min_samples_leaf_list,
            max_features_list,
            max_leaf_nodes_list
        ))
                
        # Create a copy of games since we're going to be changing the seasons column, and we don't want it affected for the other models.
        games_rf = games.copy()
        
        # Create a mapping dictionary
        seasons = [f"{year}-{year+1}" for year in range(2013, 2025)]
        season_mapping = {season: k for k, season in enumerate(seasons)}
        
        # Map the 'season' column to numeric values
        games_rf['season'] = games_rf['season'].map(season_mapping)
        
        #Accuracy dict
        accuracies = {combination : [] for combination in all_combinations}
        avg_accuracies = {combination : 0 for combination in all_combinations}
        
        #Access individual elements of a combination and fit and train the model. Keep all of the scores, but especially the best score for each
        #combination and each fold of the cross-validation.
        for combination in all_combinations:
            clear_output(wait=True)
            print(combination)
            n_estimators = combination[0]       # First element
            min_samples_split = combination[1]  # Second element
            min_samples_leaf = combination[2]   # Third element
            max_features = combination[3]       # Fourth element
            max_leaf_nodes = combination[4]     # Fifth element
            classifier = RandomForestClassifier(n_estimators=n_estimators, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
                                               max_features = max_features, max_leaf_nodes=max_leaf_nodes, n_jobs=8)
            model = make_pipeline(preprocessor, classifier)
            for k in list(range(0,3)):
                data_train = games_rf.loc[training_indices[k]]
                target_train = wl_home.loc[training_indices[k]]
                data_test = games_rf.loc[testing_indices[k]]
                target_test = wl_home.loc[testing_indices[k]]
                model.fit(data_train, target_train)
                score = model.score(data_test, target_test)
                accuracies[combination].append(score)
                if score > max_value_folds[k]:
                    max_value_folds[k] = score
                    max_key_folds[k] = combination
            avg_accuracies[combination] = statistics.mean(accuracies[combination])
        accuracies_series = pd.Series(accuracies)
        accuracies_series.to_csv(base_dir / ("tuning_rf_accuracies_" + str(j) + "-" + str(i) + ".csv"))
        avg_accuracies_series = pd.Series(avg_accuracies)
        avg_accuracies_series.to_csv(base_dir / ("tuning_rf_avg_accuracies_" + str(j) + "-" + str(i) + ".csv"))
        max_key = max(avg_accuracies, key=avg_accuracies.get)
        max_keys.append(max_key)
        max_values.append(avg_accuracies[max_key])
        best_keys_per_fold_and_setting.append(max_key_folds)
        best_scores_per_fold_and_setting.append(max_value_folds)

for i in range(0,6):
    print(str(max_keys[i]) + ": " + str(max_values[i]))
    print(str(best_keys_per_fold_and_setting[i]) + ": " + str(best_scores_per_fold_and_setting[i])))

SyntaxError: unmatched ')' (3105312796.py, line 100)

In [45]:
max(training_indices[0]), min(testing_indices[0]), max(training_indices[1]), min(testing_indices[1]), max(training_indices[2]), min(testing_indices[2])

(11502, 11503, 12186, 12188, 14135, 14136)

In [29]:
#The program crashed multiple times, so all I'm left with are the csv files. Here I'm reconstructing the best_keys/scores_per_fold_and_setting

best_keys_per_fold_and_setting = [] 
best_scores_per_fold_and_setting = []

for i in [True, False]:
    for j in list(range(3)):
        df = pd.read_csv(base_dir / ("tuning_rf_accuracies_" + str(j) + "-" + str(i) + ".csv"))
        df.columns = ['n_estimators', 'min_samples_split', 'min_samples_leaf', 'max_features', 'max_leaf_nodes', 'scores']
        # Split the single column into three columns
        df_split = df['scores'].str.strip("[]").str.split(",", expand=True)

        # Convert each column to float
        df_split = df_split.astype(float)

        # Assign new column names if desired
        df_split.columns = ["score1", "score2", "score3"]

        # Add the new columns back to the original DataFrame
        df = pd.concat([df, df_split], axis=1)

        max_key_folds = [0,0,0]
        max_value_folds = [0,0,0]

        for idx, row in df.iterrows():
            for k in list(range(3)):
                if row['score' + str(k+1)] > max_value_folds[k]:
                    max_key_folds[k] = [row['n_estimators'], row['min_samples_split'], row['min_samples_leaf'], row['max_features'], row['max_leaf_nodes']]
                    max_value_folds[k] = row['score' + str(k+1)]

        best_keys_per_fold_and_setting.append(max_key_folds)
        best_scores_per_fold_and_setting.append(max_value_folds)

for k in list(range(6)):
    print(str(best_keys_per_fold_and_setting[k]) + ": " + str(best_scores_per_fold_and_setting[k]))

[[50, 100, 10, 'sqrt', 1000.0], [50, 100, 1, nan, 100.0], [50, 2, 10, nan, nan]]: [0.6536270413976453, 0.6709601873536299, 0.6940874035989717]
[[100, 100, 100, nan, nan], [100, 10, 10, nan, 100.0], [400, 2, 1, 'sqrt', 1000.0]]: [0.6521078617546525, 0.6703747072599532, 0.6889460154241646]
[[50, 10, 100, 'sqrt', 100.0], [50, 100, 10, 'sqrt', 1000.0], [50, 2, 1, nan, 1000.0]]: [0.6502088872009115, 0.6692037470725996, 0.6915167095115681]
[[50, 2, 1, 'sqrt', 100.0], [50, 2, 10, 'sqrt', 1000.0], [50, 2, 10, 'sqrt', 1000.0]]: [0.6585643752373718, 0.6692037470725996, 0.6812339331619537]
[[100, 2, 100, 'sqrt', 1000.0], [50, 2, 100, nan, nan], [50, 100, 1, nan, 100.0]]: [0.6578047854158754, 0.6703747072599532, 0.6760925449871465]
[[50, 100, 100, 'sqrt', nan], [50, 100, 100, nan, 1000.0], [50, 100, 10, nan, 1000.0]]: [0.6574249905051273, 0.6686182669789227, 0.6812339331619537]


## Conclusion for the Random Forest Model

The best performing random forest model is *without* PCA, with *no* "individual" columns removed (and differences between the teams' scores added). The best scoring parameters are n_estimators = 50, min_samples_split = 2, min_samples_leaf = 10, max_features = 'sqrt', max_leaf_nodes = 1000. Average accuracy of that model on the randomly selected cross-validation split (3-fold): 66.7%.  

- First train-test split: train from 2013-14 to 2021-22 season, test on 2022-23 and 2023-24 seasons. Best result on this split: no PCA, no column dropping, n_estimators = 50, min_samples_split = 2, min_samples_leaf = 10, max_features = None, max_leaf_nodes = None. Accuracy: 65.9%
- Second train-test split: train from 2013-14 to 2022-23 season, test on 2023-24 and 2024-25 up to Dec. 16. Best result on this split: with PCA, no column dropping, n_estimators = 50, min_samples_split = 100, min_samples_leaf = 1, max_features = None, max_leaf_nodes = 100. Accuracy: 67.1%
- Third train-test split: train from 2013-14 to 2023-24 season, test on 2024-25 up to Dec. 16. Best result on this split: with PCA, no column dropping, n_estimators = 50, min_samples_split = 100, min_samples_leaf = 1, max_features = None, max_leaf_nodes = 100. Accuracy: 69.4%. Larger accuracy here probably due to very small testing set.

The conclusion is that reducing the number of components from ~90 to 30 probably **neither increases nor decreases the model's performance.** I was wrong about dropping columns; **the best results are obtained with the maximal number of columns.**
Now we move on to the XGBClassifier! We use the same train-test splits in order to compare it to RandomForestClassifier.

In [3]:
training_indices = [0,0,0]
testing_indices = [0,0,0]
training_indices[0] = list(range(11503))
testing_indices[0] = list(range(11503, 14136))
training_indices[1] = list(range(12187))
testing_indices[1] = list(range(12188, 14525))
training_indices[2] = list(range(14136))
testing_indices[2] = list(range(14136, 14525))

In [44]:
#XGBClassifier hyperparameter tuning:
games = pd.read_csv(base_dir / "games_almost_ready_for_training.csv")
games['game_date'] = pd.to_datetime(games['game_date'])

# Add season column
games['season'] = games['game_date'].apply(assign_season)

#each element of these two lists is itself a 3-element list, containing the best scores for each of the 3 train-test splits
max_keys = []
max_values = []
best_keys_per_fold_and_setting = [] 
best_scores_per_fold_and_setting = []
for i in [True, False]:
    for j in list(range(0,3)):
        max_key_folds = [0,0,0]
        max_value_folds = [0,0,0]
        
        games = pd.read_csv(base_dir / "games_almost_ready_for_training.csv")
        games['game_date'] = pd.to_datetime(games['game_date'])
    
        # Add season column
        games['season'] = games['game_date'].apply(assign_season)
    
        # Load the games DataFrame - drop relevant columns
        games, wl_home = load_games_dataframe(games, j)
        wl_home = wl_home.replace({'W': 1, 'L': 0})
    
        #Get the preprocessor - the i parameter decides if we use PCA or not.
        preprocessor = get_preprocessor(games, "XGB", i)
        
        #Candidate parameter values
        eta_list = [0.01, 0.1, 0.2]
        gamma_list = [0, 1, 5]
        max_depth_list = [3, 6, 10]
        min_child_weight_list = [0, 5, 20]
        subsample_list = [0.5, 1]
        n_estimators_list = [100, 200, 400]
        colsample_bytree_list = [0.7, 1.0]
        
        #Cartesian product of the lists
        all_combinations = list(product(
            eta_list,
            gamma_list,
            max_depth_list,
            min_child_weight_list,
            subsample_list,
            n_estimators_list,
            colsample_bytree_list,
        ))
                
        # Create a copy of games since we're going to be changing the seasons column, and we don't want it affected for the other models.
        games_xgb = games.copy()
        
        # Create a mapping dictionary
        seasons = [f"{year}-{year+1}" for year in range(2013, 2025)]
        season_mapping = {season: k for k, season in enumerate(seasons)}
        
        # Map the 'season' column to numeric values
        games_xgb['season'] = games_xgb['season'].map(season_mapping)
        
        #Accuracy dict
        accuracies = {combination : [] for combination in all_combinations}
        avg_accuracies = {combination : 0 for combination in all_combinations}
        
        #Access individual elements of a combination and fit and train the model. Keep all of the scores, but especially the best score for each
        #combination and each fold of the cross-validation.
        for combination in all_combinations:
            clear_output(wait=True)
            print(combination)
            eta = combination[0]       # First element
            gamma = combination[1]  # Second element
            max_depth = combination[2]   # Third element
            min_child_weight = combination[3]       # Fourth element
            subsample = combination[4]     # Fifth element
            n_estimators = combination[5]
            colsample_bytree = combination[6]
            
            classifier = XGBClassifier(eta=eta, gamma=gamma, max_depth=max_depth,
                                    min_child_weight=min_child_weight, subsample=subsample, n_jobs=8, n_estimators=n_estimators,
                                      colsample_bytree = colsample_bytree)
            for k in list(range(0,3)):
                data_train = games_xgb.loc[training_indices[k]]
                target_train = wl_home.loc[training_indices[k]]
                data_test = games_xgb.loc[testing_indices[k]]
                target_test = wl_home.loc[testing_indices[k]]
                data_train_processed = preprocessor.fit_transform(data_train)
                data_test_processed = preprocessor.transform(data_test)
                classifier.fit(data_train_processed, target_train)
                score = classifier.score(data_test_processed, target_test)
                accuracies[combination].append(score)
                if score > max_value_folds[k]:
                    max_value_folds[k] = score
                    max_key_folds[k] = combination
            avg_accuracies[combination] = statistics.mean(accuracies[combination])
        accuracies_series = pd.Series(accuracies)
        accuracies_series.to_csv(base_dir / ("tuning_xgb_accuracies_" + str(j) + "-" + str(i) + ".csv"))
        avg_accuracies_series = pd.Series(avg_accuracies)
        avg_accuracies_series.to_csv(base_dir / ("tuning_xgb_avg_accuracies_" + str(j) + "-" + str(i) + ".csv"))
        max_key = max(avg_accuracies, key=avg_accuracies.get)
        max_keys.append(max_key)
        max_values.append(avg_accuracies[max_key])
        best_keys_per_fold_and_setting.append(max_key_folds)
        best_scores_per_fold_and_setting.append(max_value_folds)

for i in range(0,6):
    print(str(max_keys[i]) + ": " + str(max_values[i]))
    print(str(best_keys_per_fold_and_setting[i]) + ": " + str(best_scores_per_fold_and_setting[i]))

(0.2, 5, 10, 20, 1, 400, 1.0)
(0.01, 5, 10, 0, 1, 200, 0.7): 0.663298278477765
[(0.01, 0, 3, 5, 0.5, 100, 1.0), (0.01, 1, 6, 20, 1, 200, 1.0), (0.2, 0, 6, 0, 1, 100, 0.7)]: [0.6536270413976453, 0.6611039794608472, 0.699228791773779]
(0.1, 1, 6, 20, 0.5, 100, 1.0): 0.6605877338954586
[(0.01, 5, 10, 0, 1, 400, 1.0), (0.01, 0, 10, 20, 0.5, 200, 0.7), (0.1, 1, 6, 20, 0.5, 100, 1.0)]: [0.6559058108621344, 0.660676080445015, 0.7069408740359897]
(0.01, 1, 10, 5, 0.5, 200, 1.0): 0.6645830755227319
[(0.01, 5, 6, 0, 1, 200, 1.0), (0.01, 0, 6, 0, 0.5, 100, 1.0), (0.1, 5, 10, 0, 0.5, 400, 1.0)]: [0.6536270413976453, 0.660676080445015, 0.6966580976863753]
(0.01, 0, 6, 0, 0.5, 400, 1.0): 0.6637200981937426
[(0.2, 5, 3, 0, 1, 100, 1.0), (0.01, 0, 3, 20, 1, 400, 1.0), (0.2, 0, 6, 0, 0.5, 100, 0.7)]: [0.65894417014812, 0.6662387676508345, 0.6889460154241646]
(0.1, 0, 3, 0, 0.5, 100, 0.7): 0.6628178753669687
[(0.01, 0, 3, 20, 0.5, 100, 0.7), (0.1, 0, 3, 0, 0.5, 100, 0.7), (0.2, 5, 10, 0, 0.5, 100, 0.7)]

## XGBoost Performance and Conclusion

The best performing model, on average, was the one with PCA and all individual columns dropped (meaning we only kept the differences between the scores, not the individual scores themselves). Accuracy: 66.5%. The train-test split is the same as the one used for random forest.

- Best performance on the first train-test split: no PCA, no columns dropped; accuracy 65.9%.
- Second: no PCA, no columns dropped; accuracy 66.6%
- Third: PCA, some columns dropped (kept the individual net ratings and individual player "ODG" ratings); accuracy 70.7%. High accuracy again probably due to very small test set.

### RF-XGB Comparison

The performances of the Random Forest model and the XGBoost model are very similar. The best average accuracy is slightly better with RF; on large test sets, RF performs slightly better. On a small test set, XGB performed significantly better.

In [36]:
#load the data, use the maximum number of columns, so don't drop anything
base_dir = Path.cwd()
games = pd.read_csv(base_dir / "games_almost_ready_for_training.csv")
games = games.drop(columns = ['matchup_home'])
games['game_date'] = pd.to_datetime(games['game_date'])
games['wl_home'] = games['wl_home'].replace({'W': 1, 'L': 0})
#drop a column to prevent data leakage
games = games.drop(columns = ['plus_minus_home'])
games['elo_diff'] = games['elo_home'] - games['elo_away']
#games = games.drop(columns = ['elo_home', 'elo_away'])
games['rating_diff'] = games['rating_home'] - games['rating_away']
#games = games.drop(columns = ['rating_home', 'rating_away'])
games = games.drop(columns = ['game_id'])
for i in list(range(1,6)):
    games['rating_player_diff' + str(i)] = games['rating_home_player' + str(i)] - games['rating_away_player' + str(i)]
    #games = games.drop(columns = ['rating_home_player' + str(i), 'rating_away_player' + str(i)])
games['home_netrtg'] = games['ortg_home'] - games['drtg_home']
#games = games.drop(columns = ['ortg_home', 'drtg_home'])
games['away_netrtg'] = games['ortg_away'] - games['drtg_away']
#games = games.drop(columns = ['ortg_away', 'drtg_away'])
games['home_netrtg_last_season'] = games['home_ortg_last_season'] - games['home_drtg_last_season']
games['away_netrtg_last_season'] = games['away_ortg_last_season'] - games['away_drtg_last_season']
#games = games.drop(columns = ['home_ortg_last_season', 'home_drtg_last_season', 'away_ortg_last_season', 'away_drtg_last_season'])
games['GP_home'] = games['record_home_wins'] + games['record_home_losses']
games['GP_away'] = games['record_away_wins'] + games['record_away_losses']
#games = games.drop(columns = ['record_home_wins', 'record_home_losses', 'record_away_wins', 'record_away_losses'])
games['wl%_diff'] = games['home_wl%'] - games['away_wl%']
#games = games.drop(columns = ['home_wl%', 'away_wl%'])
games['netrg_diff'] = games['home_netrtg'] - games['away_netrtg']
#games = games.drop(columns = ['home_netrtg', 'away_netrtg'])
games['netrtg_diff_last_season'] = games['home_netrtg_last_season'] - games['away_netrtg_last_season']
#games = games.drop(columns = ['home_netrtg_last_season', 'away_netrtg_last_season'])
games.replace('Charlotte Bobcats', 'Charlotte Hornets', inplace=True)
games.replace('New Orleans Hornets', 'New Orleans Pelicans', inplace=True)
games.to_csv(base_dir / "games_differences.csv")
games_train = games[(games['game_date'] <= datetime.datetime(2023,9,30))]
target_train = games_train['wl_home']
data_train = games_train.drop(columns = ['wl_home'])
games_test = games[games['game_date'] >= datetime.datetime(2023,10,1)]
target_test = games_test['wl_home']
data_test = games_test.drop(columns = ['wl_home'])


#preprocessing - turning dates into days since first game in table and one-hot encoding team names
#now we also rescale the data since this is necessary for neural networks!
data_train['game_date'] = (data_train['game_date'] - data_train['game_date'].min()) / pd.Timedelta(days=1)
data_test['game_date'] = (data_test['game_date'] - data_test['game_date'].min()) / pd.Timedelta(days=1)
categorical_columns = ['team_name_home', 'team_name_away', 'season_type']
numerical_columns = [col for col in data_train.columns if col not in categorical_columns]
preprocessor = ColumnTransformer(
    transformers=[
        ('team_names', OneHotEncoder(), categorical_columns),  # Apply OneHotEncoder to team columns
        ('numerical_for_rescaling', StandardScaler(), numerical_columns),
    ],
)
data_train_processed = preprocessor.fit_transform(data_train)
data_test_processed = preprocessor.transform(data_test)

#add early stopping and learning rate scheduler to reduce overfitting!
early_stopping = EarlyStopping(
    min_delta = 0.001,
    patience = 20,
    restore_best_weights=True
)
lr_scheduler = ReduceLROnPlateau(
    monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6, verbose=1
)

#define and compile the neural network
model_nn = keras.Sequential([
    layers.Dense(256, activation='relu',input_shape=[data_train_processed.shape[1]], kernel_regularizer=l2(0.01)),
    Dropout(0.4),
    layers.Dense(128, activation='relu', kernel_regularizer=l2(0.01)),
    Dropout(0.4),
    layers.Dense(64, activation='relu', kernel_regularizer=l2(0.01)),
    Dropout(0.4),
    layers.Dense(1,activation='sigmoid')
])
model_nn.compile(
    optimizer = 'adam',
    loss='binary_crossentropy',
    metrics=['binary_accuracy'],
)

#finally, train and test the neural network
history = model_nn.fit(data_train_processed, target_train,
                       validation_split=0.2,  # Use a portion of training data for validation
                       epochs=5000,            # Adjust epochs as needed
                       batch_size=256,        # Mini-batch size
                       callbacks = [early_stopping, lr_scheduler],
                       verbose=1)            # Set verbose to 1 to see the training process
test_loss, test_accuracy = model_nn.evaluate(data_test_processed, target_test)
print(f"Test Accuracy: {test_accuracy:.5f}")

  games['wl_home'] = games['wl_home'].replace({'W': 1, 'L': 0})


Epoch 1/5000


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - binary_accuracy: 0.6223 - loss: 4.1597 - val_binary_accuracy: 0.6310 - val_loss: 2.7299 - learning_rate: 0.0010
Epoch 2/5000
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - binary_accuracy: 0.6553 - loss: 2.3877 - val_binary_accuracy: 0.6349 - val_loss: 1.6435 - learning_rate: 0.0010
Epoch 3/5000
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - binary_accuracy: 0.6611 - loss: 1.4638 - val_binary_accuracy: 0.6381 - val_loss: 1.1107 - learning_rate: 0.0010
Epoch 4/5000
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - binary_accuracy: 0.6700 - loss: 1.0085 - val_binary_accuracy: 0.6368 - val_loss: 0.8613 - learning_rate: 0.0010
Epoch 5/5000
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - binary_accuracy: 0.6730 - loss: 0.7968 - val_binary_accuracy: 0.6357 - val_loss: 0.7484 - learning_rate: 0.0010
Epoch 6/5000

In [53]:
def tune_neural_network(pca_arg, drop_columns, lr, optimizer, batch_size, n_layers, initial_layer_neurons, activation, dropout_rate, training_indices, testing_indices):
    games = pd.read_csv(base_dir / "games_almost_ready_for_training.csv")
    games['game_date'] = pd.to_datetime(games['game_date'])
    
    # Add season column
    games['season'] = games['game_date'].apply(assign_season)

    # Load the games DataFrame - drop relevant columns
    games, wl_home = load_games_dataframe(games, drop_columns)
    wl_home = wl_home.replace({'W': 1, 'L': 0})
    
    #Get the preprocessor - the pca_arg parameter decides if we use PCA or not.
    preprocessor = get_preprocessor(games, "NN", pca_arg)
            
    # Create a mapping dictionary
    seasons = [f"{year}-{year+1}" for year in range(2013, 2025)]
    season_mapping = {season: k for k, season in enumerate(seasons)}
        
    # Map the 'season' column to numeric values
    games['season'] = games['season'].map(season_mapping)

    #add early stopping and learning rate scheduler to reduce overfitting!
    early_stopping = EarlyStopping(
        min_delta = 0.001,
        patience = 20,
        restore_best_weights=True
    )
    lr_scheduler = ReduceLROnPlateau(
        monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6, verbose=1
    )

    test_accuracy = [0,0,0]
    
    for k in list(range(0,3)):
        #load the training and testing sets (use the same train-test split as for RF and XGB)
        data_train = games.loc[training_indices[k]]
        target_train = wl_home.loc[training_indices[k]]
        data_test = games.loc[testing_indices[k]]
        target_test = wl_home.loc[testing_indices[k]]
        #preprocess the train and test sets
        data_train_processed = preprocessor.fit_transform(data_train)
        data_test_processed = preprocessor.transform(data_test)
        #define the model; use the initial_layer_neurons parameter for the first layer, then divide it by 2 for each subsequent layer.
        model = keras.Sequential()
        model.add(layers.Dense(initial_layer_neurons, activation=activation, input_shape=[data_train_processed.shape[1]], kernel_regularizer=l2(0.01)))
        model.add(Dropout(dropout_rate))
        n_layers -= 1
        initial_layer_neurons //= 2
        while n_layers > 0:
            model.add(layers.Dense(initial_layer_neurons, activation=activation, kernel_regularizer=l2(0.01)))
            model.add(Dropout(dropout_rate))
            n_layers -= 1
            initial_layer_neurons //= 2
        #finally, add the output layer with sigmoid activation
        model.add(layers.Dense(1, activation='sigmoid'))
        #raise exception if optimizer parameter is wrong
        if optimizer not in ['adam', 'SGD']:
            raise ValueError("The optimizer must be either 'adam' or 'SGD'!")
        if optimizer == 'adam':
            my_optimizer = keras.optimizers.Adam(learning_rate=lr)
        else : my_optimizer = keras.optimizers.SGD(learning_rate=lr)
        #compile and fit the model
        model.compile(
        optimizer = my_optimizer,
        loss='binary_crossentropy',
        metrics=['binary_accuracy'],
        )
        
        history = model.fit(data_train_processed, target_train,
                       validation_split=0.2,  # Use a portion of training data for validation
                       epochs=5000,            # Adjust epochs as needed
                       batch_size=batch_size,        # Mini-batch size
                       callbacks = [early_stopping, lr_scheduler],
                       verbose=0)
        test_loss, test_accuracy[k] = model.evaluate(data_test_processed, target_test)
    return test_accuracy


In [54]:
#Tuning the Neural Network!
#each element of these 4 lists is itself a 3-element list, containing the best scores for each of the 3 train-test splits and their resp. keys
max_keys = []
max_values = []
best_keys_per_fold_and_setting = [] 
best_scores_per_fold_and_setting = []
for i in [True, False]:
    for j in list(range(0,3)):
        max_key_folds = [0,0,0]
        max_value_folds = [0,0,0]

        #Candidate parameter values
        lr_list = [1e-3, 1e-2, 0.1]
        optimizer_list = ['adam']
        batch_size_list = [32, 256]
        no_of_layers_list = [3, 5]
        activation_list = ['relu', 'swish']
        dropout_rate_list = [0.4]
        
        #Cartesian product of the lists
        all_combinations = list(product(
            lr_list,
            optimizer_list,
            batch_size_list,
            no_of_layers_list,
            activation_list,
            dropout_rate_list,
        ))

        #Accuracy dict
        accuracies = {}
        avg_accuracies = {}
        
        #Access individual elements of a combination and fit and train the modelfor that combination.
        #Keep all of the scores, but especially the best score for each
        #combination and each fold of the cross-validation.
        #Compute the best average scores in the end.
        for combination in all_combinations:
            clear_output(wait=True)
            print(combination)
            lr = combination[0]       # First element
            optimizer = combination[1]  # Second element
            batch_size = combination[2]   # Third element
            no_of_layers = combination[3]       # Fourth element
            activation = combination[4]     # Fifth element
            dropout_rate = combination[5]
            if no_of_layers == 3:
                for initial_layer_neurons in [64, 128, 256]:
                    new_combo = combination + (initial_layer_neurons,)
                    accuracies[new_combo] = tune_neural_network(i, j, lr, optimizer, batch_size, no_of_layers, initial_layer_neurons, activation, dropout_rate, training_indices, testing_indices)
                    for k in list(range(3)):
                        if accuracies[new_combo][k] > max_value_folds[k]:
                            max_value_folds[k] = accuracies[new_combo][k]
                            max_key_folds[k] = new_combo
                    avg_accuracies[new_combo] = statistics.mean(accuracies[new_combo])
            if no_of_layers == 5:
                for initial_layer_neurons in [256, 512]:
                    new_combo = combination + (initial_layer_neurons,)
                    accuracies[new_combo] = tune_neural_network(i, j, lr, optimizer, batch_size, no_of_layers, initial_layer_neurons, activation, dropout_rate, training_indices, testing_indices)
                    for k in list(range(3)):
                        if accuracies[new_combo][k] > max_value_folds[k]:
                            max_value_folds[k] = accuracies[new_combo][k]
                            max_key_folds[k] = new_combo
                    avg_accuracies[new_combo] = statistics.mean(accuracies[new_combo])
        accuracies_series = pd.Series(accuracies)
        accuracies_series.to_csv(base_dir / ("tuning_nn_accuracies_" + str(j) + "-" + str(i) + ".csv"))
        avg_accuracies_series = pd.Series(avg_accuracies)
        avg_accuracies_series.to_csv(base_dir / ("tuning_nn_avg_accuracies_" + str(j) + "-" + str(i) + ".csv"))
        max_key = max(avg_accuracies, key=avg_accuracies.get)
        max_keys.append(max_key)
        max_values.append(avg_accuracies[max_key])
        best_keys_per_fold_and_setting.append(max_key_folds)
        best_scores_per_fold_and_setting.append(max_value_folds)

for i in range(0,6):
    print(str(max_keys[i]) + ": " + str(max_values[i]))
    print(str(best_keys_per_fold_and_setting[i]) + ": " + str(best_scores_per_fold_and_setting[i]))

(0.1, 'adam', 256, 5, 'swish', 0.4)
(11503, 85)
(11503,)


  wl_home = wl_home.replace({'W': 1, 'L': 0})
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Epoch 8: ReduceLROnPlateau reducing learning rate to 0.05000000074505806.

Epoch 13: ReduceLROnPlateau reducing learning rate to 0.02500000037252903.

Epoch 19: ReduceLROnPlateau reducing learning rate to 0.012500000186264515.
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - binary_accuracy: 0.5858 - loss: 0.6802 
(12187, 85)
(12187,)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Epoch 9: ReduceLROnPlateau reducing learning rate to 0.05000000074505806.

Epoch 25: ReduceLROnPlateau reducing learning rate to 0.02500000037252903.

Epoch 34: ReduceLROnPlateau reducing learning rate to 0.012500000186264515.

Epoch 50: ReduceLROnPlateau reducing learning rate to 0.0062500000931322575.

Epoch 56: ReduceLROnPlateau reducing learning rate to 0.0031250000465661287.

Epoch 76: ReduceLROnPlateau reducing learning rate to 0.0015625000232830644.

Epoch 86: ReduceLROnPlateau reducing learning rate to 0.0007812500116415322.
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - binary_accuracy: 0.6456 - loss: 0.6220
(14136, 85)
(14136,)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Epoch 8: ReduceLROnPlateau reducing learning rate to 0.05000000074505806.

Epoch 20: ReduceLROnPlateau reducing learning rate to 0.02500000037252903.

Epoch 27: ReduceLROnPlateau reducing learning rate to 0.012500000186264515.

Epoch 38: ReduceLROnPlateau reducing learning rate to 0.0062500000931322575.

Epoch 51: ReduceLROnPlateau reducing learning rate to 0.0031250000465661287.

Epoch 63: ReduceLROnPlateau reducing learning rate to 0.0015625000232830644.

Epoch 73: ReduceLROnPlateau reducing learning rate to 0.0007812500116415322.
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - binary_accuracy: 0.6425 - loss: 0.6305 
(11503, 85)
(11503,)


  wl_home = wl_home.replace({'W': 1, 'L': 0})
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Epoch 17: ReduceLROnPlateau reducing learning rate to 0.05000000074505806.

Epoch 24: ReduceLROnPlateau reducing learning rate to 0.02500000037252903.

Epoch 34: ReduceLROnPlateau reducing learning rate to 0.012500000186264515.

Epoch 39: ReduceLROnPlateau reducing learning rate to 0.0062500000931322575.
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - binary_accuracy: 0.5858 - loss: 0.6807 
(12187, 85)
(12187,)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Epoch 8: ReduceLROnPlateau reducing learning rate to 0.05000000074505806.

Epoch 27: ReduceLROnPlateau reducing learning rate to 0.02500000037252903.

Epoch 37: ReduceLROnPlateau reducing learning rate to 0.012500000186264515.

Epoch 44: ReduceLROnPlateau reducing learning rate to 0.0062500000931322575.

Epoch 51: ReduceLROnPlateau reducing learning rate to 0.0031250000465661287.

Epoch 58: ReduceLROnPlateau reducing learning rate to 0.0015625000232830644.

Epoch 66: ReduceLROnPlateau reducing learning rate to 0.0007812500116415322.

Epoch 77: ReduceLROnPlateau reducing learning rate to 0.0003906250058207661.

Epoch 87: ReduceLROnPlateau reducing learning rate to 0.00019531250291038305.
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - binary_accuracy: 0.6361 - loss: 0.6221 
(14136, 85)
(14136,)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Epoch 14: ReduceLROnPlateau reducing learning rate to 0.05000000074505806.

Epoch 24: ReduceLROnPlateau reducing learning rate to 0.02500000037252903.

Epoch 31: ReduceLROnPlateau reducing learning rate to 0.012500000186264515.

Epoch 45: ReduceLROnPlateau reducing learning rate to 0.0062500000931322575.

Epoch 55: ReduceLROnPlateau reducing learning rate to 0.0031250000465661287.

Epoch 61: ReduceLROnPlateau reducing learning rate to 0.0015625000232830644.

Epoch 69: ReduceLROnPlateau reducing learning rate to 0.0007812500116415322.

Epoch 74: ReduceLROnPlateau reducing learning rate to 0.0003906250058207661.
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - binary_accuracy: 0.6530 - loss: 0.6320 
(0.001, 'adam', 32, 3, 'relu', 0.4, 64): 0.6608890493710836
[(0.1, 'adam', 256, 3, 'relu', 0.4, 64), (0.01, 'adam', 32, 3, 'relu', 0.4, 256), (0.001, 'adam', 32, 3, 'relu', 0.4, 64)]: [0.6543866395950317, 0.6615318655967712, 0.6812339425086975]
(0.001, 'adam', 32, 3

## Neural Network Performance

On average, the neural network model performed best without PCA and with all individual columns dropped. The best parameters were the following: learning_rate = 0.001, batch_size = 256, 3 hidden layers with 128/64/32 neurons, 'relu' activation function. Average accuracy: 66.4%.

- Best performance on the first train-test split: with PCA, all individual columns dropped. learning_rate = 0.01, batch_size = 32, 5 hidden layers with 256/128/64/32/16 neurons, 'relu' activation function. Accuracy: 65.6%
- Best performance on the second train-test split: with PCA, most individual columnns dropped. learning_rate = 0.001, batch_size = 32, 3 hidden layers with 256/128/64 neurons, 'relu' activation function. Accuracy: 66.6%
- Best performance on the third train-test split: joint between:
   1) without PCA, no individual columns dropped. learning_rate = 0.01, batch_size = 32, 3 hidden layers with 128/64/32 neurons, 'relu' activation function, and
   2) without PCA, most individual columns dropped. learning_rate = 0.001, batch_size = 256, 3 hidden layers with 128/64/32 neurons, 'relu' acrivation function.
Accuracy of both: 68.6%

The neural network model performed slightly worse than the RF and XGB models. This could be due to the fact that the parameter grid for tuning had less elements than for the RF and XGB models, which is itself due tot he fact that the NN model takes longer to train.

## Conclusion

All three models displayed very similar performance, with RF and XGB slightly ahead of NN. Hyperparameter tuning did not cause dramatic improvement. The best performance was by far on the third train-test split, which is possibly due to the very small test set (389 games compared to 1500-2000 in the first two), which causes higher variance of accuracy w/r/t change in hyperparameter values.

My main hope is that the next jump in model performance could be achieved by measuring teams' scores only on their last 10/20/30 games. Although a decent indicator of a player's quality, performance over a full season, and especially over the last 2/3/5 seasons is probably too long a period to make near-future predictions.

NB: Below are initial versions of the models. Probably not worth looking at.

In [4]:
games.columns

Index(['Unnamed: 0', 'game_id', 'game_date', 'team_name_home',
       'team_name_away', 'matchup_home', 'wl_home', 'record_home_wins',
       'record_home_losses', 'record_away_wins', 'record_away_losses',
       'elo_home', 'elo_away', 'plus_minus_home', 'season_type', 'home_wl%',
       'away_wl%', 'rating_home', 'rating_away', 'rating_home_player1',
       'rating_away_player1', 'rating_home_player2', 'rating_away_player2',
       'rating_home_player3', 'rating_away_player3', 'rating_home_player4',
       'rating_away_player4', 'rating_home_player5', 'rating_away_player5',
       'ortg_home', 'drtg_home', 'ortg_away', 'drtg_away',
       'home_ortg_last_season', 'home_drtg_last_season',
       'away_ortg_last_season', 'away_drtg_last_season'],
      dtype='object')

In [7]:
length_test_set = 0
correct_predictions = 0
for k in list(range(0,3)):
    data_test = games.loc[testing_indices[k]]
    length_test_set = len(data_test)
    correct_predictions = len(data_test.loc[((data_test['wl_home']=='W') & (data_test['rating_home']+3 >= data_test['rating_away'])) | ((data_test['wl_home']=='L') & (data_test['rating_home']+3 < data_test['rating_away']))])
    print(str(correct_predictions) + "/" + str(length_test_set) + ", " + str(float(correct_predictions)/float(length_test_set)))

1736/2633, 0.6593239650588683
1562/2337, 0.6683782627299957
256/389, 0.6580976863753213


In [4]:
#Random forest with time series cross-validation

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.model_selection import TimeSeriesSplit
from pathlib import Path
import datetime

#load the data
base_dir = Path.cwd()
games = pd.read_csv(base_dir / "games_almost_ready_for_training.csv")
games = games.drop(columns = ['matchup_home'])
games['game_date'] = pd.to_datetime(games['game_date'])
games['wl_home'] = games['wl_home'].replace({'W': 1, 'L': 0})
#drop a column to prevent data leakage
games = games.drop(columns = ['plus_minus_home'])
#add the difference in Elo ratings as a column, drop the individual Elo ratings
games['elo_diff'] = games['elo_home'] - games['elo_away']
games = games.drop(columns = ['elo_home', 'elo_away'])
#add the gamescore-offrtg-defrtg rating difference as a column, drop the individual team ratings
games['rating_diff'] = games['rating_home'] - games['rating_away']
games = games.drop(columns = ['rating_home', 'rating_away'])
#drop game_id, it's just an index column
games = games.drop(columns = ['game_id'])
#add the rating *difference* between the nth best players, but don't drop the individual ratings
for i in list(range(1,6)):
    games['rating_player_diff' + str(i)] = games['rating_home_player' + str(i)] - games['rating_away_player' + str(i)]
    #games = games.drop(columns = ['rating_home_player' + str(i), 'rating_away_player' + str(i)])
#add home net rating instead of having both offrtg and defrtg
games['home_netrtg'] = games['ortg_home'] - games['drtg_home']
games = games.drop(columns = ['ortg_home', 'drtg_home'])
#add away net rating...
games['away_netrtg'] = games['ortg_away'] - games['drtg_away']
games = games.drop(columns = ['ortg_away', 'drtg_away'])
#add home net rating and away net rating from last season
games['home_netrtg_last_season'] = games['home_ortg_last_season'] - games['home_drtg_last_season']
games['away_netrtg_last_season'] = games['away_ortg_last_season'] - games['away_drtg_last_season']
games = games.drop(columns = ['home_ortg_last_season', 'home_drtg_last_season', 'away_ortg_last_season', 'away_drtg_last_season'])
#add games played home/away (games played so far this season)
games['GP_home'] = games['record_home_wins'] + games['record_home_losses']
games['GP_away'] = games['record_away_wins'] + games['record_away_losses']
games = games.drop(columns = ['record_home_wins', 'record_home_losses', 'record_away_wins', 'record_away_losses'])
#add difference in win-loss percentage, remove individual win-loss percentages
games['wl%_diff'] = games['home_wl%'] - games['away_wl%']
games = games.drop(columns = ['home_wl%', 'away_wl%'])
#add net rating difference, but keep individual home/away net ratings - apparently the model works better with them kept in
games['netrg_diff'] = games['home_netrtg'] - games['away_netrtg']
#games = games.drop(columns = ['home_netrtg', 'away_netrtg'])
#add netrtg difference from last season, drop individual net rtgs
games['netrtg_diff_last_season'] = games['home_netrtg_last_season'] - games['away_netrtg_last_season']
games = games.drop(columns = ['home_netrtg_last_season', 'away_netrtg_last_season'])
#in 2013-14, the charlotte hornets were called the bobcats, and (maybe) the new orleans pelicans were called the hornets
games.replace('Charlotte Bobcats', 'Charlotte Hornets', inplace=True)
games.replace('New Orleans Hornets', 'New Orleans Pelicans', inplace=True)

#preprocessing - turning dates into days since first game in table and one-hot encoding team names
games['game_date'] = (games['game_date'] - games['game_date'].min()) / pd.Timedelta(days=1)
categorical_columns = ['team_name_home', 'team_name_away', 'season_type']
preprocessor = ColumnTransformer(
    transformers=[
        ('team_names', OneHotEncoder(), categorical_columns)  # Apply OneHotEncoder to team columns
    ],
    remainder='passthrough'  # Keep other columns as is (e.g., numeric columns)
)


#fit and test the model - the n_estimators was tuned - other parameters gave the best results with the default values
classifier = RandomForestClassifier(n_estimators = 400, n_jobs = 2, random_state = 0)
model = make_pipeline(preprocessor, classifier)
tscv = TimeSeriesSplit(n_splits=3)
indices = []
accuracies = []
for train_index, test_index in tscv.split(games):
    indices.append([train_index, test_index])
    data_train = games.iloc[train_index]
    target_train = data_train['wl_home']
    data_train = data_train.drop(columns = ['wl_home'])
    data_test = games.iloc[test_index]
    target_test = data_test['wl_home']
    data_test = data_test.drop(columns = ['wl_home'])
    model.fit(data_train, target_train)
    accuracies.append(model.score(data_test, target_test))
accuracies, indices

  games['wl_home'] = games['wl_home'].replace({'W': 1, 'L': 0})


([0.6593225006885156, 0.6356375654089782, 0.6518865326356376],
 [[array([   0,    1,    2, ..., 3629, 3630, 3631]),
   array([3632, 3633, 3634, ..., 7260, 7261, 7262])],
  [array([   0,    1,    2, ..., 7260, 7261, 7262]),
   array([ 7263,  7264,  7265, ..., 10891, 10892, 10893])],
  [array([    0,     1,     2, ..., 10891, 10892, 10893]),
   array([10894, 10895, 10896, ..., 14522, 14523, 14524])]])

In [6]:
#Random forest for

import sklearn as skl
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import TimeSeriesSplit
from pathlib import Path
from xgboost import XGBClassifier
import datetime

#load the data
base_dir = Path.cwd()
games = pd.read_csv(base_dir / "games_almost_ready_for_training.csv")
games = games.drop(columns = ['matchup_home'])
games['game_date'] = pd.to_datetime(games['game_date'])
games['wl_home'] = games['wl_home'].replace({'W': 1, 'L': 0})
#drop a column to prevent data leakage
games = games.drop(columns = ['plus_minus_home'])
#add the difference in Elo ratings as a column, drop the individual Elo ratings
games['elo_diff'] = games['elo_home'] - games['elo_away']
games = games.drop(columns = ['elo_home', 'elo_away'])
#add the gamescore-offrtg-defrtg rating difference as a column, drop the individual team ratings
games['rating_diff'] = games['rating_home'] - games['rating_away']
games = games.drop(columns = ['rating_home', 'rating_away'])
#drop game_id, it's just an index column
games = games.drop(columns = ['game_id'])
#add the rating *difference* between the nth best players, but don't drop the individual ratings
for i in list(range(1,6)):
    games['rating_player_diff' + str(i)] = games['rating_home_player' + str(i)] - games['rating_away_player' + str(i)]
    #games = games.drop(columns = ['rating_home_player' + str(i), 'rating_away_player' + str(i)])
#add home net rating instead of having both offrtg and defrtg
games['home_netrtg'] = games['ortg_home'] - games['drtg_home']
games = games.drop(columns = ['ortg_home', 'drtg_home'])
#add away net rating...
games['away_netrtg'] = games['ortg_away'] - games['drtg_away']
games = games.drop(columns = ['ortg_away', 'drtg_away'])
#add home net rating and away net rating from last season
games['home_netrtg_last_season'] = games['home_ortg_last_season'] - games['home_drtg_last_season']
games['away_netrtg_last_season'] = games['away_ortg_last_season'] - games['away_drtg_last_season']
games = games.drop(columns = ['home_ortg_last_season', 'home_drtg_last_season', 'away_ortg_last_season', 'away_drtg_last_season'])
#add games played home/away (games played so far this season)
games['GP_home'] = games['record_home_wins'] + games['record_home_losses']
games['GP_away'] = games['record_away_wins'] + games['record_away_losses']
games = games.drop(columns = ['record_home_wins', 'record_home_losses', 'record_away_wins', 'record_away_losses'])
#add difference in win-loss percentage, remove individual win-loss percentages
games['wl%_diff'] = games['home_wl%'] - games['away_wl%']
games = games.drop(columns = ['home_wl%', 'away_wl%'])
#add net rating difference, but keep individual home/away net ratings - apparently the model works better with them kept in
games['netrg_diff'] = games['home_netrtg'] - games['away_netrtg']
#games = games.drop(columns = ['home_netrtg', 'away_netrtg'])
#add netrtg difference from last season, drop individual net rtgs
games['netrtg_diff_last_season'] = games['home_netrtg_last_season'] - games['away_netrtg_last_season']
games = games.drop(columns = ['home_netrtg_last_season', 'away_netrtg_last_season'])
#in 2013-14, the charlotte hornets were called the bobcats, and (maybe) the new orleans pelicans were called the hornets
games.replace('Charlotte Bobcats', 'Charlotte Hornets', inplace=True)
games.replace('New Orleans Hornets', 'New Orleans Pelicans', inplace=True)

#preprocessing - turning dates into days since first game in table and one-hot encoding team names
games['game_date'] = (games['game_date'] - games['game_date'].min()) / pd.Timedelta(days=1)
categorical_columns = ['team_name_home', 'team_name_away', 'season_type']
preprocessor = ColumnTransformer(
    transformers=[
        ('team_names', OneHotEncoder(), categorical_columns)  # Apply OneHotEncoder to team columns
    ],
    remainder='passthrough'  # Keep other columns as is (e.g., numeric columns)
)


#fit and test the model - the n_estimators was tuned - other parameters gave the best results with the default values
tscv = TimeSeriesSplit(n_splits=3)
indices = []
accuracies = []
for train_index, test_index in tscv.split(games):
    indices.append([train_index, test_index])
    data_train = games.iloc[train_index]
    target_train = data_train['wl_home']
    data_train = data_train.drop(columns = ['wl_home'])
    data_test = games.iloc[test_index]
    target_test = data_test['wl_home']
    data_test = data_test.drop(columns = ['wl_home'])
    #now use xgboost on he same data...
    data_train_processed = preprocessor.fit_transform(data_train)
    data_test_processed = preprocessor.transform(data_test)
    model_xgb = XGBClassifier(random_state=0, learning_rate=0.01, n_estimators=200)
    model_xgb.fit(data_train_processed, target_train)
    accuracies.append(model_xgb.score(data_test_processed, target_test))
accuracies, indices

  games['wl_home'] = games['wl_home'].replace({'W': 1, 'L': 0})


([0.6565684384467089, 0.619388598182319, 0.6480308454971082],
 [[array([   0,    1,    2, ..., 3629, 3630, 3631]),
   array([3632, 3633, 3634, ..., 7260, 7261, 7262])],
  [array([   0,    1,    2, ..., 7260, 7261, 7262]),
   array([ 7263,  7264,  7265, ..., 10891, 10892, 10893])],
  [array([    0,     1,     2, ..., 10891, 10892, 10893]),
   array([10894, 10895, 10896, ..., 14522, 14523, 14524])]])

In [8]:
#Neural network with time series cross-validation!

from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from tensorflow.keras.layers import Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.regularizers import l2

#load the data, use the maximum number of columns, so don't drop anything
base_dir = Path.cwd()
games = pd.read_csv(base_dir / "games_almost_ready_for_training.csv")
games = games.drop(columns = ['matchup_home'])
games['game_date'] = pd.to_datetime(games['game_date'])
games['wl_home'] = games['wl_home'].replace({'W': 1, 'L': 0})
#drop a column to prevent data leakage
games = games.drop(columns = ['plus_minus_home'])
games['elo_diff'] = games['elo_home'] - games['elo_away']
#games = games.drop(columns = ['elo_home', 'elo_away'])
games['rating_diff'] = games['rating_home'] - games['rating_away']
#games = games.drop(columns = ['rating_home', 'rating_away'])
games = games.drop(columns = ['game_id'])
for i in list(range(1,6)):
    games['rating_player_diff' + str(i)] = games['rating_home_player' + str(i)] - games['rating_away_player' + str(i)]
    #games = games.drop(columns = ['rating_home_player' + str(i), 'rating_away_player' + str(i)])
games['home_netrtg'] = games['ortg_home'] - games['drtg_home']
#games = games.drop(columns = ['ortg_home', 'drtg_home'])
games['away_netrtg'] = games['ortg_away'] - games['drtg_away']
#games = games.drop(columns = ['ortg_away', 'drtg_away'])
games['home_netrtg_last_season'] = games['home_ortg_last_season'] - games['home_drtg_last_season']
games['away_netrtg_last_season'] = games['away_ortg_last_season'] - games['away_drtg_last_season']
#games = games.drop(columns = ['home_ortg_last_season', 'home_drtg_last_season', 'away_ortg_last_season', 'away_drtg_last_season'])
games['GP_home'] = games['record_home_wins'] + games['record_home_losses']
games['GP_away'] = games['record_away_wins'] + games['record_away_losses']
#games = games.drop(columns = ['record_home_wins', 'record_home_losses', 'record_away_wins', 'record_away_losses'])
games['wl%_diff'] = games['home_wl%'] - games['away_wl%']
#games = games.drop(columns = ['home_wl%', 'away_wl%'])
games['netrg_diff'] = games['home_netrtg'] - games['away_netrtg']
#games = games.drop(columns = ['home_netrtg', 'away_netrtg'])
games['netrtg_diff_last_season'] = games['home_netrtg_last_season'] - games['away_netrtg_last_season']
#games = games.drop(columns = ['home_netrtg_last_season', 'away_netrtg_last_season'])
games.replace('Charlotte Bobcats', 'Charlotte Hornets', inplace=True)
games.replace('New Orleans Hornets', 'New Orleans Pelicans', inplace=True)


#preprocessing - turning dates into days since first game in table and one-hot encoding team names
#now we also rescale the data since this is necessary for neural networks!
games['game_date'] = (games['game_date'] - games['game_date'].min()) / pd.Timedelta(days=1)
categorical_columns = ['team_name_home', 'team_name_away', 'season_type']
numerical_columns = [col for col in data_train.columns if col not in categorical_columns]
preprocessor = ColumnTransformer(
    transformers=[
        ('team_names', OneHotEncoder(), categorical_columns),  # Apply OneHotEncoder to team columns
        ('numerical_for_rescaling', StandardScaler(), numerical_columns),
    ],
)

#add early stopping and learning rate scheduler to reduce overfitting!
early_stopping = EarlyStopping(
    min_delta = 0.001,
    patience = 20,
    restore_best_weights=True
)
lr_scheduler = ReduceLROnPlateau(
    monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6, verbose=1
)

#define and compile the neural network
model_nn = keras.Sequential([
    layers.Dense(256, activation='relu',input_shape=[data_train_processed.shape[1]], kernel_regularizer=l2(0.01)),
    Dropout(0.4),
    layers.Dense(128, activation='relu', kernel_regularizer=l2(0.01)),
    Dropout(0.4),
    layers.Dense(64, activation='relu', kernel_regularizer=l2(0.01)),
    Dropout(0.4),
    layers.Dense(1,activation='sigmoid')
])
model_nn.compile(
    optimizer = 'adam',
    loss='binary_crossentropy',
    metrics=['binary_accuracy'],
)

tscv = TimeSeriesSplit(n_splits=3)
indices = []
accuracies = []
for train_index, test_index in tscv.split(games):
    indices.append([train_index, test_index])
    data_train = games.iloc[train_index]
    target_train = data_train['wl_home']
    data_train = data_train.drop(columns = ['wl_home'])
    data_test = games.iloc[test_index]
    target_test = data_test['wl_home']
    data_test = data_test.drop(columns = ['wl_home'])
    data_train_processed = preprocessor.fit_transform(data_train)
    data_test_processed = preprocessor.transform(data_test)
    #finally, train and test the neural network
    history = model_nn.fit(data_train_processed, target_train,
                           validation_split=0.2,  # Use a portion of training data for validation
                           epochs=5000,            # Adjust epochs as needed
                           batch_size=256,        # Mini-batch size
                           callbacks = [early_stopping, lr_scheduler],
                           verbose=1)            # Set verbose to 1 to see the training process
    test_loss, test_accuracy = model_nn.evaluate(data_test_processed, target_test)
    accuracies.append(test_accuracy)
accuracies, indices

  games['wl_home'] = games['wl_home'].replace({'W': 1, 'L': 0})
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5000
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 28ms/step - binary_accuracy: 0.5274 - loss: 4.4111 - val_binary_accuracy: 0.6919 - val_loss: 3.7519 - learning_rate: 0.0010
Epoch 2/5000
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - binary_accuracy: 0.6398 - loss: 3.6070 - val_binary_accuracy: 0.6795 - val_loss: 3.0843 - learning_rate: 0.0010
Epoch 3/5000
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - binary_accuracy: 0.6632 - loss: 2.9749 - val_binary_accuracy: 0.6809 - val_loss: 2.5420 - learning_rate: 0.0010
Epoch 4/5000
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - binary_accuracy: 0.6635 - loss: 2.4554 - val_binary_accuracy: 0.6795 - val_loss: 2.1038 - learning_rate: 0.0010
Epoch 5/5000
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - binary_accuracy: 0.6594 - loss: 2.0417 - val_binary_accuracy: 0.6823 - val_loss: 1.7557 - learning_rate: 0.001

([0.6626273989677429, 0.6405948996543884, 0.6524373292922974],
 [[array([   0,    1,    2, ..., 3629, 3630, 3631]),
   array([3632, 3633, 3634, ..., 7260, 7261, 7262])],
  [array([   0,    1,    2, ..., 7260, 7261, 7262]),
   array([ 7263,  7264,  7265, ..., 10891, 10892, 10893])],
  [array([    0,     1,     2, ..., 10891, 10892, 10893]),
   array([10894, 10895, 10896, ..., 14522, 14523, 14524])]])

In [24]:
# Now I want to apply PCA!

import sklearn as skl
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import TimeSeriesSplit
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from pathlib import Path
import datetime

#load the data
base_dir = Path.cwd()
games = pd.read_csv(base_dir / "games_almost_ready_for_training.csv")
games = games.drop(columns = ['matchup_home'])
games['game_date'] = pd.to_datetime(games['game_date'])
games['wl_home'] = games['wl_home'].replace({'W': 1, 'L': 0})
#drop a column to prevent data leakage
games = games.drop(columns = ['plus_minus_home'])
#add the difference in Elo ratings as a column, drop the individual Elo ratings
games['elo_diff'] = games['elo_home'] - games['elo_away']
games = games.drop(columns = ['elo_home', 'elo_away'])
#add the gamescore-offrtg-defrtg rating difference as a column, drop the individual team ratings
games['rating_diff'] = games['rating_home'] - games['rating_away']
games = games.drop(columns = ['rating_home', 'rating_away'])
#drop game_id, it's just an index column
games = games.drop(columns = ['game_id'])
#add the rating *difference* between the nth best players, but don't drop the individual ratings
for i in list(range(1,6)):
    games['rating_player_diff' + str(i)] = games['rating_home_player' + str(i)] - games['rating_away_player' + str(i)]
    #games = games.drop(columns = ['rating_home_player' + str(i), 'rating_away_player' + str(i)])
#add home net rating instead of having both offrtg and defrtg
games['home_netrtg'] = games['ortg_home'] - games['drtg_home']
games = games.drop(columns = ['ortg_home', 'drtg_home'])
#add away net rating...
games['away_netrtg'] = games['ortg_away'] - games['drtg_away']
games = games.drop(columns = ['ortg_away', 'drtg_away'])
#add home net rating and away net rating from last season
games['home_netrtg_last_season'] = games['home_ortg_last_season'] - games['home_drtg_last_season']
games['away_netrtg_last_season'] = games['away_ortg_last_season'] - games['away_drtg_last_season']
games = games.drop(columns = ['home_ortg_last_season', 'home_drtg_last_season', 'away_ortg_last_season', 'away_drtg_last_season'])
#add games played home/away (games played so far this season)
games['GP_home'] = games['record_home_wins'] + games['record_home_losses']
games['GP_away'] = games['record_away_wins'] + games['record_away_losses']
games = games.drop(columns = ['record_home_wins', 'record_home_losses', 'record_away_wins', 'record_away_losses'])
#add difference in win-loss percentage, remove individual win-loss percentages
games['wl%_diff'] = games['home_wl%'] - games['away_wl%']
games = games.drop(columns = ['home_wl%', 'away_wl%'])
#add net rating difference, but keep individual home/away net ratings - apparently the model works better with them kept in
games['netrg_diff'] = games['home_netrtg'] - games['away_netrtg']
#games = games.drop(columns = ['home_netrtg', 'away_netrtg'])
#add netrtg difference from last season, drop individual net rtgs
games['netrtg_diff_last_season'] = games['home_netrtg_last_season'] - games['away_netrtg_last_season']
games = games.drop(columns = ['home_netrtg_last_season', 'away_netrtg_last_season'])
#in 2013-14, the charlotte hornets were called the bobcats, and (maybe) the new orleans pelicans were called the hornets
games.replace('Charlotte Bobcats', 'Charlotte Hornets', inplace=True)
games.replace('New Orleans Hornets', 'New Orleans Pelicans', inplace=True)
#split into training and testing set
# games_train = games[(games['game_date'] <= datetime.datetime(2023,9,30))]
# target_train = games_train['wl_home']
# data_train = games_train.drop(columns = ['wl_home'])
# games_test = games[games['game_date'] >= datetime.datetime(2023,10,1)]
# target_test = games_test['wl_home']
# data_test = games_test.drop(columns = ['wl_home'])

#preprocessing - turning dates into days since first game in table and one-hot encoding team names
games['game_date'] = (games['game_date'] - games['game_date'].min()) / pd.Timedelta(days=1)
wl_home = games['wl_home']
games = games.drop(columns = ['wl_home'])
categorical_columns = ['team_name_home', 'team_name_away', 'season_type']
numerical_columns = [col for col in games.columns if col not in categorical_columns]
preprocessor = ColumnTransformer(
    transformers=[
        ('team_names', OneHotEncoder(), categorical_columns),  # Apply OneHotEncoder to team columns
        ('standard_scale_num_cols', StandardScaler(), numerical_columns) # Scale numerical columns for PCA
    ],
)


#fit and test the model - the n_estimators was tuned - other parameters gave the best results with the default values
accuracies_rf = []
accuracies_xgb = []
indices = []
classifier = RandomForestClassifier(n_estimators = 400, n_jobs = 2, random_state = 0)
pca = PCA(n_components=30)  # Reduce to 30 dimensions
model = make_pipeline(preprocessor, pca, classifier)
preprocessor_xgb = make_pipeline(preprocessor, pca)
tscv = TimeSeriesSplit(n_splits=3)
for train_index, test_index in tscv.split(games):
    indices.append([train_index, test_index])
    data_train = games.iloc[train_index]
    data_test = games.iloc[test_index]
    target_train = wl_home[train_index]
    target_test = wl_home[test_index]
    model.fit(data_train, target_train)
    accuracies_rf.append(model.score(data_test, target_test))
    #now use xgboost on the same data...
    data_train_processed = preprocessor_xgb.fit_transform(data_train)
    data_test_processed = preprocessor_xgb.transform(data_test)
    model_xgb = XGBClassifier(random_state=0, learning_rate=0.01, n_estimators=200)
    model_xgb.fit(data_train_processed, target_train)
    accuracies_xgb.append(model_xgb.score(data_test_processed, target_test))
print("Indices:" + str(indices))
print("With RF: " + str(accuracies_rf))
print("With XGB: " + str(accuracies_xgb))

  games['wl_home'] = games['wl_home'].replace({'W': 1, 'L': 0})


Indices:[[array([   0,    1,    2, ..., 3629, 3630, 3631]), array([3632, 3633, 3634, ..., 7260, 7261, 7262])], [array([   0,    1,    2, ..., 7260, 7261, 7262]), array([ 7263,  7264,  7265, ..., 10891, 10892, 10893])], [array([    0,     1,     2, ..., 10891, 10892, 10893]), array([10894, 10895, 10896, ..., 14522, 14523, 14524])]]
With RF: [0.6582208757917929, 0.621041035527403, 0.6505095015147342]
With XGB: [0.6527127513081795, 0.6240705039933903, 0.646653814376205]


In [28]:
#Finally, we add PCA to the neural network.

from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import TimeSeriesSplit
from sklearn.decomposition import PCA
from tensorflow.keras.layers import Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.regularizers import l2

#load the data, use the maximum number of columns, so don't drop anything
base_dir = Path.cwd()
games = pd.read_csv(base_dir / "games_almost_ready_for_training.csv")
games = games.drop(columns = ['matchup_home'])
games['game_date'] = pd.to_datetime(games['game_date'])
games['wl_home'] = games['wl_home'].replace({'W': 1, 'L': 0})
#drop a column to prevent data leakage
games = games.drop(columns = ['plus_minus_home'])
games['elo_diff'] = games['elo_home'] - games['elo_away']
#games = games.drop(columns = ['elo_home', 'elo_away'])
games['rating_diff'] = games['rating_home'] - games['rating_away']
#games = games.drop(columns = ['rating_home', 'rating_away'])
games = games.drop(columns = ['game_id'])
for i in list(range(1,6)):
    games['rating_player_diff' + str(i)] = games['rating_home_player' + str(i)] - games['rating_away_player' + str(i)]
    #games = games.drop(columns = ['rating_home_player' + str(i), 'rating_away_player' + str(i)])
games['home_netrtg'] = games['ortg_home'] - games['drtg_home']
#games = games.drop(columns = ['ortg_home', 'drtg_home'])
games['away_netrtg'] = games['ortg_away'] - games['drtg_away']
#games = games.drop(columns = ['ortg_away', 'drtg_away'])
games['home_netrtg_last_season'] = games['home_ortg_last_season'] - games['home_drtg_last_season']
games['away_netrtg_last_season'] = games['away_ortg_last_season'] - games['away_drtg_last_season']
#games = games.drop(columns = ['home_ortg_last_season', 'home_drtg_last_season', 'away_ortg_last_season', 'away_drtg_last_season'])
games['GP_home'] = games['record_home_wins'] + games['record_home_losses']
games['GP_away'] = games['record_away_wins'] + games['record_away_losses']
#games = games.drop(columns = ['record_home_wins', 'record_home_losses', 'record_away_wins', 'record_away_losses'])
games['wl%_diff'] = games['home_wl%'] - games['away_wl%']
#games = games.drop(columns = ['home_wl%', 'away_wl%'])
games['netrg_diff'] = games['home_netrtg'] - games['away_netrtg']
#games = games.drop(columns = ['home_netrtg', 'away_netrtg'])
games['netrtg_diff_last_season'] = games['home_netrtg_last_season'] - games['away_netrtg_last_season']
#games = games.drop(columns = ['home_netrtg_last_season', 'away_netrtg_last_season'])
games.replace('Charlotte Bobcats', 'Charlotte Hornets', inplace=True)
games.replace('New Orleans Hornets', 'New Orleans Pelicans', inplace=True)


#preprocessing - turning dates into days since first game in table and one-hot encoding team names
#now we also rescale the data since this is necessary for neural networks!
games['game_date'] = (games['game_date'] - games['game_date'].min()) / pd.Timedelta(days=1)
wl_home = games['wl_home']
games = games.drop(columns = ['wl_home'])
categorical_columns = ['team_name_home', 'team_name_away', 'season_type']
numerical_columns = [col for col in data_train.columns if col not in categorical_columns]
preprocessor = ColumnTransformer(
    transformers=[
        ('team_names', OneHotEncoder(), categorical_columns),  # Apply OneHotEncoder to team columns
        ('numerical_for_rescaling', StandardScaler(), numerical_columns),
    ],
)
pca = PCA(n_components=30)  # Reduce to 30 dimensions
preprocessor_pca = make_pipeline(preprocessor, pca)

#add early stopping and learning rate scheduler to reduce overfitting!
early_stopping = EarlyStopping(
    min_delta = 0.001,
    patience = 20,
    restore_best_weights=True
)
lr_scheduler = ReduceLROnPlateau(
    monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6, verbose=1
)

#define and compile the neural network
model_nn = keras.Sequential([
    layers.Dense(256, activation='relu',input_shape=[30], kernel_regularizer=l2(0.01)),
    Dropout(0.4),
    layers.Dense(128, activation='relu', kernel_regularizer=l2(0.01)),
    Dropout(0.4),
    layers.Dense(64, activation='relu', kernel_regularizer=l2(0.01)),
    Dropout(0.4),
    layers.Dense(1,activation='sigmoid')
])
model_nn.compile(
    optimizer = 'adam',
    loss='binary_crossentropy',
    metrics=['binary_accuracy'],
)

tscv = TimeSeriesSplit(n_splits=3)
indices = []
accuracies = []
for train_index, test_index in tscv.split(games):
    indices.append([train_index, test_index])
    data_train = games.iloc[train_index]
    target_train = wl_home[train_index]
    data_test = games.iloc[test_index]
    target_test = wl_home[test_index]
    data_train_processed = preprocessor_pca.fit_transform(data_train)
    data_test_processed = preprocessor_pca.transform(data_test)
    #finally, train and test the neural network
    history = model_nn.fit(data_train_processed, target_train,
                           validation_split=0.2,  # Use a portion of training data for validation
                           epochs=5000,            # Adjust epochs as needed
                           batch_size=256,        # Mini-batch size
                           callbacks = [early_stopping, lr_scheduler],
                           verbose=1)            # Set verbose to 1 to see the training process
    test_loss, test_accuracy = model_nn.evaluate(data_test_processed, target_test)
    accuracies.append(test_accuracy)
accuracies, indices

Epoch 1/5000


  games['wl_home'] = games['wl_home'].replace({'W': 1, 'L': 0})
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - binary_accuracy: 0.5834 - loss: 3.6660 - val_binary_accuracy: 0.6919 - val_loss: 3.1907 - learning_rate: 0.0010
Epoch 2/5000
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - binary_accuracy: 0.6424 - loss: 3.1133 - val_binary_accuracy: 0.6905 - val_loss: 2.7410 - learning_rate: 0.0010
Epoch 3/5000
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - binary_accuracy: 0.6634 - loss: 2.6722 - val_binary_accuracy: 0.6864 - val_loss: 2.3524 - learning_rate: 0.0010
Epoch 4/5000
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - binary_accuracy: 0.6562 - loss: 2.2936 - val_binary_accuracy: 0.6809 - val_loss: 2.0206 - learning_rate: 0.0010
Epoch 5/5000
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - binary_accuracy: 0.6590 - loss: 1.9634 - val_binary_accuracy: 0.6699 - val_loss: 1.7418 - learning_rate: 0.0010
Epoch 6/5000

([0.6681355237960815, 0.6414210796356201, 0.6513357162475586],
 [[array([   0,    1,    2, ..., 3629, 3630, 3631]),
   array([3632, 3633, 3634, ..., 7260, 7261, 7262])],
  [array([   0,    1,    2, ..., 7260, 7261, 7262]),
   array([ 7263,  7264,  7265, ..., 10891, 10892, 10893])],
  [array([    0,     1,     2, ..., 10891, 10892, 10893]),
   array([10894, 10895, 10896, ..., 14522, 14523, 14524])]])

In [9]:
games = pd.read_csv(base_dir / "games_almost_ready_for_training.csv")
games['game_date'] = pd.to_datetime(games['game_date'])
games_train = games[games['game_date'] <= datetime.datetime(2023,9,30)]
games_test = games[games['game_date'] >= datetime.datetime(2023,10,1)]
n_of_games_train = len(games_train)
correct_prediction_list = []
hca_values = list(range(0,11))
for hca in hca_values:
    correct_predictions = 0
    for idx, row in games_train.iterrows():
        if ((row['rating_home'] + hca >= row['rating_away']) and (row['wl_home'] == 'W')) or ((row['rating_home'] + hca < row['rating_away']) and (row['wl_home'] == 'L')):
            correct_predictions += 1
    correct_prediction_list.append(correct_predictions)
accuracies = []
for pred in correct_prediction_list:
    accuracies.append(float(pred)/float(n_of_games_train))
n_of_games_train, correct_prediction_list, accuracies
#Conclusion : hca = 5.

(12817,
 [8296, 8341, 8389, 8396, 8421, 8432, 8419, 8409, 8392, 8398, 8392],
 [0.6472653507060935,
  0.6507763127096825,
  0.654521338846844,
  0.6550674884918468,
  0.657018022938285,
  0.657876258094718,
  0.65686198018257,
  0.6560817664039947,
  0.6547554029804167,
  0.6552235312475618,
  0.6547554029804167])

In [13]:
#Time to test:
#Interestingly, hca = 3 works better :) I think it's because home-court advantage is starting to matter less and less, and the change is noticeable in the last 10 years
games_test = games[(games['game_date'] >= datetime.datetime(2023,10,1)) & ((games['game_date'] <= datetime.datetime(2024,10,1)))]
n_of_games_test = len(games_test)
correct_predictions = 0
for idx, row in games_test.iterrows():
    if ((row['rating_home'] + 3 >= row['rating_away']) and (row['wl_home'] == 'W')) or ((row['rating_home'] + 3 < row['rating_away']) and (row['wl_home'] == 'L')):
        correct_predictions += 1
accuracy = float(correct_predictions)/float(n_of_games_test)
n_of_games_test, correct_predictions, accuracy

(1319, 900, 0.6823351023502654)