In [1]:
import numpy as np
import pandas as pd
import tensorflow
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

The first think being done is this notebook is collecting the feature data and label data for my model which in this case is 2-point field goals made 2P, 3-point field goals made 3P, free throws made FT and the points scored PTS. For this model I will be applying weights to each of my features to proportionately represent each goal type's contribution to the total points, since each type of goal is worth a different amount of points. 

Basketball-Reference doesn't have a specific 2-point field goals made column for each gamelog, so to find this I can find the difference between total field goals FG and 3-point field goals 3P, where the left over numbers are 2-point field goals 2P.

In [2]:
# The names of each player I've collected data for and which seasons I've collected for each player in order.

name = ['James Harden', 'Anthony Davis', 'LeBron James', 'Giannis Antetokounmpo', 'Kevin Durant', 'Russell Westbrook', 'Victor Oladipo', 'Paul George', 'Joel Embiid', 'Devin Booker', 'Bradley Beal', 'Trae Young', 'Luka Doncic']
season = [['2018','2019','2020'],['2018','2020'],['2018'],['2018','2019','2020'],['2018','2019'],['2018'],['2018'],['2019'],['2019'],['2019'],['2019'],['2020'],['2020']]

# First I'm going to load all of the feature data into a single dataframe.

feature_training = pd.DataFrame()
feature_testing = pd.DataFrame()
label_training = pd.DataFrame()
label_testing = pd.DataFrame()



for i in range(len(name)):
    for j in range(len(season[i])):

        # Downloading the file data from the appropriate file path.
        # Loading a player's feature and label data for one season.

        feature_data = pd.read_csv(r'C:\Users\frank\OneDrive\Documents\DS\ML Basketball Data\Player Data\{}\{}'.format(name[i],season[i][j]),index_col=0)
        label_data = pd.read_csv(r'C:\Users\frank\OneDrive\Documents\DS\ML Basketball Data\Player Data\{}\Points\{}'.format(name[i],season[i][j]),index_col=0)

        feature_data['2P'] = feature_data['FG'] - feature_data['3P'] # finding the 2P fields goals for each row.

        # Splitting the data for each csv into train/test data

        feature_train, feature_test, label_train, label_test = train_test_split(feature_data,label_data,test_size=0.2,random_state=5) # using the same random state will ensure the same indices are used for the train/test split on each set of data.

        # After splitting the data I will load it into different train and test dataframes for both the features and label data.
        
        feature_training = pd.concat([feature_training,feature_train])
        feature_testing = pd.concat([feature_testing,feature_test])
        label_training = pd.concat([label_training,label_train])
        label_testing = pd.concat([label_testing,label_test])

In [3]:
# Only selecting the most significant features for our model.

features = ['2P','3P','FT']
feature_training = feature_training[features]
feature_testing = feature_testing[features]

feature_training.reset_index(drop='True', inplace=True)
label_training.reset_index(drop='True', inplace=True)
feature_testing.reset_index(drop='True', inplace=True)
label_testing.reset_index(drop='True', inplace=True)

In [4]:
# Checking for nan values (which our network cannot process)
print(feature_training.isnull().values.any())
print(label_training.isnull().values.any())
print(feature_testing.isnull().values.any())
print(label_testing.isnull().values.any())

True
True
False
False


So we have NaN values in all of our training data which the network won't be able to process.

In [5]:
null_f = [] # list to contain the indices of rows with nan values in the feature training data
for i in range(len(feature_training)):
    if str(feature_training.iloc[i].isnull().values.any()) == 'True':
        null_f.append(i)

null_l = [] # list to contain the indices of rows with nan values in the label training data
for i in range(len(label_training)):
    if str(label_training.iloc[i].isnull().values.any()) == 'True':
        null_l.append(i)

So the rows that contain null values are the same in both the feature and label training data so we can now remove them.

In [6]:
feature_training = feature_training.drop(null_f,axis=0)
label_training = label_training.drop(null_l,axis=0)

Keras also can't read Pandas Dataframes so, now I'll convert my dataframe into Numpy arrays.

In [7]:
feature_training = np.array(feature_training)
label_training = np.array(label_training)
feature_testing = np.array(feature_testing)
label_testing = np.array(label_testing)

Now to create the neural network I'll be using to generate the model.

In [35]:
tensorflow.keras.backend.clear_session()
# Creating a fully connected neural network with 3 inputs and 4 hidden layers

def wide_model():
    model = Sequential()
    model.add(Dense(256,input_dim=feature_training.shape[1],activation='relu'))
    model.add(Dense(256,activation='relu'))
    model.add(Dense(1))

    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error']) # using mean squared error as my loss function and RMSprop as my optimizer

    return model

def deep_model():
    model = Sequential()
    model.add(Dense(16,input_dim=feature_training.shape[1],activation='relu'))
    model.add(Dense(16,activation='relu'))
    model.add(Dense(32,activation='relu'))
    model.add(Dense(64,activation='relu'))
    model.add(Dense(64,activation='relu'))
    model.add(Dense(64,activation='relu'))
    model.add(Dense(64,activation='relu'))
    model.add(Dense(64,activation='relu'))
    model.add(Dense(64,activation='relu'))
    model.add(Dense(64,activation='relu'))
    model.add(Dense(64,activation='relu'))
    model.add(Dense(64,activation='relu'))
    model.add(Dense(64,activation='relu'))
    model.add(Dense(64,activation='relu'))
    model.add(Dense(64,activation='relu'))
    model.add(Dense(64,activation='relu'))
    model.add(Dense(64,activation='relu'))
    model.add(Dense(64,activation='relu'))
    model.add(Dense(64,activation='relu'))
    model.add(Dense(25,activation='relu'))
    model.add(Dense(1))

    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error']) # using mean squared error as my loss function and RMSprop as my optimizer

    return model

def deep_wide_model():
    model = Sequential()
    model.add(Dense(32,input_dim=feature_training.shape[1],activation='relu'))
    model.add(Dense(32,activation='relu'))
    model.add(Dense(64,activation='relu'))
    model.add(Dense(128,activation='relu'))
    model.add(Dense(256,activation='relu'))
    model.add(Dense(87,activation='relu'))
    model.add(Dense(1))

    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error']) # using mean squared error as my loss function and RMSprop as my optimizer

    return model

In [37]:
scaler = StandardScaler() # scaler to normalise the dataset

wide = [] 
wide.append(('standardize', scaler))
wide.append(('mlp', KerasRegressor(build_fn=wide_model, epochs=50, batch_size=5, verbose=0))) # putting the pipeline process which will standardise the data and create the model
pipeline = Pipeline(wide)

stratkf = StratifiedKFold(n_splits=5,shuffle=True) # using 5 splits since I only have 1077 training samples and 279 testing samples
results = cross_val_score(pipeline, feature_training, label_training, cv=stratkf, scoring='neg_mean_squared_error') # finding the mean squared error for each split of the data
print('The MSE for training with a wide neural network is',results.mean()) # the average mean squared error for each valuation model

predictions = cross_val_predict(pipeline, feature_testing, label_testing) # generating predictions
accuracy = accuracy_score(np.round(label_testing).flatten(), np.round(predictions)) # accuracy of predictions
print('The accuracy for predictions with a wide neural network is', accuracy)

The MSE for training with a wide neural network is -0.019620121505194973
The accuracy for predictions with a wide neural network is 0.978494623655914


In [38]:
deep = [] 
deep.append(('standardize', scaler))
deep.append(('mlp', KerasRegressor(build_fn=deep_model, epochs=50, batch_size=5, verbose=0))) # putting the pipeline process which will standardise the data and create the model
pipeline = Pipeline(deep)

stratkf = StratifiedKFold(n_splits=5,shuffle=True)
results = cross_val_score(pipeline, feature_training, label_training, cv=stratkf, scoring='neg_mean_squared_error') # finding the mean squared error for each split of the data
print('The MSE for training with a deep neural network is',results.mean()) # the average mean squared error for each valuation model

predictions = cross_val_predict(pipeline, feature_testing, label_testing) # generating predictions
accuracy = accuracy_score(np.round(label_testing).flatten(), np.round(predictions)) # accuracy of predictions
print('The accuracy for predictions with a deep neural network is', accuracy)

The MSE for training with a deep neural network is -1.2359708570091965
The accuracy for predictions with a deep neural network is 0.5663082437275986


In [39]:
deep_wide = [] 
deep_wide.append(('standardize', scaler))
deep_wide.append(('mlp', KerasRegressor(build_fn=deep_wide_model, epochs=50, batch_size=5, verbose=0))) # putting the pipeline process which will standardise the data and create the model
pipeline = Pipeline(deep_wide)

stratkf = StratifiedKFold(n_splits=5,shuffle=True)
results = cross_val_score(pipeline, feature_training, label_training, cv=stratkf, scoring='neg_mean_squared_error') # finding the mean squared error for each split of the data
print('The MSE for training with a neural network thats both deep and widening',results.mean()) # the average mean squared error for each valuation model

predictions = cross_val_predict(pipeline, feature_testing, label_testing) # generating predictions
accuracy = accuracy_score(np.round(label_testing).flatten(), np.round(predictions)) # accuracy of predictions
print('The accuracy for predictions with a neural network thats both deep and widening', accuracy)

The MSE for training with a neural network thats both deep and widening -0.1056580626963582
The accuracy for predictions with a neural network thats both deep and widening 0.6953405017921147


For the above evaluations I created three neural networks; a wide network, a deep network and a network that increases the number of neurons in each layer. I tried to keep the number of parameters in each network roughly the same, so that the number of weights and connections were kept approximately constant, while changing the architecture of the network.
Surprisingly I've found that the wide network is the best network