In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

In [2]:
# The names of each player I've collected data for and which seasons I've collected for each player in order.

name = ['James Harden', 'Anthony Davis', 'LeBron James', 'Giannis Antetokounmpo', 'Kevin Durant', 'Russell Westbrook', 'Victor Oladipo', 'Paul George', 'Joel Embiid', 'Devin Booker', 'Bradley Beal', 'Trae Young', 'Luka Doncic']
season = [['2018','2019','2020'],['2018','2020'],['2018'],['2018','2019','2020'],['2018','2019'],['2018'],['2018'],['2019'],['2019'],['2019'],['2019'],['2020'],['2020']]

The below cell downloads the data I've collected from my files, and takes a split of the data before composing the training and testing data in order to get a reasonable distribution of data for different player's and their style of play in both the training and testing data.
This is done to avoid overfitting of the training data, if for example most of the training data contains information from guards who tend to take and make more 3-point field goals compared to forwards or centers who take more 2-point field goals and free throws.

In [3]:
# First I'm going to load all of the feature data into a single dataframe.

feature_training = pd.DataFrame()
feature_testing = pd.DataFrame()
label_training = pd.DataFrame()
label_testing = pd.DataFrame()



for i in range(len(name)):
    for j in range(len(season[i])):

        # Downloading the file data from the appropriate file path.
        # Loading a player's feature and label data for one season.

        feature_data = pd.read_csv(r'C:\Users\frank\OneDrive\Documents\DS\ML Basketball Data\Player Data\{}\{}'.format(name[i],season[i][j]),index_col=0)
        label_data = pd.read_csv(r'C:\Users\frank\OneDrive\Documents\DS\ML Basketball Data\Player Data\{}\Points\{}'.format(name[i],season[i][j]),index_col=0)

        # Splitting the data for each csv into train/test data

        feature_train, feature_test, label_train, label_test = train_test_split(feature_data,label_data,test_size=0.2,random_state=5) # using the same random state will ensure the same indices are used for the train/test split on each set of data.

        # After splitting the data I will load it into different train and test dataframes for both the features and label data.
        
        feature_training = pd.concat([feature_training,feature_train])
        feature_testing = pd.concat([feature_testing,feature_test])
        label_training = pd.concat([label_training,label_train])
        label_testing = pd.concat([label_testing,label_test])

feature_training.reset_index(drop='True', inplace=True)
label_training.reset_index(drop='True', inplace=True)
feature_testing.reset_index(drop='True', inplace=True)
label_testing.reset_index(drop='True', inplace=True)

In [4]:
# Finding and removing NaN values from the datasets

print(feature_training.isnull().values.any())
print(label_training.isnull().values.any())
print(feature_testing.isnull().values.any())
print(label_testing.isnull().values.any())

null_f = [] # list to contain the indices of rows with nan values in the feature training data
for i in range(len(feature_training)):
    if str(feature_training.iloc[i].isnull().values.any()) == 'True':
        null_f.append(i)

null_l = [] # list to contain the indices of rows with nan values in the label training data
for i in range(len(label_training)):
    if str(label_training.iloc[i].isnull().values.any()) == 'True':
        null_l.append(i)

feature_training = feature_training.drop(null_f,axis=0)
label_training = label_training.drop(null_l,axis=0)

print(feature_training.isnull().values.any())
print(label_training.isnull().values.any())
print(feature_testing.isnull().values.any())
print(label_testing.isnull().values.any())

True
True
False
False
False
False
False
False


In [5]:
# Creating the ordinary least squares model and generating predictions from this.

stratkf = StratifiedKFold(n_splits=10, shuffle=True)
scaler = StandardScaler()
model = LinearRegression()

# Creating a pipeline so that data is normalised before predictions
estimators = []
estimators.append(('normalise', scaler))
estimators.append(('linear model', model))
pipeline  = Pipeline(estimators)

# Training and making predictions from the model

mean_squared_errors = cross_val_score(pipeline, feature_training, label_training, cv=stratkf, scoring='neg_mean_squared_error')
predictions = cross_val_predict(pipeline, feature_testing, label_testing).flatten() # generating predictions
accuracy = accuracy_score(np.round(np.array(label_testing)).flatten(), np.round(predictions))
mse = np.mean(mean_squared_errors) # average MSE from each split of the data
print('MSE:', mse)
print('Accuracy:', accuracy)

MSE: -4.8901894369184547e-29
Accuracy: 1.0


Already from this we have a 100% accurate model with almost no mean squared error (because predicted values are rounded). The model might be more efficient if we use less features, and only consider features actually contributing to the points scored by a player; specifically FG, 3P, and FT.

In [6]:
# Only selecting the most significant features for our model.

features = ['FG','3P','FT']
feature_training = feature_training[features]
feature_testing = feature_testing[features]

stratkf = StratifiedKFold(n_splits=10, shuffle=True)
model = LinearRegression()
mean_squared_errors = cross_val_score(model, feature_training, label_training, cv=stratkf, scoring='neg_mean_squared_error')
predictions = cross_val_predict(model, feature_testing, label_testing).flatten() # generating predictions
accuracy = accuracy_score(np.round(np.array(label_testing)).flatten(), np.round(predictions))
mse = np.mean(mean_squared_errors) # average MSE from each split of the data
print('MSE:', mse)
print('Accuracy:', accuracy)

MSE: -2.2712133847180432e-29
Accuracy: 1.0
