In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
import statsmodels.api as sm
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [2]:
# The names of each player I've collected data for and which seasons I've collected for each player in order.

name = ['James Harden', 'Anthony Davis', 'LeBron James', 'Giannis Antetokounmpo', 'Kevin Durant', 'Russell Westbrook', 'Victor Oladipo', 'Paul George', 'Joel Embiid', 'Devin Booker', 'Bradley Beal', 'Trae Young', 'Luka Doncic']
season = [['2018','2019','2020'],['2018','2020'],['2018'],['2018','2019','2020'],['2018','2019'],['2018'],['2018'],['2019'],['2019'],['2019'],['2019'],['2020'],['2020']]

In [3]:
# First I'm going to load all of the feature data into a single dataframe.

feature_training = pd.DataFrame()
feature_testing = pd.DataFrame()
label_training = pd.DataFrame()
label_testing = pd.DataFrame()



for i in range(len(name)):
    for j in range(len(season[i])):

        # Downloading the file data from the appropriate file path.
        # Loading a player's feature and label data for one season.

        feature_data = pd.read_csv(r'C:\Users\frank\OneDrive\Documents\DS\ML Basketball Data\Player Data\{}\{}'.format(name[i],season[i][j]),index_col=0)
        label_data = pd.read_csv(r'C:\Users\frank\OneDrive\Documents\DS\ML Basketball Data\Player Data\{}\Points\{}'.format(name[i],season[i][j]),index_col=0)

        # Splitting the data for each csv into train/test data

        feature_train, feature_test, label_train, label_test = train_test_split(feature_data,label_data,test_size=0.2,random_state=5) # using the same random state will ensure the same indices are used for the train/test split on each set of data.

        # After splitting the data I will load it into different train and test dataframes for both the features and label data.
        
        feature_training = pd.concat([feature_training,feature_train])
        feature_testing = pd.concat([feature_testing,feature_test])
        label_training = pd.concat([label_training,label_train])
        label_testing = pd.concat([label_testing,label_test])

In [4]:
from sklearn.preprocessing import StandardScaler

# Scaling the data before using it to fit the model.
# It's important to fit the scaler only to a single data set (feature training, feature testing etc.) as this will exclude information from the test sets.

scaler = StandardScaler()
f_train_scaled = pd.DataFrame(scaler.fit_transform(feature_training),columns=feature_training.columns.values.tolist())
f_test_scaled = pd.DataFrame(scaler.fit_transform(feature_testing),columns=feature_testing.columns.values.tolist())
l_train_scaled = pd.DataFrame(scaler.fit_transform(label_training),columns=label_training.columns.values.tolist())
l_test_scaled = pd.DataFrame(scaler.fit_transform(label_testing),columns=label_testing.columns.values.tolist())

In [5]:
# Creating the ordinary least squares model and generating predictions from this.

model = sm.OLS(l_train_scaled,f_train_scaled,missing='drop').fit()
predictions = model.predict(f_test_scaled)
preds = scaler.inverse_transform(predictions)
comparison = pd.DataFrame()
comparison['Actual Points'] = np.array(label_testing).flatten()
comparison['Predicted Points'] = np.array(preds)
comparison['Difference'] = comparison['Actual Points'] - comparison['Predicted Points']
comparison

Unnamed: 0,Actual Points,Predicted Points,Difference
0,25.0,24.951432,0.048568
1,48.0,48.855793,-0.855793
2,24.0,23.706731,0.293269
3,56.0,57.252383,-1.252383
4,41.0,41.266647,-0.266647
...,...,...,...
274,42.0,42.368958,-0.368958
275,35.0,35.242513,-0.242513
276,29.0,29.081077,-0.081077
277,27.0,27.161144,-0.161144


In [6]:
model.summary()

0,1,2,3
Dep. Variable:,PTS,R-squared (uncentered):,1.0
Model:,OLS,Adj. R-squared (uncentered):,1.0
Method:,Least Squares,F-statistic:,1.6419999999999998e+32
Date:,"Sat, 17 Oct 2020",Prob (F-statistic):,0.0
Time:,20:05:21,Log-Likelihood:,35781.0
No. Observations:,1077,AIC:,-71550.0
Df Residuals:,1069,BIC:,-71510.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
FG,0.7183,6.76e-17,1.06e+16,0.000,0.718,0.718
FGA,4.684e-17,6.62e-17,0.707,0.480,-8.31e-17,1.77e-16
3P,0.2190,5.56e-17,3.94e+15,0.000,0.219,0.219
3PA,7.685e-16,6.16e-17,12.467,0.000,6.48e-16,8.89e-16
FT,0.4472,9.61e-17,4.65e+15,0.000,0.447,0.447
FTA,2.914e-16,9.18e-17,3.176,0.002,1.11e-16,4.72e-16
USG%,1.735e-17,4.38e-17,0.396,0.692,-6.85e-17,1.03e-16
ORtg,-1.284e-16,4.95e-17,-2.592,0.010,-2.26e-16,-3.12e-17

0,1,2,3
Omnibus:,55.102,Durbin-Watson:,0.841
Prob(Omnibus):,0.0,Jarque-Bera (JB):,62.328
Skew:,-0.579,Prob(JB):,2.92e-14
Kurtosis:,3.22,Cond. No.,9.08


From the above summary of the model, we can see that the the FGA, 3PA, FTA, USG%, and ORtg statistics have very little effect on the regression model.

In [7]:
# Only selecting the most significant features for our model.

features = ['FG','3P','FT']
feature_training = feature_training[features]
feature_testing = feature_testing[features]

# Scaling the data once again.

f_train_scaled = pd.DataFrame(scaler.fit_transform(feature_training),columns=feature_training.columns.values.tolist())
f_test_scaled = pd.DataFrame(scaler.fit_transform(feature_testing),columns=feature_testing.columns.values.tolist())
l_train_scaled = pd.DataFrame(scaler.fit_transform(label_training),columns=label_training.columns.values.tolist())
l_test_scaled = pd.DataFrame(scaler.fit_transform(label_testing),columns=label_testing.columns.values.tolist())

model = sm.OLS(l_train_scaled,f_train_scaled,missing='drop').fit()
predictions = model.predict(f_test_scaled)
preds = scaler.inverse_transform(predictions)
comparison = pd.DataFrame()
comparison['Actual Points'] = np.array(label_testing).flatten()
comparison['Predicted Points'] = np.array(preds)
comparison['Difference'] = comparison['Actual Points'] - comparison['Predicted Points']
comparison

Unnamed: 0,Actual Points,Predicted Points,Difference
0,25.0,24.951432,0.048568
1,48.0,48.855793,-0.855793
2,24.0,23.706731,0.293269
3,56.0,57.252383,-1.252383
4,41.0,41.266647,-0.266647
...,...,...,...
274,42.0,42.368958,-0.368958
275,35.0,35.242513,-0.242513
276,29.0,29.081077,-0.081077
277,27.0,27.161144,-0.161144


In [8]:
model.summary()

0,1,2,3
Dep. Variable:,PTS,R-squared (uncentered):,1.0
Model:,OLS,Adj. R-squared (uncentered):,1.0
Method:,Least Squares,F-statistic:,1.336e+33
Date:,"Sat, 17 Oct 2020",Prob (F-statistic):,0.0
Time:,20:06:48,Log-Likelihood:,36379.0
No. Observations:,1077,AIC:,-72750.0
Df Residuals:,1074,BIC:,-72740.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
FG,0.7183,1.69e-17,4.25e+16,0.000,0.718,0.718
3P,0.2190,1.69e-17,1.3e+16,0.000,0.219,0.219
FT,0.4472,1.61e-17,2.78e+16,0.000,0.447,0.447

0,1,2,3
Omnibus:,181.769,Durbin-Watson:,1.281
Prob(Omnibus):,0.0,Jarque-Bera (JB):,329.119
Skew:,1.031,Prob(JB):,3.41e-72
Kurtosis:,4.757,Cond. No.,1.48
