# Imports

In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

# Load the data

In [2]:
nba_data = pd.read_csv('../data/nba_data.csv', index_col=0)
nba_data.head(10)

Unnamed: 0,NAME,SEASON,PTS,REB,AST,BLK,STL,G,FG%,3P%,FT%,TS%,USG%,PER,WS,BPM,VORP,SALARY
0,Patrick Patterson,2017-2018,318.0,193.0,57.0,23.0,48.0,82,0.398,0.386,0.87,0.555,10.8,10.0,2.6,-1.3,0.2,4325064.0
1,E'Twaun Moore,2017-2018,1022.0,238.0,187.0,12.0,79.0,82,0.508,0.425,0.706,0.593,16.1,12.1,4.4,-0.2,1.2,8680000.0
2,Bismack Biyombo,2017-2018,468.0,468.0,66.0,95.0,21.0,82,0.52,0.0,0.65,0.556,14.9,14.1,2.9,-1.6,0.2,17000000.0
3,Khris Middleton,2017-2018,1652.0,429.0,328.0,21.0,119.0,82,0.466,0.359,0.884,0.577,24.6,17.4,6.9,0.4,1.8,17520690.0
4,Cory Joseph,2017-2018,649.0,263.0,260.0,18.0,80.0,82,0.424,0.353,0.745,0.503,14.8,11.0,3.1,-1.2,0.5,7472500.0
5,Jordan Clarkson,2017-2018,1124.0,217.0,221.0,6.0,58.0,81,0.451,0.352,0.8,0.542,26.5,16.4,3.0,-1.9,0.1,12500000.0
6,Thaddeus Young,2017-2018,955.0,512.0,152.0,36.0,135.0,81,0.487,0.32,0.598,0.528,17.3,14.8,5.5,1.5,2.3,12500000.0
7,Dwight Howard,2017-2018,1347.0,1012.0,105.0,131.0,48.0,81,0.555,0.143,0.574,0.577,24.2,20.5,6.8,-0.3,1.1,23752240.0
8,Jeremy Lamb,2017-2018,1033.0,324.0,186.0,32.0,61.0,80,0.457,0.37,0.861,0.559,22.4,17.0,4.9,0.9,1.5,7000000.0
9,Bojan Bogdanovic,2017-2018,1141.0,270.0,119.0,8.0,55.0,80,0.474,0.402,0.868,0.605,19.0,13.9,5.4,-0.7,0.8,10500000.0


# Train/Test split

In [3]:
X_train, X_test, y_train, y_test = train_test_split(nba_data.drop(columns='SALARY'), nba_data.SALARY, test_size = 0.3, random_state=77)

In [4]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((110, 17), (48, 17), (110,), (48,))

Here I split the data into 2 groups, training data and testing data and also split those two groups into two more groups, features and labels. The featues are the player statistics that will be used to predict the label which in this case is the salary the player earned the year those stats were recorded. The training data will be used to train the models while the testing data will be used to test those models. I also checked the shape of the 4 groups of data to ensure they are correct.

In [5]:
name_and_season_list = ['NAME', 'SEASON']
name_and_season_train = X_train[name_and_season_list]
name_and_season_test = X_test[name_and_season_list]
X_train.drop(columns=name_and_season_list, inplace=True)
X_test.drop(columns=name_and_season_list, inplace=True)
X_train.shape, X_test.shape

((110, 15), (48, 15))

Here I saved the 'NAME' and "SEASON' columns from the train and test data. Then I removed those columns from the train and test data and once again checked the shape of these two groups.

In [6]:
X_train.dtypes

PTS     float64
REB     float64
AST     float64
BLK     float64
STL     float64
G         int64
FG%     float64
3P%     float64
FT%     float64
TS%     float64
USG%    float64
PER     float64
WS      float64
BPM     float64
VORP    float64
dtype: object

In [7]:
X_test.dtypes

PTS     float64
REB     float64
AST     float64
BLK     float64
STL     float64
G         int64
FG%     float64
3P%     float64
FT%     float64
TS%     float64
USG%    float64
PER     float64
WS      float64
BPM     float64
VORP    float64
dtype: object

This ensures that all the features in the train and test groups are numeric.

# Save the data

In [8]:
X_train.to_csv(r'..\data\X_train.csv')
y_train.to_csv(r'..\data\y_train.csv')
X_test.to_csv(r'..\data\X_test.csv')
y_test.to_csv(r'..\data\y_test.csv')