In [1]:
# Settings the warnings to be ignored 

import warnings 
warnings.filterwarnings('ignore') 

In [2]:
# Import 'pandas' for working with data frames
import pandas as pd

import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score


In [3]:
data = pd.read_csv("nba_final.csv")

In [4]:
data.dtypes

Rk              int64
Player.x       object
Player_ID      object
Pos1           object
Pos2           object
Age             int64
Tm             object
G               int64
GS              int64
MP            float64
FG            float64
FGA           float64
FG.           float64
X3P           float64
X3PA          float64
X3P.          float64
X2P           float64
X2PA          float64
X2P.          float64
eFG.          float64
FT            float64
FTA           float64
FT.           float64
ORB           float64
DRB           float64
TRB           float64
AST           float64
STL           float64
BLK           float64
TOV           float64
PF            float64
PTS           float64
Salary        float64
mean_views    float64
Season         object
Conference     object
Role           object
Fvot            int64
FRank           int64
Pvot          float64
PRank         float64
Mvot          float64
MRank         float64
Score         float64
Play           object
dtype: obj

In [5]:
data.isnull().sum()

Rk               0
Player.x         0
Player_ID        0
Pos1             0
Pos2          1396
Age              0
Tm               0
G                0
GS               0
MP               0
FG               0
FGA              0
FG.              4
X3P              0
X3PA             0
X3P.            99
X2P              0
X2PA             0
X2P.            15
eFG.             4
FT               0
FTA              0
FT.             47
ORB              0
DRB              0
TRB              0
AST              0
STL              0
BLK              0
TOV              0
PF               0
PTS              0
Salary          62
mean_views     138
Season           0
Conference       0
Role             0
Fvot             0
FRank            0
Pvot           159
PRank          159
Mvot           404
MRank          404
Score            0
Play             0
dtype: int64

In [6]:
# axis = 1 means, it will show number of missing values for each row,
data.isnull().sum(axis=1)

0       6
1       5
2       5
3       3
4       3
       ..
1403    1
1404    1
1405    4
1406    1
1407    1
Length: 1408, dtype: int64

In [7]:
data.shape

(1408, 45)

Note: There are two things we can do to solve missing value problem:

1. Remove the row or column in which they exists
2. We can fill it with most suitable valuessuch as mean, median, and mode and surrounding values of the missing value.

In [8]:
#The column 'Pos2' having more number(i.e., 1396) of null values, hence filling with 'None'
data['Pos2'] = data['Pos2'].replace(np.nan, 'None')

## Below is also perform same operation as above
#data["Pos1'].fillna('None')

Before filling the missing values for numerical column with mean or median, we need to check for outliers in the column. then it is better to fill it with median as opposed to mean value.

In [9]:
data['Salary']

0              NaN
1        2700000.0
2        4351320.0
3        2022240.0
4        7680965.0
           ...    
1403     3628920.0
1404    19500000.0
1405       77250.0
1406     2393887.0
1407     2615160.0
Name: Salary, Length: 1408, dtype: float64

In [10]:
#following will show the outliers, observe the columns pvot and mvot
data.describe

<bound method NDFrame.describe of        Rk         Player.x  Player_ID Pos1  Pos2  Age   Tm   G  GS    MP  ...  \
0     170     A.J. Hammons  hammoaj01    C  None   24  DAL  22   0   7.4  ...   
1      58     Aaron Brooks  brookaa01   PG  None   32  IND  65   0  13.8  ...   
2     157     Aaron Gordon  gordoaa01   SF  None   21  ORL  80  72  28.7  ...   
3     352    Adreian Payne  paynead01   PF  None   25  MIN  18   0   7.5  ...   
4      10  Al-Farouq Aminu  aminual01   PF  None   26  POR  61  25  29.1  ...   
...   ...              ...        ...  ...   ...  ...  ...  ..  ..   ...  ...   
1403  109     Zach Collins  colliza01    C  None   21  POR  77   0  17.6  ...   
1404  294      Zach LaVine  lavinza01   SG  None   23  CHI  63  62  34.5  ...   
1405  308      Zach Lofton  loftoza01   SG  None   26  DET   1   0   4.0  ...   
1406  393    Zaza Pachulia  pachuza01    C  None   34  DET  68   3  12.9  ...   
1407  457     Zhaire Smith  smithzh01   SG  None   19  PHI   6   2  18.5  .

In [11]:
data['Salary'] = data['Salary'].fillna(data['Salary'].median())

In [12]:
data.isnull().sum()

Rk              0
Player.x        0
Player_ID       0
Pos1            0
Pos2            0
Age             0
Tm              0
G               0
GS              0
MP              0
FG              0
FGA             0
FG.             4
X3P             0
X3PA            0
X3P.           99
X2P             0
X2PA            0
X2P.           15
eFG.            4
FT              0
FTA             0
FT.            47
ORB             0
DRB             0
TRB             0
AST             0
STL             0
BLK             0
TOV             0
PF              0
PTS             0
Salary          0
mean_views    138
Season          0
Conference      0
Role            0
Fvot            0
FRank           0
Pvot          159
PRank         159
Mvot          404
MRank         404
Score           0
Play            0
dtype: int64

In [13]:
data.describe

<bound method NDFrame.describe of        Rk         Player.x  Player_ID Pos1  Pos2  Age   Tm   G  GS    MP  ...  \
0     170     A.J. Hammons  hammoaj01    C  None   24  DAL  22   0   7.4  ...   
1      58     Aaron Brooks  brookaa01   PG  None   32  IND  65   0  13.8  ...   
2     157     Aaron Gordon  gordoaa01   SF  None   21  ORL  80  72  28.7  ...   
3     352    Adreian Payne  paynead01   PF  None   25  MIN  18   0   7.5  ...   
4      10  Al-Farouq Aminu  aminual01   PF  None   26  POR  61  25  29.1  ...   
...   ...              ...        ...  ...   ...  ...  ...  ..  ..   ...  ...   
1403  109     Zach Collins  colliza01    C  None   21  POR  77   0  17.6  ...   
1404  294      Zach LaVine  lavinza01   SG  None   23  CHI  63  62  34.5  ...   
1405  308      Zach Lofton  loftoza01   SG  None   26  DET   1   0   4.0  ...   
1406  393    Zaza Pachulia  pachuza01    C  None   34  DET  68   3  12.9  ...   
1407  457     Zhaire Smith  smithzh01   SG  None   19  PHI   6   2  18.5  .

In [14]:
#filling other null values with 0
data = data.fillna(0)

In [15]:
# Dropping few columns which are not important for prediction 
data.drop(columns=['Player.x', 'Player_ID'], inplace = True)

In [16]:
data.duplicated().sum()

0

In [17]:
data.drop_duplicates(inplace=True)

In [18]:
#Print all the column which have 'object' have dtype
data.select_dtypes('object')

Unnamed: 0,Pos1,Pos2,Tm,Season,Conference,Role,Play
0,C,,DAL,2016-17,West,Front,No
1,PG,,IND,2016-17,Est,Back,No
2,SF,,ORL,2016-17,Est,Front,No
3,PF,,MIN,2016-17,West,Front,No
4,PF,,POR,2016-17,West,Front,No
...,...,...,...,...,...,...,...
1403,C,,POR,2018-19,West,Front,No
1404,SG,,CHI,2018-19,Est,Back,No
1405,SG,,DET,2018-19,Est,Back,No
1406,C,,DET,2018-19,Est,Front,No


In [19]:
#To get the column names of object data type
data.select_dtypes('object').columns

Index(['Pos1', 'Pos2', 'Tm', 'Season', 'Conference', 'Role', 'Play'], dtype='object')

In [20]:
cat_cols = data.select_dtypes('object').columns
cat_cols

Index(['Pos1', 'Pos2', 'Tm', 'Season', 'Conference', 'Role', 'Play'], dtype='object')

#### Encoding of the object Columns

In [21]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

In [22]:
for col in cat_cols:
    data[col] = le.fit_transform(data[col])

Note: in the above line we used fit_transform method, where fit learns the number of categories present in any categorical/object column, then transform is responsible for finally encoding the text values into 0/1 type of numbers.

In [52]:
data['Play'].value_counts()

Play
0    1335
1      73
Name: count, dtype: int64

In [53]:
## Above results that the data is in imbalance format

## Machine Learning Process

In [23]:
X = data.drop(columns = 'Play')
y = data['Play']

### Scaling of the data is very important in case of PCA and LDA

In [24]:
from sklearn.preprocessing import StandardScaler

In [25]:
scaler = StandardScaler()

In [26]:
X = scaler.fit_transform(X)

# Splitting the data into train and test

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=100, stratify=y)

# Apply Logistic Regression on the data

In [28]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

In [29]:
y_pred = log_reg.predict(X_test)

In [30]:
from sklearn.metrics import roc_auc_score

In [31]:
accuracy_score(y_test, y_pred)

0.9787234042553191

In [32]:
roc_auc_score(y_test, y_pred)

0.9258426966292135

In [33]:
# Since our data is imbalance we need to use roc_auc_score

In [34]:
data.shape

(1408, 43)

### Transform the Data with PCA

In [35]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 0.95)
pca_data = pca.fit_transform(X)

In [36]:
X.shape

(1408, 42)

In [37]:
pca_data.shape

(1408, 22)

In [38]:
# With the above activities we are able to convert 42 columns into 22 PCA components

In [39]:
# Now, we can use latest PCA to split the data

In [40]:
X_train, X_test, y_train, y_test = train_test_split(pca_data, y, test_size=0.2, random_state=100, stratify=y)

In [41]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

In [42]:
accuracy_score(y_test, y_pred)

0.9787234042553191

In [43]:
roc_auc_score(y_test, y_pred)

0.9258426966292135

In [44]:
## after using the PCA to split the data the auuracy_score and roc_acu_score are similar of using 42 columns

### Hence it is proved that we can replace PCA lesser number of columns(22) over the actual 42 columns

### Transform the Data with LDA

In [45]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [46]:
lda = LinearDiscriminantAnalysis()
X = lda.fit_transform(X, y)

In [47]:
X.shape

(1408, 1)

#above, we can observe LDA converted 42 columns to 1 column

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=100, stratify=y)

In [49]:
log_reg3 = LogisticRegression()
log_reg3.fit(X_train, y_train)

In [50]:
y_pred3 = log_reg3.predict(X_test)

In [51]:
roc_auc_score(y_test, y_pred3)

0.8610486891385768

In [54]:
accuracy_score(y_test, y_pred3)

0.975177304964539