# NBA PLayers position classification using Random forest classifier
## The data we have is the data of NBA players during 2014-2015 season
### Kaggle: https://www.kaggle.com/drgilermo/nba-players-stats-20142015

In [1]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [2]:
data = pd.read_csv("players_stats.csv")
data.head()

Unnamed: 0,Name,Games Played,MIN,PTS,FGM,FGA,FG%,3PM,3PA,3P%,...,Age,Birth_Place,Birthdate,Collage,Experience,Height,Pos,Team,Weight,BMI
0,AJ Price,26,324,133,51,137,37.2,15,57,26.3,...,29.0,us,"October 7, 1986",University of Connecticut,5,185.0,PG,PHO,81.45,23.798393
1,Aaron Brooks,82,1885,954,344,817,42.1,121,313,38.7,...,30.0,us,"January 14, 1985",University of Oregon,6,180.0,PG,CHI,72.45,22.361111
2,Aaron Gordon,47,797,243,93,208,44.7,13,48,27.1,...,20.0,us,"September 16, 1995",University of Arizona,R,202.5,PF,ORL,99.0,24.142661
3,Adreian Payne,32,740,213,91,220,41.4,1,9,11.1,...,24.0,us,"February 19, 1991",Michigan State University,R,205.0,PF,ATL,106.65,25.377751
4,Al Horford,76,2318,1156,519,965,53.8,11,36,30.6,...,29.0,do,"June 3, 1986",University of Florida,7,205.0,C,ATL,110.25,26.234384


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 490 entries, 0 to 489
Data columns (total 34 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Name          490 non-null    object 
 1   Games Played  490 non-null    int64  
 2   MIN           490 non-null    int64  
 3   PTS           490 non-null    int64  
 4   FGM           490 non-null    int64  
 5   FGA           490 non-null    int64  
 6   FG%           490 non-null    float64
 7   3PM           490 non-null    int64  
 8   3PA           490 non-null    int64  
 9   3P%           490 non-null    float64
 10  FTM           490 non-null    int64  
 11  FTA           490 non-null    int64  
 12  FT%           490 non-null    float64
 13  OREB          490 non-null    int64  
 14  DREB          490 non-null    int64  
 15  REB           490 non-null    int64  
 16  AST           490 non-null    int64  
 17  STL           490 non-null    int64  
 18  BLK           490 non-null    

In [4]:
data.describe()

Unnamed: 0,Games Played,MIN,PTS,FGM,FGA,FG%,3PM,3PA,3P%,FTM,...,BLK,TOV,PF,EFF,AST/TOV,STL/TOV,Age,Height,Weight,BMI
count,490.0,490.0,490.0,490.0,490.0,490.0,490.0,490.0,490.0,490.0,...,490.0,490.0,490.0,490.0,490.0,490.0,422.0,422.0,422.0,422.0
mean,53.014286,1214.714286,502.108163,188.338776,419.526531,43.099184,39.387755,112.52449,25.519184,86.042857,...,24.07551,68.826531,101.483673,564.330612,1.465837,0.626,27.507109,197.440758,99.469194,25.427747
std,24.175437,820.570132,422.084232,156.265752,337.367125,9.625231,47.880909,127.38575,15.796603,91.315316,...,31.662852,58.480701,65.326807,464.428031,0.825037,0.367443,4.220603,8.74025,12.364228,1.715794
min,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-3.0,0.0,0.0,20.0,172.5,72.45,20.411523
25%,33.0,492.25,145.25,55.5,139.0,39.6,1.0,6.0,15.5,18.5,...,5.0,23.0,45.5,165.0,0.92,0.3925,24.0,190.0,90.0,24.286136
50%,61.0,1193.0,423.0,156.0,357.5,42.9,18.0,58.0,31.3,58.0,...,14.0,56.5,103.0,490.5,1.345,0.53,27.0,197.5,99.0,25.420833
75%,74.0,1905.75,774.0,286.0,642.75,47.575,66.0,192.0,36.4,126.75,...,29.75,100.0,148.75,837.0,1.92,0.82,30.0,205.0,108.0,26.4375
max,83.0,2981.0,2217.0,659.0,1471.0,100.0,286.0,646.0,100.0,715.0,...,200.0,321.0,285.0,2202.0,6.5,3.0,39.0,222.5,162.0,32.723141


## Data preprocessing

In [5]:
# dropping unnecessary attributes and nulls
data = data.dropna()
data = data.drop(['Birth_Place','Birthdate', 'Collage', 'Experience'], axis=1)

In [6]:
# encoding object attributes
data["Pos"] = data["Pos"].astype('category').cat.codes
data["Team"] = data["Team"].astype('category').cat.codes

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 350 entries, 0 to 487
Data columns (total 30 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Name          350 non-null    object 
 1   Games Played  350 non-null    int64  
 2   MIN           350 non-null    int64  
 3   PTS           350 non-null    int64  
 4   FGM           350 non-null    int64  
 5   FGA           350 non-null    int64  
 6   FG%           350 non-null    float64
 7   3PM           350 non-null    int64  
 8   3PA           350 non-null    int64  
 9   3P%           350 non-null    float64
 10  FTM           350 non-null    int64  
 11  FTA           350 non-null    int64  
 12  FT%           350 non-null    float64
 13  OREB          350 non-null    int64  
 14  DREB          350 non-null    int64  
 15  REB           350 non-null    int64  
 16  AST           350 non-null    int64  
 17  STL           350 non-null    int64  
 18  BLK           350 non-null    

In [8]:
# splitting data for training and testing
X = data.drop(["Pos", "Name"], axis=1)
y = data["Pos"]
X_train,  X_test,y_train, y_test = train_test_split(X, y, test_size=0.333, random_state=42)

## The Random Forest Classifier algorithm is an ensemble approach since it uses the Decision Tree Classifier technique but creates several Decision Trees instead of just one.

In [9]:
# creating our Random forest classifier model
rf_clf = RandomForestClassifier(random_state=42)
# fitting the classifier
rf_clf.fit(X_train, y_train);

In [10]:
# predict output from test data
y_pred = rf_clf.predict(X_test)
y_pred

array([4, 4, 3, 3, 4, 2, 2, 4, 4, 1, 3, 2, 4, 4, 4, 2, 2, 1, 4, 1, 0, 1,
       4, 3, 3, 4, 1, 3, 1, 4, 3, 4, 4, 0, 3, 4, 2, 4, 2, 2, 4, 3, 2, 4,
       2, 2, 0, 2, 1, 4, 4, 1, 4, 4, 2, 0, 1, 4, 1, 1, 1, 4, 3, 4, 2, 1,
       1, 2, 4, 3, 2, 2, 2, 3, 4, 4, 0, 4, 4, 2, 0, 1, 3, 1, 4, 4, 3, 1,
       1, 3, 1, 0, 1, 1, 2, 4, 3, 1, 4, 1, 4, 2, 2, 2, 1, 1, 1, 3, 1, 2,
       2, 3, 1, 4, 0, 3, 1], dtype=int8)

In [11]:
# score report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.50      0.64        14
           1       0.62      0.82      0.71        22
           2       0.88      0.73      0.80        30
           3       0.74      0.74      0.74        19
           4       0.75      0.84      0.79        32

    accuracy                           0.75       117
   macro avg       0.77      0.73      0.73       117
weighted avg       0.77      0.75      0.75       117



## Model accuracy 75%