### Problem statement
Using the K-nearest neighbors
algorithm to predict how many points NBA players scored in the 2013-2014
season.

In [1]:
import pandas as pd

In [3]:
data=pd.read_csv('nba_2013.csv')
data.head(5)

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
0,Quincy Acy,SF,23,TOT,63,0,847,66,141,0.468,...,144,216,28,23,26,30,122,171,2013-2014,2013
1,Steven Adams,C,20,OKC,81,20,1197,93,185,0.503,...,190,332,43,40,57,71,203,265,2013-2014,2013
2,Jeff Adrien,PF,27,TOT,53,12,961,143,275,0.52,...,204,306,38,24,36,39,108,362,2013-2014,2013
3,Arron Afflalo,SG,28,ORL,73,73,2552,464,1011,0.459,...,230,262,248,35,3,146,136,1330,2013-2014,2013
4,Alexis Ajinca,C,25,NOP,56,30,951,136,249,0.546,...,183,277,40,23,46,63,187,328,2013-2014,2013


In [4]:
data.shape

(481, 31)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 481 entries, 0 to 480
Data columns (total 31 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   player        481 non-null    object 
 1   pos           481 non-null    object 
 2   age           481 non-null    int64  
 3   bref_team_id  481 non-null    object 
 4   g             481 non-null    int64  
 5   gs            481 non-null    int64  
 6   mp            481 non-null    int64  
 7   fg            481 non-null    int64  
 8   fga           481 non-null    int64  
 9   fg.           479 non-null    float64
 10  x3p           481 non-null    int64  
 11  x3pa          481 non-null    int64  
 12  x3p.          414 non-null    float64
 13  x2p           481 non-null    int64  
 14  x2pa          481 non-null    int64  
 15  x2p.          478 non-null    float64
 16  efg.          479 non-null    float64
 17  ft            481 non-null    int64  
 18  fta           481 non-null    

### Checking nan values

In [6]:
nan_cols = [i for i in data.columns if data[i].isnull().any()]
nan_cols

['fg.', 'x3p.', 'x2p.', 'efg.', 'ft.']

#### impute nan with mean

In [7]:
def impute_nan(df,nan_cols):
    for i in nan_cols:
        df[i].fillna(df[i].mean(),inplace=True)

In [8]:
impute_nan(data,nan_cols)

In [9]:
data.isnull().sum()

player          0
pos             0
age             0
bref_team_id    0
g               0
gs              0
mp              0
fg              0
fga             0
fg.             0
x3p             0
x3pa            0
x3p.            0
x2p             0
x2pa            0
x2p.            0
efg.            0
ft              0
fta             0
ft.             0
orb             0
drb             0
trb             0
ast             0
stl             0
blk             0
tov             0
pf              0
pts             0
season          0
season_end      0
dtype: int64

In [11]:
data_numerical=data.select_dtypes(exclude='object')
data_numerical.head(2)

Unnamed: 0,age,g,gs,mp,fg,fga,fg.,x3p,x3pa,x3p.,...,orb,drb,trb,ast,stl,blk,tov,pf,pts,season_end
0,23,63,0,847,66,141,0.468,4,15,0.266667,...,72,144,216,28,23,26,30,122,171,2013
1,20,81,20,1197,93,185,0.503,0,0,0.285111,...,142,190,332,43,40,57,71,203,265,2013


In [12]:
data_categorical=data.select_dtypes(include='object')
data_categorical.head(2)

Unnamed: 0,player,pos,bref_team_id,season
0,Quincy Acy,SF,TOT,2013-2014
1,Steven Adams,C,OKC,2013-2014


In [13]:
X_data=data_numerical.drop('pts',axis=1)
y=data_numerical['pts']

#### Scaling numerical values

In [14]:
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()
X= scalar.fit_transform(X_data)

### Model creation

In [15]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=355)

In [16]:
from sklearn.neighbors import KNeighborsRegressor
knn=KNeighborsRegressor()
knn.fit(X_train,y_train)

KNeighborsRegressor()

In [29]:
knn.leaf_size

30

In [78]:
knn.score(X_train,y_train)

0.975444256767941

In [17]:
knn.score(X_test,y_test)

0.9692389180620461

In [18]:
y_pred=knn.predict(X_test)

In [21]:
Predicted = pd.DataFrame({'Actual Points': y_test.tolist(), 'Predicted Points': y_pred.tolist()})
Predicted.tail(10)

Unnamed: 0,Actual Points,Predicted Points
111,356,440.0
112,240,329.2
113,1851,1573.8
114,362,393.2
115,629,741.0
116,1010,944.4
117,1226,1202.0
118,408,476.4
119,477,536.8
120,987,919.6


### Hyper parameter tuning

In [22]:
param_grid = { 'algorithm' : ['ball_tree', 'kd_tree', 'brute'],
               'leaf_size' : [18,20,25,27,30,32,34],
               'n_neighbors' : [3,5,7,9,10,11,12,13]
              }

In [23]:
from sklearn.model_selection import GridSearchCV
gridsearch = GridSearchCV(knn, param_grid,verbose=3)
gridsearch.fit(X_train,y_train)

Fitting 5 folds for each of 168 candidates, totalling 840 fits
[CV 1/5] END algorithm=ball_tree, leaf_size=18, n_neighbors=3; total time=   0.1s
[CV 2/5] END algorithm=ball_tree, leaf_size=18, n_neighbors=3; total time=   0.0s
[CV 3/5] END algorithm=ball_tree, leaf_size=18, n_neighbors=3; total time=   0.0s
[CV 4/5] END algorithm=ball_tree, leaf_size=18, n_neighbors=3; total time=   0.0s
[CV 5/5] END algorithm=ball_tree, leaf_size=18, n_neighbors=3; total time=   0.0s
[CV 1/5] END algorithm=ball_tree, leaf_size=18, n_neighbors=5; total time=   0.0s
[CV 2/5] END algorithm=ball_tree, leaf_size=18, n_neighbors=5; total time=   0.0s
[CV 3/5] END algorithm=ball_tree, leaf_size=18, n_neighbors=5; total time=   0.0s
[CV 4/5] END algorithm=ball_tree, leaf_size=18, n_neighbors=5; total time=   0.0s
[CV 5/5] END algorithm=ball_tree, leaf_size=18, n_neighbors=5; total time=   0.0s
[CV 1/5] END algorithm=ball_tree, leaf_size=18, n_neighbors=7; total time=   0.0s
[CV 2/5] END algorithm=ball_tree, l

[CV 5/5] END algorithm=ball_tree, leaf_size=25, n_neighbors=11; total time=   0.0s
[CV 1/5] END algorithm=ball_tree, leaf_size=25, n_neighbors=12; total time=   0.0s
[CV 2/5] END algorithm=ball_tree, leaf_size=25, n_neighbors=12; total time=   0.0s
[CV 3/5] END algorithm=ball_tree, leaf_size=25, n_neighbors=12; total time=   0.0s
[CV 4/5] END algorithm=ball_tree, leaf_size=25, n_neighbors=12; total time=   0.0s
[CV 5/5] END algorithm=ball_tree, leaf_size=25, n_neighbors=12; total time=   0.0s
[CV 1/5] END algorithm=ball_tree, leaf_size=25, n_neighbors=13; total time=   0.0s
[CV 2/5] END algorithm=ball_tree, leaf_size=25, n_neighbors=13; total time=   0.0s
[CV 3/5] END algorithm=ball_tree, leaf_size=25, n_neighbors=13; total time=   0.0s
[CV 4/5] END algorithm=ball_tree, leaf_size=25, n_neighbors=13; total time=   0.0s
[CV 5/5] END algorithm=ball_tree, leaf_size=25, n_neighbors=13; total time=   0.0s
[CV 1/5] END algorithm=ball_tree, leaf_size=27, n_neighbors=3; total time=   0.0s
[CV 2

[CV 5/5] END algorithm=ball_tree, leaf_size=32, n_neighbors=5; total time=   0.0s
[CV 1/5] END algorithm=ball_tree, leaf_size=32, n_neighbors=7; total time=   0.0s
[CV 2/5] END algorithm=ball_tree, leaf_size=32, n_neighbors=7; total time=   0.0s
[CV 3/5] END algorithm=ball_tree, leaf_size=32, n_neighbors=7; total time=   0.0s
[CV 4/5] END algorithm=ball_tree, leaf_size=32, n_neighbors=7; total time=   0.0s
[CV 5/5] END algorithm=ball_tree, leaf_size=32, n_neighbors=7; total time=   0.0s
[CV 1/5] END algorithm=ball_tree, leaf_size=32, n_neighbors=9; total time=   0.0s
[CV 2/5] END algorithm=ball_tree, leaf_size=32, n_neighbors=9; total time=   0.0s
[CV 3/5] END algorithm=ball_tree, leaf_size=32, n_neighbors=9; total time=   0.0s
[CV 4/5] END algorithm=ball_tree, leaf_size=32, n_neighbors=9; total time=   0.0s
[CV 5/5] END algorithm=ball_tree, leaf_size=32, n_neighbors=9; total time=   0.0s
[CV 1/5] END algorithm=ball_tree, leaf_size=32, n_neighbors=10; total time=   0.0s
[CV 2/5] END al

[CV 3/5] END algorithm=kd_tree, leaf_size=18, n_neighbors=13; total time=   0.0s
[CV 4/5] END algorithm=kd_tree, leaf_size=18, n_neighbors=13; total time=   0.0s
[CV 5/5] END algorithm=kd_tree, leaf_size=18, n_neighbors=13; total time=   0.0s
[CV 1/5] END .algorithm=kd_tree, leaf_size=20, n_neighbors=3; total time=   0.0s
[CV 2/5] END .algorithm=kd_tree, leaf_size=20, n_neighbors=3; total time=   0.0s
[CV 3/5] END .algorithm=kd_tree, leaf_size=20, n_neighbors=3; total time=   0.0s
[CV 4/5] END .algorithm=kd_tree, leaf_size=20, n_neighbors=3; total time=   0.0s
[CV 5/5] END .algorithm=kd_tree, leaf_size=20, n_neighbors=3; total time=   0.0s
[CV 1/5] END .algorithm=kd_tree, leaf_size=20, n_neighbors=5; total time=   0.0s
[CV 2/5] END .algorithm=kd_tree, leaf_size=20, n_neighbors=5; total time=   0.0s
[CV 3/5] END .algorithm=kd_tree, leaf_size=20, n_neighbors=5; total time=   0.0s
[CV 4/5] END .algorithm=kd_tree, leaf_size=20, n_neighbors=5; total time=   0.0s
[CV 5/5] END .algorithm=kd_t

[CV 4/5] END algorithm=kd_tree, leaf_size=27, n_neighbors=13; total time=   0.0s
[CV 5/5] END algorithm=kd_tree, leaf_size=27, n_neighbors=13; total time=   0.0s
[CV 1/5] END .algorithm=kd_tree, leaf_size=30, n_neighbors=3; total time=   0.0s
[CV 2/5] END .algorithm=kd_tree, leaf_size=30, n_neighbors=3; total time=   0.0s
[CV 3/5] END .algorithm=kd_tree, leaf_size=30, n_neighbors=3; total time=   0.0s
[CV 4/5] END .algorithm=kd_tree, leaf_size=30, n_neighbors=3; total time=   0.0s
[CV 5/5] END .algorithm=kd_tree, leaf_size=30, n_neighbors=3; total time=   0.0s
[CV 1/5] END .algorithm=kd_tree, leaf_size=30, n_neighbors=5; total time=   0.0s
[CV 2/5] END .algorithm=kd_tree, leaf_size=30, n_neighbors=5; total time=   0.0s
[CV 3/5] END .algorithm=kd_tree, leaf_size=30, n_neighbors=5; total time=   0.0s
[CV 4/5] END .algorithm=kd_tree, leaf_size=30, n_neighbors=5; total time=   0.0s
[CV 5/5] END .algorithm=kd_tree, leaf_size=30, n_neighbors=5; total time=   0.0s
[CV 1/5] END .algorithm=kd_t

[CV 4/5] END algorithm=kd_tree, leaf_size=34, n_neighbors=12; total time=   0.0s
[CV 5/5] END algorithm=kd_tree, leaf_size=34, n_neighbors=12; total time=   0.0s
[CV 1/5] END algorithm=kd_tree, leaf_size=34, n_neighbors=13; total time=   0.0s
[CV 2/5] END algorithm=kd_tree, leaf_size=34, n_neighbors=13; total time=   0.0s
[CV 3/5] END algorithm=kd_tree, leaf_size=34, n_neighbors=13; total time=   0.0s
[CV 4/5] END algorithm=kd_tree, leaf_size=34, n_neighbors=13; total time=   0.0s
[CV 5/5] END algorithm=kd_tree, leaf_size=34, n_neighbors=13; total time=   0.0s
[CV 1/5] END ...algorithm=brute, leaf_size=18, n_neighbors=3; total time=   0.0s
[CV 2/5] END ...algorithm=brute, leaf_size=18, n_neighbors=3; total time=   0.0s
[CV 3/5] END ...algorithm=brute, leaf_size=18, n_neighbors=3; total time=   0.0s
[CV 4/5] END ...algorithm=brute, leaf_size=18, n_neighbors=3; total time=   0.0s
[CV 5/5] END ...algorithm=brute, leaf_size=18, n_neighbors=3; total time=   0.0s
[CV 1/5] END ...algorithm=br

[CV 2/5] END ..algorithm=brute, leaf_size=25, n_neighbors=10; total time=   0.0s
[CV 3/5] END ..algorithm=brute, leaf_size=25, n_neighbors=10; total time=   0.0s
[CV 4/5] END ..algorithm=brute, leaf_size=25, n_neighbors=10; total time=   0.0s
[CV 5/5] END ..algorithm=brute, leaf_size=25, n_neighbors=10; total time=   0.0s
[CV 1/5] END ..algorithm=brute, leaf_size=25, n_neighbors=11; total time=   0.0s
[CV 2/5] END ..algorithm=brute, leaf_size=25, n_neighbors=11; total time=   0.0s
[CV 3/5] END ..algorithm=brute, leaf_size=25, n_neighbors=11; total time=   0.0s
[CV 4/5] END ..algorithm=brute, leaf_size=25, n_neighbors=11; total time=   0.0s
[CV 5/5] END ..algorithm=brute, leaf_size=25, n_neighbors=11; total time=   0.0s
[CV 1/5] END ..algorithm=brute, leaf_size=25, n_neighbors=12; total time=   0.0s
[CV 2/5] END ..algorithm=brute, leaf_size=25, n_neighbors=12; total time=   0.0s
[CV 3/5] END ..algorithm=brute, leaf_size=25, n_neighbors=12; total time=   0.0s
[CV 4/5] END ..algorithm=bru

[CV 4/5] END ...algorithm=brute, leaf_size=32, n_neighbors=7; total time=   0.0s
[CV 5/5] END ...algorithm=brute, leaf_size=32, n_neighbors=7; total time=   0.0s
[CV 1/5] END ...algorithm=brute, leaf_size=32, n_neighbors=9; total time=   0.0s
[CV 2/5] END ...algorithm=brute, leaf_size=32, n_neighbors=9; total time=   0.0s
[CV 3/5] END ...algorithm=brute, leaf_size=32, n_neighbors=9; total time=   0.0s
[CV 4/5] END ...algorithm=brute, leaf_size=32, n_neighbors=9; total time=   0.0s
[CV 5/5] END ...algorithm=brute, leaf_size=32, n_neighbors=9; total time=   0.0s
[CV 1/5] END ..algorithm=brute, leaf_size=32, n_neighbors=10; total time=   0.0s
[CV 2/5] END ..algorithm=brute, leaf_size=32, n_neighbors=10; total time=   0.0s
[CV 3/5] END ..algorithm=brute, leaf_size=32, n_neighbors=10; total time=   0.0s
[CV 4/5] END ..algorithm=brute, leaf_size=32, n_neighbors=10; total time=   0.0s
[CV 5/5] END ..algorithm=brute, leaf_size=32, n_neighbors=10; total time=   0.0s
[CV 1/5] END ..algorithm=bru

GridSearchCV(estimator=KNeighborsRegressor(),
             param_grid={'algorithm': ['ball_tree', 'kd_tree', 'brute'],
                         'leaf_size': [18, 20, 25, 27, 30, 32, 34],
                         'n_neighbors': [3, 5, 7, 9, 10, 11, 12, 13]},
             verbose=3)

In [24]:
gridsearch.best_params_

{'algorithm': 'ball_tree', 'leaf_size': 18, 'n_neighbors': 5}

In [30]:
knn2=KNeighborsRegressor()
knn2.fit(X_train,y_train)

KNeighborsRegressor()

In [31]:
knn.score(X_train,y_train)

0.975444256767941

In [32]:
knn.score(X_test,y_test)

0.9692389180620461