# NBA player 5-year career longevity prediction with K-nearest neighbors.

Implementing a basic classification model using K-nearest neighbors.
The dataset nba_logreg.csv contains statistics of NBA players. Predicting whether a player will have a career of at least five years in the NBA.

In [1]:
# importing required libraries.
import pandas as pd
import numpy as np
from math import sqrt
import operator

In [2]:
ds = pd.read_csv('nba_logreg.csv') # reading the csv file and storing the dataset in ds.
print(len(ds)) # length of dataframe ds.
print(np.shape(ds)) # dimensions of dataframe ds.
ds.head() # display top 5 rows of the dataset ds.

1340
(1340, 21)


Unnamed: 0,Name,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,...,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs
0,Brandon Ingram,36,27.4,7.4,2.6,7.6,34.7,0.5,2.1,25.0,...,2.3,69.9,0.7,3.4,4.1,1.9,0.4,0.4,1.3,0.0
1,Andrew Harrison,35,26.9,7.2,2.0,6.7,29.6,0.7,2.8,23.5,...,3.4,76.5,0.5,2.0,2.4,3.7,1.1,0.5,1.6,0.0
2,JaKarr Sampson,74,15.3,5.2,2.0,4.7,42.2,0.4,1.7,24.4,...,1.3,67.0,0.5,1.7,2.2,1.0,0.5,0.3,1.0,0.0
3,Malik Sealy,58,11.6,5.7,2.3,5.5,42.6,0.1,0.5,22.6,...,1.3,68.9,1.0,0.9,1.9,0.8,0.6,0.1,1.0,1.0
4,Matt Geiger,48,11.5,4.5,1.6,3.0,52.4,0.0,0.1,0.0,...,1.9,67.4,1.0,1.5,2.5,0.3,0.3,0.4,0.8,1.0


In [3]:
# A function to check the missing values and the data type of each column.
def eda(df):
    datatype = df.dtypes
    missing_values = df.isnull().sum()
    per_missing = (df.isnull().sum() / len(df))*100
    count_unique = list(df.nunique())
    v_len = len(df)
    dx = pd.DataFrame({'Total number of values': v_len, 'Total Missing Values': missing_values, 'Percentage missing': per_missing, ' Count of unique values': count_unique, 'Data type': datatype})
    return dx

In [4]:
eda(ds) # check for missing values (null values).

Unnamed: 0,Total number of values,Total Missing Values,Percentage missing,Count of unique values,Data type
Name,1340,0,0.0,1294,object
GP,1340,0,0.0,70,int64
MIN,1340,0,0.0,325,float64
PTS,1340,0,0.0,191,float64
FGM,1340,0,0.0,87,float64
FGA,1340,0,0.0,159,float64
FG%,1340,0,0.0,284,float64
3P Made,1340,0,0.0,23,float64
3PA,1340,0,0.0,54,float64
3P%,1340,11,0.820896,254,float64


As there are some null values in column '3P%', let us see the summary of this column.

In [5]:
# summary of column 3P%
print(ds['3P%'].describe()) 
print('--------------------')
print(ds['3P%'].mode())
#print(ds['3P%'].mean())

count    1329.000000
mean       19.308126
std        16.022916
min         0.000000
25%         0.000000
50%        22.400000
75%        32.500000
max       100.000000
Name: 3P%, dtype: float64
--------------------
0    0.0
dtype: float64


In [6]:
ds = ds.fillna(ds['3P%'].mean()) # replacing null values in column '3P%' with the mean value.

#keeping columns with only numerical values.
#ds1 = ds.drop(['Name'], axis =1)
eda(ds) # checking again for missing values.

Unnamed: 0,Total number of values,Total Missing Values,Percentage missing,Count of unique values,Data type
Name,1340,0,0.0,1294,object
GP,1340,0,0.0,70,int64
MIN,1340,0,0.0,325,float64
PTS,1340,0,0.0,191,float64
FGM,1340,0,0.0,87,float64
FGA,1340,0,0.0,159,float64
FG%,1340,0,0.0,284,float64
3P Made,1340,0,0.0,23,float64
3PA,1340,0,0.0,54,float64
3P%,1340,0,0.0,255,float64


In [7]:
print(ds.head())

              Name  GP   MIN  PTS  FGM  FGA   FG%  3P Made  3PA   3P%  ...  \
0   Brandon Ingram  36  27.4  7.4  2.6  7.6  34.7      0.5  2.1  25.0  ...   
1  Andrew Harrison  35  26.9  7.2  2.0  6.7  29.6      0.7  2.8  23.5  ...   
2   JaKarr Sampson  74  15.3  5.2  2.0  4.7  42.2      0.4  1.7  24.4  ...   
3      Malik Sealy  58  11.6  5.7  2.3  5.5  42.6      0.1  0.5  22.6  ...   
4      Matt Geiger  48  11.5  4.5  1.6  3.0  52.4      0.0  0.1   0.0  ...   

   FTA   FT%  OREB  DREB  REB  AST  STL  BLK  TOV  TARGET_5Yrs  
0  2.3  69.9   0.7   3.4  4.1  1.9  0.4  0.4  1.3          0.0  
1  3.4  76.5   0.5   2.0  2.4  3.7  1.1  0.5  1.6          0.0  
2  1.3  67.0   0.5   1.7  2.2  1.0  0.5  0.3  1.0          0.0  
3  1.3  68.9   1.0   0.9  1.9  0.8  0.6  0.1  1.0          1.0  
4  1.9  67.4   1.0   1.5  2.5  0.3  0.3  0.4  0.8          1.0  

[5 rows x 21 columns]


In [8]:
dsn = ds.columns[1:20]
print(dsn)

Index(['GP', 'MIN', 'PTS', 'FGM', 'FGA', 'FG%', '3P Made', '3PA', '3P%', 'FTM',
       'FTA', 'FT%', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV'],
      dtype='object')


In [14]:
ds[dsn] = ds[dsn].values.tolist()

In [15]:
print(ds[dsn].values.tolist())

[[36.0, 27.4, 7.4, 2.6, 7.6, 34.7, 0.5, 2.1, 25.0, 1.6, 2.3, 69.9, 0.7, 3.4, 4.1, 1.9, 0.4, 0.4, 1.3], [35.0, 26.9, 7.2, 2.0, 6.7, 29.6, 0.7, 2.8, 23.5, 2.6, 3.4, 76.5, 0.5, 2.0, 2.4, 3.7, 1.1, 0.5, 1.6], [74.0, 15.3, 5.2, 2.0, 4.7, 42.2, 0.4, 1.7, 24.4, 0.9, 1.3, 67.0, 0.5, 1.7, 2.2, 1.0, 0.5, 0.3, 1.0], [58.0, 11.6, 5.7, 2.3, 5.5, 42.6, 0.1, 0.5, 22.6, 0.9, 1.3, 68.9, 1.0, 0.9, 1.9, 0.8, 0.6, 0.1, 1.0], [48.0, 11.5, 4.5, 1.6, 3.0, 52.4, 0.0, 0.1, 0.0, 1.3, 1.9, 67.4, 1.0, 1.5, 2.5, 0.3, 0.3, 0.4, 0.8], [75.0, 11.4, 3.7, 1.5, 3.5, 42.3, 0.3, 1.1, 32.5, 0.4, 0.5, 73.2, 0.2, 0.7, 0.8, 1.8, 0.4, 0.0, 0.7], [62.0, 10.9, 6.6, 2.5, 5.8, 43.5, 0.0, 0.1, 50.0, 1.5, 1.8, 81.1, 0.5, 1.4, 2.0, 0.6, 0.2, 0.1, 0.7], [48.0, 10.3, 5.7, 2.3, 5.4, 41.5, 0.4, 1.5, 30.0, 0.7, 0.8, 87.5, 0.8, 0.9, 1.7, 0.2, 0.2, 0.1, 0.7], [65.0, 9.9, 2.4, 1.0, 2.4, 39.2, 0.1, 0.5, 23.3, 0.4, 0.5, 71.4, 0.2, 0.6, 0.8, 2.3, 0.3, 0.0, 1.1], [42.0, 8.5, 3.7, 1.4, 3.5, 38.3, 0.1, 0.3, 21.4, 1.0, 1.4, 67.8, 0.4, 0.7, 1.1, 0.3

In [16]:
#ds3 = ds2.values.tolist()

## Standardizing the data

Standardization is a rescaling technique that refers to centering the distribution of the data on the value 0 and the standard deviation to the value 1.

In [18]:
# Building a list of functions in order to scale the data
# Using the formulae below to get the standardized data.
# mean = sum(values) / total values
# standard deviation = sqrt( (value_i - mean)^2 / (total values-1))
# standardized value = (value - mean) / standard deviation

data = ds[dsn].values.tolist()
# calculating column means
def column_means(data):
    means = [0 for i in range(len(data[0])-1)]
    for i in range(len(data[0])-1):
        col_values = [row[i] for row in data]
        means[i] = sum(col_values) / float(len(data))
    return means

# Calculating column standard deviations
def column_sds(data, means):
    sds = [0 for i in range(len(data[0])-1)]
    for i in range(len(data[0])-1):
        variance = [pow(row[i]-means[i], 2) for row in data]
        sds[i] = sum(variance)
    stdevs = [sqrt(x/(float(len(data)-1))) for x in sds]
    return sds

# Standardizing the dataset (Standard Scaling)
def standardize_dataset(data, means, sds):
    for row in data:
        for i in range(len(row)-1):
            row[i] = (row[i] - means[i]) / sds[i]
            
# Standardize dataset

# Estimate mean and standard deviation
means = column_means(data)
sds = column_sds(data, means)
#print(means)
#print(sds)
# standardize dataset
standardize_dataset(data, means, sds)

In [48]:
# storing the scaled data in ds4.
ds[dsn] = data
#print(ds)

In [25]:
ds1 = ds.values.tolist()
ds1

[['Brandon Ingram',
  -5.9988479241442025e-05,
  0.0001057702634123986,
  2.3539934118710032e-05,
  -7.668749957968028e-06,
  9.916888682448861e-05,
  -0.000187730007540535,
  0.0012803624205527792,
  0.0008748611925875077,
  1.6694523906906085e-05,
  0.0002316466504651988,
  0.0002039821519247097,
  -2.671510604272059e-06,
  -0.00038262069966541596,
  0.0005548859576774936,
  0.00018792589769389852,
  0.00012059062336290617,
  -0.0009719174284445772,
  0.0001274625901082478,
  1.3,
  0.0],
 ['Andrew Harrison',
  -6.24455956156903e-05,
  0.00010036022627320423,
  1.5673721719939102e-05,
  -0.0001657629798607216,
  4.711784556134123e-05,
  -0.0002888370192461084,
  0.0022949606722031195,
  0.0013385143925689089,
  1.229495573869705e-05,
  0.00099789328493414,
  0.0006733385239775083,
  4.1375543430380315e-05,
  -0.0006299490824689171,
  -1.0395637002373636e-05,
  -0.00011190264618248568,
  0.0007416979980187452,
  0.0021416705083075146,
  0.0005331629956784655,
  1.6,
  0.0],
 ['JaKarr 

In [26]:
#splitting the 1st 1000 rows in training data and last 340 rows in testing data.
train_ds = ds1[0:1000]
test_ds = ds1[1000:]
print(len(train_ds))
print(len(test_ds))

1000
340


Firstly, calculating the euclidean distance. The euclidean is defined as the square root of the sum of the squared differences between the two arrays of numbers.

In [33]:
# function to calculate euclidean distance.
def Euc_dist(x, y, leng):
    dis = 0.0
    for i in range(1, leng):
        dis += pow(float(x[i])- float(y[i]),2)
    return sqrt(dis)

After calculating the distance from each point, we use it collect the k most similar points for the given test data. Creating a function that  returns k most similar neighbors from the training set for a given test data.

In [34]:
# function to get the neighbors
def getNb(train_ds, test, k):
    dist = []
    leng = len(test)-1
    for a in range(1, len(train_ds)):
        d = Euc_dist(test, train_ds[a], leng)
        dist.append((train_ds[a], d))
    dist.sort(key=operator.itemgetter(1))
    nb = []
    for a in range(k):
        nb.append(dist[a][0])
    return nb

Predicting the class: Predict the response based on the neighbors. Allowing each neighbor to vote for their class attribute, and taking the majority vote as the prediction. Creating a function for getting the majority voted response from a number of neighbors.

In [35]:
# function to get response
def getRes(nb):
        class_v = {}
        for a in range(1, len(nb)):
                res = nb[a][-1]
                if res in class_v:
                        class_v[res] += 1
                else:
                        class_v[res] = 1
        sort_v = sorted(class_v.items(), key=operator.itemgetter(1), reverse=True)
        return sort_v[0][0] 

Checking the accurcy: To calculate the accuracy of the model we need to calculate a ratio of the total correct predictions out of all predictions made. Creating a function which sums the total correct predictions and returns the accuracy as a percentage of correct classifications.

In [36]:
# function to get accuracy
def getAcc(test_ds, predictions):
    acc = 0
    for a in range(1, len(test_ds)):
        if test_ds[a][-1] == predictions[a]:
            acc += 1
    return (acc/float(len(test_ds))) * 100.0

## Predicting for k = 5

In [54]:
predictions=[]
k = 5
for a in range(len(test_ds)):
    nb = getNb(train_ds, test_ds[a], k)
    result = getRes(nb)
    predictions.append(result)
    #print(predictions)   
    #print('predicted=' + repr(result) + ', actual=' + repr(test_ds[a][-1]))
    if result == test_ds[a][-1]:
        print (test_ds[a][0])
    

Gilbert Arenas
Richard Jefferson
Speedy Claxton
Brendan Haywood
Tyson Chandler
Jason Collins
Eddy Curry
Will Solomon
Lamar Patterson
Etan Thomas
Loren Woods
Brandon Armstrong
Willie Reed
Kenyon Martin
Marc Jackson
Mike Miller
Darius Miles
Morris Peterson
Marcus Fizer
Courtney Alexander
Chris Mihm
Kelly Oubre
Mark Blount
Stromile Swift
Mike Penberthy
Lee Nailon
Paul McPherson
DeShawn Stevenson
Elton Brand
Lamar Odom
Steve Francis
Metta World Peace
Adrian Griffin
Andre Miller
James Posey
Kenny Thomas
Shawn Marion
Anthony Carter
Jason Terry
Richard Hamilton
Baron Davis
Eddie Robinson
Michael Ruffin
Quincy Lewis
William Avery
Jason Williams
Vince Carter
Mike Bibby
Paul Pierce
Michael Dickerson
Jamie Feick
Cuttino Mobley
Antawn Jamison
Matt Harpring
Larry Hughes
Cory Carr
Keon Clark
Damon Jones
Jelani McCoy
Brad Miller
Ricky Davis
J.R. Henderson
Corey Benjamin
Toby Bailey
Marlon Garnett
Gerald Brown
Keith Van Horn
Ron Mercer
Brevin Knight
Cedric Henderson
Bobby Jackson
Anthony Johnson
Derek

In [52]:
accuracy = round(getAcc(test_ds, predictions), 2)
print('Accuracy =' + repr(accuracy) + '%')

Accuracy =61.47%


## Predicting for k = 25

In [46]:
predictions=[]
k = 25
for a in range(len(test_ds)):
    nb = getNb(train_ds, test_ds[a], k)
    result = getRes(nb)
    predictions.append(result)
    #if result == test_ds[a][-1]:
     #   print (test_ds[a][0])
    
    
accuracy = round(getAcc(test_ds, predictions), 2)
print('Accuracy =' + repr(accuracy) + '%')

Accuracy =65.0%


## Predicting for k = 51

In [47]:
predictions=[]
k = 51
for a in range(len(test_ds)):
    nb = getNb(train_ds, test_ds[a], k)
    result = getRes(nb)
    predictions.append(result)
    #if result == test_ds[a][-1]:
     #   print (test_ds[a][0])
    
    
accuracy = round(getAcc(test_ds, predictions), 2)
print('Accuracy =' + repr(accuracy) + '%')

Accuracy =69.12%


# Verifying the result with libraries such as sklearn

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [22]:
ds11 = ds1

In [23]:
#splitting dataset

X = ds11.iloc[:, 1:19]
Y = ds11.iloc[:, 19]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, shuffle = False, stratify = None, train_size = 0.74626866)

In [24]:
#scaling
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

## Prediction for k = 5

In [25]:
classifier = KNeighborsClassifier(n_neighbors = 5, p = 2, metric = 'euclidean')

In [26]:
classifier.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [27]:
Y_pred = classifier.predict(X_test)

In [28]:
cm = confusion_matrix(Y_test, Y_pred)
print(cm)

[[ 50  61]
 [ 48 181]]


In [29]:
print(f1_score(Y_test, Y_pred))

0.7685774946921442


In [30]:
print(accuracy_score(Y_test, Y_pred)*100)

67.94117647058823


## Prediction for k = 25

In [31]:
classifier = KNeighborsClassifier(n_neighbors = 25, p = 2, metric = 'euclidean')
classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)
cm = confusion_matrix(Y_test, Y_pred)
print(f1_score(Y_test, Y_pred))
print(accuracy_score(Y_test, Y_pred)*100)

0.7672413793103448
68.23529411764706


## Prediction for k = 51

In [32]:
classifier = KNeighborsClassifier(n_neighbors = 51, p = 2, metric = 'euclidean')
classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)
cm = confusion_matrix(Y_test, Y_pred)
print(f1_score(Y_test, Y_pred))
print(accuracy_score(Y_test, Y_pred)*100)

0.7695652173913043
68.82352941176471


As we can see the accuracy of the Knn built from scrath and the accuracy of the Knn model used by sklearn library is very close.

From the above obtained accuracy score for the Knn built from scratch, we can say that k = 25 has the maximum accuracy, followed by k = 51 and the least accuracy is for k = 5