In [1]:
import pandas as pd
import numpy as np
import csv
import math
import seaborn as sns

In [2]:
data = pd.read_csv("nba_2013.csv")

In [3]:
pd.options.display.max_columns = None

In [4]:
data.head()

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,x3p,x3pa,x3p.,x2p,x2pa,x2p.,efg.,ft,fta,ft.,orb,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
0,Quincy Acy,SF,23,TOT,63,0,847,66,141,0.468,4,15,0.266667,62,126,0.492063,0.482,35,53,0.66,72,144,216,28,23,26,30,122,171,2013-2014,2013
1,Steven Adams,C,20,OKC,81,20,1197,93,185,0.503,0,0,,93,185,0.502703,0.503,79,136,0.581,142,190,332,43,40,57,71,203,265,2013-2014,2013
2,Jeff Adrien,PF,27,TOT,53,12,961,143,275,0.52,0,0,,143,275,0.52,0.52,76,119,0.639,102,204,306,38,24,36,39,108,362,2013-2014,2013
3,Arron Afflalo,SG,28,ORL,73,73,2552,464,1011,0.459,128,300,0.426667,336,711,0.472574,0.522,274,336,0.815,32,230,262,248,35,3,146,136,1330,2013-2014,2013
4,Alexis Ajinca,C,25,NOP,56,30,951,136,249,0.546,0,1,0.0,136,248,0.548387,0.546,56,67,0.836,94,183,277,40,23,46,63,187,328,2013-2014,2013


## Some important columns
pos -- the position of the player

g -- number of games the player was in

gs -- number of games the player started

pts -- total points the player scored

In [5]:
data[data['pts'] == data['pts'].max()] 

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,x3p,x3pa,x3p.,x2p,x2pa,x2p.,efg.,ft,fta,ft.,orb,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
133,Kevin Durant,SF,25,OKC,81,81,3122,849,1688,0.503,192,491,0.391039,657,1197,0.548872,0.56,703,805,0.873,58,540,598,445,103,59,285,174,2593,2013-2014,2013


In [7]:
data[data['player'] == 'LeBron James']

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,x3p,x3pa,x3p.,x2p,x2pa,x2p.,efg.,ft,fta,ft.,orb,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
225,LeBron James,PF,29,MIA,77,77,2902,767,1353,0.567,116,306,0.379085,651,1047,0.621777,0.61,439,585,0.75,81,452,533,488,121,26,270,126,2089,2013-2014,2013


## Players who scored over 2,000 points during the season

In [8]:
data[data['pts'] > 2000]

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,x3p,x3pa,x3p.,x2p,x2pa,x2p.,efg.,ft,fta,ft.,orb,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
17,Carmelo Anthony,PF,29,NYK,77,77,2982,743,1643,0.452,167,415,0.40241,576,1228,0.469055,0.503,459,541,0.848,145,477,622,242,95,51,198,224,2112,2013-2014,2013
133,Kevin Durant,SF,25,OKC,81,81,3122,849,1688,0.503,192,491,0.391039,657,1197,0.548872,0.56,703,805,0.873,58,540,598,445,103,59,285,174,2593,2013-2014,2013
225,LeBron James,PF,29,MIA,77,77,2902,767,1353,0.567,116,306,0.379085,651,1047,0.621777,0.61,439,585,0.75,81,452,533,488,121,26,270,126,2089,2013-2014,2013
277,Kevin Love,PF,25,MIN,77,77,2797,650,1421,0.457,190,505,0.376238,460,916,0.502183,0.524,520,633,0.821,224,739,963,341,59,35,196,136,2010,2013-2014,2013


### Let's see if LeBron James would be similar to anybody in the above neighbourhood.

In [10]:
data.shape

(481, 31)

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 481 entries, 0 to 480
Data columns (total 31 columns):
player          481 non-null object
pos             481 non-null object
age             481 non-null int64
bref_team_id    481 non-null object
g               481 non-null int64
gs              481 non-null int64
mp              481 non-null int64
fg              481 non-null int64
fga             481 non-null int64
fg.             479 non-null float64
x3p             481 non-null int64
x3pa            481 non-null int64
x3p.            414 non-null float64
x2p             481 non-null int64
x2pa            481 non-null int64
x2p.            478 non-null float64
efg.            479 non-null float64
ft              481 non-null int64
fta             481 non-null int64
ft.             461 non-null float64
orb             481 non-null int64
drb             481 non-null int64
trb             481 non-null int64
ast             481 non-null int64
stl             481 non-null int64
blk    

In [12]:
data.columns

Index(['player', 'pos', 'age', 'bref_team_id', 'g', 'gs', 'mp', 'fg', 'fga',
       'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft',
       'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf',
       'pts', 'season', 'season_end'],
      dtype='object')

In [13]:
# Let's select Lebron James from our dataset and 
selected_player = data[data["player"] == "LeBron James"].iloc[0]
# Let's choose only the numeric columns to compute euclidean distance in the KNN.
distance_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts']

## Defining the distance function

In [14]:
def euclidean_distance(row): 
    inner_value = 0
    for k in distance_columns:
        inner_value += (row[k] - selected_player[k]) ** 2
    return math.sqrt(inner_value)

# Find the distance from each player in the dataset to LeBron.
lebron_distance = data.apply(euclidean_distance, axis=1)

## Normalizing columns

In [15]:
# Select only the numeric columns from the NBA dataset
nba_numeric = data[distance_columns]

# Normalize all of the numeric columns
nba_normalized = (nba_numeric - nba_numeric.mean()) / nba_numeric.std()

In [16]:
nba_normalized.head(1)

Unnamed: 0,age,g,gs,mp,fg,fga,fg.,x3p,x3pa,x3p.,x2p,x2pa,x2p.,efg.,ft,fta,ft.,orb,drb,trb,ast,stl,blk,tov,pf,pts
0,-0.835906,0.384886,-0.862207,-0.435088,-0.738401,-0.768505,0.319884,-0.700282,-0.716608,-0.117009,-0.619931,-0.640207,0.240468,0.012541,-0.542173,-0.515408,-0.389712,0.26069,-0.129462,-0.013116,-0.64522,-0.468056,0.06141,-0.66765,0.226515,-0.734621


## Finding the nearest neighbor

In [62]:
from scipy.spatial import distance

# Fill in NA values in nba_normalized
nba_normalized.fillna(0, inplace=True)

# Find the normalized vector for lebron james.
lebron_normalized = nba_normalized[data["player"] == "LeBron James"]

# Find the distance between lebron james and everyone else.
euclidean_distances = nba_normalized.apply(lambda row: distance.euclidean(row, lebron_normalized), axis=1)

# Create a new dataframe with distances.
distance_frame = pd.DataFrame(data={"dist": euclidean_distances, "idx": euclidean_distances.index})
distance_frame.sort_values("dist", inplace=True)
# Find the most similar player to lebron (the lowest distance to lebron is lebron, the second smallest is the most similar non-lebron player)
second_smallest = distance_frame.iloc[1]["idx"]
most_similar_to_lebron = data.loc[int(second_smallest)]["player"]

In [89]:
print(most_similar_to_lebron) # Could you guess it in advance?

Carmelo Anthony


#### Interestingly enough, Carmelo Anthony was one of the four players who scored over 2,000 points in the season!
#### He is also same years old and played same number of games in the season. So we can say that kNN algoritm works good!

## Generating training and testing sets

In [17]:
from sklearn.model_selection import train_test_split

In [25]:
x_train, x_test, y_train, y_test = train_test_split(nba_normalized, nba_normalized['pts'], test_size = 0.20)

## Using sklearn for k = 5 nearest neighbors

There's a regressor and a classifier available, but we'll be using the regressor, 
as we have continuous values to predict on.

In [None]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(n_neighbors=5)

knn.fit(x_train, y_train)

In [None]:
predictions = knn.predict(x_test)

## Computing error (MSE) & R^2 score

In [24]:
from sklearn.metrics import r2_score, mean_squared_error

In [87]:
print(mean_squared_error(y_test, predictions))

0.033756036178455844


In [88]:
print(r2_score(y_test, predictions))

0.966754232993784
