In [2]:
# See mission 6.5 ML Intermediate for more on the NBA players 

In [42]:
import pandas as pd
nba = pd.read_csv("nba_2013.csv")

# The names of the columns in the data
print(nba.columns.values)

['player' 'pos' 'age' 'bref_team_id' 'g' 'gs' 'mp' 'fg' 'fga' 'fg.' 'x3p'
 'x3pa' 'x3p.' 'x2p' 'x2pa' 'x2p.' 'efg.' 'ft' 'fta' 'ft.' 'orb' 'drb'
 'trb' 'ast' 'stl' 'blk' 'tov' 'pf' 'pts' 'season' 'season_end']


In [43]:
# Finding Similar Rows With Euclidean Distance

selected_player = nba[nba["player"] == "LeBron James"].iloc[0]
distance_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts']

import math
def euclidean_distance(row):
    inner_value = 0
    for k in distance_columns:
        inner_value += (row[k] - selected_player[k]) ** 2
    return math.sqrt(inner_value)

lebron_distance = nba.apply(euclidean_distance, axis=1)

In [44]:
# Normalizing Columns

nba_numeric = nba[distance_columns]
nba_normalized = (nba_numeric - nba_numeric.mean()) / nba_numeric.std()

In [45]:
# Finding the Nearest Neighbor
import pandas

from scipy.spatial import distance

# Fill in the NA values in nba_normalized
nba_normalized.fillna(0, inplace=True)

# Find the normalized vector for Lebron James
lebron_normalized = nba_normalized[nba["player"] == "LeBron James"]

# Find the distance between Lebron James and everyone else.
euclidean_distances = nba_normalized.apply(lambda row: distance.euclidean(row, lebron_normalized), axis=1)

distance_frame = pandas.DataFrame(data={"dist": euclidean_distances, "idx": euclidean_distances.index})
distance_frame.sort_values("dist", inplace=True)
second_smallest = distance_frame.iloc[1]["idx"]
most_similar_to_lebron = nba.loc[int(second_smallest)]["player"]

print(most_similar_to_lebron)

Carmelo Anthony


In [46]:
# Generating Training and Testing Sets

import random
from numpy.random import permutation

# Randomly shuffle the index of nba
random_indices = permutation(nba.index)
# Set a cutoff for how many items we want in the test set (in this case 1/3 of the items)
test_cutoff = math.floor(len(nba)/3)
# Generate the test set by taking the first 1/3 of the randomly shuffled indices
test = nba.loc[random_indices[1:test_cutoff]]
# Generate the train set with the rest of the data
train = nba.loc[random_indices[test_cutoff:]]

Instead of having to do it all ourselves, we can use the kNN implementation in scikit-learn. While scikit-learn (Sklearn for short) makes a regressor and a classifier available, we'll be using the regressor, as we have continuous values to predict on.

Sklearn performs the normalization and distance finding automatically, and lets us specify how many neighbors we want to look at

In [47]:
import numpy as np

col_mask=nba.isnull().any(axis=0) 

In [48]:
row_mask=nba.isnull().any(axis=1)

In [49]:
nba.loc[row_mask,col_mask]

Unnamed: 0,fg.,x3p.,x2p.,efg.,ft.
1,0.503,,0.502703,0.503,0.581
2,0.520,,0.520000,0.520,0.639
5,0.541,,0.540984,0.541,0.867
11,0.500,,0.500000,0.500,0.250
18,0.375,,0.375000,0.375,0.500
...,...,...,...,...,...
456,0.500,,0.500000,0.500,
460,0.000,,0.000000,0.000,
461,0.000,,0.000000,0.000,
468,0.556,0.333333,0.666667,0.611,


In [50]:
# The columns that we'll be using to make predictions
x_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf']
# The column we want to predict
y_column = ["pts"]

from sklearn.neighbors import KNeighborsRegressor
# Create the kNN model
knn = KNeighborsRegressor(n_neighbors=5)
# Fit the model on the training data
knn.fit(train[x_columns], train[y_column])
# Make predictions on the test set using the fit model
predictions = knn.predict(test[x_columns])

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [39]:
predictions = np.array([[5.1900e+02],
       [1.3560e+02],
       [2.9720e+02],
       [3.4460e+02],
       [2.2800e+01],
       [2.8240e+02],
       [7.1400e+01],
       [4.3040e+02],
       [4.9380e+02],
       [3.5340e+02],
       [9.7440e+02],
       [1.6920e+02],
       [4.3780e+02],
       [5.6460e+02],
       [1.3014e+03],
       [4.3520e+02],
       [8.4320e+02],
       [9.6400e+01],
       [7.8300e+02],
       [2.7600e+02],
       [2.0760e+02],
       [1.5620e+02],
       [1.0200e+01],
       [2.5100e+02],
       [6.3580e+02],
       [1.5102e+03],
       [7.9200e+02],
       [3.3940e+02],
       [7.9800e+01],
       [1.4612e+03],
       [9.6400e+01],
       [8.6600e+02],
       [8.0440e+02],
       [1.1048e+03],
       [3.3680e+02],
       [1.0678e+03],
       [2.7180e+02],
       [1.8170e+03],
       [1.1434e+03],
       [1.0542e+03],
       [1.8000e+02],
       [8.9600e+01],
       [1.1160e+02],
       [8.7800e+01],
       [7.8160e+02],
       [1.2780e+02],
       [1.4100e+02],
       [1.7886e+03],
       [1.6500e+02],
       [1.8140e+02],
       [1.4238e+03],
       [1.6020e+02],
       [2.2200e+01],
       [1.5600e+01],
       [9.9460e+02],
       [3.3320e+02],
       [3.3100e+02],
       [2.1880e+02],
       [6.8200e+01],
       [1.3292e+03],
       [1.2000e+01],
       [3.0000e+00],
       [3.5340e+02],
       [3.8120e+02],
       [1.3292e+03],
       [1.3020e+02],
       [6.0300e+02],
       [6.0200e+01],
       [1.3828e+03],
       [9.3100e+02],
       [1.2314e+03],
       [6.6660e+02],
       [2.3440e+02],
       [9.7060e+02],
       [2.8460e+02],
       [7.7020e+02],
       [1.7886e+03],
       [4.2000e+00],
       [4.8980e+02],
       [9.2360e+02],
       [1.2200e+01],
       [3.1380e+02],
       [2.0040e+02],
       [8.2400e+01],
       [1.4704e+03],
       [1.2200e+01],
       [4.1000e+01],
       [6.3800e+01],
       [4.0200e+02],
       [6.0000e-01],
       [1.0520e+03],
       [1.7760e+02],
       [1.6000e+00],
       [1.1056e+03],
       [1.7840e+02],
       [3.8680e+02],
       [2.4960e+02],
       [3.1020e+02],
       [3.3860e+02],
       [1.1640e+02],
       [7.3220e+02],
       [1.8440e+02],
       [1.0720e+03],
       [1.2300e+02],
       [3.4800e+01],
       [4.1620e+02],
       [2.5200e+01],
       [1.5960e+02],
       [5.9000e+02],
       [6.0260e+02],
       [1.6842e+03],
       [2.7160e+02],
       [2.2600e+02],
       [1.0036e+03],
       [8.6000e+01],
       [8.9200e+02],
       [9.8000e+00],
       [1.2000e+02],
       [1.2000e+00],
       [5.2400e+01],
       [1.2828e+03],
       [1.0160e+02],
       [5.1420e+02],
       [1.2456e+03],
       [9.7200e+01],
       [9.6400e+01],
       [1.4400e+03],
       [1.0976e+03],
       [7.9560e+02],
       [8.9600e+01],
       [3.3200e+01],
       [1.2736e+03],
       [3.7880e+02],
       [1.2086e+03],
       [1.4400e+01],
       [1.3078e+03],
       [7.3000e+01],
       [5.9980e+02],
       [1.1052e+03],
       [2.8440e+02],
       [5.3820e+02],
       [4.6120e+02],
       [7.1400e+01],
       [1.7840e+02],
       [1.0586e+03],
       [1.3292e+03],
       [9.9400e+01],
       [1.5272e+03],
       [3.4400e+02],
       [2.9440e+02],
       [5.9380e+02],
       [1.0800e+02],
       [6.0900e+02],
       [6.8220e+02],
       [3.5600e+01],
       [2.9860e+02],
       [9.4200e+01],
       [9.8500e+02],
       [1.0618e+03]])

actual = test[y_column]
mse = (((predictions - actual) ** 2).sum()) / len(predictions)

print(mse)

pts    447605.646541
dtype: float64


In [53]:
# Splitting the Data into train and test

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(nba["pts"], test_size=0.2, random_state=1)

ValueError: not enough values to unpack (expected 4, got 2)

In [51]:
# making predictions with fit

from sklearn.linear_model import LinearRegression

clf = LinearRegression()
clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

NameError: name 'X_train' is not defined