In [10]:
# imports
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import hvplot.pandas

In [11]:
# load data
rookies_df = pd.read_csv('data/cleaned_nba_hof_rookies.csv')
rookies_df.head()

Unnamed: 0,hof_class,GP,MIN,PTS,FGM,FGA,FG%,3PM,3PA,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,EFF
0,0.0,82,18.8,6.4,2.5,4.7,53.9,0.0,0.1,1.2,2.0,61.1,2.0,2.7,4.6,0.7,0.6,0.6,1.2,8.7
1,0.0,70,20.6,8.8,3.6,8.2,43.9,0.0,0.4,1.6,2.2,70.7,0.9,1.2,2.1,2.5,0.4,0.2,1.6,7.1
2,0.0,56,15.4,7.3,2.6,6.3,41.0,1.1,3.1,1.1,1.3,80.0,0.2,1.4,1.6,1.9,0.6,0.1,1.1,6.4
3,0.0,51,11.9,5.2,1.8,4.4,41.3,0.7,2.1,0.8,1.0,85.7,0.3,0.8,1.1,1.7,0.3,0.1,0.9,4.7
4,0.0,47,17.0,5.2,2.0,4.4,44.7,0.3,1.0,0.9,1.3,72.1,1.0,2.6,3.6,0.7,0.5,0.5,0.8,6.8


In [12]:
# standradize data
rookies_scaled = StandardScaler().fit_transform(rookies_df)
print(rookies_scaled[0:5])

[[-0.16965198  1.25063934  0.15528784 -0.07237608 -0.05308938 -0.31247592
   1.58324823 -0.65882163 -0.65008069 -0.07834804  0.15807461 -0.86357655
   1.27279594  0.47323033  0.74597861 -0.5584941  -0.00551556  0.45185066
   0.02900601  0.29386378]
 [-0.16965198  0.56384151  0.37700397  0.49063719  0.6162485   0.6892906
  -0.04398941 -0.65882163 -0.37497771  0.33334747  0.31107251  0.0314137
  -0.14770449 -0.63217244 -0.47434699  0.70531236 -0.50418662 -0.40502098
   0.5933381  -0.05759513]
 [-0.16965198 -0.23742263 -0.26350929  0.13875389  0.00775952  0.14547449
  -0.51588833  2.13073441  2.10094909 -0.18127192 -0.37741805  0.89843551
  -1.0516593  -0.4847854  -0.71841212  0.28404354 -0.00551556 -0.61923889
  -0.11207701 -0.2113584 ]
 [-0.16965198 -0.52358839 -0.69462398 -0.35388272 -0.47903166 -0.39834162
  -0.4670712   1.1163504   1.18393916 -0.49004356 -0.60691491  1.42983597
  -0.9225229  -0.92694651 -0.96247724  0.1436206  -0.75352215 -0.61923889
  -0.39424306 -0.5847835 ]
 [-0.1

In [13]:
# initialize PCA model
pca = PCA(n_components=2)

In [14]:
# fit the data
rookies_pca = pca.fit_transform(rookies_scaled)

In [15]:
# move PCA data into a df
rookies_pca_df = pd.DataFrame(data=rookies_pca, columns=['principal_componet_1', 'prinicpal_componet_2'])
rookies_pca_df.head()

Unnamed: 0,principal_componet_1,prinicpal_componet_2
0,0.822433,-2.320291
1,0.599017,0.38472
2,-0.568228,3.103391
3,-1.96988,2.189917
4,-0.807715,-0.42147


In [16]:
# get explained variance
pca.explained_variance_ratio_

array([0.52405856, 0.17739363])

In [17]:
# elbow curve
# find best k value
inertia = []
k = list(range(1, 11))

# loop through k values in range
for i in k:
    km = KMeans(n_clusters=i, random_state=42)
    km.fit(rookies_pca_df)
    inertia.append(km.inertia_)
    
# generate elbow curve
elbow_data = {'k': k, 'inertia': inertia}
df_elbow = pd.DataFrame(elbow_data)

# plot
df_elbow.hvplot.line(x='k', y='inertia', xticks=k, title='Elbow Curve')

  f"KMeans is known to have a memory leak on Windows "


In [18]:
# initialize k-means model
model = KMeans(n_clusters=5, random_state=0)

# fit model
model.fit(rookies_pca_df)

# predictions
predictions = model.predict(rookies_pca_df)

# add predicted class column
rookies_pca_df['class'] = model.labels_
rookies_pca_df.head()

Unnamed: 0,principal_componet_1,prinicpal_componet_2,class
0,0.822433,-2.320291,0
1,0.599017,0.38472,4
2,-0.568228,3.103391,4
3,-1.96988,2.189917,4
4,-0.807715,-0.42147,2


In [19]:
# plot clusters
rookies_pca_df.hvplot.scatter(
    x='principal_componet_1', 
    y='prinicpal_componet_2',
    hover_cols=['class'], by='class')