In [9]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer

In [10]:
data_path = "/content/drive/MyDrive/Data Intensive Computing project/data/NBA_data_merged_v2.csv"

In [11]:
df = pd.read_csv(data_path)

In [12]:
df.head()

Unnamed: 0,year,tm,player,age,g,per,ts.,x3par,ftr,orb.,drb.,trb.,ast.,stl.,blk.,tov.,usg.,ws,ws.48,obpm,dbpm,bpm,vorp.gm,offense,defense,mpg.1,tm.ortg,tm.drtg,truesalary,bbref.pos,yrs.experience,height,weight,rounded.position,Selected
0,2000,DAL,Michael Finley,26,82,19.2,0.522,0.151,0.194,3.8,12.2,8.0,21.3,1.6,0.6,9.9,25.1,8.3,0.114,2.9,-0.7,2.2,3.8,-0.42,-0.61,42.2,2.8,-3.0,16300000.0,3.0,4.0,79.0,215.0,2.0,0.0
1,2000,SEA,Gary Payton,31,82,23.6,0.535,0.312,0.254,3.2,13.9,8.5,37.7,2.3,0.4,10.8,26.8,13.9,0.195,6.6,-0.2,6.4,7.6,0.94,0.17,41.8,1.5,-0.4,24500000.0,1.0,9.0,76.0,180.0,1.0,0.0
2,2000,NYK,Latrell Sprewell,29,82,15.7,0.515,0.097,0.304,1.8,10.5,6.3,18.2,1.8,0.5,13.2,24.7,6.6,0.097,0.3,0.8,1.0,2.5,-1.41,-0.41,40.0,-1.9,3.3,13500000.0,3.0,7.0,77.0,190.0,2.0,0.0
3,2000,MIN,Kevin Garnett,23,81,23.6,0.545,0.053,0.265,7.9,26.6,17.1,21.5,1.9,2.8,13.6,27.4,11.6,0.172,3.1,2.6,5.7,6.6,-0.7,0.94,40.0,2.2,0.8,23000000.0,4.0,4.0,83.0,220.0,4.0,0.0
4,2000,VAN,Shareef Abdur-Rahim,23,82,20.2,0.547,0.075,0.431,8.0,22.7,15.3,15.5,1.5,1.9,14.1,25.0,8.8,0.132,1.7,0.3,2.0,3.5,-0.29,-0.04,39.3,-1.1,-4.5,14100000.0,3.0,3.0,81.0,225.0,4.0,0.0


In [13]:
non_stat_cols = ['year', 'tm', 'player', 'Selected']

#Creates X_data by dropping non stats cols
X = df.drop(columns = non_stat_cols)

y =  df['Selected']

In [14]:
#checks for missing values
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8582 entries, 0 to 8581
Data columns (total 31 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               8582 non-null   int64  
 1   g                 8582 non-null   int64  
 2   per               8582 non-null   float64
 3   ts.               8545 non-null   float64
 4   x3par             8542 non-null   float64
 5   ftr               8542 non-null   float64
 6   orb.              8582 non-null   float64
 7   drb.              8582 non-null   float64
 8   trb.              8582 non-null   float64
 9   ast.              8582 non-null   float64
 10  stl.              8582 non-null   float64
 11  blk.              8582 non-null   float64
 12  tov.              8554 non-null   float64
 13  usg.              8582 non-null   float64
 14  ws                8582 non-null   float64
 15  ws.48             8582 non-null   float64
 16  obpm              8582 non-null   float64


In [15]:
#Scale all cols
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#Turning the ndarray back into a pandas df
X = pd.DataFrame(X_scaled, columns= X.columns)

In [16]:
#Use the KNN imputer to fill nan values in the dataset
imputer = KNNImputer(n_neighbors=5, weights= 'distance')

X_imputed = imputer.fit_transform(X)

#Turning the ndarray back into a pandas df
X = pd.DataFrame(X_imputed, columns= X.columns)

In [17]:
#This cell runs the PCA
pca = PCA(random_state = 42)

pca.fit(X)

component_importances = pca.explained_variance_ratio_ 

variance_covered = 0
components_included = 0
while  components_included !=  len(component_importances) :
    variance_covered = variance_covered + component_importances[components_included]
    components_included+=1
    
    print('With ' + str(components_included) + ' PCA_components ' + str(variance_covered*100) + 
          ' % of the variance is explained')
    

With 1 PCA_components 24.106119351702386 % of the variance is explained
With 2 PCA_components 45.65358202546834 % of the variance is explained
With 3 PCA_components 53.52765148250539 % of the variance is explained
With 4 PCA_components 60.2545099267495 % of the variance is explained
With 5 PCA_components 66.68337392422416 % of the variance is explained
With 6 PCA_components 71.1251277581909 % of the variance is explained
With 7 PCA_components 74.90208009835065 % of the variance is explained
With 8 PCA_components 78.55423143795811 % of the variance is explained
With 9 PCA_components 81.53068666668824 % of the variance is explained
With 10 PCA_components 84.22614819920553 % of the variance is explained
With 11 PCA_components 86.60250061668089 % of the variance is explained
With 12 PCA_components 88.87615545199336 % of the variance is explained
With 13 PCA_components 90.95795657129545 % of the variance is explained
With 14 PCA_components 92.8308517846278 % of the variance is explained
Wit