In [3]:
# Dependencies

# Data Manip
import pandas as pd

# Machine Learning
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

# Linear Algebra
import numpy as np

# Utils

# Visualization
import plotly.graph_objects as go

seed = 42
np.random.seed(seed)

In [4]:
# Data preprocessing

data_path = "data/nba_filtered_capped.csv"
df = pd.read_csv(data_path)
df = df.loc[:,~df.columns.str.contains("capped")]

In [5]:
# Train / Test split
target = "TARGET_5Yrs"

X = df.drop(columns=[target]).values
y = df[target].values

test_size=0.2
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,test_size=test_size,shuffle=True,random_state=seed)


In [6]:
# PCA
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X_train)

pca = PCA()
X_pca = pca.fit_transform(X_scaled)

threshold = 0.99 # keep 95% of variance in data

fig_pca = go.Figure()
explained_var = pca.explained_variance_ratio_
for i in range(len(explained_var)):
    fig_pca.add_trace(go.Bar(
        x=[f"PC{i+1}"],
        y=[explained_var[i]],
        name=f"PC{i+1}"
    ))
fig_pca.update_layout(
    title="PCA Axis Contribution (Explained Variance Ratio)",
    xaxis_title="Principal Component",
    yaxis_title="Explained Variance Ratio",
    showlegend=False
)
fig_pca.show()

cum_var = np.cumsum(pca.explained_variance_ratio_)
n_components_needed = np.argmax(cum_var >= threshold) + 1
print(f"Number of components to reach {threshold*100:.1f}% variance:", n_components_needed)

Number of components to reach 99.0% variance: 12


In [9]:
# PCA Visu

feature_names = [f for f in df.columns if f not in ["Name", target]]

prop_var = pd.DataFrame(
    pca.components_.T[:,:n_components_needed] ** 2,
    columns=[f"PC{i+1}" for i in range(n_components_needed)],
    index=feature_names
)

# prop_var = prop_var.drop(columns="PC4") # focus on certain axis (link with L1 regularization in Logistic Regression)

sum_prop_var = prop_var.sum(axis=1) / n_components_needed

fig_sum_prop_var = go.Figure()
fig_sum_prop_var.add_trace(go.Bar(
    x=sum_prop_var.index,
    y=sum_prop_var.values,
    marker=dict(color='teal')
))
fig_sum_prop_var.update_layout(
    title=f"Proportion of Variance Associated with Each Feature on First {n_components_needed} PCs",
    xaxis_title="Feature",
    yaxis_title="Proportion of Variance",
    width=1200,
    height=500,
    xaxis_tickangle=-45
)
fig_sum_prop_var.show()

Features with a **high value** represent the most variance in the dataset (around 8% for GP, FG%, FT% ...) and reflect what seems to **distinguish NBA players** from one another (on this set) => important criterions to select players. <br>