# üéØ Player Clustering ‚Äî Discovering Archetypes

## Key Findings
- K-means reveals **3 distinct player archetypes**: High Scorers, Mid-Range Contributors, and Low-Volume players
- The **High Scorer cluster** averages 2-3√ó the PPG of the Mid-Range group
- Shot efficiency correlates with volume ‚Äî the best scorers are also the most efficient
- Clustering is useful for identifying players who "punch above their weight" in efficiency

---

In [None]:
%matplotlib inline
import sqlite3, pandas as pd, numpy as np
import matplotlib.pyplot as plt, seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import warnings; warnings.filterwarnings('ignore')

sns.set_theme(style="whitegrid"); plt.rcParams['figure.dpi'] = 120

conn = sqlite3.connect("../data/playhq.db")
df = pd.read_sql("""
    SELECT p.id, p.first_name || ' ' || p.last_name as name,
        SUM(ps.games_played) as gp, SUM(ps.total_points) as pts,
        SUM(ps.one_point) as ft, SUM(ps.two_point) as fg2,
        SUM(ps.three_point) as fg3, SUM(ps.total_fouls) as fouls
    FROM player_stats ps JOIN players p ON p.id = ps.player_id
    GROUP BY p.id HAVING SUM(ps.games_played) >= 5
""", conn)
conn.close()

df["ppg"] = df["pts"] / df["gp"]
df["fpg"] = df["fouls"] / df["gp"]
makes = df["ft"] + df["fg2"] + df["fg3"]
df["efficiency"] = np.where(makes > 0, df["pts"] / makes, 0)
print(f"Players with 5+ games: {len(df):,}")

## Elbow Method ‚Äî Finding Optimal K

In [None]:
features = ["ppg", "fpg", "efficiency"]
X = StandardScaler().fit_transform(df[features])

inertias = []
K_range = range(2, 9)
for k in K_range:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    km.fit(X)
    inertias.append(km.inertia_)

fig, ax = plt.subplots(figsize=(8, 4))
ax.plot(K_range, inertias, "o-", color="#1976D2", lw=2)
ax.set_title("Elbow Method for K-Means", fontweight="bold")
ax.set_xlabel("Number of Clusters (K)"); ax.set_ylabel("Inertia")
ax.axvline(3, color="#D32F2F", ls="--", alpha=0.7, label="K=3 (chosen)")
ax.legend()
plt.tight_layout(); plt.show()

## K=3 Clustering

In [None]:
km = KMeans(n_clusters=3, random_state=42, n_init=10)
df["cluster"] = km.fit_predict(X)

# Label clusters by avg PPG
labels = {}
for c in sorted(df["cluster"].unique()):
    avg = df[df["cluster"] == c]["ppg"].mean()
    if avg > df["ppg"].median() * 1.5:
        labels[c] = "‚≠ê High Scorers"
    elif avg < df["ppg"].median() * 0.5:
        labels[c] = "üìâ Low Volume"
    else:
        labels[c] = "üìä Mid-Range"

df["archetype"] = df["cluster"].map(labels)

for c in sorted(df["cluster"].unique()):
    sub = df[df["cluster"] == c]
    print(f"\n{labels[c]} (n={len(sub):,})")
    print(f"  Avg PPG: {sub['ppg'].mean():.2f}  |  Avg FPG: {sub['fpg'].mean():.2f}  |  Avg Efficiency: {sub['efficiency'].mean():.2f}")
    print(f"  Avg Games: {sub['gp'].mean():.1f}")

In [None]:
colors = {"‚≠ê High Scorers": "#D32F2F", "üìä Mid-Range": "#1976D2", "üìâ Low Volume": "#9E9E9E"}

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

ax = axes[0]
for arch, color in colors.items():
    sub = df[df["archetype"] == arch]
    ax.scatter(sub["ppg"], sub["fpg"], alpha=0.4, s=12, color=color, label=f"{arch} (n={len(sub):,})")
ax.set_title("Player Clusters: PPG vs FPG", fontweight="bold")
ax.set_xlabel("Points Per Game"); ax.set_ylabel("Fouls Per Game"); ax.legend(fontsize=9)

ax = axes[1]
for arch, color in colors.items():
    sub = df[df["archetype"] == arch]
    ax.scatter(sub["ppg"], sub["efficiency"], alpha=0.4, s=12, color=color, label=arch)
ax.set_title("Player Clusters: PPG vs Shot Efficiency", fontweight="bold")
ax.set_xlabel("Points Per Game"); ax.set_ylabel("Shot Efficiency"); ax.legend(fontsize=9)

plt.tight_layout()
plt.savefig("../assets/clustering.png", dpi=150, bbox_inches="tight")
plt.show()

## Notable Players by Cluster

High scorers who also maintain elite efficiency are the true standouts.

In [None]:
# Top 10 per cluster
for arch in ["‚≠ê High Scorers", "üìä Mid-Range"]:
    print(f"\nTop 10 {arch} by PPG:")
    top = df[df["archetype"] == arch].nlargest(10, "ppg")[["name", "gp", "ppg", "fpg", "efficiency"]]
    top.index = range(1, 11)
    display(top.round(2))