# üèÄ Player Comparison Tool

## The Analytics Behind the Matchup

*Every basketball player tells a story through their numbers. Some are snipers from beyond the arc, others bruise their way to the basket, and a rare few do it all. This notebook gives you the tools to decode those stories ‚Äî comparing any two players head-to-head, breaking down their scoring DNA, and using machine learning to classify every player in the database into distinct archetypes.*

*Think of it as a scouting report generator ‚Äî FiveThirtyEight meets your local basketball league.*

---

In [None]:
import sqlite3
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import ipywidgets as widgets
from IPython.display import display, HTML, Markdown
import warnings
warnings.filterwarnings('ignore')

# FiveThirtyEight-inspired colour palette
COLORS = {
    'primary': '#FF6B35',
    'secondary': '#004E89',
    'accent': '#2EC4B6',
    'dark': '#2D3436',
    'light': '#DFE6E9',
    'bg': '#FAFAFA',
    'archetypes': ['#FF6B35', '#004E89', '#2EC4B6', '#E63946', '#457B9D', '#F4A261']
}

LAYOUT_DEFAULTS = dict(
    font=dict(family='Arial, sans-serif'),
    plot_bgcolor=COLORS['bg'],
    paper_bgcolor='white',
    margin=dict(t=60, b=40, l=40, r=40),
)

print('‚úÖ Dependencies loaded.')

## 1. Load & Prepare the Data

*We pull every player who's actually seen the court ‚Äî at least 3 games played ‚Äî and compute per-game averages. Raw totals lie; per-game numbers tell the truth.*

In [None]:
conn = sqlite3.connect(r'../data/playhq.db')

df = pd.read_sql_query("""
    SELECT 
        p.id AS player_id,
        p.first_name || ' ' || p.last_name AS player_name,
        p.first_name,
        p.last_name,
        ps.team_name,
        g.name AS grade_name,
        g.season,
        c.name AS competition_name,
        ps.games_played,
        ps.total_points,
        ps.one_point,
        ps.two_point,
        ps.three_point,
        ps.total_fouls,
        ps.ranking,
        CAST(ps.total_points AS FLOAT) / MAX(ps.games_played, 1) AS ppg,
        CAST(ps.one_point AS FLOAT) / MAX(ps.games_played, 1) AS ft_pg,
        CAST(ps.two_point AS FLOAT) / MAX(ps.games_played, 1) AS twos_pg,
        CAST(ps.three_point AS FLOAT) / MAX(ps.games_played, 1) AS threes_pg,
        CAST(ps.total_fouls AS FLOAT) / MAX(ps.games_played, 1) AS fouls_pg
    FROM player_stats ps
    JOIN players p ON p.id = ps.player_id
    JOIN grades g ON g.grade_id = ps.grade_id
    JOIN competitions c ON c.competition_id = g.competition_id
    WHERE ps.games_played >= 3
""", conn)

conn.close()

# Compute scoring distribution percentages
total_shots = df['one_point'] + df['two_point'] * 2 + df['three_point'] * 3
df['pct_1pt'] = np.where(total_shots > 0, df['one_point'] / total_shots * 100, 0)
df['pct_2pt'] = np.where(total_shots > 0, (df['two_point'] * 2) / total_shots * 100, 0)
df['pct_3pt'] = np.where(total_shots > 0, (df['three_point'] * 3) / total_shots * 100, 0)

print(f'üìä {len(df):,} player-season records loaded ({df["player_id"].nunique():,} unique players).')
print(f'   Avg PPG: {df["ppg"].mean():.1f} | Avg games: {df["games_played"].mean():.1f}')
df.head(3)

## 2. Interactive Player Selector

*Type a name, pick a player, and let the data do the talking. The dropdowns update dynamically ‚Äî search by first or last name.*

In [None]:
# Build player lookup: deduplicate by taking best season (most points)
player_best = df.sort_values('total_points', ascending=False).drop_duplicates('player_id')
player_options = (
    player_best[['player_id', 'player_name', 'team_name', 'grade_name']]
    .assign(label=lambda x: x['player_name'] + ' (' + x['team_name'] + ' ‚Äî ' + x['grade_name'] + ')')
    .sort_values('player_name')
)
options_list = list(zip(player_options['label'], player_options['player_id']))

style = '<style>.widget-dropdown select { font-size: 13px; }</style>'
display(HTML(style))

player_a_dd = widgets.Dropdown(
    options=options_list, description='Player A:', 
    style={'description_width': '80px'}, layout=widgets.Layout(width='600px')
)
player_b_dd = widgets.Dropdown(
    options=options_list, value=options_list[min(1, len(options_list)-1)][1],
    description='Player B:',
    style={'description_width': '80px'}, layout=widgets.Layout(width='600px')
)

search_box = widgets.Text(placeholder='Type to filter players...', description='Search:',
                          layout=widgets.Layout(width='400px'))

def filter_players(change):
    q = change['new'].lower()
    filtered = [(l, v) for l, v in options_list if q in l.lower()]
    if filtered:
        player_a_dd.options = filtered
        player_b_dd.options = filtered

search_box.observe(filter_players, names='value')

display(widgets.VBox([search_box, player_a_dd, player_b_dd]))
print('üëÜ Select two players above, then run the cells below.')

## 3. Head-to-Head Comparison Framework

*The tale of the tape. We line up the two selected players across every key metric and tell you who has the edge ‚Äî and by how much.*

In [None]:
def get_player_data(pid):
    """Get aggregated stats for a player across all their records."""
    p = df[df['player_id'] == pid]
    if p.empty:
        return None
    # Use the row with most games as primary
    return p.sort_values('games_played', ascending=False).iloc[0]

a = get_player_data(player_a_dd.value)
b = get_player_data(player_b_dd.value)

if a is None or b is None:
    print('‚ö†Ô∏è Could not find one of the selected players.')
else:
    metrics = [
        ('Points Per Game', 'ppg', '.1f', True),
        ('Games Played', 'games_played', 'd', True),
        ('Total Points', 'total_points', 'd', True),
        ('Free Throws/Game', 'ft_pg', '.1f', True),
        ('2-Pointers/Game', 'twos_pg', '.1f', True),
        ('3-Pointers/Game', 'threes_pg', '.1f', True),
        ('Fouls/Game', 'fouls_pg', '.1f', False),  # lower is better
        ('Ranking', 'ranking', 'd', False),  # lower is better
    ]
    
    rows = []
    a_wins, b_wins = 0, 0
    for label, col, fmt, higher_better in metrics:
        va, vb = a[col], b[col]
        if higher_better:
            winner = '‚óÄ' if va > vb else ('‚ñ∂' if vb > va else '=')
        else:
            winner = '‚óÄ' if va < vb else ('‚ñ∂' if vb < va else '=')
        if winner == '‚óÄ': a_wins += 1
        elif winner == '‚ñ∂': b_wins += 1
        rows.append(f"| {format(va, fmt)} | {winner} **{label}** {winner} | {format(vb, fmt)} |")
    
    header = f"### ü•ä {a['player_name']}  vs  {b['player_name']}\n\n"
    header += f"*{a['team_name']} ({a['grade_name']}) vs {b['team_name']} ({b['grade_name']})*\n\n"
    header += f"| {a['player_name']} | Metric | {b['player_name']} |\n|---:|:---:|:---|\n"
    
    verdict = f"\n\n**Edge:** {a['player_name']} wins {a_wins} categories, {b['player_name']} wins {b_wins}."
    if a_wins > b_wins:
        verdict += f" **{a['player_name']} has the statistical edge.**"
    elif b_wins > a_wins:
        verdict += f" **{b['player_name']} has the statistical edge.**"
    else:
        verdict += " **Dead even ‚Äî this one's a coin flip.**"
    
    display(Markdown(header + '\n'.join(rows) + verdict))

## 4. Radar Chart ‚Äî Per-Game Profile

*Radar charts are the fingerprint of a player's game. Each axis is a per-game stat, normalised against the population so you can see who's truly elite (outer ring) versus merely average (inner ring).*

In [None]:
radar_cols = ['ppg', 'ft_pg', 'twos_pg', 'threes_pg', 'fouls_pg', 'games_played']
radar_labels = ['Points/Game', 'Free Throws/G', '2-Pointers/G', '3-Pointers/G', 'Fouls/G', 'Games Played']

# Percentile ranks (0-100) for meaningful comparison
percentiles = df[radar_cols].rank(pct=True) * 100

def player_percentiles(pid):
    idx = df[df['player_id'] == pid].sort_values('games_played', ascending=False).index[0]
    return percentiles.loc[idx]

pa = player_percentiles(player_a_dd.value)
pb = player_percentiles(player_b_dd.value)

fig = go.Figure()
for pdata, name, color in [(pa, a['player_name'], COLORS['primary']), 
                            (pb, b['player_name'], COLORS['secondary'])]:
    vals = pdata[radar_cols].tolist()
    vals += [vals[0]]  # close the polygon
    fig.add_trace(go.Scatterpolar(
        r=vals, theta=radar_labels + [radar_labels[0]],
        fill='toself', name=name, 
        fillcolor=color.replace(')', ',0.15)').replace('rgb', 'rgba') if 'rgb' in color else color + '26',
        line=dict(color=color, width=2.5),
    ))

fig.update_layout(
    title=dict(text='Per-Game Stat Percentiles', font=dict(size=18)),
    polar=dict(
        radialaxis=dict(visible=True, range=[0, 100], ticksuffix='th'),
        bgcolor=COLORS['bg']
    ),
    showlegend=True, **{k:v for k,v in LAYOUT_DEFAULTS.items() if k != 'plot_bgcolor'},
    height=520, width=650,
)
fig.show()

## 5. Scoring Style Breakdown

*Where do the points come from? A player's shot distribution reveals their identity ‚Äî are they living at the free throw line, grinding in the paint, or pulling up from deep? We break it down side by side.*

In [None]:
categories = ['1-Point (FT)', '2-Point', '3-Point']

fig = make_subplots(rows=1, cols=2, specs=[[{'type':'pie'}, {'type':'pie'}]],
                    subplot_titles=[a['player_name'], b['player_name']])

for i, p in enumerate([a, b]):
    vals = [p['pct_1pt'], p['pct_2pt'], p['pct_3pt']]
    fig.add_trace(go.Pie(
        labels=categories, values=vals, hole=0.45,
        marker_colors=[COLORS['accent'], COLORS['primary'], COLORS['secondary']],
        textinfo='label+percent', textposition='outside',
        textfont_size=12,
    ), row=1, col=i+1)

fig.update_layout(
    title=dict(text='Scoring Distribution ‚Äî Where Do The Points Come From?', font=dict(size=16)),
    showlegend=False, height=400, width=750,
    **{k:v for k,v in LAYOUT_DEFAULTS.items() if k != 'plot_bgcolor'},
)
fig.show()

# Narrative
for p in [a, b]:
    dominant = max([('free throw line', p['pct_1pt']), ('mid-range/paint', p['pct_2pt']), 
                    ('three-point land', p['pct_3pt'])], key=lambda x: x[1])
    print(f"üîπ {p['player_name']}: {dominant[1]:.0f}% of points from {dominant[0]}. "
          f"PPG: {p['ppg']:.1f} on {p['games_played']:.0f} games.")

## 6. Player Archetype Clustering (K-Means)

*Here's where it gets interesting. We feed every qualified player's per-game stats into a K-means clustering algorithm and let the data sort them into natural archetypes. No human bias ‚Äî just math finding the patterns that emerge when thousands of players are plotted in statistical space.*

*The algorithm identifies distinct player types that the eye test has always known existed, but now we can quantify them.*

In [None]:
# Cluster on per-game scoring features
cluster_cols = ['ppg', 'ft_pg', 'twos_pg', 'threes_pg', 'fouls_pg']
# Use players with meaningful scoring (>0 total points, >=3 games)
cluster_df = df[df['total_points'] > 0].copy()

X = cluster_df[cluster_cols].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Find optimal k with inertia elbow (we'll use 5 archetypes)
N_CLUSTERS = 5
km = KMeans(n_clusters=N_CLUSTERS, random_state=42, n_init=10)
cluster_df['cluster'] = km.fit_predict(X_scaled)

# Label archetypes by examining cluster centres
centres = pd.DataFrame(scaler.inverse_transform(km.cluster_centers_), columns=cluster_cols)
centres['cluster'] = range(N_CLUSTERS)

def label_archetype(row):
    """Heuristic labelling based on cluster centre characteristics."""
    if row['threes_pg'] == centres['threes_pg'].max():
        return 'üéØ Sharpshooter'
    if row['twos_pg'] == centres['twos_pg'].max():
        return 'üí™ Inside Scorer'
    if row['ppg'] == centres['ppg'].max():
        return '‚≠ê High Volume Scorer'
    if row['fouls_pg'] == centres['fouls_pg'].max():
        return 'üî• Physical/Aggressive'
    return '‚öñÔ∏è Balanced Contributor'

centres['archetype'] = centres.apply(label_archetype, axis=1)
archetype_map = dict(zip(centres['cluster'], centres['archetype']))
cluster_df['archetype'] = cluster_df['cluster'].map(archetype_map)

print('üè∑Ô∏è Archetype Distribution:\n')
for arch, count in cluster_df['archetype'].value_counts().items():
    print(f'   {arch}: {count:,} players ({count/len(cluster_df)*100:.1f}%)')

print(f'\nüìä Cluster Centres (per-game averages):')
display(centres[['archetype'] + cluster_cols].round(2))

In [None]:
# Visualise clusters in 2D (PPG vs 3-pointers/game, sized by games played)
fig = px.scatter(
    cluster_df, x='ppg', y='threes_pg', color='archetype',
    hover_data=['player_name', 'team_name', 'grade_name', 'games_played'],
    color_discrete_sequence=COLORS['archetypes'],
    size='games_played', size_max=12, opacity=0.6,
    labels={'ppg': 'Points Per Game', 'threes_pg': '3-Pointers Per Game'},
    title='Player Archetypes ‚Äî Every Player, Classified',
    height=550, width=850,
)

# Highlight our two selected players
for p, marker_sym, name in [(a, 'star', a['player_name']), (b, 'diamond', b['player_name'])]:
    fig.add_trace(go.Scatter(
        x=[p['ppg']], y=[p['threes_pg']], mode='markers+text',
        marker=dict(size=18, symbol=marker_sym, color='black', line=dict(width=2, color='white')),
        text=[name], textposition='top center', textfont=dict(size=11, color='black'),
        name=name, showlegend=True,
    ))

fig.update_layout(**LAYOUT_DEFAULTS)
fig.show()

# Report archetypes for selected players
for p in [a, b]:
    match = cluster_df[cluster_df['player_id'] == p['player_id']]
    if not match.empty:
        arch = match.iloc[0]['archetype']
        print(f"üîπ {p['player_name']} is classified as: {arch}")

In [None]:
# Archetype radar ‚Äî average profile of each archetype
fig = go.Figure()
for i, (_, row) in enumerate(centres.iterrows()):
    vals = row[cluster_cols].tolist() 
    # Normalise to 0-1 for comparison
    maxvals = centres[cluster_cols].max()
    norm = [v / m if m > 0 else 0 for v, m in zip(vals, maxvals)]
    norm += [norm[0]]
    labels_r = ['PPG', 'FT/G', '2PT/G', '3PT/G', 'Fouls/G'] + ['PPG']
    fig.add_trace(go.Scatterpolar(
        r=norm, theta=labels_r, fill='toself',
        name=row['archetype'],
        line=dict(color=COLORS['archetypes'][i % len(COLORS['archetypes'])]),
    ))

fig.update_layout(
    title=dict(text='Archetype DNA ‚Äî What Defines Each Type?', font=dict(size=16)),
    polar=dict(radialaxis=dict(visible=True, range=[0, 1.1])),
    height=520, width=700,
    **{k:v for k,v in LAYOUT_DEFAULTS.items() if k != 'plot_bgcolor'},
)
fig.show()

## 7. Quick-Compare Widget

*Don't want to re-run cells every time? Use this all-in-one widget ‚Äî pick two players and get the full comparison instantly.*

In [None]:
output = widgets.Output()

w_a = widgets.Dropdown(options=options_list, description='Player A:',
                       style={'description_width': '80px'}, layout=widgets.Layout(width='600px'))
w_b = widgets.Dropdown(options=options_list, value=options_list[min(1, len(options_list)-1)][1],
                       description='Player B:',
                       style={'description_width': '80px'}, layout=widgets.Layout(width='600px'))
w_search = widgets.Text(placeholder='Filter by name...', description='Search:',
                        layout=widgets.Layout(width='400px'))

def on_search(change):
    q = change['new'].lower()
    f = [(l, v) for l, v in options_list if q in l.lower()]
    if f:
        w_a.options = f
        w_b.options = f

w_search.observe(on_search, names='value')

btn = widgets.Button(description='‚ö° Compare!', button_style='warning',
                     layout=widgets.Layout(width='200px', height='36px'))

def run_comparison(_):
    output.clear_output()
    with output:
        pa_data = get_player_data(w_a.value)
        pb_data = get_player_data(w_b.value)
        if pa_data is None or pb_data is None:
            print('‚ö†Ô∏è Player not found.'); return
        
        # Radar
        fig = go.Figure()
        for pid, pdata, name, color in [
            (w_a.value, pa_data, pa_data['player_name'], COLORS['primary']),
            (w_b.value, pb_data, pb_data['player_name'], COLORS['secondary'])
        ]:
            pp = player_percentiles(pid)
            vals = pp[radar_cols].tolist() + [pp[radar_cols[0]]]
            fig.add_trace(go.Scatterpolar(
                r=vals, theta=radar_labels + [radar_labels[0]],
                fill='toself', name=name,
                line=dict(color=color, width=2.5),
            ))
        fig.update_layout(
            title='Percentile Comparison', height=450, width=600,
            polar=dict(radialaxis=dict(range=[0,100], ticksuffix='th')),
            **{k:v for k,v in LAYOUT_DEFAULTS.items() if k != 'plot_bgcolor'},
        )
        fig.show()
        
        # Scoring bars
        cats = ['1PT', '2PT', '3PT']
        fig2 = go.Figure()
        for pdata, color in [(pa_data, COLORS['primary']), (pb_data, COLORS['secondary'])]:
            fig2.add_trace(go.Bar(
                x=cats, y=[pdata['pct_1pt'], pdata['pct_2pt'], pdata['pct_3pt']],
                name=pdata['player_name'], marker_color=color, text=[f"{v:.0f}%" for v in [pdata['pct_1pt'], pdata['pct_2pt'], pdata['pct_3pt']]],
                textposition='outside',
            ))
        fig2.update_layout(
            title='Scoring Style', barmode='group', height=350, width=500,
            yaxis_title='% of Total Points', **LAYOUT_DEFAULTS,
        )
        fig2.show()
        
        # Archetypes
        for pdata in [pa_data, pb_data]:
            match = cluster_df[cluster_df['player_id'] == pdata['player_id']]
            arch = match.iloc[0]['archetype'] if not match.empty else 'Unknown'
            print(f"{pdata['player_name']}: {arch} | {pdata['ppg']:.1f} PPG | {pdata['games_played']:.0f} GP")

btn.on_click(run_comparison)
display(widgets.VBox([w_search, w_a, w_b, btn, output]))

---

*Built with PlayHQ data, Plotly, and scikit-learn. FiveThirtyEight-inspired analysis for community basketball.*

*¬© 2025 FullCourtVision*