# NBA Insights Design

Goal: Design simple, rule-based insights for the SportIQ NBA model using
the processed B2B feature table. These insights will power the
`/insights/{game_id}` endpoint and the frontend "Insights" panel.

## 1. Setup and Data Load

In [1]:
from pathlib import Path
import sys
import pandas as pd

# Assume you're in sportiq-app/notebooks
PROJECT_ROOT = Path.cwd().parents[0]
sys.path.append(str(PROJECT_ROOT))

from src.paths import PROCESSED_DIR

df = pd.read_parquet(PROCESSED_DIR / "processed_games_b2b_model.parquet")
df.head()

Unnamed: 0,date,start_et,away_team,home_team,attend,arena,season,source_file,home_win,home_win_pct_10,...,home_last_pd,away_win_pct_10,away_avg_pd_10,away_season_win_pct,away_recent_win_pct_20g,away_days_rest,away_last_pd,home_b2b,away_b2b,game_id
0,2015-10-29,7:00p,Memphis Grizzlies,Indiana Pacers,18165.0,Bankers Life Fieldhouse,2015-16_NBA,oct.xls,0,0.0,...,-1.0,0.0,-1.0,0.0,0.0,1.0,-1.0,0,0,0
1,2015-10-29,10:30p,Dallas Mavericks,Los Angeles Clippers,19218.0,STAPLES Center,2015-16_NBA,oct.xls,1,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0,0,1
2,2015-10-29,8:00p,Atlanta Hawks,New York Knicks,19812.0,Madison Square Garden (IV),2015-16_NBA,oct.xls,0,1.0,...,1.0,0.0,-1.0,0.0,0.0,2.0,-1.0,0,0,2
3,2015-10-30,8:00p,Charlotte Hornets,Atlanta Hawks,17024.0,Philips Arena,2015-16_NBA,oct.xls,1,0.5,...,1.0,0.0,-1.0,0.0,0.0,2.0,-1.0,1,0,3
4,2015-10-30,7:30p,Toronto Raptors,Boston Celtics,16898.0,TD Garden,2015-16_NBA,oct.xls,0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.0,1.0,0,0,4


## 2. Inspect Data

In [2]:
df.columns.tolist()
df.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
date,11508.0,2020-01-28 16:54:18.394160640,2015-10-29 00:00:00,2017-11-24 00:00:00,2020-01-03 12:00:00,2022-03-23 00:00:00,2024-06-17 00:00:00,
attend,11324.0,16326.446044,0.0,16032.75,18064.0,19419.25,68323.0,5317.781076
home_win,11508.0,0.570125,0.0,0.0,1.0,1.0,1.0,0.49508
home_win_pct_10,11508.0,0.498765,0.0,0.4,0.5,0.6,1.0,0.203878
home_avg_pd_10,11508.0,-0.00247,-1.0,-0.2,0.0,0.2,1.0,0.407757
home_season_win_pct,11508.0,0.50721,0.0,0.388889,0.516129,0.630137,1.0,0.185033
home_recent_win_pct_20g,11508.0,0.499412,0.0,0.4,0.5,0.6,1.0,0.173212
home_days_rest,11508.0,4.051616,1.0,2.0,2.0,2.0,288.0,17.555269
home_last_pd,11508.0,-0.008863,-1.0,-1.0,-1.0,1.0,1.0,1.000004
away_win_pct_10,11508.0,0.503111,0.0,0.4,0.5,0.6,1.0,0.204203


## 3. Difference Feature 

In [3]:
df["season_wp_diff"] = df["home_season_win_pct"] - df["away_season_win_pct"]
df["recent_wp_diff"] = df["home_recent_win_pct_20g"] - df["away_recent_win_pct_20g"]
df["rest_diff"] = df["home_days_rest"] - df["away_days_rest"]
df["last_pd_diff"] = df["home_last_pd"] - df["away_last_pd"]

df[["season_wp_diff", "recent_wp_diff", "rest_diff", "last_pd_diff"]].head()

Unnamed: 0,season_wp_diff,recent_wp_diff,rest_diff,last_pd_diff
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0
2,1.0,1.0,-1.0,2.0
3,0.5,0.5,-1.0,2.0
4,0.0,0.0,0.0,0.0


## 4. Build Insight Rule Engine

In [4]:
def build_insights_for_row(row):
    insights = []

    # ---- Season Strength ----
    if abs(row["season_wp_diff"]) > 0.15:
        if row["season_wp_diff"] > 0:
            insights.append(("season_strength",
                f"{row['home_team']} have a stronger season performance (+{row['season_wp_diff']:.1%})."))
        else:
            insights.append(("season_strength",
                f"{row['away_team']} have a stronger season performance (+{-row['season_wp_diff']:.1%})."))

    # ---- Recent Form ----
    if abs(row["recent_wp_diff"]) > 0.20:
        if row["recent_wp_diff"] > 0:
            insights.append(("recent_form",
                f"{row['home_team']} are in better recent form (last 20 games)."))
        else:
            insights.append(("recent_form",
                f"{row['away_team']} are in better recent form (last 20 games)."))

    # ---- Rest Difference ----
    if abs(row["rest_diff"]) >= 2 and row["home_days_rest"] < 20 and row["away_days_rest"] < 20:
        if row["rest_diff"] > 0:
            insights.append(("rest",
                f"{row['home_team']} are more rested (+{row['rest_diff']} days)."))
        else:
            insights.append(("rest",
                f"{row['away_team']} are more rested (+{-row['rest_diff']} days)."))

    # ---- Back-to-back ----
    if row["home_b2b"] == 1:
        insights.append(("fatigue", f"{row['home_team']} are on a back-to-back."))
    if row["away_b2b"] == 1:
        insights.append(("fatigue", f"{row['away_team']} are on a back-to-back."))

    # ---- Last-game performance ----
    if row["last_pd_diff"] > 0.5:
        insights.append(("momentum", f"{row['home_team']} had a strong last game performance."))
    elif row["last_pd_diff"] < -0.5:
        insights.append(("momentum", f"{row['away_team']} had a strong last game performance."))

    return insights

In [7]:
sample = df.sample(10, random_state=0)
for _, row in sample.iterrows():
    print(row["home_team"], "vs", row["away_team"])
    print(build_insights_for_row(row))
    print("---")

Sacramento Kings vs Utah Jazz
[('momentum', 'Sacramento Kings had a strong last game performance.')]
---
Los Angeles Lakers vs Indiana Pacers
[('season_strength', 'Los Angeles Lakers have a stronger season performance (+51.4%).'), ('momentum', 'Los Angeles Lakers had a strong last game performance.')]
---
Minnesota Timberwolves vs Los Angeles Clippers
[('momentum', 'Los Angeles Clippers had a strong last game performance.')]
---
Cleveland Cavaliers vs Los Angeles Clippers
[('season_strength', 'Los Angeles Clippers have a stronger season performance (+57.2%).'), ('recent_form', 'Los Angeles Clippers are in better recent form (last 20 games).'), ('rest', 'Cleveland Cavaliers are more rested (+3.0 days).'), ('fatigue', 'Los Angeles Clippers are on a back-to-back.')]
---
Miami Heat vs Chicago Bulls
[('season_strength', 'Miami Heat have a stronger season performance (+17.6%).'), ('recent_form', 'Miami Heat are in better recent form (last 20 games).'), ('momentum', 'Miami Heat had a strong l

## 5. Insight Coverage Check

Now that I have a rule-based `build_insights_for_row` function, I want to see:
- How often each insight type appears (season strength, recent form, rest, fatigue, momentum).
- How many total insights per game.
- Whether the thresholds create useful but not spammy output.

### Count Coverage by Insight Type

In [5]:
from collections import Counter 

type_counts = Counter()
insights_per_game = []

for _, row in df.iterrows():
    ins = build_insights_for_row(row)
    insights_per_game.append(len(ins))
    for t, _ in ins:
        type_counts[t] += 1

num_games = len(df)

type_counts, num_games

(Counter({'momentum': 6265,
          'season_strength': 6055,
          'recent_form': 4310,
          'fatigue': 3865,
          'rest': 1161}),
 11508)

In [7]:
print(f"Total games: {num_games}")
print()

for insight_type, count in type_counts.items():
    pct = count / num_games * 100
    print(f"{insight_type:15s}: {count:5d} games  ({pct:5.1f}%)")

avg_insights = sum(insights_per_game) / num_games
print()
print(f"Average insights per game (feature-based only): {avg_insights:.2f}")
print(f"Games with at least one feature insight: {(sum(1 for n in insights_per_game if n > 0) / num_games * 100):.1f}%")

Total games: 11508

season_strength:  6055 games  ( 52.6%)
recent_form    :  4310 games  ( 37.5%)
momentum       :  6265 games  ( 54.4%)
fatigue        :  3865 games  ( 33.6%)
rest           :  1161 games  ( 10.1%)

Average insights per game (feature-based only): 1.88
Games with at least one feature insight: 87.2%
