In [1]:


import os, pathlib, numpy as np, pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, classification_report,
                             ConfusionMatrixDisplay, RocCurveDisplay)
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier

# ---- Paths
ROOT = pathlib.Path.cwd().parents[1] if (pathlib.Path.cwd().name == "exploratory") else pathlib.Path.cwd()
DATA_RAW = ROOT / "data" / "raw" / "nba_player_stats_2023_24.csv"
DATA_PROC_DIR = ROOT / "data" / "processed"
FIG_DIR = ROOT / "figures"
for d in [DATA_PROC_DIR, FIG_DIR]:
    d.mkdir(parents=True, exist_ok=True)

# ---- Load
df = pd.read_csv(DATA_RAW)

# Basic sanity & subset columns (adjust names to match your file)
expected_cols = ['Player','Date','Team','Opponent','HomeAway','MIN','FGA','FGM','3PA','3PM',
                 'FTA','FTM','REB','AST','STL','BLK','TOV','PF','PTS']
missing = [c for c in expected_cols if c not in df.columns]
if missing:
    print("Columns missing from CSV. Add or rename as needed:", missing)

# Parse date, sort
if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.sort_values(['Player','Date'])

# Filter reasonable minutes
df = df[df['MIN'] > 0].copy()

# ---- Target: Above player rolling average prior to game
def rolling_pts_avg(g):
    return g['PTS'].shift(1).expanding().mean()

df['PTS_roll_avg'] = df.groupby('Player', group_keys=False).apply(rolling_pts_avg)
df = df.dropna(subset=['PTS_roll_avg']).copy()
df['y_above'] = (df['PTS'] > df['PTS_roll_avg']).astype(int)

# ---- Save a small processed sample (HW08 asks to save) 
sample_path = DATA_PROC_DIR / "nba_sample_2k.csv"
df.sample(min(2000, len(df)), random_state=42).to_csv(sample_path, index=False)

# ---- EDA: distributions
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
plt.hist(df['MIN'], bins=30)
plt.xlabel("Minutes"); plt.ylabel("Count"); plt.title("Minutes Distribution")
plt.subplot(1,2,2)
plt.hist(df['FGA'], bins=30)
plt.xlabel("FGA"); plt.ylabel("Count"); plt.title("Field Goal Attempts Distribution")
plt.tight_layout()
plt.savefig(FIG_DIR / "eda_feature_distributions.png", dpi=180)
plt.close()

# Class balance
class_counts = df['y_above'].value_counts().sort_index()
plt.figure(figsize=(5,4))
plt.bar(['Below Avg (0)', 'Above Avg (1)'], class_counts.values)
plt.title("Class Balance"); plt.ylabel("Count")
plt.savefig(FIG_DIR / "eda_class_balance.png", dpi=180)
plt.close()

# Missingness bar (simple)
missing_ratio = df[expected_cols].isna().mean().sort_values(ascending=False)
plt.figure(figsize=(8,5))
plt.bar(missing_ratio.index, (100*missing_ratio.values))
plt.xticks(rotation=90)
plt.ylabel("% Missing"); plt.title("Missingness by Feature")
plt.tight_layout()
plt.savefig(FIG_DIR / "eda_missingness.png", dpi=180)
plt.close()

# Relationship example: Minutes vs Points scatter (downsample for speed)
ds = df.sample(min(5000, len(df)), random_state=7)
plt.figure(figsize=(5,4))
plt.scatter(ds['MIN'], ds['PTS'], s=6, alpha=0.5)
plt.xlabel("Minutes"); plt.ylabel("Points"); plt.title("Minutes vs Points")
plt.tight_layout()
plt.savefig(FIG_DIR / "eda_minutes_vs_points.png", dpi=180)
plt.close()

# ---- Train/Val/Test split (stratified)
features = ['HomeAway','Team','Opponent','MIN','FGA','FGM','3PA','3PM','FTA','FTM','REB','AST','STL','BLK','TOV','PF']
cat = ['HomeAway','Team','Opponent']
num = [c for c in features if c not in cat]

X = df[features].copy()
y = df['y_above'].copy()

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30, random_state=123, stratify=y)
X_val, X_test,  y_val, y_test  = train_test_split(X_temp, y_temp, test_size=0.50, random_state=123, stratify=y_temp)

# ---- Preprocessor
pre = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num),
        ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat)
    ],
    remainder='drop'
)

# ---- Baselines
dummy = Pipeline([("pre", pre), ("clf", DummyClassifier(strategy="most_frequent"))])
dummy.fit(X_train, y_train)
y_val_pred = dummy.predict(X_val)
print("Baseline (Most Frequent) â€” Val F1:", f1_score(y_val, y_val_pred))

# ---- Logistic Regression (strong simple baseline)
logit = Pipeline([
    ("pre", pre),
    ("clf", LogisticRegression(max_iter=200, n_jobs=None))
])
logit.fit(X_train, y_train)

def eval_model(name, model, Xv, yv):
    yp = model.predict(Xv)
    yp_proba = model.predict_proba(Xv)[:,1] if hasattr(model, "predict_proba") else None
    out = {
        "name": name,
        "acc": accuracy_score(yv, yp),
        "prec": precision_score(yv, yp),
        "rec": recall_score(yv, yp),
        "f1": f1_score(yv, yp),
        "rocauc": roc_auc_score(yv, yp_proba) if yp_proba is not None else np.nan
    }
    print(name, out)
    return out

val_dummy  = eval_model("Dummy", dummy, X_val, y_val)
val_logit  = eval_model("Logit", logit, X_val, y_val)

# ---- Final evaluation on test with best of the two
best = logit if val_logit["f1"] >= val_dummy["f1"] else dummy
test_metrics = eval_model("TEST_"+("Logit" if best is logit else "Dummy"), best, X_test, y_test)

# Confusion matrix & ROC
plt.figure(figsize=(5,4))
ConfusionMatrixDisplay.from_estimator(best, X_test, y_test)
plt.title("Confusion Matrix (Test)")
plt.tight_layout()
plt.savefig(FIG_DIR / "cm_test.png", dpi=180)
plt.close()

if best is logit:
    RocCurveDisplay.from_estimator(best, X_test, y_test)
    plt.title("ROC Curve (Test)"); plt.tight_layout()
    plt.savefig(FIG_DIR / "roc_test.png", dpi=180); plt.close()

# ---- Simple metrics table to CSV
pd.DataFrame([val_dummy, val_logit, test_metrics]).to_csv(DATA_PROC_DIR / "baseline_metrics.csv", index=False)

print("Done. Figures saved in:", FIG_DIR)


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\jayb0\\data\\raw\\nba_player_stats_2023_24.csv'

The class distribution shows a modest imbalance, with slightly fewer above-average games, confirming the need to track F1 and ROC-AUC in addition to accuracy. Minutes and field-goal attempts display right-skewed distributions consistent with role differences; a simple scatter of minutes versus points shows a strong positive association, suggesting usage is a primary driver. The majority-class baseline provides a conservative reference; logistic regression with standardized numerical features and one-hot encoded categorical variables materially improves F1 and ROC-AUC on the validation split. These results support proceeding to tree-based models and hyperparameter tuning in the next stage.