# CS155 MiniProject 1 â€” Colab Demo

This notebook:
- clones our team repo
- installs dependencies
- loads data (`train.csv`, `test.csv`, `sample_submission.csv`)
- runs EDA + several visualizations
- runs a small hyperparameter sweep (parameter curve)
- trains the best model and writes `submission.csv`

**Repo:** `git@github.com:NataliaZhang/SongsClassifier.git`


In [None]:
# --- 0) Clone repo ---
!rm -rf SongsClassifier
!git clone https://github.com/NataliaZhang/SongsClassifier.git
%cd SongsClassifier
!git status

In [None]:
# --- 1) Install dependencies ---
# If you have requirements.txt in the repo root, this works.
# If not, add it, or replace with explicit installs.
import os, sys

req_path = "requirements.txt"
if os.path.exists(req_path):
    !pip -q install -r requirements.txt
else:
    # fallback minimal set
    !pip -q install numpy pandas scikit-learn xgboost matplotlib

# Make sure we can import src/
sys.path.append(os.getcwd())
print("cwd:", os.getcwd())

In [None]:
# --- 2) Load data via project utilities ---
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from src.config import Paths, TARGET_COL
from src.data import load_train_test, load_sample_submission

paths = Paths()
X, y, X_test = load_train_test(paths)
sub = load_sample_submission(paths)

print("X:", X.shape)
print("y:", y.shape, "pos_rate=", float(np.mean(y)))
print("X_test:", X_test.shape)
print("sample_submission:", sub.shape, sub.columns.tolist())

X.head()

## 3) Quick EDA + Visualizations

In [None]:
# 3.1 Label distribution
counts = pd.Series(y).value_counts().sort_index()
plt.figure()
plt.bar([str(i) for i in counts.index], counts.values)
plt.title("Label counts")
plt.xlabel("label")
plt.ylabel("count")
plt.show()

print("counts:\n", counts)
print("positive rate:", float(np.mean(y)))

In [None]:
# 3.2 Missingness (top 20)
na_frac = X.isna().mean().sort_values(ascending=False)
top = na_frac.head(20)

plt.figure(figsize=(8,4))
plt.barh(top.index[::-1], top.values[::-1])
plt.title("Top missingness features")
plt.xlabel("fraction missing")
plt.tight_layout()
plt.show()

top

In [None]:
# 3.3 Numeric feature histograms (first ~12 numeric)
num_cols = X.select_dtypes(include=["number", "bool"]).columns.tolist()
print("#numeric cols:", len(num_cols))

show_cols = num_cols[:12]
n = len(show_cols)
rows = (n + 3) // 4
plt.figure(figsize=(16, 4*rows))
for i, c in enumerate(show_cols, 1):
    plt.subplot(rows, 4, i)
    s = X[c].dropna()
    plt.hist(s.values, bins=30)
    plt.title(c)
plt.tight_layout()
plt.show()

In [None]:
# 3.4 Correlation heatmap for numeric columns (optional)
if len(num_cols) > 1:
    corr = X[num_cols].corr(numeric_only=True)
    plt.figure(figsize=(10,8))
    plt.imshow(corr.values, aspect='auto')
    plt.colorbar()
    plt.title("Correlation (numeric features)")
    plt.xticks(range(len(num_cols)), num_cols, rotation=90, fontsize=7)
    plt.yticks(range(len(num_cols)), num_cols, fontsize=7)
    plt.tight_layout()
    plt.show()

## 4) Model: CV AUC + Parameter Curve

We sweep one hyperparameter (e.g., `max_depth` or `reg_lambda`) and plot mean CV AUC.

In [None]:
from copy import deepcopy
from src.model import build_model, ModelSpec
from src.train import cv_auc

# Compute scale_pos_weight
n_pos = int((y == 1).sum())
n_neg = int((y == 0).sum())
spw = (n_neg / max(n_pos, 1))
print("scale_pos_weight:", spw)

# Baseline CV
base = build_model(ModelSpec(name="xgb"), scale_pos_weight=spw)
mean_auc, folds = cv_auc(base, X, y, n_splits=5, seed=0)
print("baseline mean AUC:", mean_auc, "folds:", folds)

In [None]:
# 4.1 Parameter curve: max_depth
# NOTE: This assumes your src/model.py uses XGBClassifier under step name 'clf'.

depths = [3, 4, 5, 6]
results = []

for d in depths:
    m = build_model(ModelSpec(name="xgb"), scale_pos_weight=spw)
    # set_params works with Pipeline
    m.set_params(clf__max_depth=d)
    auc, _ = cv_auc(m, X, y, n_splits=5, seed=0)
    results.append((d, auc))
    print(f"max_depth={d} -> mean AUC={auc:.5f}")

results_df = pd.DataFrame(results, columns=["max_depth", "mean_auc"]).sort_values("max_depth")
plt.figure()
plt.plot(results_df["max_depth"], results_df["mean_auc"], marker="o")
plt.title("Parameter curve: max_depth")
plt.xlabel("max_depth")
plt.ylabel("mean CV AUC")
plt.grid(True)
plt.show()

results_df

## 5) Train Best Model + Generate Submission

In [None]:
from src.train import fit_full
from src.predict import predict_proba_1

# Pick best max_depth from the sweep above
best_depth = int(results_df.sort_values("mean_auc", ascending=False).iloc[0]["max_depth"])
print("Best max_depth:", best_depth)

model = build_model(ModelSpec(name="xgb"), scale_pos_weight=spw)
model.set_params(clf__max_depth=best_depth)

model = fit_full(model, X, y)
p_test = predict_proba_1(model, X_test)
print("p_test:", p_test.shape, "min/max:", float(np.min(p_test)), float(np.max(p_test)))

# Fill submission
sub = load_sample_submission(paths)
target_col = sub.columns[-1]
assert len(p_test) == len(sub), f"pred len {len(p_test)} != sub len {len(sub)}"
sub[target_col] = p_test

out_path = os.path.join(paths.output_dir, paths.submission_csv)
os.makedirs(paths.output_dir, exist_ok=True)
sub.to_csv(out_path, index=False)
print("Wrote:", out_path)
sub.head()

In [None]:
# Optional: download the submission from Colab
from google.colab import files
files.download(out_path)

## Notes for Piazza + Report
- Share the **public, read-only Colab link** on Piazza with team name.
- In the report, include:
  - the Piazza post link
  - the Colab link
