
# Baseline Classifier with Cross-Validated Grid Search (STFT)

This baseline uses simple, strong features and a proper **CV grid search**:

**Features per clip (T=1000, F=257):**
- Per-frequency mean & std (2F)
- First-difference mean & std (2F)
- → **4F = 1028** features per clip

**Models searched (with StandardScaler in pipeline):**
- `LinearSVC` (hinge / squared_hinge)
- `SVC` (RBF)
- `LogisticRegression` (L2)

We do a stratified train/test split and run **GridSearchCV on the train only**,
then evaluate the best model on the held-out test set.


In [1]:

# If imports fail, uncomment the pip line below.
# %pip install -q numpy scikit-learn joblib tqdm
import os, json, numpy as np
from tqdm import tqdm
from joblib import Parallel, delayed

from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
print("Imports OK")


Imports OK


## Config & data loading

In [2]:

BASE_DIR     = '.'
META_PATH    = os.path.join(BASE_DIR, 'stft_np', 'meta.json')

N_PER_CLASS  = 800     # raise/lower depending on compute
TEST_SIZE    = 0.2
RANDOM_STATE = 42

with open(META_PATH, 'r') as f:
    meta = json.load(f)

dtype_str = meta.get('dtype', 'float16')
if isinstance(dtype_str, str) and dtype_str.startswith("<class 'numpy."):
    dtype_str = dtype_str.split('.')[-1].split("'")[0]

A = np.memmap(os.path.join(BASE_DIR, 'stft_np', meta['A_memmap']), dtype=dtype_str, mode='r', shape=tuple(meta['A_shape']))
B = np.memmap(os.path.join(BASE_DIR, 'stft_np', meta['B_memmap']), dtype=dtype_str, mode='r', shape=tuple(meta['B_shape']))
print("A:", A.shape, "B:", B.shape)


A: (940, 1000, 257) B: (973, 1000, 257)


## Feature extraction (time summaries)

In [3]:

def spec_to_features(db_spec):
    # db_spec: (T,F) in dB [-80,0]
    Y  = np.clip((db_spec.astype(np.float32) + 80.0) / 80.0, 0.0, 1.0)  # (T,F)
    mu = Y.mean(axis=0)                                                 # (F,)
    sd = Y.std(axis=0)
    dY = np.diff(Y, axis=0)                                             # (T-1,F)
    dmu = dY.mean(axis=0)
    dsd = dY.std(axis=0)
    return np.concatenate([mu, sd, dmu, dsd], axis=0).astype(np.float32)  # (4F,)

def build_dataset(A, B, n_per_class=800, seed=42, n_jobs=-1):
    rng = np.random.default_rng(seed)
    nA = min(n_per_class, A.shape[0])
    nB = min(n_per_class, B.shape[0])
    idxA = rng.choice(A.shape[0], size=nA, replace=False)
    idxB = rng.choice(B.shape[0], size=nB, replace=False)

    def one(mm, i):
        x = np.array(mm[i], dtype=np.float32)
        return spec_to_features(x)

    XA = Parallel(n_jobs=n_jobs, prefer='threads')(delayed(one)(A, i) for i in tqdm(idxA, desc="A feats"))
    XB = Parallel(n_jobs=n_jobs, prefer='threads')(delayed(one)(B, i) for i in tqdm(idxB, desc="B feats"))
    X = np.vstack([XA, XB]).astype(np.float32)
    y = np.concatenate([np.zeros(nA, dtype=int), np.ones(nB, dtype=int)])
    return X, y

X, y = build_dataset(A, B, n_per_class=N_PER_CLASS, seed=RANDOM_STATE)
print("Feature matrix:", X.shape, "Labels:", y.shape)


A feats: 100%|██████████| 800/800 [00:01<00:00, 476.04it/s]
B feats: 100%|██████████| 800/800 [00:01<00:00, 767.32it/s]

Feature matrix: (1600, 1028) Labels: (1600,)





## Train/test split and CV grid search

In [5]:

# Holdout split
sss = StratifiedShuffleSplit(n_splits=1, test_size=TEST_SIZE, random_state=RANDOM_STATE)
train_idx, test_idx = next(sss.split(X, y))
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

# One pipeline with a placeholder classifier, then search multiple model families
pipe = Pipeline([('scaler', StandardScaler(with_mean=True, with_std=True)),
                 ('clf', SVC())])

param_grid = [
    # Linear SVM
    {
        'clf': [LinearSVC(max_iter=10000, dual=True)],
        'clf__C': [0.1, 0.5, 1, 2, 5, 10],
        'clf__loss': ['hinge', 'squared_hinge']
    },
    # RBF SVM
    {
        'clf': [SVC(kernel='rbf')],
        'clf__C': [0.5, 1, 2, 5, 10],
        'clf__gamma': ['scale', 1e-3, 5e-4, 1e-4]
    },
    # Logistic Regression
    {
        'clf': [LogisticRegression(max_iter=2000, solver='lbfgs')],
        'clf__C': [0.1, 0.5, 1, 2, 5, 10],
        'clf__penalty': ['l2']
    },
]

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
gs = GridSearchCV(pipe, param_grid=param_grid, cv=cv, n_jobs=-1, scoring='accuracy', refit=True, verbose=0)

gs.fit(X_train, y_train)
print("Best CV params:", gs.best_params_)
print("Best CV accuracy:", gs.best_score_)

# Evaluate on held-out test
y_pred = gs.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1w = f1_score(y_test, y_pred, average='weighted')
print(f"Test Accuracy: {acc:.4f}")
print(f"Test F1 (weighted): {f1w:.4f}")
print()
print(classification_report(y_test, y_pred, target_names=['A','B']))




Best CV params: {'clf': SVC(), 'clf__C': 10, 'clf__gamma': 0.0005}
Best CV accuracy: 0.84375
Test Accuracy: 0.8500
Test F1 (weighted): 0.8500

              precision    recall  f1-score   support

           A       0.85      0.86      0.85       160
           B       0.85      0.84      0.85       160

    accuracy                           0.85       320
   macro avg       0.85      0.85      0.85       320
weighted avg       0.85      0.85      0.85       320



## (Optional) Save fitted model & features

In [None]:

import joblib, os
os.makedirs('stft_np/baseline_features', exist_ok=True)
np.save('stft_np/baseline_features/X.npy', X)
np.save('stft_np/baseline_features/y.npy', y)
joblib.dump(gs.best_estimator_, 'stft_np/baseline_features/baseline_cv_model.joblib')
print("Saved X/y and the best model to stft_np/baseline_features/")
