# Scholar — Hard-Coded Runner

Runs literature-backed pipelines with hard-coded parameters (no argparse).

In [None]:
# Fixed configuration
DATA_PATH = 'XY_Phonemes.csv'
MODE = 'eval'          # 'eval' | 'grid' | 'export'
OUTPUT_PATH = 'scholar_sub.csv'
SEED = 42

In [None]:
# Imports from scholar.py (transformers, pipelines, helpers)
import os, numpy as np, pandas as pd
from scholar import (
    build_pipelines, grid_candidates, evaluate_cv,
    PerSpeakerStandardizer, MovingAverageSmoother, BandAverager, Cepstral, diff12_tf, SpectralDescriptors,
    HAS_SGF, StratifiedGroupKFold, GroupKFold
)
from sklearn.model_selection import GroupKFold

assert os.path.exists(DATA_PATH), f'{DATA_PATH} not found'
df = pd.read_csv(DATA_PATH, low_memory=False)
is_train = df['g'].notna()
train = df[is_train].copy()
test  = df[~is_train].copy()
X_cols = [c for c in df.columns if c.startswith('x.')]
X = train[X_cols].values.astype(float)
y = train['g'].values
groups = train['speaker'].values if 'speaker' in train.columns else None
X_test = test[X_cols].values.astype(float)

cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=SEED) if (HAS_SGF and groups is not None) else (GroupKFold(n_splits=5) if groups is not None else 5)
pipes = build_pipelines()
print('MODE =', MODE)
print('Pipelines:', list(pipes.keys()))

In [None]:
# Execute requested mode with hard-coded params
if MODE == 'eval':
    for name, pipe in pipes.items():
        mean, std = evaluate_cv(pipe, X, y, groups, cv)
        print(f'{name}: acc={mean:.4f} ± {std:.4f}')

elif MODE == 'grid':
    from sklearn.model_selection import GridSearchCV
    grids = grid_candidates()
    results = []
    for name, (pipe, grid) in grids.items():
        gs = GridSearchCV(pipe, grid, cv=cv, scoring='accuracy', n_jobs=-1, refit=True, verbose=0)
        fit_params = {}
        if 'pss' in pipe.named_steps and groups is not None:
            fit_params['pss__groups'] = groups
        gs.fit(X, y, groups=groups, **fit_params)
        print(f"{name}: best_acc={gs.best_score_:.4f} params={gs.best_params_}")
        results.append((name, float(gs.best_score_), gs.best_params_))
    import pandas as pd
    display(pd.DataFrame(results, columns=['pipeline', 'cv_acc', 'best_params']).sort_values('cv_acc', ascending=False))

elif MODE == 'export':
    pipe = pipes['Best']
    fit_params = {'pss__groups': groups} if groups is not None else {}
    pipe.fit(X, y, **fit_params)
    pred = pipe.predict(X_test)
    sub = pd.DataFrame({'g': pred}, index=range(1, len(pred)+1))
    sub.to_csv(OUTPUT_PATH, index_label='id')
    print('Saved', os.path.abspath(OUTPUT_PATH))
    display(sub['g'].value_counts(sort=False))

else:
    raise ValueError('Unknown MODE; use eval | grid | export')