# Phonemes — Improved Local Solution

Adds StratifiedGroupKFold CV, per-speaker standardization, PCA (with optional whitening), first/second-difference and band-mean features, and tighter LDA/LR hyperparameter grids. No Google/Kaggle API — reads local `XY_Phonemes.csv` and writes `solution.csv` (plus `solution_ensemble.csv` if soft-vote is built).

In [40]:
# Imports and setup
import os, sys, time
import numpy as np, pandas as pd
from sklearn.model_selection import GroupKFold, GridSearchCV
try:
    from sklearn.model_selection import StratifiedGroupKFold
    HAS_SGF = True
except Exception:
    StratifiedGroupKFold = None
    HAS_SGF = False
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.linear_model import LogisticRegression

SEED = 42
np.random.seed(SEED)

DATA_PATH = 'XY_Phonemes.csv'
assert os.path.exists(DATA_PATH), 'XY_Phonemes.csv not found in current directory'

print('Loading data...')
df = pd.read_csv(DATA_PATH, low_memory=False)
print(df.shape)
df.head(2)

Loading data...
(60000, 258)


Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,x249,x250,x251,x252,x253,x254,x255,x256,SpkrID,g
0,11.14,15.17,19.12,18.78,13.79,18.98,20.09,19.16,17.32,19.76,...,7.79,10.95,10.05,8.67,10.84,9.5,7.96,9.04,0,
1,10.84,15.2,19.8,17.82,14.1,17.93,19.54,20.97,16.65,22.41,...,12.95,13.05,11.95,11.07,10.81,9.71,11.57,12.22,0,


In [41]:
# Train/Test split by label presence; features and groups
is_train = df['g'].notna()
train = df[is_train].copy()
test  = df[~is_train].copy()

# Feature columns (x.1 ... x.256)
X_cols = [c for c in df.columns if c.startswith('x')]
group_col = 'speaker' if 'speaker' in df.columns else None

X = train[X_cols].values
y = train['g'].values
groups = train[group_col].values if group_col else None
X_test = test[X_cols].values

print('Train shape:', X.shape, ' Test shape:', X_test.shape)
print('Classes:', pd.Series(y).value_counts().to_dict())
print('Speakers (train):', len(pd.Series(groups).unique()) if groups is not None else 'N/A')

Train shape: (50000, 256)  Test shape: (10000, 256)
Classes: {'iy': 13154, 'ao': 11197, 'sh': 9674, 'dcl': 8340, 'aa': 7635}
Speakers (train): N/A


## Per-speaker standardization and engineered features

In [42]:
# Transformer: per-speaker standardization (fits mean/std per speaker within CV folds)
class PerSpeakerStandardizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.global_mean_ = None
        self.global_std_ = None
        self.spk_mean_ = {}
        self.spk_std_ = {}
    def fit(self, X, y=None, groups=None):
        X = np.asarray(X)
        self.global_mean_ = X.mean(axis=0)
        self.global_std_ = X.std(axis=0) + 1e-8
        self.spk_mean_.clear(); self.spk_std_.clear()
        if groups is not None:
            for spk in np.unique(groups):
                idx = (groups == spk)
                self.spk_mean_[spk] = X[idx].mean(axis=0)
                self.spk_std_[spk]  = X[idx].std(axis=0) + 1e-8
        return self
    def transform(self, X, y=None, groups=None):
        X = np.asarray(X)
        if groups is None:
            return (X - self.global_mean_) / self.global_std_
        out = np.empty_like(X, dtype=float)
        for i, spk in enumerate(groups):
            m = self.spk_mean_.get(spk, self.global_mean_)
            s = self.spk_std_.get(spk, self.global_std_)
            out[i] = (X[i] - m) / s
        return out

# First and second difference features across frequency bins
def add_diff12(X):
    X = np.asarray(X)
    D1 = np.diff(X, axis=1)
    D1 = np.pad(D1, ((0,0),(1,0)))
    D2 = np.diff(D1, axis=1)
    D2 = np.pad(D2, ((0,0),(1,0)))
    return np.hstack([X, D1, D2])

diff12_tf = FunctionTransformer(add_diff12, validate=False)

# Band-mean features (append coarse averages across adjacent frequency bins)
class BandAverager(BaseEstimator, TransformerMixin):
    def __init__(self, n_bands=16):
        self.n_bands = n_bands
        self.idx_ = None
    def fit(self, X, y=None):
        X = np.asarray(X)
        n = X.shape[1]
        # create band index mapping once
        edges = np.linspace(0, n, self.n_bands+1, dtype=int)
        self.idx_ = [(edges[i], edges[i+1]) for i in range(self.n_bands)]
        return self
    def transform(self, X):
        X = np.asarray(X)
        bands = []
        for a,b in self.idx_:
            bands.append(X[:, a:b].mean(axis=1, keepdims=True))
        B = np.hstack(bands)
        return np.hstack([X, B])

## Candidate pipelines and expanded grids

In [43]:
# Stratified grouped CV if available, else GroupKFold
if HAS_SGF and (groups is not None):
    cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=SEED)
else:
    cv = GroupKFold(n_splits=5) if groups is not None else 5

scoring = 'accuracy'
n_jobs = -1

candidates = []

# 1) LDA + PCA with shrinkage and optional whitening; per-speaker standardization first
pipe_lda = Pipeline([
    ('pss', PerSpeakerStandardizer()),
    ('pca', PCA(random_state=SEED)),
    ('clf', LDA())
])
grid_lda = {
    'pca__n_components': [24, 32, 40, 60, 80, 100, 0.97, 0.98],
    'pca__whiten': [False, True],
    'clf__solver': ['lsqr', 'eigen'],
    'clf__shrinkage': [0.0, 0.02, 0.05, 0.1, 0.15, 'auto']
}
candidates.append(('LDA+PCA+PSS', pipe_lda, grid_lda))

# 2) LDA + (1st+2nd) differences + PCA
pipe_lda_diff = Pipeline([
    ('pss', PerSpeakerStandardizer()),
    ('diff', diff12_tf),
    ('pca', PCA(random_state=SEED)),
    ('clf', LDA())
])
grid_lda_diff = {
    'pca__n_components': [40, 60, 80, 100, 0.98],
    'pca__whiten': [False, True],
    'clf__solver': ['lsqr', 'eigen'],
    'clf__shrinkage': [0.0, 0.05, 'auto']
}
candidates.append(('LDA+DIFF12+PCA+PSS', pipe_lda_diff, grid_lda_diff))

# 3) LDA + band means + PCA
pipe_lda_bands = Pipeline([
    ('pss', PerSpeakerStandardizer()),
    ('bands', BandAverager()),
    ('pca', PCA(random_state=SEED)),
    ('clf', LDA())
])
grid_lda_bands = {
    'bands__n_bands': [16, 32],
    'pca__n_components': [40, 60, 80, 100, 0.98],
    'pca__whiten': [False, True],
    'clf__solver': ['lsqr', 'eigen'],
    'clf__shrinkage': [0.0, 0.05, 'auto']
}
candidates.append(('LDA+BANDS+PCA+PSS', pipe_lda_bands, grid_lda_bands))

# 4) Logistic Regression on PCs (expanded grid)
pipe_lr = Pipeline([
    ('pss', PerSpeakerStandardizer()),
    ('pca', PCA(random_state=SEED)),
    ('clf', LogisticRegression(max_iter=800, multi_class='multinomial', solver='lbfgs'))
])
grid_lr = {
    'pca__n_components': [40, 60, 80, 100, 0.97, 0.98],
    'pca__whiten': [False, True],
    'clf__C': [0.2, 0.5, 1.0, 2.0, 3.0],
    'clf__class_weight': [None, 'balanced']
}
candidates.append(('LR+PCA+PSS', pipe_lr, grid_lr))

results = []
best_models = {}
for name, pipe, grid in candidates:
    print(f'\n==> Tuning {name}')
    gs = GridSearchCV(pipe, grid, scoring=scoring, cv=cv, n_jobs=n_jobs, refit=True, verbose=0)
    fit_params = {}
    if ('pss' in pipe.named_steps) and (groups is not None):
        fit_params['pss__groups'] = groups
    if HAS_SGF and (groups is not None):
        gs.fit(X, y, groups=groups, **fit_params)
    else:
        # If no groups, still pass fit params; if groups present, GroupKFold uses them internally
        gs.fit(X, y, groups=groups, **fit_params)
    results.append((name, gs.best_score_, gs.best_params_))
    best_models[name] = gs.best_estimator_
    print(f'Best CV acc: {gs.best_score_:.4f} with {gs.best_params_}')

results_sorted = sorted(results, key=lambda x: x[1], reverse=True)
pd.DataFrame(results_sorted, columns=['model','cv_acc','best_params'])


==> Tuning LDA+PCA+PSS
Best CV acc: 0.9270 with {'clf__shrinkage': 0.15, 'clf__solver': 'lsqr', 'pca__n_components': 0.97, 'pca__whiten': False}

==> Tuning LDA+DIFF12+PCA+PSS
Best CV acc: 0.9212 with {'clf__shrinkage': 0.05, 'clf__solver': 'lsqr', 'pca__n_components': 0.98, 'pca__whiten': False}

==> Tuning LDA+BANDS+PCA+PSS
Best CV acc: 0.9259 with {'bands__n_bands': 32, 'clf__shrinkage': 0.05, 'clf__solver': 'lsqr', 'pca__n_components': 100, 'pca__whiten': False}

==> Tuning LR+PCA+PSS
Best CV acc: 0.9272 with {'clf__C': 0.2, 'clf__class_weight': None, 'pca__n_components': 60, 'pca__whiten': True}


Unnamed: 0,model,cv_acc,best_params
0,LR+PCA+PSS,0.92716,"{'clf__C': 0.2, 'clf__class_weight': None, 'pc..."
1,LDA+PCA+PSS,0.92702,"{'clf__shrinkage': 0.15, 'clf__solver': 'lsqr'..."
2,LDA+BANDS+PCA+PSS,0.92586,"{'bands__n_bands': 32, 'clf__shrinkage': 0.05,..."
3,LDA+DIFF12+PCA+PSS,0.92118,"{'clf__shrinkage': 0.05, 'clf__solver': 'lsqr'..."


## Fit best model; optional LDA+LR soft-vote

In [44]:
# Best single model
best_name, best_cv, _ = results_sorted[0]
best_est = best_models[best_name]
print('Best overall:', best_name, f'(CV acc={best_cv:.4f})')
# Fit on all data (pass groups to PSS if needed)
fit_params_full = {}
if ('pss' in best_est.named_steps) and (groups is not None):
    fit_params_full['pss__groups'] = groups
best_est.fit(X, y, **fit_params_full)

# Predict on test
pred_single = best_est.predict(X_test)
subm = pd.DataFrame({'g': pred_single}, index=range(1, len(pred_single)+1))
out_path = 'solution.csv'
subm.to_csv(out_path, index_label='id')
print('Saved single-model to', os.path.abspath(out_path))
display(subm['g'].value_counts(sort=False))

# Optional: soft-vote between best LDA-like and LR models if both present
ens_written = False
if ('LDA+PCA+PSS' in best_models) and ('LR+PCA+PSS' in best_models):
    lda_est = best_models['LDA+PCA+PSS']
    lr_est  = best_models['LR+PCA+PSS']
    # Fit both on full data
    fp_lda = {'pss__groups': groups} if 'pss' in lda_est.named_steps and groups is not None else {}
    fp_lr  = {'pss__groups': groups} if 'pss' in lr_est.named_steps and groups is not None else {}
    lda_est.fit(X, y, **fp_lda)
    lr_est.fit(X, y, **fp_lr)
    p_lda = lda_est.predict_proba(X_test)
    p_lr  = lr_est.predict_proba(X_test)
    # Weights from their CV accuracies if available
    w_lda = [r[1] for r in results if r[0]=='LDA+PCA+PSS']
    w_lr  = [r[1] for r in results if r[0]=='LR+PCA+PSS']
    w_lda = w_lda[0] if len(w_lda)>0 else 1.0
    w_lr  = w_lr[0] if len(w_lr)>0 else 1.0
    P = (w_lda*p_lda + w_lr*p_lr) / (w_lda + w_lr)
    classes = lda_est.classes_
    pred_ens = classes[P.argmax(axis=1)]
    sub_ens = pd.DataFrame({'g': pred_ens}, index=range(1, len(pred_ens)+1))
    out_ens = 'solution_ensemble.csv'
    sub_ens.to_csv(out_ens, index_label='id')
    print('Saved ensemble to', os.path.abspath(out_ens))
    display(sub_ens['g'].value_counts(sort=False))
    ens_written = True

print('Done. Ensemble written?', ens_written)

Best overall: LR+PCA+PSS (CV acc=0.9272)
Saved single-model to /Users/ianovosad/code/HSE/ML/Kaggle/Phonemes/solution.csv


g
ao     2323
iy     2610
dcl    1660
sh     1949
aa     1458
Name: count, dtype: int64

Saved ensemble to /Users/ianovosad/code/HSE/ML/Kaggle/Phonemes/solution_ensemble.csv


g
ao     2341
iy     2634
dcl    1640
sh     1943
aa     1442
Name: count, dtype: int64

Done. Ensemble written? True
