# Submission — Ensemble (LR + LDA with PSS + PCA)

Clean reproduction of the final ensemble using fixed best hyperparameters from solution.ipynb. No grid search. Writes solution_ensemble.csv.

In [2]:
import os
import numpy as np, pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.linear_model import LogisticRegression

SEED = 42
DATA_PATH = 'XY_Phonemes.csv'
assert os.path.exists(DATA_PATH), 'XY_Phonemes.csv not found'

# Per-speaker standardizer
class PerSpeakerStandardizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.global_mean_ = None
        self.global_std_ = None
        self.spk_mean_ = {}
        self.spk_std_ = {}
    def fit(self, X, y=None, groups=None):
        X = np.asarray(X)
        self.global_mean_ = X.mean(axis=0)
        self.global_std_ = X.std(axis=0) + 1e-8
        self.spk_mean_.clear(); self.spk_std_.clear()
        if groups is not None:
            for spk in np.unique(groups):
                idx = (groups == spk)
                self.spk_mean_[spk] = X[idx].mean(axis=0)
                self.spk_std_[spk]  = X[idx].std(axis=0) + 1e-8
        return self
    def transform(self, X, y=None, groups=None):
        X = np.asarray(X)
        if groups is None:
            return (X - self.global_mean_) / self.global_std_
        out = np.empty_like(X, dtype=float)
        for i, spk in enumerate(groups):
            m = self.spk_mean_.get(spk, self.global_mean_)
            s = self.spk_std_.get(spk, self.global_std_)
            out[i] = (X[i] - m) / s
        return out

# Load data
df = pd.read_csv(DATA_PATH, low_memory=False)
is_train = df['g'].notna()
train = df[is_train].copy()
test  = df[~is_train].copy()
X_cols = [c for c in df.columns if c.startswith('x')]
X = train[X_cols].values
y = train['g'].values
groups = train['speaker'].values if 'speaker' in train.columns else None
X_test = test[X_cols].values
print('Train:', X.shape, ' Test:', X_test.shape)

# Fixed best hyperparameters from solution.ipynb
# LDA+PCA+PSS: shrinkage=0.1, n_components=100 (CV acc=0.9267)
lda = Pipeline([
    ('pss', PerSpeakerStandardizer()),
    ('pca', PCA(n_components=100, random_state=SEED)),
    ('clf', LDA(solver='lsqr', shrinkage=0.1))
])
# LR+PCA+PSS: C=0.3, n_components=60 (CV acc=0.9270)
lr = Pipeline([
    ('pss', PerSpeakerStandardizer()),
    ('pca', PCA(n_components=60, random_state=SEED)),
    ('clf', LogisticRegression(max_iter=600, solver='lbfgs', C=0.3))
])

# Fit both on full data
fit_params = {'pss__groups': groups} if groups is not None else {}
lda.fit(X, y, **fit_params)
lr.fit(X, y, **fit_params)

# Soft-vote using the reported CV accuracies as weights
w_lda, w_lr = 0.9267, 0.9270
p_lda = lda.predict_proba(X_test)
p_lr  = lr.predict_proba(X_test)
P = (w_lda * p_lda + w_lr * p_lr) / (w_lda + w_lr)
classes = lda.classes_
pred = classes[P.argmax(axis=1)]
sub = pd.DataFrame({'g': pred}, index=range(1, len(pred)+1))
out_path = 'solution_ensemble.csv'
sub.to_csv(out_path, index_label='id')
print('Saved ensemble:', os.path.abspath(out_path))
sub['g'].value_counts(sort=False)

Train: (50000, 256)  Test: (10000, 256)
Saved ensemble: /Users/ianovosad/code/HSE/ML/Kaggle/Phonemes/solution_ensemble.csv


g
ao     2339
iy     2629
dcl    1644
sh     1944
aa     1444
Name: count, dtype: int64