# Age Regression Notebook

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, make_scorer, cohen_kappa_score
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, GridSearchCV

from mord import OrdinalRidge

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
feature_path = "../data/features.csv"
voices_path = "../data/voices.csv"

In [None]:
features = pd.read_csv(feature_path, index_col="clip_id")
voices = pd.read_csv(voices_path, index_col="clip_id")

In [None]:
# turn 90 and 80 to >= 80
data = voices.merge(features, left_index=True, right_index=True, how='inner')
data.loc[data.voice_age_group == 90, "voice_age_group"] = 80
data.voice_age_group.value_counts()

In [None]:
data = data.groupby('voice_age_group', group_keys=False).apply(
    lambda x: x.sample(n=2000, replace=True, random_state=42)
).sample(frac=1, random_state=42)
data.voice_age_group.value_counts()

In [None]:
data.head()

In [None]:
X = data[features.columns]
y = data.voice_age_group

In [None]:
X.head()

In [None]:
age_bins = [20, 30, 40, 50, 60, 70, 80]
y_ordinal = np.digitize(y, age_bins) - 1

y_ordinal

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_ordinal, test_size=0.2)

In [None]:
def evaluate_model_classification(model, X_train, X_test, y_train, y_test):
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    print("Train Accuracy:", accuracy_score(y_train, y_pred_train))
    print("Test Accuracy:", accuracy_score(y_test, y_pred_test))
    
    conf = confusion_matrix(y_test, y_pred_test)
    print("Confusion Matrix:\n", conf)
    sns.heatmap(
        conf,
        annot=True,
        fmt="d",
        cmap="Blues",
        xticklabels=age_bins,
        yticklabels=age_bins
    )
    plt.xlabel("Predicted Age Group")
    plt.ylabel("True Age Group")
    plt.title("Confusion Matrix")
    plt.show()

# Ordinal Model

In [None]:
param_grid = {
    'ordinalridge__alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0],  # Regularization strength
    'ordinalridge__fit_intercept': [True, False],
    'ordinalridge__tol': [1e-4, 1e-3, 1e-2],  # Tolerance for stopping
    'ordinalridge__max_iter': [100, 500, 1000]  # Maximum iterations
}

pipeline = Pipeline([
    ('scaler', RobustScaler()),
    ('ordinalridge', OrdinalRidge())
])

# Use stratified k-fold for ordinal targets
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Custom scoring for ordinal problems
scoring = {
    'accuracy': 'accuracy',
    'balanced_accuracy': 'balanced_accuracy',
    'mae': 'neg_mean_absolute_error',
    'kappa': make_scorer(cohen_kappa_score, weights='quadratic')
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=cv,
    scoring=scoring,
    refit='kappa',  # Focus on improving ordinal agreement
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
evaluate_model_classification(grid_search.best_estimator_, X_train, X_test, y_train, y_test)

# AT

In [None]:
from mord import LogisticAT

model = LogisticAT(alpha=0.1)
model.fit(X_train, y_train)
evaluate_model_classification(model, X_train, X_test, y_train, y_test)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

ordinal_scorer = make_scorer(cohen_kappa_score, weights='quadratic')

# Best parameters found:  {'max_depth': 10, 'max_features': 0.3, 'min_samples_leaf': 3, 'n_estimators': 600}
param_grid = {
    'n_estimators': [200, 400, 600, 1000],  # More trees help, but with diminishing returns
    'max_depth': [3, 5, 7, 10],  # Control overfitting
    'min_samples_leaf': [3, 5],    # Smooth predictions
    'max_features': ['sqrt', 0.3]  # Reduce correlation between trees
}

model = RandomForestClassifier(random_state=42, class_weight='balanced')

# Use 3-fold CV for speed (increase if you have small datasets)
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,
    scoring=ordinal_scorer,  # Prioritize ordinal agreement
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
evaluate_model_classification(grid_search.best_estimator_, X_train, X_test, y_train, y_test)