# Age Regression Notebook

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error, mean_absolute_error, r2_score

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
feature_path = "../data/features.csv"
voices_path = "../data/voices.csv"

In [None]:
features = pd.read_csv(feature_path, index_col="clip_id")
voices = pd.read_csv(voices_path, index_col="clip_id")

In [None]:
# turn 90 and 80 to >= 80
data = voices.merge(features, left_index=True, right_index=True, how='inner')
data.loc[data.voice_age_group == 90, "voice_age_group"] = 80
data.voice_age_group = (data.voice_age_group-20) / 60
data.voice_age_group.value_counts()

In [None]:
data = data.groupby('voice_age_group', group_keys=False).apply(
    lambda x: x.sample(n=2000, replace=True, random_state=42)
).sample(frac=1, random_state=42)
data.voice_age_group.value_counts()

In [None]:
data.head()

In [None]:
X = data[features.columns]
y = data["voice_age_group"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

In [None]:

def evaluate_model_regression(model, X_train, X_test, y_train, y_test):
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    y_pred_test_rounded = np.round(model.predict(X_test))

    # Regression metrics
    print("Train MAE:", mean_absolute_error(y_train, y_pred_train))
    print("Test MAE:", mean_absolute_error(y_test, y_pred_test))
    print("Rounded Test MAE:", mean_absolute_error(y_test, y_pred_test_rounded))
    print("\nTrain MSE:", mean_squared_error(y_train, y_pred_train))
    print("Test MSE:", mean_squared_error(y_test, y_pred_test))
    print("\nTrain R²:", r2_score(y_train, y_pred_train))
    print("Test R²:", r2_score(y_test, y_pred_test))

    # Scatter plot of true vs predicted values
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=y_test, y=y_pred_test, alpha=0.6)
    plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'r--')  # Diagonal line
    plt.xlabel("True Age")
    plt.ylabel("Predicted Age")
    plt.title("True vs Predicted Age (Test Set)")
    plt.show()

    # Residual plot (helps check for bias)
    """
    residuals = y_test - y_pred_test
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=y_pred_test, y=residuals, alpha=0.6)
    plt.axhline(y=0, color='r', linestyle='--')
    plt.xlabel("Predicted Age")
    plt.ylabel("Residuals (True - Predicted)")
    plt.title("Residual Plot")
    plt.show()
    """

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train_s, y_train)

In [None]:
evaluate_model_regression(model, X_train_s, X_test_s, y_train, y_test)

# Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
evaluate_model_regression(model, X_train, X_test, y_train, y_test)

# XGB Boost

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [300, 500, 1000],       # Number of trees
    'max_depth': [3, 5, 7],                # Tree depth (lower = less overfitting)
}

# Initialize the model
model = XGBRegressor(random_state=42, objective='reg:squarederror')

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',  # Focus on MAE (aligns with your metric)
    cv=3,                               # 5-fold cross-validation
    n_jobs=-1,                          # Use all CPU cores
    verbose=2
)

# Run the grid search
grid_search.fit(X_train, y_train)

# Best parameters and results
print("Best Parameters:", grid_search.best_params_)
print("Best MAE:", -grid_search.best_score_)
evaluate_model_regression(grid_search.best_estimator_, X_train, X_test, y_train, y_test)