# Gender Classification
Train a simple gender classification model.
The model will guess either male or female by the given 88 GeMAPS features.

In [None]:
# Imports
# Python packages
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

import shap

import IPython.display as ipd
import seaborn as sns

In [None]:
rand_state = 10

# Load the data
First we need the dataset we're going to work on.
Get the dataframes by the local csvs and merge them into a df we can work with for ml

In [None]:
voices_path = "../data/csv_saves/gemaps/voices.csv"
features_path = "../data/csv_saves/gemaps/features.csv"
audio_path = "/home/chr1s/Downloads/cv-corpus-21.0-2025-03-14/en/clips/"

In [None]:
voices = pd.read_csv(voices_path)
features = pd.read_csv(features_path)
voices.head()

In [None]:
# Merge voices and features by the column "clip_id"
data = pd.merge(voices[["clip_id", "voice_gender"]], features, on="clip_id")
data.set_index("clip_id", inplace=True)
del voices

In [None]:
data.head()

In [None]:
# turn gender into a binary variable
data["voice_gender_m"] = data["voice_gender"].apply(lambda x: 1 if x == "m" else 0)
data["voice_gender_f"] = data["voice_gender"].apply(lambda x: 1 if x == "f" else 0)
data.drop(columns="voice_gender", inplace=True)
data.voice_gender_m.hist()

# Understand data
Next we need to get an even better understanding of the data...

In [None]:
data.info()

In [None]:
data.describe()

#### Correlation
Lets correllate the features with the gender.
This gives us some basic informations which features could be relevant for gender detection.

In [None]:
"""
corr = data.corr()
target_cols = ["voice_gender_m", "voice_gender_f"]
corr_with_targets = corr[target_cols]
top_features = set()
for col in target_cols:
    abs_corr = corr_with_targets[col].abs().drop(target_cols, errors='ignore')
    abs_corr = abs_corr[abs_corr < 1.0]
    top_features.update(abs_corr.nlargest(10).index.tolist())
top_features.update(target_cols)
top_features_list = list(top_features)
subset_corr = corr.loc[top_features_list, top_features_list]
plt.figure(figsize=(12, 10))
sns.heatmap(subset_corr, annot=True, fmt=".2f", cmap="coolwarm", 
            vmin=-1, vmax=1, center=0)
plt.title("Top Correlations with Voice Gender Features")
plt.tight_layout()
plt.show()
"""

In [None]:
"""
corr_table = corr_with_targets.loc[top_features_list].stack().reset_index()
corr_table.columns = ['Feature', 'Gender', 'Correlation']
corr_table['Abs_Correlation'] = corr_table['Correlation'].abs()
corr_table = corr_table.sort_values('Abs_Correlation', ascending=False)
display(corr_table[['Feature', 'Gender', 'Correlation']].style
        .background_gradient(cmap='coolwarm', subset=['Correlation'])
        .format({'Correlation': '{:.2f}'})
        .set_caption('Top Correlations with Voice Gender Features'))
"""

#### Random Forest Classifier
Lets create a random forest model to try classifing the gender by the audio features 

In [None]:
# create features and labels
target_cols = ["voice_gender_m", "voice_gender_f"]
X = data.drop(columns=target_cols)
y = data["voice_gender_m"]

In [None]:
# Select features for grid search
forest_feature_grid = {
    "n_estimators": [10, 100, 1000],
    "max_depth": [5, 10, 15],
}

In [None]:
"""
# grid search the best parameters
forest = RandomForestClassifier(random_state=rand_state)
forest_grid = GridSearchCV(forest, forest_feature_grid, verbose=2, cv=1)
forest_grid.fit(X, y)
"""

#### Train a Random Forest with fitting parameters
If I have time I'll promise to use GridSearch for this ^^"

In [None]:
target_cols = ["voice_gender_m", "voice_gender_f"]
X = data.drop(columns=target_cols)
y = data["voice_gender_m"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rand_state)

In [None]:
forest = RandomForestClassifier(
    n_estimators=500,
    max_depth=5,
    random_state=rand_state,
)
forest.fit(X_train, y_train)

In [None]:
y_train_pred = forest.predict(X_train)
y_train_pred_proba = forest.predict_proba(X_train)[:, 1]
y_test_pred = forest.predict(X_test)
y_test_pred_proba = forest.predict_proba(X_test)[:, 1]

print(f"Train Accuracy: {accuracy_score(y_train, y_train_pred):.4f}")
print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")

In [None]:
(y_test - y_test_pred_proba).abs().describe()

In [None]:
feature_importances = pd.DataFrame(forest.feature_importances_, columns=["feature_importance"], index=X_train.columns)
feature_importances.sort_values(by="feature_importance", ascending=False, inplace=True)
feature_importances

# Time for SHAP!
Lets see what the model thinks about features! 

In [None]:
i = 10
clip_id = X_test.index[i]
print(clip_id)
print(X_test.loc[clip_id,:], "\n")
print(f"Model prediction: {y_test_pred[i]}, probability: {y_test_pred_proba[i]:.4f}, true label: {y_test.iloc[i]}")

In [None]:
explainer = shap.TreeExplainer(forest)
shap_values = explainer.shap_values(X_test.iloc[i])

In [None]:
shap.force_plot(explainer.expected_value[1], shap_values[:, 1], feature_names=X_test.columns, matplotlib=True)

In [None]:
explainer = shap.Explainer(forest, X_train)
shap_values = explainer(X_test[:1000], check_additivity=False)

In [None]:
shap.plots.beeswarm(shap_values[:, :, 1])