# Gender Classification
Train a simple gender classification model.
The model will guess either male or female by the given 88 GeMAPS features.

In [None]:
# Imports
# Python packages
import os
import numpy as np
import matplotlib.pyplot as plt
# Logic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# play audio
import IPython.display as ipd

# Custom package
import sys
from pathlib import Path
sys.path.append(str(Path().resolve().parent / "src"))
from age_by_voice.dataset_prep.dataset_preparator import Dataset_Perparator

In [None]:
voices_path = "/home/chr1s/Dokumente/github/Programming/Python/Age_by_Voice/data/csv_saves/save_voices_6000.csv"
features_path = "/home/chr1s/Dokumente/github/Programming/Python/Age_by_Voice/data/csv_saves/save_features_6000.csv"
audio_path = "/home/chr1s/Downloads/cv-corpus-21.0-2025-03-14/en/clips/"

In [None]:
data_prep = Dataset_Perparator(voices_path, features_path)

In [None]:
# play random audio file
file = data_prep.voices.audio_file_name.sample(1).values[0]
file_path = os.path.join(audio_path, file)

ipd.Audio(file_path)  # Play the audio file

In [None]:
# plot the gender count
data_prep.voices.voice_gender.value_counts().plot(kind='bar')

In [None]:
X_train, X_test, y_train, y_test = data_prep.prepare_gender_dataset(test_size=0.001)
y_train = y_train["male"]
y_test = y_test["male"]

In [None]:
# Verify if clip_id is unique in voices and features
print("Voices clip_id unique:", data_prep.voices['clip_id'].is_unique)
print("Features clip_id unique:", data_prep.features['clip_id'].is_unique)

In [None]:
# plot the gender count for X_train (two columns, one of them is 1 per row)
y_train.value_counts().plot(kind='bar')


In [None]:
model = LogisticRegression(max_iter=9000)

In [None]:
model.fit(X_train, y_train)
print("Model score:", model.score(X_test, y_test))
y_pred_proba = model.predict_proba(X_test)
print("Predicted probabilities for male and female:", y_pred_proba)