# Gender Classification
Train a simple gender classification model.
The model will guess either male or female by the given 88 GeMAPS features.

In [None]:
# Imports
# Python packages
import os
import numpy as np
import matplotlib.pyplot as plt
# Logic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree, DecisionTreeClassifier
# play audio
import IPython.display as ipd
import seaborn as sns

# Custom package
import sys
from pathlib import Path
sys.path.append(str(Path().resolve().parent / "src"))
from age_by_voice.dataset_prep.dataset_preparator import Dataset_Perparator

In [None]:
rand_state = 420

In [None]:
voices_path = "/home/chr1s/Dokumente/github/Programming/Python/Age_by_Voice/data/csv_saves/custom_features/save_voices_4000.csv"
features_path = "/home/chr1s/Dokumente/github/Programming/Python/Age_by_Voice/data/csv_saves/custom_features/save_features_4000.csv"
audio_path = "/home/chr1s/Downloads/cv-corpus-21.0-2025-03-14/en/clips/"

In [None]:
data_prep = Dataset_Perparator(voices_path, features_path)
data_prep.check_balance(feature="gender")

In [None]:
# play random audio file
file = data_prep.voices.audio_file_name.sample(1).values[0]
file_path = os.path.join(audio_path, file)

ipd.Audio(file_path)  # Play the audio file

In [None]:
# Analyze the data
data = data_prep.features.copy()
data = data.drop(columns=["clip_id"])
data["gender"] = data_prep.voices["voice_gender"].map({'m': 1, 'f': 0})

In [None]:
sns.heatmap(data.corr(), fmt=".2f", cmap="coolwarm")

In [None]:
data.info()
data.describe()

In [None]:
# plot the gender count
data_prep.voices.voice_gender.value_counts().plot(kind='bar')

In [None]:
X = data.drop(columns=["gender"])
y = data["gender"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=rand_state)

In [None]:
X_train.shape

In [None]:
model = DecisionTreeClassifier(max_depth=20, random_state=rand_state)

In [None]:
model.fit(X_train, y_train)

In [None]:
# train confusion matrix
y_train_pred = model.predict(X_train)
confusion_matrix(y_train, y_train_pred) / y_train.shape[0]

In [None]:
# test confusion matrix
y_test_pred = model.predict(X_test)
confusion_matrix(y_test, y_test_pred) / y_test.shape[0]

In [None]:
fig = plt.figure(figsize=(20, 10))
plot_tree(model, feature_names=X.columns, class_names=["f", "m"], filled=True)

plt.show()