# Gender Classification
Train a simple gender classification model.
The model will guess either male or female by the given 88 GeMAPS features.

In [None]:
# Imports
# Python packages
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree, DecisionTreeClassifier

import tensorflow as tf
from keras import Sequential, layers
import shap

import IPython.display as ipd
import seaborn as sns

# Custom package
import sys
from pathlib import Path
sys.path.append(str(Path().resolve().parent / "src"))
from age_by_voice.dataset_prep.dataset_preparator import Dataset_Perparator

In [None]:
rand_state = 420

In [None]:
voices_path = "/home/chr1s/Dokumente/github/Programming/Python/Age_by_Voice/data/csv_saves/compare/save_voices_14000.csv"
features_path = "/home/chr1s/Dokumente/github/Programming/Python/Age_by_Voice/data/csv_saves/compare/save_features_14000.csv"
audio_path = "/home/chr1s/Downloads/cv-corpus-21.0-2025-03-14/en/clips/"

In [None]:
data_prep = Dataset_Perparator(voices_path, features_path) # , normalize=True
data_prep.check_balance(feature="gender")

In [None]:
# play random audio file
file = data_prep.voices.audio_file_name.sample(1).values[0]
file_path = os.path.join(audio_path, file)

ipd.Audio(file_path)  # Play the audio file

In [None]:
# Analyze the data
data = data_prep.features.copy()
data = data.drop(columns=["clip_id"])
data["gender"] = data_prep.voices["voice_gender"].map({'m': 1, 'f': 0})

In [None]:
data.info()
data.describe()

In [None]:
# plot the gender count
data_prep.voices.voice_gender.value_counts().plot(kind='bar')

In [None]:
X = data.drop(columns=["gender"])
y = data["gender"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=rand_state)

In [None]:
X_train.shape

In [None]:
tree = DecisionTreeClassifier(max_depth=15, random_state=rand_state)

In [None]:
tree.fit(X_train, y_train)

In [None]:
# Model Score
print(f"Train score: {tree.score(X_train, y_train)}")
print(f"Test score: {tree.score(X_test, y_test)}")

In [None]:
# train confusion matrix
y_train_pred = tree.predict(X_train)
confusion_matrix(y_train, y_train_pred) / y_train.shape[0]

In [None]:
# test confusion matrix
y_test_pred = tree.predict(X_test)
confusion_matrix(y_test, y_test_pred) / y_test.shape[0]

In [None]:
logreg = LogisticRegression(random_state=rand_state)
logreg.fit(X_train, y_train)

In [None]:
# train confusion matrix
y_train_pred = logreg.predict(X_train)
confusion_matrix(y_train, y_train_pred) / y_train.shape[0]

In [None]:
# test confusion matrix
y_test_pred = logreg.predict(X_test)
confusion_matrix(y_test, y_test_pred) / y_test.shape[0]

In [None]:
# Model Score
print(f"Train score: {logreg.score(X_train, y_train)}")
print(f"Test score: {logreg.score(X_test, y_test)}")

In [None]:
data_prep = Dataset_Perparator(voices_path, features_path, normalize=True)
data_prep.check_balance(feature="gender")
data = data_prep.features.copy()
data = data.drop(columns=["clip_id"])
data["gender"] = data_prep.voices["voice_gender"].map({'m': 1, 'f': 0})
X = data.drop(columns=["gender"])
y = pd.DataFrame()
y['m'] = data['gender']
y['f'] = 1 - data['gender'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=rand_state)

In [None]:
# DNN Model
model = Sequential([
    layers.Input(shape=(X_train.shape[1],)),
    layers.Dense(2048, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(1024, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(512, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(16, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(2, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit(X_train, y_train, epochs=100, batch_size=50, validation_split=0.2)

In [None]:
# Model Score

print(f"Train score: {model.evaluate(X_train, y_train)[1]}")
print(f"Test score: {model.evaluate(X_test, y_test)[1]}")

In [None]:
rand_voice = data_prep.voices.sample(1)
print(f"Clip ID: {rand_voice.clip_id.values[0]}, Gender:  {rand_voice.voice_gender.values[0]}")
file_path = os.path.join(audio_path, rand_voice.audio_file_name.values[0])

ipd.Audio(file_path)

