# Data Preparation Notebook

In [1]:
!rm -rf ~/.cache/huggingface/datasets

In [2]:
from datasets import load_dataset

female_dataset = load_dataset("Nourhann/Arabic-Diacritized-TTS", cache_dir="./cache")
male_dataset = load_dataset("MBZUAI/ClArTTS", cache_dir="./cache")

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/21 [00:00<?, ?it/s]

In [3]:
print(male_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'file', 'audio', 'sampling_rate', 'duration'],
        num_rows: 9500
    })
    test: Dataset({
        features: ['text', 'file', 'audio', 'sampling_rate', 'duration'],
        num_rows: 205
    })
})


In [4]:
print(female_dataset)

DatasetDict({
    train: Dataset({
        features: ['audio', 'transcription'],
        num_rows: 8874
    })
})


In [5]:
import IPython.display as ipd

# Access the first sample in the train split
sampleM = male_dataset['train'][1000]
audio_array = sampleM['audio']
sampling_rate = sampleM['sampling_rate']
text = sampleM['text']
# Display the text
print(f"Text: {text}")

# Play the audio
ipd.Audio(audio_array, rate=sampling_rate)

Text: .عِيٌّ .وَإِكْثَارُ .يَجِب اخْتِيَارُ الْكَلَامِ


In [6]:
sampleM

{'text': '.عِيٌّ .وَإِكْثَارُ .يَجِب اخْتِيَارُ الْكَلَامِ',
 'file': 'ch_16_arabic_tts_dataset_367.wav',
 'audio': [0.001373291015625,
  0.001190185546875,
  0.0018310546875,
  0.00201416015625,
  0.001312255859375,
  0.000762939453125,
  0.00103759765625,
  0.0010986328125,
  0.00091552734375,
  0.000823974609375,
  0.001129150390625,
  0.0009765625,
  0.00152587890625,
  0.001861572265625,
  0.00164794921875,
  0.000885009765625,
  0.0003662109375,
  0.0008544921875,
  0.000762939453125,
  0.000732421875,
  0.0003662109375,
  0.000335693359375,
  0.000518798828125,
  0.000762939453125,
  0.000823974609375,
  0.000946044921875,
  0.0010986328125,
  0.000885009765625,
  0.00067138671875,
  0.00079345703125,
  0.000885009765625,
  0.000396728515625,
  0.000274658203125,
  0.0006103515625,
  0.000885009765625,
  0.000457763671875,
  0.00042724609375,
  0.000732421875,
  0.000457763671875,
  9.1552734375e-05,
  0.000335693359375,
  0.00067138671875,
  0.000579833984375,
  0.0003662109375

In [7]:
import IPython.display as ipd

# Access the first sample in the train split of female_dataset
sampleF = female_dataset['train'][1000]
audio_array = sampleF['audio']['array']
sampling_rate = sampleF['audio']['sampling_rate']
text = sampleF['transcription']
# Display the text
print(f"Text: {text}")

# Play the audio
ipd.Audio(audio_array, rate=sampling_rate)

Text:  كَانَتْ الْاِبْنَة الأولى، والْحَفِيدَةُ الأولى أيضًا


In [8]:
sampleF

{'audio': {'path': None,
  'array': array([-3.05175781e-05,  0.00000000e+00, -3.05175781e-05, ...,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00], shape=(76288,)),
  'sampling_rate': 22100},
 'transcription': ' كَانَتْ الْاِبْنَة الأولى، والْحَفِيدَةُ الأولى أيضًا'}

## Dataset Alignment (Male + Female)

### Female metadata extraction

In [9]:
import pandas as pd
from tqdm import tqdm

# Combine splits into a single DataFrame with a 'split' column
female_records = []
for split in female_dataset.keys():
    for idx, sample in enumerate(tqdm(female_dataset[split], desc=f"Processing {split} split")):
        file_path = sample["audio"].get("path")
        if file_path is None:
            file_path = f"female_{idx+1:04d}"
        female_records.append({
            "text": sample["transcription"],
            "file": file_path,
            "sampling_rate": sample["audio"]["sampling_rate"],
            "duration": round(len(sample["audio"]["array"]) / sample["audio"]["sampling_rate"], 2),
            "split": split
        })

female_df = pd.DataFrame(female_records)
female_df.head()

Processing train split: 100%|██████████| 8874/8874 [00:39<00:00, 222.51it/s] 


Unnamed: 0,text,file,sampling_rate,duration,split
0,اَللّـهُمَّ ما قُلْتُ فى جُمُعَتى هذِهِ مِنْ ...,female_0001,22100,27.97,train
1,اَللّـهُمَّ ما قُلْتُ فى جُمُعَتى هذِهِ مِنْ ...,female_0002,22100,27.97,train
2,اَللّـهُمَّ ما قُلْتُ فى جُمُعَتى هذِهِ مِنْ ...,female_0003,22100,27.42,train
3,اَللّـهُمَّ اِنّى تَعَمَّدْتُ اِلَيْكَ بِحاجَ...,female_0004,22100,44.25,train
4,لا اِلـهَ إلاّ اللهُ وَاللهُ اَكْبَرُ وَسُبْح...,female_0005,22100,59.58,train


In [10]:
# Save female metadata to CSV
female_df.to_csv("female_metadata.csv", index=False)

### Male metadata extraction

In [11]:
import pandas as pd
from tqdm import tqdm

male_dfs = []
total = sum(len(male_dataset[split]) for split in male_dataset.keys())
with tqdm(total=total, desc="Processing all male samples") as pbar:
    for split in male_dataset.keys():
        df = male_dataset[split].to_pandas()
        df["original_file"] = df["file"].str.split('.').str[0]
        df["file"] = [f"male_{i+1:04d}" for i in range(len(df))]
        df["split"] = split
        male_dfs.append(df[["text", "file", "original_file", "sampling_rate", "duration", "split"]])
        pbar.update(len(df))

male_df = pd.concat(male_dfs, ignore_index=True)
male_df.head()

Processing all male samples: 100%|██████████| 9705/9705 [00:33<00:00, 290.93it/s]


Unnamed: 0,text,file,original_file,sampling_rate,duration,split
0,.لِأَنَّهُ لَا يَرَى أَنَّهُ عَلَى السَّفَهِ ....,male_0001,ch_20_arabic_tts_dataset_48,40100,5.29,train
1,.الْعُمْرُ .يَنْقُصُ وَالذُّنُوبُ تَزِيدُ,male_0002,ch_05_arabic_tts_dataset_436,40100,3.06,train
2,.قَلِيلَةً بَيْنَ الْمُكْثِرِينَ .فَإِنَّ النَ...,male_0003,ch_20_arabic_tts_dataset_20,40100,3.32,train
3,.إلَّا تَفَضُّلَا .وَمِنْهَا: الْعَقْلُ .الَّذ...,male_0004,ch_16_arabic_tts_dataset_112,40100,5.08,train
4,.إلَّا مَثَلٌ مَرْذُولٌ .وَتَشْبِيهٌ .مَعْلُولٌ,male_0005,ch_16_arabic_tts_dataset_554,40100,3.48,train


In [12]:
# Save male metadata to CSV
male_df.to_csv("male_metadata.csv", index=False)

## Saving the wav files for StyleTTS2

In [13]:
import os

output_dir = "wav_data"
os.makedirs(output_dir, exist_ok=True)
target_sr = 24000

### Female WAV files

In [32]:
import numpy as np
import soundfile as sf
from scipy.signal import resample
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm


def save_resampled_female_wav(row):
    idx = row.name  # row.name gives the DataFrame index
    split = row["split"]
    sample = female_dataset[split][idx]
    audio_array = sample["audio"]["array"]
    orig_sr = sample["audio"]["sampling_rate"]

    # Resample if needed
    if orig_sr != target_sr:
        num_samples = int(len(audio_array) * target_sr / orig_sr)
        audio_array = resample(audio_array, num_samples)
    
    out_path = os.path.join(output_dir, f"{row['file']}.wav")
    sf.write(out_path, audio_array, target_sr)
    return out_path

max_workers = os.cpu_count() or 4

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = []
    for idx, row in female_df.iterrows():
        futures.append(executor.submit(save_resampled_female_wav, row))
    for _ in tqdm(as_completed(futures), total=len(futures), desc="Saving resampled female wavs"):
        pass

Saving resampled female wavs: 100%|██████████| 8874/8874 [04:17<00:00, 34.43it/s] 


### Male WAV files

In [11]:
import numpy as np
import soundfile as sf
from scipy.signal import resample
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm



def save_resampled_male_wav(row):
    split = row["split"]
    sample_idx = int(row["file"].split("_")[1]) - 1  # e.g., "male_0001" -> 0
    sample = male_dataset[split][sample_idx]
    audio_array = sample["audio"]
    orig_sr = sample["sampling_rate"]

    # Resample if needed
    if orig_sr != target_sr:
        num_samples = int(len(audio_array) * target_sr / orig_sr)
        audio_array = resample(audio_array, num_samples)
    
    # Save using the original_file column as filename
    out_path = os.path.join(output_dir, f"{row['original_file']}.wav")
    sf.write(out_path, audio_array, target_sr)
    return out_path

max_workers = os.cpu_count() or 4

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = []
    for idx, row in male_df.iterrows():
        futures.append(executor.submit(save_resampled_male_wav, row))
    for _ in tqdm(as_completed(futures), total=len(futures), desc="Saving resampled male wavs"):
        pass

Saving resampled male wavs: 100%|██████████| 9705/9705 [17:03<00:00,  9.48it/s]  


## Gender Recognition

In [13]:
import requests
import os
import pandas as pd

# API endpoint and key
api_url = "https://api.genderrecognition.com/v1/voice-gender-recognition/api"
api_key = '78765665efa44c5d138a8d1d1b7bace07206f93442e8da92395d32c3775e2ea4'

# Collect all female wav files
female_wav_files = sorted(
    [os.path.join(output_dir, f) for f in os.listdir(output_dir) if f.startswith("female_") and f.endswith(".wav")]
)

# Try to load existing results to resume
csv_path = "female_gender_api_results.csv"
if os.path.exists(csv_path):
    existing_df = pd.read_csv(csv_path)
    done_files = set(existing_df['file'])
    results = existing_df.to_dict('records')
    print(f"Resuming: {len(done_files)} files already processed.")
else:
    done_files = set()
    results = []

batch_size = 50  # Save after every 50 files

for idx, file_path in enumerate(female_wav_files):
    file_name = os.path.basename(file_path)
    if file_name in done_files:
        continue  # Skip already processed files

    try:
        with open(file_path, 'rb') as file:
            files = {"file": ("recording.wav", file, "audio/wav")}
            headers = {"apiKey": api_key}
            response = requests.post(api_url, files=files, headers=headers)

        if response.status_code == 200:
            result = response.json()
            print(f"{file_name}: {result}")
            results.append({"file": file_name, "gender": result.get("gender", None)})
        else:
            print(f"{file_name}: Request failed with status code {response.status_code}: {response.text}")
            results.append({"file": file_name, "gender": None, "error": response.text})
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        results.append({"file": file_name, "gender": None, "error": "File not found"})
    except requests.RequestException as e:
        print(f"Error making request for {file_path}: {e}")
        results.append({"file": file_name, "gender": None, "error": str(e)})

    # Save progress every batch_size files
    if (len(results) % batch_size == 0) or (idx == len(female_wav_files) - 1):
        pd.DataFrame(results).to_csv(csv_path, index=False)
        print(f"Progress saved after {len(results)} files.")

print("Processing complete.")

female_0001.wav: {'gender': 'female'}
female_0002.wav: {'gender': 'female'}
female_0003.wav: {'gender': 'female'}
female_0004.wav: {'gender': 'female'}
female_0005.wav: {'gender': 'female'}
female_0006.wav: {'gender': 'female'}
female_0007.wav: {'gender': 'female'}
female_0008.wav: {'gender': 'female'}
female_0009.wav: {'gender': 'female'}
female_0010.wav: {'gender': 'female'}
female_0011.wav: {'gender': 'female'}
female_0012.wav: {'gender': 'female'}
female_0013.wav: {'gender': 'female'}
female_0014.wav: {'gender': 'female'}
female_0015.wav: {'gender': 'female'}
female_0016.wav: {'gender': 'female'}
female_0017.wav: {'gender': 'female'}
female_0018.wav: {'gender': 'female'}
female_0019.wav: {'gender': 'female'}
female_0020.wav: {'gender': 'female'}
female_0021.wav: {'gender': 'female'}
female_0022.wav: {'gender': 'female'}
female_0023.wav: {'gender': 'female'}
female_0024.wav: {'gender': 'female'}
female_0025.wav: {'gender': 'female'}
female_0026.wav: {'gender': 'female'}
female_0027.

In [14]:
import os
import librosa
import torch
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader, Dataset
from transformers import Wav2Vec2Processor
from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2Model, Wav2Vec2PreTrainedModel

# ----------------------
# SETUP CONFIGURATION
# ----------------------
MODEL_PATH = "audeering/wav2vec2-large-robust-6-ft-age-gender"
SAMPLE_RATE = 16000
AUDIO_DIR = "wav_data"
OUTPUT_CSV = "gender_predictions.csv"
BATCH_SIZE = 8  # GPU is optimized for batch inference
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# ----------------------
# MODEL DEFINITION
# ----------------------


class ModelHead(torch.nn.Module):
    """Classification Head for Gender and Age Prediction."""
    def __init__(self, config, num_labels):
        super().__init__()
        self.dense = torch.nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = torch.nn.Dropout(config.final_dropout)
        self.out_proj = torch.nn.Linear(config.hidden_size, num_labels)

    def forward(self, features):
        x = self.dropout(features)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

class AgeGenderModel(Wav2Vec2PreTrainedModel):
    """Custom Model for Age & Gender Classification."""
    def __init__(self, config):
        super().__init__(config)
        self.wav2vec2 = Wav2Vec2Model(config)
        self.age = ModelHead(config, 1)
        self.gender = ModelHead(config, 3)
        self.init_weights()

    def forward(self, input_values):
        outputs = self.wav2vec2(input_values)
        hidden_states = torch.mean(outputs[0], dim=1)

        logits_age = self.age(hidden_states)
        logits_gender = torch.softmax(self.gender(hidden_states), dim=1)

        return hidden_states, logits_age, logits_gender

class AudeeringModel:
    """Wrapper for Loading & Running Inference with Audeering Gender Model."""
    def __init__(self, model_path: str, device=DEVICE):
        self.device = device
        self.processor = Wav2Vec2Processor.from_pretrained(model_path)
        self.model = AgeGenderModel.from_pretrained(model_path).to(self.device)
        self.model.eval()

    def predict_batch(self, signals, sample_rate: int) -> list:
        # Convert signals tensor to list of numpy arrays
        if isinstance(signals, torch.Tensor):
            signals = [s.cpu().numpy() for s in signals]
        processed_batch = self.processor(signals, sampling_rate=sample_rate, return_tensors="pt", padding=True)
        input_values = processed_batch["input_values"].to(self.device)

        # Run batched inference
        _, _, gender_logits = self.model(input_values)
        gender_scores = gender_logits.detach().cpu().numpy()

        results = []
        for scores in gender_scores:
            results.append({
                "female": round(scores[0], 4),
                "male": round(scores[1], 4),
                "child": round(scores[2], 4),
            })

        return results

# ----------------------
# CUSTOM DATASET + DATALOADER
# ----------------------

def collate_fn(batch):
    file_paths, signals = zip(*batch)
    max_len = max([len(s) for s in signals])
    padded_signals = []
    for s in signals:
        if len(s) < max_len:
            padded = np.pad(s, (0, max_len - len(s)), mode='constant')
        else:
            padded = s
        padded_signals.append(padded)
    signals_tensor = torch.tensor(padded_signals, dtype=torch.float32)  # shape: [batch, max_len]
    return list(file_paths), signals_tensor

class AudioDataset(Dataset):
    """Custom PyTorch Dataset to Load & Process Audio Files."""
    def __init__(self, file_paths, sample_rate):
        self.file_paths = file_paths
        self.sample_rate = sample_rate

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        file_path = self.file_paths[idx]
        signal, _ = librosa.load(file_path, sr=self.sample_rate)
        return file_path, signal

# ----------------------
# BATCH INFERENCE EXECUTION
# ----------------------

def process_all_files(audio_dir, model, output_csv):
    """Run batched gender classification on 'female_df' WAV files."""
    files_to_process = sorted(
        [os.path.join(audio_dir, f) for f in os.listdir(audio_dir) if f.startswith("female_") and f.endswith(".wav")]
    )

    # Load dataset & create DataLoader
    dataset = AudioDataset(files_to_process, SAMPLE_RATE)
    data_loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

    results = []
    
    for batch in tqdm(data_loader, desc="Processing batches"):
        file_paths, signals = batch
        try:
            # Try on GPU
            gender_probs = model.predict_batch(signals, SAMPLE_RATE)
        except RuntimeError as e:
            if "CUDA out of memory" in str(e):
                print(f"CUDA OOM on batch with files: {[os.path.basename(f) for f in file_paths]}")
                torch.cuda.empty_cache()
                # Move model to CPU
                model.model = model.model.cpu()
                model.device = torch.device("cpu")
                # Re-run on CPU
                gender_probs = model.predict_batch(signals, SAMPLE_RATE)
                # Move model back to GPU for next batch
                model.model = model.model.to(DEVICE)
                model.device = DEVICE
            else:
                raise  # re-raise other errors

        for file_path, probs in zip(file_paths, gender_probs):
            results.append({"file": os.path.basename(file_path), **probs})

    # Save all results to CSV
    pd.DataFrame(results).to_csv(output_csv, index=False)
    print(f"Processing complete. Results saved to {output_csv}")

# ----------------------
# EXECUTE IN JUPYTER NOTEBOOK
# ----------------------

# Initialize model
print("Loading Model...")
model = AudeeringModel(MODEL_PATH, device=DEVICE)

# Run batch processing in Jupyter
print("Processing WAV files in batches...")
process_all_files(AUDIO_DIR, model, OUTPUT_CSV)

Loading Model...


preprocessor_config.json:   0%|          | 0.00/214 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/363M [00:00<?, ?B/s]

Processing WAV files in batches...


Processing batches:   0%|          | 0/1110 [00:00<?, ?it/s]

  signals_tensor = torch.tensor(padded_signals, dtype=torch.float32)  # shape: [batch, max_len]


CUDA OOM on batch with files: ['female_0001.wav', 'female_0002.wav', 'female_0003.wav', 'female_0004.wav', 'female_0005.wav', 'female_0006.wav', 'female_0007.wav', 'female_0008.wav']


: 

In [14]:
# Define AudeeringModel (gender part only)
import numpy as np
import torch
import torch.nn as nn
import librosa
from transformers import Wav2Vec2Processor
from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2Model, Wav2Vec2PreTrainedModel

class ModelHead(nn.Module):
    def __init__(self, config, num_labels):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, num_labels)

    def forward(self, features, **kwargs):
        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

class AgeGenderModel(Wav2Vec2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.config = config
        self.wav2vec2 = Wav2Vec2Model(config)
        self.age = ModelHead(config, 1)
        self.gender = ModelHead(config, 3)
        self.init_weights()

    def forward(self, input_values):
        outputs = self.wav2vec2(input_values)
        hidden_states = outputs[0]
        hidden_states = torch.mean(hidden_states, dim=1)
        logits_age = self.age.forward(hidden_states)
        logits_gender = torch.softmax(self.gender.forward(hidden_states), dim=1)
        return hidden_states, logits_age, logits_gender

class AudeeringModel(object):
    def __init__(self, model_path: str):
        self.model_path = model_path
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        processor = Wav2Vec2Processor.from_pretrained(self.model_path)
        model = AgeGenderModel.from_pretrained(self.model_path).to(device)
        model.eval()
        self.device = device
        self.processor = processor
        self.model = model

    def predict(self, signal: np.ndarray, sample_rate: int) -> dict:
        y = self.processor(signal, sampling_rate=sample_rate)
        y = y["input_values"][0]
        y = y.reshape(1, -1)
        y = torch.from_numpy(y).to(self.device)
        _, _, gender = self.model.forward(y)
        gender = gender.detach().cpu().numpy().tolist()[0]
        result = {
            "female": round(gender[0], 4),
            "male": round(gender[1], 4),
            "child": round(gender[2], 4),
        }
        return result

    def __call__(self, *args, **kwargs):
        return self.predict(*args, **kwargs)

In [20]:
import os
import json
import time
import librosa
import pandas as pd

# Model setup
model_path = "audeering/wav2vec2-large-robust-6-ft-age-gender"
sample_rate = 16000
infer_engine = AudeeringModel(model_path=model_path)

output_dir = "wav_data"
output_csv = "female_gender_probs_audeering.csv"

# Set this to a file path to test a single file, or None to process all
test_file = None
# Example: 
# test_file = "wav_data/female_4806.wav"

if test_file is not None:
    files_to_process = [test_file]
else:
    files_to_process = sorted(
        [os.path.join(output_dir, f) for f in os.listdir(output_dir) if f.startswith("female_") and f.endswith(".wav")]
    )

results = []
batch_size = 20

for idx, file_path in enumerate(files_to_process):
    try:
        # Load and resample audio
        signal, _ = librosa.load(file_path, sr=sample_rate)
        # Only gender part
        gender_probs = infer_engine(signal, sample_rate)
        gender_only = {
            "female": gender_probs["female"],
            "male": gender_probs["male"],
            "child": gender_probs["child"],
        }
        print(f"{os.path.basename(file_path)}: {gender_only}")
        results.append({"file": os.path.basename(file_path), **gender_only})
    except Exception as e:
        print(f"Error for {file_path}: {e}")
        results.append({"file": os.path.basename(file_path), "female": None, "male": None, "child": None, "error": str(e)})

    # Save progress every batch_size files or at the end
    if (len(results) % batch_size == 0) or (idx == len(files_to_process) - 1):
        pd.DataFrame(results).to_csv(output_csv, index=False)
        print(f"Progress saved after {len(results)} files.")

print("Processing complete.")

female_0001.wav: {'female': 0.9939, 'male': 0.0059, 'child': 0.0002}
female_0002.wav: {'female': 0.9939, 'male': 0.0059, 'child': 0.0002}
female_0003.wav: {'female': 0.9809, 'male': 0.0189, 'child': 0.0001}
female_0004.wav: {'female': 0.986, 'male': 0.0138, 'child': 0.0001}
female_0005.wav: {'female': 0.9953, 'male': 0.0047, 'child': 0.0}
female_0006.wav: {'female': 0.9898, 'male': 0.0101, 'child': 0.0001}
female_0007.wav: {'female': 0.9932, 'male': 0.0067, 'child': 0.0001}
female_0008.wav: {'female': 0.9933, 'male': 0.0066, 'child': 0.0001}
female_0009.wav: {'female': 0.9904, 'male': 0.0095, 'child': 0.0001}
female_0010.wav: {'female': 0.9924, 'male': 0.0075, 'child': 0.0001}
female_0011.wav: {'female': 0.9832, 'male': 0.0167, 'child': 0.0001}
female_0012.wav: {'female': 0.9413, 'male': 0.0582, 'child': 0.0005}
female_0013.wav: {'female': 0.9922, 'male': 0.0077, 'child': 0.0001}
female_0014.wav: {'female': 0.9919, 'male': 0.0079, 'child': 0.0002}
female_0015.wav: {'female': 0.9873, 'm

In [19]:
import pandas as pd

df = pd.read_csv('female_gender_probs_audeering.csv')

# Add predicted column using all three probability columns
df['predicted'] = df[['female', 'male', 'child']].idxmax(axis=1)

print(df['predicted'].value_counts())

predicted
female    4960
male      3914
Name: count, dtype: int64
