<a href="https://colab.research.google.com/github/HamdanXI/nlp_adventure/blob/main/speech-privacy/gender-neural-network-62-50.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [2]:
################################################################################
# 1. IMPORTS
################################################################################
import torch
import torchaudio
import numpy as np
import random
import matplotlib.pyplot as plt

from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import (
    Wav2Vec2Processor,
    Wav2Vec2Model
)
import torch.nn as nn
import torch.optim as optim
from collections import defaultdict
from typing import Any, Dict

################################################################################
# 2. LOAD THE DATASET
################################################################################
dataset = load_dataset("HamdanXI/speech-accent-archive-v2")["train"]
print("Full dataset size:", len(dataset))
print("Columns:", dataset.column_names)
print("Unique speakerid:", len(set(dataset["speakerid"])))

################################################################################
# 3. CHOOSE (A) "TRAIN" + (B) "TEST" SPEAKERS
################################################################################
A = 50
B = 20

# We'll focus on English native speakers only, for male/female
male_speakers_all = list({
    ex["speakerid"] for ex in dataset
    if ex["sex"] == "male" and ex["native_language"].lower() == "english"
})
female_speakers_all = list({
    ex["speakerid"] for ex in dataset
    if ex["sex"] == "female" and ex["native_language"].lower() == "english"
})

# Pick train/test subsets of speakers
random.seed(42)
chosen_male_train = random.sample(male_speakers_all, A)
chosen_female_train = random.sample(female_speakers_all, A)

# Remove them so we don't re-pick them for test
for spk in chosen_male_train:
    male_speakers_all.remove(spk)
for spk in chosen_female_train:
    female_speakers_all.remove(spk)

chosen_male_test = random.sample(male_speakers_all, B)
chosen_female_test = random.sample(female_speakers_all, B)

chosen_speakers_train = chosen_male_train + chosen_female_train
chosen_speakers_test  = chosen_male_test + chosen_female_test

print(f"Chosen training male:   {chosen_male_train}")
print(f"Chosen training female: {chosen_female_train}")
print(f"Chosen test male:       {chosen_male_test}")
print(f"Chosen test female:     {chosen_female_test}")

# Filter dataset into train / test based on speaker IDs
train_ds = dataset.filter(lambda x: x["speakerid"] in chosen_speakers_train)
test_ds  = dataset.filter(lambda x: x["speakerid"] in chosen_speakers_test)

################################################################################
# 4. LOAD Wav2Vec2 PROCESSOR & MODEL (FOR EMBEDDING EXTRACTION)
################################################################################
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
# Use Wav2Vec2Model (no CTC head) for speaker-embedding extraction
wav2vec2_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")

# Optionally freeze the entire model if you do NOT want to fine-tune it:
for param in wav2vec2_model.parameters():
    param.requires_grad = False

wav2vec2_model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
wav2vec2_model.to(device)

################################################################################
# 5. EMBEDDING EXTRACTION FUNCTION
################################################################################
def extract_wav2vec2_embedding(batch):
    """
    1) Resample audio to 16k if needed.
    2) Pass through Wav2Vec2 processor and model to get hidden states.
    3) Mean-pool over time dimension to get a single speaker embedding.
    4) Return that embedding plus the label (0=male, 1=female).
    """
    sr = batch["audio"]["sampling_rate"]
    audio_array = batch["audio"]["array"]
    wave_tensor = torch.tensor(audio_array, dtype=torch.float32)

    if sr != 16000:
        wave_tensor = torchaudio.functional.resample(wave_tensor, sr, 16000)

    # Process into model input format
    inputs = processor(wave_tensor, sampling_rate=16000, return_tensors="pt")

    # Move to GPU if available
    input_values = inputs["input_values"].to(device)
    attention_mask = inputs["attention_mask"].to(device) if "attention_mask" in inputs else None

    with torch.no_grad():
        outputs = wav2vec2_model(input_values, attention_mask=attention_mask)
        # outputs.last_hidden_state shape: (batch_size=1, time_steps, hidden_dim)
        hidden_states = outputs.last_hidden_state[0]  # shape: (time_steps, hidden_dim)

    # Mean pooling over time dimension -> single (hidden_dim,) embedding
    embedding = hidden_states.mean(dim=0).cpu().numpy()  # shape: (hidden_dim,)

    # Convert sex to label: 0 = male, 1 = female
    label = 0 if batch["sex"] == "male" else 1

    return {
        "speakerid": batch["speakerid"],
        "embedding": embedding,
        "label":     label
    }

################################################################################
# 6. BUILD NEW DATASET WITH EMBEDDINGS
################################################################################
# We map the train_ds and test_ds to new columns: "embedding", "label"
# CAUTION: Doing map over large sets might be slow in pure Python. For demonstration it's fine.
train_with_emb = train_ds.map(extract_wav2vec2_embedding)
test_with_emb  = test_ds.map(extract_wav2vec2_embedding)

# Keep only "embedding" and "label" columns
train_with_emb = train_with_emb.remove_columns(
    [col for col in train_with_emb.column_names if col not in ["embedding", "label"]]
)
test_with_emb = test_with_emb.remove_columns(
    [col for col in test_with_emb.column_names if col not in ["embedding", "label"]]
)

print(f"train_with_emb length: {len(train_with_emb)}")
print(f"test_with_emb length:  {len(test_with_emb)}")

################################################################################
# 7. CONVERT EMBEDDINGS TO NUMPY/torch FOR CLASSIFICATION
################################################################################
# Turn the datasets into lists of (embedding, label)
X_train = np.array([example["embedding"] for example in train_with_emb], dtype=np.float32)
y_train = np.array([example["label"] for example in train_with_emb], dtype=np.int64)

X_test = np.array([example["embedding"] for example in test_with_emb], dtype=np.float32)
y_test = np.array([example["label"] for example in test_with_emb], dtype=np.int64)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:",  X_test.shape)
print("y_test shape:",  y_test.shape)

################################################################################
# 8. TRAIN A SMALL MLP CLASSIFIER ON THE EMBEDDINGS
################################################################################
# If the base Wav2Vec2 model is "base-960h", the hidden dimension is 768.
input_dim = X_train.shape[1]

class SexNet(nn.Module):
    def __init__(self, input_dim=768, hidden_dim=128):
        super(SexNet, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, 2)  # 2 outputs (male vs. female)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Create torch datasets
X_train_tensor = torch.from_numpy(X_train)
y_train_tensor = torch.from_numpy(y_train)

X_test_tensor = torch.from_numpy(X_test)
y_test_tensor = torch.from_numpy(y_test)

train_dataset_torch = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
test_dataset_torch  = torch.utils.data.TensorDataset(X_test_tensor, y_test_tensor)

train_loader = torch.utils.data.DataLoader(train_dataset_torch, batch_size=8, shuffle=True)
test_loader  = torch.utils.data.DataLoader(test_dataset_torch, batch_size=8, shuffle=False)

# Instantiate the model
model_nn = SexNet(input_dim=input_dim, hidden_dim=128).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_nn.parameters(), lr=1e-3)
epochs = 10

model_nn.train()
for epoch in range(epochs):
    running_loss = 0.0
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)

        optimizer.zero_grad()
        outputs = model_nn(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    avg_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs} - Loss: {avg_loss:.4f}")

################################################################################
# 9. EVALUATE ON TEST SET
################################################################################
model_nn.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch_x, batch_y in test_loader:
        batch_x = batch_x.to(device)
        logits = model_nn(batch_x)
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(batch_y.numpy())

accuracy = accuracy_score(all_labels, all_preds)
print(f"\nTest Accuracy on speaker-level utterances: {accuracy*100:.2f}%")

################################################################################
# DONE
################################################################################
print("Finished speaker-attribute classification with Wav2Vec2 embeddings.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/966 [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/21 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/21 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/21 [00:00<?, ?files/s]

(…)-00000-of-00021-818c2809e92a8447.parquet:   0%|          | 0.00/595M [00:00<?, ?B/s]

(…)-00001-of-00021-f17c4b6634e4ec1b.parquet:   0%|          | 0.00/552M [00:00<?, ?B/s]

(…)-00002-of-00021-434d1d2f1da6c52c.parquet:   0%|          | 0.00/482M [00:00<?, ?B/s]

(…)-00003-of-00021-e9a66d1771a1620d.parquet:   0%|          | 0.00/412M [00:00<?, ?B/s]

(…)-00004-of-00021-89e8cc05a18a2ae8.parquet:   0%|          | 0.00/396M [00:00<?, ?B/s]

(…)-00005-of-00021-914e6e5c717ae7b5.parquet:   0%|          | 0.00/397M [00:00<?, ?B/s]

(…)-00006-of-00021-49cf9d4bacc67e9c.parquet:   0%|          | 0.00/434M [00:00<?, ?B/s]

(…)-00007-of-00021-f9d15796b5839c46.parquet:   0%|          | 0.00/422M [00:00<?, ?B/s]

(…)-00008-of-00021-8746a6103caa7ca5.parquet:   0%|          | 0.00/412M [00:00<?, ?B/s]

(…)-00009-of-00021-09ac43cb4d900e19.parquet:   0%|          | 0.00/493M [00:00<?, ?B/s]

(…)-00010-of-00021-162759e35d0a2e5f.parquet:   0%|          | 0.00/493M [00:00<?, ?B/s]

(…)-00011-of-00021-0ed5f1616afcab2e.parquet:   0%|          | 0.00/507M [00:00<?, ?B/s]

(…)-00012-of-00021-580e490b16c4052b.parquet:   0%|          | 0.00/550M [00:00<?, ?B/s]

(…)-00013-of-00021-6c8afabe7f6a42cd.parquet:   0%|          | 0.00/533M [00:00<?, ?B/s]

(…)-00014-of-00021-7c3deececfbb0135.parquet:   0%|          | 0.00/575M [00:00<?, ?B/s]

(…)-00015-of-00021-0da9bdf727ff3db2.parquet:   0%|          | 0.00/496M [00:00<?, ?B/s]

(…)-00016-of-00021-3430eeebcc9e3fe2.parquet:   0%|          | 0.00/513M [00:00<?, ?B/s]

(…)-00017-of-00021-1d2dba05aaaf9b14.parquet:   0%|          | 0.00/552M [00:00<?, ?B/s]

(…)-00018-of-00021-e6584fba93998feb.parquet:   0%|          | 0.00/482M [00:00<?, ?B/s]

(…)-00019-of-00021-97254a2a3e23753a.parquet:   0%|          | 0.00/521M [00:00<?, ?B/s]

(…)-00020-of-00021-24f45d1c8f67cd37.parquet:   0%|          | 0.00/538M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2138 [00:00<?, ? examples/s]

Full dataset size: 2138
Columns: ['age', 'age_onset', 'birthplace', 'filename', 'native_language', 'sex', 'speakerid', 'country', 'file_missing?', 'file_exists', '__index_level_0__', 'audio']
Unique speakerid: 2138
Chosen training male:   [137, 71, 673, 584, 547, 161, 132, 1642, 2172, 1962, 1226, 76, 2121, 127, 536, 555, 1469, 2046, 73, 1709, 521, 1222, 538, 1295, 1895, 695, 2077, 443, 876, 427, 535, 869, 1075, 129, 937, 889, 637, 86, 1312, 1635, 148, 1072, 2165, 1661, 767, 951, 1786, 517, 112, 92]
Chosen training female: [1209, 1799, 1093, 1214, 87, 1371, 1277, 1951, 1358, 121, 1362, 1348, 164, 739, 69, 793, 1319, 78, 1493, 2029, 94, 75, 702, 1688, 1648, 1957, 1320, 1341, 503, 1113, 1339, 490, 1416, 1373, 1884, 540, 606, 1412, 1552, 1395, 119, 1694, 111, 542, 90, 774, 1351, 921, 1428, 817]
Chosen test male:       [1995, 1340, 1640, 497, 835, 492, 791, 825, 1530, 1352, 1259, 664, 489, 1878, 142, 106, 162, 515, 519, 1410]
Chosen test female:     [1873, 1571, 1215, 1212, 1280, 1323, 636,

Filter:   0%|          | 0/2138 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2138 [00:00<?, ? examples/s]

preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

train_with_emb length: 100
test_with_emb length:  40
X_train shape: (100, 768)
y_train shape: (100,)
X_test shape: (40, 768)
y_test shape: (40,)
Epoch 1/10 - Loss: 0.7211
Epoch 2/10 - Loss: 0.6940
Epoch 3/10 - Loss: 0.6993
Epoch 4/10 - Loss: 0.6907
Epoch 5/10 - Loss: 0.6848
Epoch 6/10 - Loss: 0.6850
Epoch 7/10 - Loss: 0.6782
Epoch 8/10 - Loss: 0.6719
Epoch 9/10 - Loss: 0.6746
Epoch 10/10 - Loss: 0.6715

Test Accuracy on speaker-level utterances: 62.50%
Finished speaker-attribute classification with Wav2Vec2 embeddings.
