In [5]:
import numpy as np
import librosa
import soundfile as sf
import sounddevice as sd
import soundfile
import numpy as np
import wave
import os
from numpy.fft import fft, ifft
from IPython.display import Audio, Markdown
from collections import defaultdict
import speech_recognition
from io import BytesIO
import jiwer
from tqdm import tqdm

In [6]:
def extract_mfcc(file_path, n_mfcc=13):
    """
    Extract MFCC features from an audio file.
    
    Parameters:
    file_path (str): Path to the audio file.
    n_mfcc (int): Number of MFCC features to extract.
    
    Returns:
    np.ndarray: Extracted MFCC features.
    """
    y, sr = librosa.load(file_path, sr=None)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    return np.mean(mfccs.T, axis=0)  # Average across time frames

def aggregate_speaker_features(file_paths):
    """
    Aggregate MFCC features for all recordings of a speaker.
    
    Parameters:
    file_paths (list): List of file paths for a single speaker's recordings.
    
    Returns:
    np.ndarray: Aggregated feature vector for the speaker.
    """
    features = [extract_mfcc(file) for file in file_paths]
    return np.mean(features, axis=0)

def get_speaker_files(base_path, split_token='-'):
    """
    Get a dictionary of speaker IDs and their corresponding file paths.
    
    Parameters:
    base_path (str): Base directory containing the audio files.
    
    Returns:
    dict: Dictionary with speaker IDs as keys and lists of file paths as values.
    """
    speaker_files = defaultdict(list)
    for file in os.listdir(base_path):
        if file.endswith('.wav'):
            speaker_id = file.split(split_token)[0]
            speaker_files[speaker_id].append(os.path.join(base_path, file))
    return speaker_files

def calculate_similarity_matrix(utterances):
    """
    Compute the voice similarity matrix for given utterances.
    
    Parameters:
    utterances (list): List of feature vectors representing each speaker.
    
    Returns:
    np.ndarray: Similarity matrix.
    """
    num_speakers = len(utterances)
    similarity_matrix = np.zeros((num_speakers, num_speakers))
    
    for i in range(num_speakers):
        for j in range(num_speakers):
            similarity_matrix[i, j] = np.dot(utterances[i], utterances[j]) / (np.linalg.norm(utterances[i]) * np.linalg.norm(utterances[j]))
    
    return similarity_matrix

def calculate_diagonal_dominance(matrix):
    """
    Calculate the diagonal dominance of a given similarity matrix.
    
    Parameters:
    matrix (np.ndarray): Similarity matrix.
    
    Returns:
    float: Diagonal dominance value.
    """
    N = matrix.shape[0]
    diag_avg = np.mean(np.diag(matrix))
    off_diag_mask = np.ones(matrix.shape, dtype=bool)
    np.fill_diagonal(off_diag_mask, 0)
    off_diag_avg = np.mean(matrix[off_diag_mask])
    
    return abs(diag_avg - off_diag_avg)

def calculate_deid(matrix_oo, matrix_op):
    """
    Calculate the DeID metric given the original and original-protected similarity matrices.
    
    Parameters:
    matrix_oo (np.ndarray): Original similarity matrix.
    matrix_op (np.ndarray): Original-protected similarity matrix.
    
    Returns:
    float: DeID metric.
    """
    D_diag_OO = calculate_diagonal_dominance(matrix_oo)
    D_diag_OP = calculate_diagonal_dominance(matrix_op)
    
    DeID = 1 - (D_diag_OP / D_diag_OO)
    return DeID

def calculate_gvd(matrix_oo, matrix_pp):
    """
    Calculate the GVD metric given the original and pseudonymised similarity matrices.
    
    Parameters:
    matrix_oo (np.ndarray): Original similarity matrix.
    matrix_pp (np.ndarray): Pseudonymised similarity matrix.
    
    Returns:
    float: GVD metric.
    """
    D_diag_OO = calculate_diagonal_dominance(matrix_oo)
    D_diag_PP = calculate_diagonal_dominance(matrix_pp)
    
    GVD = 10 * np.log10(D_diag_PP / D_diag_OO)
    return GVD

In [7]:
def get_wav_files(base_path, split_token='.'):
    """
    Get a dictionary of speaker IDs and their corresponding file paths.
    
    Parameters:
    base_path (str): Base directory containing the audio files.
    
    Returns:
    dict: Dictionary with audio IDs as keys and file path as values.
    """
    wav_files = {}
    for file in os.listdir(base_path):
        if file.endswith('.wav'):
            audio_id = file.split(split_token)[0]
            wav_files[audio_id] = os.path.join(base_path, file)
    return wav_files




def transcribe_audio(file_path, recognizer, model):
    # print(file_path)
    
    audio_file = speech_recognition.AudioFile(file_path)
    with audio_file as source:
            audio_data = recognizer.record(source)
    
    text = recognizer.recognize_whisper(audio_data, model)
    if text == "":
        text = "empty"
    return text  

def transcribe_audio2(file_path, recognizer, model):
    y, sr = librosa.load(file_path, sr=None)
    # print(file_path)

    wav_io = BytesIO()
    sf.write(wav_io, y, 16000, format='WAV')
    wav_io.seek(0)

    audio_file = speech_recognition.AudioFile(wav_io)
    with audio_file as source:
            audio_data = recognizer.record(source)

    text = recognizer.recognize_whisper(audio_data, 'tiny')
    wav_io.close()
    if text == "":
        text = "empty"
    return text  

# Gender mapping
spk2gender = {
    '1272': 'm', '1462': 'f', '1673': 'f', '174': 'm', '1919': 'f', '1988': 'f', 
    '1993': 'f', '2035': 'f', '2078': 'm', '2086': 'm', '2277': 'f', '2412': 'f', 
    '2428': 'm', '251': 'm', '2803': 'm', '2902': 'm', '3000': 'm', '3081': 'f', 
    '3170': 'm', '3536': 'f', '3576': 'f', '3752': 'm', '3853': 'f', '422': 'm', 
    '5338': 'f', '5536': 'm', '5694': 'm', '5895': 'f', '6241': 'm', '6295': 'm', 
    '6313': 'f', '6319': 'f', '6345': 'f', '652': 'm', '777': 'm', '7850': 'f', 
    '7976': 'm', '8297': 'm', '84': 'f', '8842': 'f'
}

# Libri-dev


voice privacy challange - asrbn

In [8]:
# Get speaker files and aggregate features
base_dir = 'data/dev-clean-audio'
speaker_files = get_speaker_files(base_dir)

# Separating male and female speakers
male_speakers = {spk: paths for spk, paths in speaker_files.items() if spk2gender[spk] == 'm'}
female_speakers = {spk: paths for spk, paths in speaker_files.items() if spk2gender[spk] == 'f'}

utterances_male = [aggregate_speaker_features(paths) for spk, paths in male_speakers.items()]
utterances_female = [aggregate_speaker_features(paths) for spk, paths in female_speakers.items()]

# Calculating similarity matrices for male and female
matrix_oo_male = calculate_similarity_matrix(utterances_male)
matrix_oo_female = calculate_similarity_matrix(utterances_female)

# Get speaker files and aggregate features for pseudonymised data
base_dir_pseudo = 'data/anonymized/vpc/libri_dev/asrbn_libri_dev'
speaker_files_pseudo = get_speaker_files(base_dir_pseudo)
utterances_pseudonymised = [aggregate_speaker_features(paths) for speaker_id, paths in speaker_files_pseudo.items()]

# Separating male and female speakers
pseudo_male_speakers = {spk: paths for spk, paths in speaker_files_pseudo.items() if spk2gender[spk] == 'm'}
pseudo_female_speakers = {spk: paths for spk, paths in speaker_files_pseudo.items() if spk2gender[spk] == 'f'}

utterances_pseudo_male = [aggregate_speaker_features(paths) for spk, paths in pseudo_male_speakers.items()]
utterances_pseudo_female = [aggregate_speaker_features(paths) for spk, paths in pseudo_female_speakers.items()]

# Male DeID and GVD
matrix_op_male = calculate_similarity_matrix(utterances_male + utterances_pseudo_male)[len(utterances_male):, :len(utterances_male)]
matrix_pp_male = calculate_similarity_matrix(utterances_pseudo_male)

deid_male = calculate_deid(matrix_oo_male, matrix_op_male)
gvd_male = calculate_gvd(matrix_oo_male, matrix_pp_male)

# Female DeID and GVD
matrix_op_female = calculate_similarity_matrix(utterances_female + utterances_pseudo_female)[len(utterances_female):, :len(utterances_female)]
matrix_pp_female = calculate_similarity_matrix(utterances_pseudo_female)

deid_female = calculate_deid(matrix_oo_female, matrix_op_female)
gvd_female = calculate_gvd(matrix_oo_female, matrix_pp_female)

# Displaying results
print(f"Male DeID: {deid_male:.2f}, Male GVD: {gvd_male:.2f}")
print(f"Female DeID: {deid_female:.2f}, Female GVD: {gvd_female:.2f}")


Male DeID: 0.86, Male GVD: -9.48
Female DeID: 0.95, Female GVD: -11.92


mcadams

In [9]:
# Get speaker files and aggregate features for pseudonymised data
base_dir_pseudo = 'data/anonymized/vpc/libri_dev/mcadams_libri_dev'
speaker_files_pseudo = get_speaker_files(base_dir_pseudo)
utterances_pseudonymised = [aggregate_speaker_features(paths) for speaker_id, paths in speaker_files_pseudo.items()]

# Separating male and female speakers
pseudo_male_speakers = {spk: paths for spk, paths in speaker_files_pseudo.items() if spk2gender[spk] == 'm'}
pseudo_female_speakers = {spk: paths for spk, paths in speaker_files_pseudo.items() if spk2gender[spk] == 'f'}

utterances_pseudo_male = [aggregate_speaker_features(paths) for spk, paths in pseudo_male_speakers.items()]
utterances_pseudo_female = [aggregate_speaker_features(paths) for spk, paths in pseudo_female_speakers.items()]

# Male DeID and GVD
matrix_op_male = calculate_similarity_matrix(utterances_male + utterances_pseudo_male)[len(utterances_male):, :len(utterances_male)]
matrix_pp_male = calculate_similarity_matrix(utterances_pseudo_male)

deid_male = calculate_deid(matrix_oo_male, matrix_op_male)
gvd_male = calculate_gvd(matrix_oo_male, matrix_pp_male)

# Female DeID and GVD
matrix_op_female = calculate_similarity_matrix(utterances_female + utterances_pseudo_female)[len(utterances_female):, :len(utterances_female)]
matrix_pp_female = calculate_similarity_matrix(utterances_pseudo_female)

deid_female = calculate_deid(matrix_oo_female, matrix_op_female)
gvd_female = calculate_gvd(matrix_oo_female, matrix_pp_female)

# Displaying results
print(f"Male DeID: {deid_male:.2f}, Male GVD: {gvd_male:.2f}")
print(f"Female DeID: {deid_female:.2f}, Female GVD: {gvd_female:.2f}")

Male DeID: 0.58, Male GVD: 4.22
Female DeID: 0.56, Female GVD: 3.42


NAC

In [10]:
# Get speaker files and aggregate features for pseudonymised data
base_dir_pseudo = 'data/anonymized/vpc/libri_dev/nac_libri_dev'
speaker_files_pseudo = get_speaker_files(base_dir_pseudo)
utterances_pseudonymised = [aggregate_speaker_features(paths) for speaker_id, paths in speaker_files_pseudo.items()]

# Separating male and female speakers
pseudo_male_speakers = {spk: paths for spk, paths in speaker_files_pseudo.items() if spk2gender[spk] == 'm'}
pseudo_female_speakers = {spk: paths for spk, paths in speaker_files_pseudo.items() if spk2gender[spk] == 'f'}

utterances_pseudo_male = [aggregate_speaker_features(paths) for spk, paths in pseudo_male_speakers.items()]
utterances_pseudo_female = [aggregate_speaker_features(paths) for spk, paths in pseudo_female_speakers.items()]

# Male DeID and GVD
matrix_op_male = calculate_similarity_matrix(utterances_male + utterances_pseudo_male)[len(utterances_male):, :len(utterances_male)]
matrix_pp_male = calculate_similarity_matrix(utterances_pseudo_male)

deid_male = calculate_deid(matrix_oo_male, matrix_op_male)
gvd_male = calculate_gvd(matrix_oo_male, matrix_pp_male)

# Female DeID and GVD
matrix_op_female = calculate_similarity_matrix(utterances_female + utterances_pseudo_female)[len(utterances_female):, :len(utterances_female)]
matrix_pp_female = calculate_similarity_matrix(utterances_pseudo_female)

deid_female = calculate_deid(matrix_oo_female, matrix_op_female)
gvd_female = calculate_gvd(matrix_oo_female, matrix_pp_female)

# Displaying results
print(f"Male DeID: {deid_male:.2f}, Male GVD: {gvd_male:.2f}")
print(f"Female DeID: {deid_female:.2f}, Female GVD: {gvd_female:.2f}")

Male DeID: 0.89, Male GVD: -10.74
Female DeID: 0.97, Female GVD: -11.16


SSL_SAS

In [11]:

base_dir_pseudo = 'data/anonymized/ssl_sas/ssl_sas_libri_dev'
speaker_files_pseudo = get_speaker_files(base_dir_pseudo)
utterances_pseudonymised = [aggregate_speaker_features(paths) for speaker_id, paths in speaker_files_pseudo.items()]

# Separating male and female speakers
pseudo_male_speakers = {spk: paths for spk, paths in speaker_files_pseudo.items() if spk2gender[spk] == 'm'}
pseudo_female_speakers = {spk: paths for spk, paths in speaker_files_pseudo.items() if spk2gender[spk] == 'f'}

utterances_pseudo_male = [aggregate_speaker_features(paths) for spk, paths in pseudo_male_speakers.items()]
utterances_pseudo_female = [aggregate_speaker_features(paths) for spk, paths in pseudo_female_speakers.items()]

# Male DeID and GVD
matrix_op_male = calculate_similarity_matrix(utterances_male + utterances_pseudo_male)[len(utterances_male):, :len(utterances_male)]
matrix_pp_male = calculate_similarity_matrix(utterances_pseudo_male)

deid_male = calculate_deid(matrix_oo_male, matrix_op_male)
gvd_male = calculate_gvd(matrix_oo_male, matrix_pp_male)

# Female DeID and GVD
matrix_op_female = calculate_similarity_matrix(utterances_female + utterances_pseudo_female)[len(utterances_female):, :len(utterances_female)]
matrix_pp_female = calculate_similarity_matrix(utterances_pseudo_female)

deid_female = calculate_deid(matrix_oo_female, matrix_op_female)
gvd_female = calculate_gvd(matrix_oo_female, matrix_pp_female)

# Displaying results
print(f"Male DeID: {deid_male:.2f}, Male GVD: {gvd_male:.2f}")
print(f"Female DeID: {deid_female:.2f}, Female GVD: {gvd_female:.2f}")
        
        

Male DeID: 0.95, Male GVD: -3.85
Female DeID: 0.91, Female GVD: -6.20


# WER

In [14]:
# Load transcripts
transcripts_path = "data/dev-clean-audio/transcript.txt"

def transcribe_audio2(file_path, recognizer, model):
    y, sr = librosa.load(file_path, sr=None)
    # print(file_path)

    wav_io = BytesIO()
    sf.write(wav_io, y, 16000, format='WAV')
    wav_io.seek(0)

    audio_file = speech_recognition.AudioFile(wav_io)
    with audio_file as source:
            audio_data = recognizer.record(source)

    text = recognizer.recognize_whisper(audio_data, 'tiny')
    wav_io.close()
    if text == "":
        text = "empty"
    return text  

transcrips_dict = {}
with open(transcripts_path, 'r') as f:
    for line in f:
        key, sentence = line.split(' ', 1)  # Split on the first space
        key = key.strip()
        sentence = sentence.strip()
        transcrips_dict[key] = sentence

# Load paths for different audio sets
clean_paths = get_wav_files('data/dev-clean-audio')
asrbn_paths = get_wav_files('data/anonymized/vpc/libri_dev/asrbn_libri_dev')
mcadams_paths = get_wav_files('data/anonymized/vpc/libri_dev/mcadams_libri_dev')
nac_paths = get_wav_files('data/anonymized/vpc/libri_dev/nac_libri_dev')
sslsas_paths = get_wav_files('data/anonymized/ssl_sas/ssl_sas_libri_dev', '_')

# Initialize lists to store WER values for male and female speakers
clean_wer_male = []
clean_wer_female = []
asrbn_wer_male = []
asrbn_wer_female = []
mcadams_wer_male = []
mcadams_wer_female = []
nac_wer_male = []
nac_wer_female = []
sslsas_wer_male = []
sslsas_wer_female = []

# Initialize recognizer
recognizer = speech_recognition.Recognizer()

# Calculate WER separately for male and female speakers
for key, t in tqdm(transcrips_dict.items()):
    transcript = t.lower()
    speaker_id = key.split('-')[0]
    gender = spk2gender[speaker_id]
    
    # clean
    if key in clean_paths:
        text = transcribe_audio2(clean_paths[key], recognizer, 'tiny')
        if text != "empty":
            if gender == 'm':
                clean_wer_male.append(jiwer.wer(text.lower(), transcript))
            else:
                clean_wer_female.append(jiwer.wer(text.lower(), transcript))
    
    # asrbn
    if key in asrbn_paths:
        text = transcribe_audio2(asrbn_paths[key], recognizer, 'tiny')
        if text != "empty":
            if gender == 'm':
                asrbn_wer_male.append(jiwer.wer(text.lower(), transcript))
            else:
                asrbn_wer_female.append(jiwer.wer(text.lower(), transcript))
    
    # mcadams
    if key in mcadams_paths:
        text = transcribe_audio2(mcadams_paths[key], recognizer, 'tiny')
        if text != "empty":
            if gender == 'm':
                mcadams_wer_male.append(jiwer.wer(text.lower(), transcript))
            else:
                mcadams_wer_female.append(jiwer.wer(text.lower(), transcript))
    
    # nac
    if key in nac_paths:
        text = transcribe_audio2(nac_paths[key], recognizer, 'tiny')
        if text != "empty":
            if gender == 'm':
                nac_wer_male.append(jiwer.wer(text.lower(), transcript))
            else:
                nac_wer_female.append(jiwer.wer(text.lower(), transcript))
    
    # sslsas
    if key in sslsas_paths:
        text = transcribe_audio2(sslsas_paths[key], recognizer, 'tiny')
        if text != "empty":
            if gender == 'm':
                sslsas_wer_male.append(jiwer.wer(text.lower(), transcript))
            else:
                sslsas_wer_female.append(jiwer.wer(text.lower(), transcript))

# Calculate average WER for male and female speakers
clean_wer_male_avg = sum(clean_wer_male) / len(clean_wer_male) if clean_wer_male else float('nan')
clean_wer_female_avg = sum(clean_wer_female) / len(clean_wer_female) if clean_wer_female else float('nan')

asrbn_wer_male_avg = sum(asrbn_wer_male) / len(asrbn_wer_male) if asrbn_wer_male else float('nan')
asrbn_wer_female_avg = sum(asrbn_wer_female) / len(asrbn_wer_female) if asrbn_wer_female else float('nan')

mcadams_wer_male_avg = sum(mcadams_wer_male) / len(mcadams_wer_male) if mcadams_wer_male else float('nan')
mcadams_wer_female_avg = sum(mcadams_wer_female) / len(mcadams_wer_female) if mcadams_wer_female else float('nan')

nac_wer_male_avg = sum(nac_wer_male) / len(nac_wer_male) if nac_wer_male else float('nan')
nac_wer_female_avg = sum(nac_wer_female) / len(nac_wer_female) if nac_wer_female else float('nan')

sslsas_wer_male_avg = sum(sslsas_wer_male) / len(sslsas_wer_male) if sslsas_wer_male else float('nan')
sslsas_wer_female_avg = sum(sslsas_wer_female) / len(sslsas_wer_female) if sslsas_wer_female else float('nan')

# Display results
print(f"Clean WER Male: {clean_wer_male_avg}")
print(f"Clean WER Female: {clean_wer_female_avg}")
print(f"ASRBN WER Male: {asrbn_wer_male_avg}")
print(f"ASRBN WER Female: {asrbn_wer_female_avg}")
print(f"McAdams WER Male: {mcadams_wer_male_avg}")
print(f"McAdams WER Female: {mcadams_wer_female_avg}")
print(f"NAC WER Male: {nac_wer_male_avg}")
print(f"NAC WER Female: {nac_wer_female_avg}")
print(f"SSL-SAS WER Male: {sslsas_wer_male_avg}")
print(f"SSL-SAS WER Female: {sslsas_wer_female_avg}")

100%|██████████| 2321/2321 [42:19<00:00,  1.09s/it] 

Clean WER Male: 0.24064532517504425
Clean WER Female: 0.2263667482594354
ASRBN WER Male: 0.27644587722404496
ASRBN WER Female: 0.2635742301886673
McAdams WER Male: 0.6672536801121404
McAdams WER Female: 0.5171989811663409
NAC WER Male: 0.3297330855097981
NAC WER Female: 0.3079999092872537
SSL-SAS WER Male: 0.2471018092813415
SSL-SAS WER Female: 0.2363985067938919



