In [1]:
import numpy as np
import librosa
import soundfile as sf
import sounddevice as sd
import soundfile
import numpy as np
import wave
import os
from numpy.fft import fft, ifft
from IPython.display import Audio, Markdown
from collections import defaultdict
import speech_recognition
from io import BytesIO
import jiwer
from tqdm import tqdm

In [2]:
def extract_mfcc(file_path, n_mfcc=13):
    """
    Extract MFCC features from an audio file.
    
    Parameters:
    file_path (str): Path to the audio file.
    n_mfcc (int): Number of MFCC features to extract.
    
    Returns:
    np.ndarray: Extracted MFCC features.
    """
    y, sr = librosa.load(file_path, sr=None)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    return np.mean(mfccs.T, axis=0)  # Average across time frames

def aggregate_speaker_features(file_paths):
    """
    Aggregate MFCC features for all recordings of a speaker.
    
    Parameters:
    file_paths (list): List of file paths for a single speaker's recordings.
    
    Returns:
    np.ndarray: Aggregated feature vector for the speaker.
    """
    features = [extract_mfcc(file) for file in file_paths]
    return np.mean(features, axis=0)

def get_speaker_files(base_path, split_token='-'):
    """
    Get a dictionary of speaker IDs and their corresponding file paths.
    
    Parameters:
    base_path (str): Base directory containing the audio files.
    
    Returns:
    dict: Dictionary with speaker IDs as keys and lists of file paths as values.
    """
    speaker_files = defaultdict(list)
    for file in os.listdir(base_path):
        if file.endswith('.wav'):
            speaker_id = file.split(split_token)[0]
            speaker_files[speaker_id].append(os.path.join(base_path, file))
    return speaker_files

def calculate_similarity_matrix(utterances):
    """
    Compute the voice similarity matrix for given utterances.
    
    Parameters:
    utterances (list): List of feature vectors representing each speaker.
    
    Returns:
    np.ndarray: Similarity matrix.
    """
    num_speakers = len(utterances)
    similarity_matrix = np.zeros((num_speakers, num_speakers))
    
    for i in range(num_speakers):
        for j in range(num_speakers):
            similarity_matrix[i, j] = np.dot(utterances[i], utterances[j]) / (np.linalg.norm(utterances[i]) * np.linalg.norm(utterances[j]))
    
    return similarity_matrix

def calculate_diagonal_dominance(matrix):
    """
    Calculate the diagonal dominance of a given similarity matrix.
    
    Parameters:
    matrix (np.ndarray): Similarity matrix.
    
    Returns:
    float: Diagonal dominance value.
    """
    N = matrix.shape[0]
    diag_avg = np.mean(np.diag(matrix))
    off_diag_mask = np.ones(matrix.shape, dtype=bool)
    np.fill_diagonal(off_diag_mask, 0)
    off_diag_avg = np.mean(matrix[off_diag_mask])
    
    return abs(diag_avg - off_diag_avg)

def calculate_deid(matrix_oo, matrix_op):
    """
    Calculate the DeID metric given the original and original-protected similarity matrices.
    
    Parameters:
    matrix_oo (np.ndarray): Original similarity matrix.
    matrix_op (np.ndarray): Original-protected similarity matrix.
    
    Returns:
    float: DeID metric.
    """
    D_diag_OO = calculate_diagonal_dominance(matrix_oo)
    D_diag_OP = calculate_diagonal_dominance(matrix_op)
    
    DeID = 1 - (D_diag_OP / D_diag_OO)
    return DeID

def calculate_gvd(matrix_oo, matrix_pp):
    """
    Calculate the GVD metric given the original and pseudonymised similarity matrices.
    
    Parameters:
    matrix_oo (np.ndarray): Original similarity matrix.
    matrix_pp (np.ndarray): Pseudonymised similarity matrix.
    
    Returns:
    float: GVD metric.
    """
    D_diag_OO = calculate_diagonal_dominance(matrix_oo)
    D_diag_PP = calculate_diagonal_dominance(matrix_pp)
    
    GVD = 10 * np.log10(D_diag_PP / D_diag_OO)
    return GVD

In [3]:
def get_wav_files(base_path, split_token='.'):
    """
    Get a dictionary of speaker IDs and their corresponding file paths.
    
    Parameters:
    base_path (str): Base directory containing the audio files.
    
    Returns:
    dict: Dictionary with audio IDs as keys and file path as values.
    """
    wav_files = {}
    for file in os.listdir(base_path):
        if file.endswith('.wav'):
            audio_id = file.split(split_token)[0]
            wav_files[audio_id] = os.path.join(base_path, file)
    return wav_files




def transcribe_audio(file_path, recognizer, model):
    print(file_path)
    
    audio_file = speech_recognition.AudioFile(file_path)
    with audio_file as source:
            audio_data = recognizer.record(source)
    
    text = recognizer.recognize_whisper(audio_data, model)
    if text == "":
        text = "empty"
    return text  

In [4]:
def transcribe_audio2(file_path, recognizer, model):
    y, sr = librosa.load(file_path, sr=None)
    print(file_path)

    wav_io = BytesIO()
    sf.write(wav_io, y, 16000, format='WAV')
    wav_io.seek(0)

    audio_file = speech_recognition.AudioFile(wav_io)
    with audio_file as source:
            audio_data = recognizer.record(source)

    text = recognizer.recognize_whisper(audio_data, 'tiny')
    wav_io.close()
    if text == "":
        text = "empty"
    return text  


# Libri-dev


voice privacy challange - asrbn

In [5]:
# Get speaker files and aggregate features
base_dir = 'data/dev-clean-audio'
speaker_files = get_speaker_files(base_dir)
utterances_original = [aggregate_speaker_features(paths) for speaker_id, paths in speaker_files.items()]

# Get speaker files and aggregate features for pseudonymised data
base_dir_pseudo = 'data/anonymized/vpc/libri_dev/asrbn_libri_dev'
speaker_files_pseudo = get_speaker_files(base_dir_pseudo)
utterances_pseudonymised = [aggregate_speaker_features(paths) for speaker_id, paths in speaker_files_pseudo.items()]

# Calculate similarity matrices
matrix_oo = calculate_similarity_matrix(utterances_original)
matrix_op = calculate_similarity_matrix(utterances_original + utterances_pseudonymised)[len(utterances_original):, :len(utterances_original)]
matrix_pp = calculate_similarity_matrix(utterances_pseudonymised)

# Calculate DeID
deid = calculate_deid(matrix_oo, matrix_op)
print(f"DeID: {deid:.2f}")

# Calculate GVD
gvd = calculate_gvd(matrix_oo, matrix_pp)
print(f"GVD: {gvd:.2f}")

DeID: 0.93
GVD: -11.01


mcadams

In [6]:
# Get speaker files and aggregate features for pseudonymised data
base_dir_pseudo = 'data/anonymized/vpc/libri_dev/mcadams_libri_dev'
speaker_files_pseudo = get_speaker_files(base_dir_pseudo)
utterances_pseudonymised = [aggregate_speaker_features(paths) for speaker_id, paths in speaker_files_pseudo.items()]

# Calculate similarity matrices
matrix_oo = calculate_similarity_matrix(utterances_original)
matrix_op = calculate_similarity_matrix(utterances_original + utterances_pseudonymised)[len(utterances_original):, :len(utterances_original)]
matrix_pp = calculate_similarity_matrix(utterances_pseudonymised)

# Calculate DeID
deid = calculate_deid(matrix_oo, matrix_op)
print(f"DeID: {deid:.2f}")

# Calculate GVD
gvd = calculate_gvd(matrix_oo, matrix_pp)
print(f"GVD: {gvd:.2f}")

DeID: 0.56
GVD: 3.43


In [7]:
transcripts_path = "data/dev-clean-audio/transcript.txt"

transcrips_dict = {}
with open(transcripts_path, 'r') as f:
    for line in f:
        key, sentence = line.split(' ', 1)  # Split on the first space
        key = key.strip()
        sentence = sentence.strip()
        transcrips_dict[key] = sentence
        
clean_paths = get_wav_files('data/dev-clean-audio')
asrbn_paths = get_wav_files('data/anonymized/vpc/libri_dev/asrbn_libri_dev')
mcadams_paths = get_wav_files('data/anonymized/vpc/libri_dev/mcadams_libri_dev')
clean_wer = []
asrbn_wer = []
mcadams_wer = []


recognizer = speech_recognition.Recognizer()

for key, t in transcrips_dict.items():
    # clean
    transcript = t.lower()
    
    text = transcribe_audio(clean_paths[key], recognizer, 'tiny')
    if text != "empty":
        clean_wer.append(jiwer.wer(text.lower(), transcript))
    
    # asrbn
    text = transcribe_audio(asrbn_paths[key], recognizer, 'tiny')
    if text != "empty":
        asrbn_wer.append(jiwer.wer(text.lower(), transcript))
    
    # mcadams
    text = transcribe_audio(mcadams_paths[key], recognizer, 'tiny')
    if text != "empty":
        mcadams_wer.append(jiwer.wer(text.lower(), transcript))
    

print("clean: ", sum(clean_wer) / len(clean_wer))
print("asrbn: ", sum(asrbn_wer) / len(asrbn_wer))
print("mcadams: ", sum(mcadams_wer) / len(mcadams_wer))


data/dev-clean-audio\1272-128104-0000.wav
data/anonymized/vpc/libri_dev/asrbn_libri_dev\1272-128104-0000.wav
data/anonymized/vpc/libri_dev/mcadams_libri_dev\1272-128104-0000.wav
data/dev-clean-audio\1272-128104-0002.wav
data/anonymized/vpc/libri_dev/asrbn_libri_dev\1272-128104-0002.wav
data/anonymized/vpc/libri_dev/mcadams_libri_dev\1272-128104-0002.wav
data/dev-clean-audio\1272-128104-0003.wav
data/anonymized/vpc/libri_dev/asrbn_libri_dev\1272-128104-0003.wav
data/anonymized/vpc/libri_dev/mcadams_libri_dev\1272-128104-0003.wav
data/dev-clean-audio\1272-128104-0005.wav
data/anonymized/vpc/libri_dev/asrbn_libri_dev\1272-128104-0005.wav
data/anonymized/vpc/libri_dev/mcadams_libri_dev\1272-128104-0005.wav
data/dev-clean-audio\1272-128104-0006.wav
data/anonymized/vpc/libri_dev/asrbn_libri_dev\1272-128104-0006.wav
data/anonymized/vpc/libri_dev/mcadams_libri_dev\1272-128104-0006.wav
data/dev-clean-audio\1272-128104-0007.wav
data/anonymized/vpc/libri_dev/asrbn_libri_dev\1272-128104-0007.wav
d

In [8]:
print("clean: ", sum(clean_wer) / len(clean_wer))
print("asrbn: ", sum(asrbn_wer) / len(asrbn_wer))
print("mcadams: ", sum(mcadams_wer) / len(mcadams_wer))

clean:  0.23369438512576177
asrbn:  0.27050473295329663
mcadams:  0.5320727790382908


NAC

In [9]:
# Get speaker files and aggregate features for pseudonymised data
base_dir_pseudo = 'data/anonymized/vpc/libri_dev/nac_libri_dev'
speaker_files_pseudo = get_speaker_files(base_dir_pseudo)
utterances_pseudonymised = [aggregate_speaker_features(paths) for speaker_id, paths in speaker_files_pseudo.items()]

# Calculate similarity matrices
matrix_oo = calculate_similarity_matrix(utterances_original)
matrix_op = calculate_similarity_matrix(utterances_original + utterances_pseudonymised)[len(utterances_original):, :len(utterances_original)]
matrix_pp = calculate_similarity_matrix(utterances_pseudonymised)

# Calculate DeID
deid = calculate_deid(matrix_oo, matrix_op)
print(f"DeID: {deid:.2f}")

# Calculate GVD
gvd = calculate_gvd(matrix_oo, matrix_pp)
print(f"GVD: {gvd:.2f}")

# Calculate WER
nac_paths = get_wav_files('data/anonymized/vpc/libri_dev/nac_libri_dev')
nac_wer = []

recognizer = speech_recognition.Recognizer()

for key, t in transcrips_dict.items():
    # nac
    transcript = t.lower()
    text = transcribe_audio(nac_paths[key], recognizer, 'tiny')
    if nac_wer != "empty":
        nac_wer.append(jiwer.wer(text.lower(), transcript))
    

print("nac: ", sum(nac_wer) / len(nac_wer))

DeID: 0.94
GVD: -11.28
data/anonymized/vpc/libri_dev/nac_libri_dev\1272-128104-0000.wav
data/anonymized/vpc/libri_dev/nac_libri_dev\1272-128104-0002.wav
data/anonymized/vpc/libri_dev/nac_libri_dev\1272-128104-0003.wav
data/anonymized/vpc/libri_dev/nac_libri_dev\1272-128104-0005.wav
data/anonymized/vpc/libri_dev/nac_libri_dev\1272-128104-0006.wav
data/anonymized/vpc/libri_dev/nac_libri_dev\1272-128104-0007.wav
data/anonymized/vpc/libri_dev/nac_libri_dev\1272-128104-0010.wav
data/anonymized/vpc/libri_dev/nac_libri_dev\1272-128104-0011.wav
data/anonymized/vpc/libri_dev/nac_libri_dev\1272-128104-0013.wav
data/anonymized/vpc/libri_dev/nac_libri_dev\1272-128104-0014.wav
data/anonymized/vpc/libri_dev/nac_libri_dev\1272-135031-0000.wav
data/anonymized/vpc/libri_dev/nac_libri_dev\1272-135031-0001.wav
data/anonymized/vpc/libri_dev/nac_libri_dev\1272-135031-0002.wav
data/anonymized/vpc/libri_dev/nac_libri_dev\1272-135031-0003.wav
data/anonymized/vpc/libri_dev/nac_libri_dev\1272-135031-0004.wav
da

SSL_SAS

In [10]:

base_dir_pseudo = 'data/anonymized/ssl_sas/ssl_sas_libri_dev'
speaker_files_SSLSAS = get_speaker_files(base_dir_pseudo)        
utterances_SSLSAS = [aggregate_speaker_features(paths) for speaker_id, paths in speaker_files_SSLSAS.items()]

# sslsas
matrix_oo_SSLSAS = calculate_similarity_matrix(utterances_original)
matrix_op_SSLSAS = calculate_similarity_matrix(utterances_original + utterances_SSLSAS)[len(utterances_original):, :len(utterances_original)]
matrix_pp_SSLSAS = calculate_similarity_matrix(utterances_SSLSAS)

# Calculate DeID
deid_SSLSAS= calculate_deid(matrix_oo_SSLSAS, matrix_op_SSLSAS)
print(f"DeID ssl-sas NL: {deid_SSLSAS:.2f}")

# Calculate GVD
gvd_SSLSAS = calculate_gvd(matrix_oo_SSLSAS, matrix_pp_SSLSAS)
print(f"GVD ssl-sas NL: {gvd_SSLSAS:.2f}")
        
        

DeID ssl-sas NL: 0.92
GVD ssl-sas NL: -3.90


In [11]:
base_dir_pseudo = 'data/anonymized/ssl_sas/ssl_sas_libri_dev'
speaker_files_pseudo = get_speaker_files(base_dir_pseudo)
SSLSAS_paths = get_wav_files('data/anonymized/ssl_sas/ssl_sas_libri_dev', '_')

SSLSAS_wer = []
for key, t in transcrips_dict.items():
    transcript = t.lower()
    # SSL-SAS
    text = transcribe_audio2(SSLSAS_paths[key], recognizer, 'tiny')
    if text != "empty":
        SSLSAS_wer.append(jiwer.wer(text.lower(), transcript))
    
print("SSLSAS: ", sum(SSLSAS_wer) / len(SSLSAS_wer))

data/anonymized/ssl_sas/ssl_sas_libri_dev\1272-128104-0000_gen.wav
data/anonymized/ssl_sas/ssl_sas_libri_dev\1272-128104-0002_gen.wav
data/anonymized/ssl_sas/ssl_sas_libri_dev\1272-128104-0003_gen.wav
data/anonymized/ssl_sas/ssl_sas_libri_dev\1272-128104-0005_gen.wav
data/anonymized/ssl_sas/ssl_sas_libri_dev\1272-128104-0006_gen.wav
data/anonymized/ssl_sas/ssl_sas_libri_dev\1272-128104-0007_gen.wav
data/anonymized/ssl_sas/ssl_sas_libri_dev\1272-128104-0010_gen.wav
data/anonymized/ssl_sas/ssl_sas_libri_dev\1272-128104-0011_gen.wav
data/anonymized/ssl_sas/ssl_sas_libri_dev\1272-128104-0013_gen.wav
data/anonymized/ssl_sas/ssl_sas_libri_dev\1272-128104-0014_gen.wav
data/anonymized/ssl_sas/ssl_sas_libri_dev\1272-135031-0000_gen.wav
data/anonymized/ssl_sas/ssl_sas_libri_dev\1272-135031-0001_gen.wav
data/anonymized/ssl_sas/ssl_sas_libri_dev\1272-135031-0002_gen.wav
data/anonymized/ssl_sas/ssl_sas_libri_dev\1272-135031-0003_gen.wav
data/anonymized/ssl_sas/ssl_sas_libri_dev\1272-135031-0004_gen